Bibtex-format database of papers in the 1993 DAGS conference.
Dartmouth College, June 1993.

@string{dags93 = "Proceedings of the 1993 DAGS/PC Symposium"}
@string{dags = "Dartmouth Institute for Advanced Graduate Studies"}

INVITED PAPERS:

@InProceedings{wilkes:lessons,
  author = 	"John Wilkes", 
  title = 	"{DataMesh}, house-building, and distributed systems
		 technology", 
  booktitle = 	dags93,
  year = 	1993,
  pages = 	"1--5",
  organization = dags,
  address = 	"Hanover, NH",
  month = 	jun,
  comment = 	"Invited speaker. Also appeared in ACM OSR April 1993.",
  keyword = 	"file system, parallel I/O, RAID, disk array"
}

@InProceedings{nodine:loadbalance,
  author = 	"Mark H. Nodine and Jeffrey Vitter", 
  title = 	"Load Balancing Paradigms for Optimal Use of Parallel
		 Disks and Parallel Memory Hierarchies",
  booktitle = 	dags93,
  year = 	1993,
  pages = 	"26--39",
  organization = dags,
  address = 	"Hanover, NH",
  month = 	jun,
  comment = 	"Invited speaker: Jeffrey Vitter.",
  keyword = 	"parallel I/O algorithm, memory hierarchy, load
		 balance, sorting",
  abstract = {
We present several load balancing paradigms pertinent to optimizing
I/O performance with disk and processor parallelism.  We use sorting
as our canonical application to illustrate the paradigms, and we
survey a wide variety of applications in computational geometry.  The
use of parallel disks can help overcome the I/O bottleneck in sorting
if the records in each read or write are evenly balanced among the
disks.  There are three known load balancing paradigms that lead to
optimal I/O algorithms: using randomness to assign blocks to disks,
using the disks predominantly independently, and deterministically
balancing the blocks by matching.  In this report, we describe all of
these techniques in detail and compare their relative advantages.  We
show how randomized and deterministic balancing can be extended to
provide sorting algorithms that are optimal both in terms of the
number of I/Os and the internal processing time for
parallel-processing machines with scalable I/O subsystems and with
parallel memory hierarchies.  We also survey results achieving optimal
performance in the these models for a large range of online and batch
problems in computational geometry.
  }
}

@InProceedings{patterson:tip,
  author = 	"R. Hugo Patterson and Garth A. Gibson and M. Satyanarayanan", 
  title = 	"Informed Prefetching: Converting High Throughput to
		 Low Latency", 
  booktitle = 	dags93,
  year = 	1993,
  pages = 	"41--55",
  organization = dags,
  address = 	"Hanover, NH",
  month = 	jun,
  comment = 	"Invited speaker: Garth Gibson. Similar paper appeared
		 in ACM OSR April 1993.", 
  keyword = 	"file system, prefetching, operating system",
  abstract = {
This paper focuses on extending the power of caching and prefetching
to reduce file read latencies by exploiting application level hints
about future I/O accesses. We argue that systems that disclose
high-level knowledge can transfer optimization information across
module boundaries in a manner consistent with sound software
engineering principles. Such Transparent Informed Prefetching (TIP)
systems provide a technique for converting the high throughput of new
technologies such as disk arrays and log-structured file systems into
low latency for applications. Our preliminary experiments show that
even without a high-throughput I/O sub-system TIP yields reduced
execution time of up to 30\% for applications obtaining data from a
remote file server and up to 13\% for applications obtaining data from
a single local disk.  These experiments indicate that greater
performance benefits will be available when TIP is integrated with low
level resource management policies and highly parallel I/O subsystems
such as disk arrays.
  }
}

@InProceedings{aggarwal:latency,
  author = 	"Alok Aggarwal and Ashok K. Chandra and Marc Snir", 
  title = 	"On Communication Latency In {PRAM} Computations",
  booktitle = 	dags93,
  year = 	1993,
  pages = 	"76--86",
  organization = dags,
  address = 	"Hanover, NH",
  month = 	jun,
  comment = 	"Invited speaker: Alok Aggarwal. Extended version
		 submitted to FOCS.",
  keyword = 	"PRAM, parallel algorithm, memory hierarchy, PRAM
		 models, parallel I/O"
}

@InProceedings{scott:matrix,
  author = 	"David S. Scott", 
  title = 	"Parallel {I/O} and Solving Out of Core Systems of
		 Linear Equations",
  booktitle = 	dags93,
  year = 	1993,
  pages = 	"123--130",
  organization = dags,
  address = 	"Hanover, NH",
  month = 	jun,
  comment = 	"Invited speaker.",
  keyword = 	"parallel I/O, scientific computing, matrix
		 factorization, Intel",
  abstract = {
Large systems of linear equations arise in a number of scientific and
engineering applications.  In this paper we describe the implementation
of a family of disk based linear equation solvers and the required
characteristics of the I/O system needed to support them.
  }
}

@InProceedings{waltz:database,
  author = 	"David L. Waltz", 
  title = 	"Innovative Massively Parallel {AI} Applications",
  booktitle = 	dags93,
  year = 	1993,
  pages = 	"132--138",
  organization = dags,
  address = 	"Hanover, NH",
  month = 	jun,
  comment = 	"Invited speaker.",
  keyword = 	"database, AI, artificial intelligence",
  abstract = {
Massively parallel applications must address problems that will be too
large for workstations for the next several years, or else it will not make
sense to expend development costs on them. Suitable applications include
one or more of the following properties: 1) large amounts of data; 2)
intensive computations; 3) requirement for very fast response times; 4)
ways to trade computations for human effort, as in developing applications
using learning methods. Most of the suitable applications that we have
found come from the general area of very large databases. Massively
parallel machines have proved to be important not only in being able to run
large applications, but in accelerating development (allowing the use of
simpler algorithms, cutting the time to test performance on realistic
databases) and allowing many different algorithms and parameter settings to
be tried and compared for a particular task. This paper summarizes four
such applications.

The applications described are: 1) prediction of credit card "defaulters"
(non-payers) and "attritters" (people who didn't renew their cards) from a
credit card database; 2) prediction of the continuation of time series,
e.g. stock price movements; 3) automatic keyword assignment for news
articles; and 4) protein secondary structure prediction. These add to a
list identified in an earlier paper [Waltz 90] including: 5) automatic
classification of U.S.  Census Bureau long forms, using MBR -- Memory-Based
Reasoning [Creecy et al 92, Waltz 89, Stanfill \& Waltz 86]; 6) generating
catalogs for a mail order company that maximize expected net returns
(revenues from orders minus cost of the catalogs and mailings) using
genetically-inspired methods; and 7) text-based intelligent systems for
information retrieval, decision support, etc.
  }
}


CONTRIBUTED PAPERS:

@InProceedings{krieger:hfs,
  author = 	"Orran Krieger and Michael Stumm", 
  title = 	"{HFS:} A Flexible File System for large-scale
		 Multiprocessors", 
  booktitle = 	dags93,
  year = 	1993,
  pages = 	"6--14",
  organization = dags,
  address = 	"Hanover, NH",
  month = 	jun,
  keyword = 	"multiprocessor file system, parallel I/O, operating
		 system, shared memory",
  abstract = {
  The {H{\sc urricane}} File System (HFS) is a new file system being
developed for large-scale shared memory multiprocessors with
distributed disks.  The main goal of this file system is scalability;
that is, the file system is designed to handle demands that are
expected to grow linearly with the number of processors in the system.
To achieve this goal, HFS is designed using a new structuring
technique called Hierarchical Clustering. HFS is also designed to be
flexible in supporting a variety of policies for managing file data
and for managing file system state. This flexibility is necessary to
support in a scalable fashion the diverse workloads we expect for a
multiprocessor file system. } 
}

@InProceedings{keane:commercial,
  author = 	"J. A. Keane and T. N. Franklin and A. J. Grant and R.
		 Sumner and M. Q. Xu", 
  title = 	"Commercial Users' Requirements for Parallel Systems",
  booktitle = 	dags93,
  year = 	1993,
  pages = 	"15--25",
  organization = dags,
  address = 	"Hanover, NH",
  month = 	jun,
  keyword = 	"parallel architecture, parallel I/O, databases,
		 commercial requirements",
  abstract = {
This paper reports on part of an on-going analysis of parallel systems
for commercial users. The particular focus of this paper is on the
requirements that commercial users, in particular users with financial
database systems, have of parallel systems.

The issues of concern to such users differ from those of concern to
science and engineering users.  Performance of the parallel system is
not the only, or indeed primary, reason for moving to such systems for
commercial users.  Infra-structure issues are important, such as
system availability and inter-working with existing systems.

These issues are discussed in the context of a banking customer's
requirements. The various technical concerns that these requirements
impose are discussed in terms of commercially available systems.
  }
}

@InProceedings{womble:pario,
  author = 	"David Womble and David Greenberg and Stephen Wheat
		 and Rolf Riesen",
  title = 	"Beyond Core: Making Parallel Computer {I/O} Practical",
  booktitle = 	dags93,
  year = 	1993,
  pages = 	"56--63",
  organization = dags,
  address = 	"Hanover, NH",
  month = 	jun,
  keyword = 	"parallel I/O, out-of-core, parallel algorithm, scientific
		 computing, multiprocessor file system",
  abstract = {
The solution of Grand Challenge Problems will require computations which
are too large to fit in the memories of even the largest machines.
Inevitably, new designs of I/O systems will be necessary to support them.
Through our implementations of an out-of-core LU factorization we have
learned several important lessons about what I/O systems should be
like.  In particular we believe that the I/O system must provide the
programmer with the ability to explicitly manage storage.  One method of
doing so is to have a partitioned secondary storage in which each processor
owns a logical disk.  Along with operating system enhancements which
allow overheads such as buffer copying to be avoided, this sort of I/O
system meets the needs of high performance computing.
  }
}

@InProceedings{cormen:integrate,
  author = 	"Thomas H. Cormen and David Kotz", 
  title = 	"Integrating Theory and Practice in Parallel File Systems",
  booktitle = 	dags93,
  year = 	1993,
  pages = 	"64--74",
  organization = dags,
  address = 	"Hanover, NH",
  month = 	jun,
  keyword = 	"parallel I/O, multiprocessor file systems, algorithm,
			  file system interface",,
  abstract = {
Several algorithms for parallel disk systems have appeared in the
literature recently, and they are asymptotically optimal in terms of
the number of disk accesses. Scalable systems with parallel disks must
be able to run these algorithms.  We present for the first time a list
of capabilities that must be provided by the system to support these
optimal algorithms: control over declustering, querying about the
configuration, independent I/O, and turning off parity, file caching,
and prefetching.  We summarize recent theoretical and empirical work
that justifies the need for these capabilities.  In addition, we
sketch an organization for a parallel file interface with low-level
primitives and higher-level operations.
  }
}

@InProceedings{chin:hash,
  author = 	"Andrew Chin", 
  title = 	"Locality-Preserving Hashing", 
  booktitle = 	dags93,
  year = 	1993,
  pages = 	"87--98",
  organization = dags,
  address = 	"Hanover, NH",
  month = 	jun,
  keyword = 	"parallel computing model, hashing, locality, shared memory",
  abstract = {
When simulating shared-memory parallel computations on physically
distributed memory, it may be advantageous to hash the address space
to prevent network congestion and memory bank contention.  The
decision whether or not to use hashing depends on the communication
latency in the network and the locality of memory accesses in the
algorithm.

A complexity-theoretic basis for this decision is provided by the
Block PRAM model of Aggarwal, Chandra and Snir, a shared-memory model
of parallel computation which accounts for communication locality.
For this model, we exhibit a universal family of hash functions having
optimal locality.  The complexity of applying these hash functions to
the shared address space of the Block PRAM (i.e., by permuting data
elements) is asymptotically equivalent to the complexity of performing
a square matrix transpose, and this result is best possible for all
pairwise independent universal hash families.
  }
}

@InProceedings{dixon:arithmetic,
  author = 	"B. Dixon and A. K. Lenstra", 
  title = 	"Fast Massively Parallel Modular Arithmetic", 
  booktitle = 	dags93,
  year = 	1993,
  pages = 	"99--110",
  organization = dags,
  address = 	"Hanover, NH",
  month = 	jun,
  keyword = 	"arithmetic, factoring, parallel algorithm",
  abstract = {
High-performance implementations on massively parallel architectures
of many number theoretic algorithms and cryptographic schemes require
efficient methods to perform modular arithmetic.  This paper shows how
this can be achieved on a SIMD array of processors. The goal of our
algorithms is to trade parallelism for time, which we are able to do
in an optimal fashion. For the purposes of performing modular
arithmetic, we can effectively reconfigure a 16K processor massively
parallel SIMD computer into a machine having 16K$/k$ processors that
are $k$ times faster.  This is particularly useful for our
high-performance application, where a high degree of slow parallelism
is much less efficient than a lower degree of faster parallelism.
  }
}

@InProceedings{nyland:afma,
  author = 	"Lars S. Nyland and Jan F. Prins and John H. Reif", 
  title = 	"A Data-Parallel Implementation of the Adaptive Fast
		 Multipole Algorithm", 
  booktitle = 	dags93,
  year = 	1993,
  pages = 	"111--123",
  organization = dags,
  address = 	"Hanover, NH",
  month = 	jun,
  keyword = 	"parallel algorithm, scientific computing, parallel
		 computing, language, prototyping",
  abstract = {
Given an ensemble of $n$ bodies in space whose interaction is governed
by a potential function, the N-body problem is to calculate the force
on each body in the ensemble that results from its interaction with
all other bodies.  An efficient algorithm for this problem is critical
in the simulation of molecular dynamics, turbulent fluid flow,
intergalactic matter and other problems.  The fast multipole algorithm
(FMA) developed by Greengard approximates the solution with bounded
error in time $O(n)$.  For non-uniform distributions of bodies, an
adaptive variation of the algorithm is required to maintain this time
complexity.

The parallel execution of the FMA poses complex implementation issues
in the decomposition of the problem over processors to reduce
communication.  As a result the 3D Adaptive FMA has, to our knowledge,
never been implemented on a scalable parallel computer.  This paper
describes several variations on the parallel adaptive 3D FMA algorithm
that are expressed using the data-parallel subset of the high-level
parallel prototyping language Proteus.  These formulations have
implicit parallelism that is executed sequentially using the current
Proteus execution system to yield some insight into the performance of
the variations.  Efforts underway will make it possible to directly
generate vector code from the formulations, rendering them executable
on a broad class of parallel computers.
  }
}