Bibtex-format database of papers in the 1993 DAGS conference. Dartmouth College, June 1993. @string{dags93 = "Proceedings of the 1993 DAGS/PC Symposium"} @string{dags = "Dartmouth Institute for Advanced Graduate Studies"} INVITED PAPERS: @InProceedings{wilkes:lessons, author = "John Wilkes", title = "{DataMesh}, house-building, and distributed systems technology", booktitle = dags93, year = 1993, pages = "1--5", organization = dags, address = "Hanover, NH", month = jun, comment = "Invited speaker. Also appeared in ACM OSR April 1993.", keyword = "file system, parallel I/O, RAID, disk array" } @InProceedings{nodine:loadbalance, author = "Mark H. Nodine and Jeffrey Vitter", title = "Load Balancing Paradigms for Optimal Use of Parallel Disks and Parallel Memory Hierarchies", booktitle = dags93, year = 1993, pages = "26--39", organization = dags, address = "Hanover, NH", month = jun, comment = "Invited speaker: Jeffrey Vitter.", keyword = "parallel I/O algorithm, memory hierarchy, load balance, sorting", abstract = { We present several load balancing paradigms pertinent to optimizing I/O performance with disk and processor parallelism. We use sorting as our canonical application to illustrate the paradigms, and we survey a wide variety of applications in computational geometry. The use of parallel disks can help overcome the I/O bottleneck in sorting if the records in each read or write are evenly balanced among the disks. There are three known load balancing paradigms that lead to optimal I/O algorithms: using randomness to assign blocks to disks, using the disks predominantly independently, and deterministically balancing the blocks by matching. In this report, we describe all of these techniques in detail and compare their relative advantages. We show how randomized and deterministic balancing can be extended to provide sorting algorithms that are optimal both in terms of the number of I/Os and the internal processing time for parallel-processing machines with scalable I/O subsystems and with parallel memory hierarchies. We also survey results achieving optimal performance in the these models for a large range of online and batch problems in computational geometry. } } @InProceedings{patterson:tip, author = "R. Hugo Patterson and Garth A. Gibson and M. Satyanarayanan", title = "Informed Prefetching: Converting High Throughput to Low Latency", booktitle = dags93, year = 1993, pages = "41--55", organization = dags, address = "Hanover, NH", month = jun, comment = "Invited speaker: Garth Gibson. Similar paper appeared in ACM OSR April 1993.", keyword = "file system, prefetching, operating system", abstract = { This paper focuses on extending the power of caching and prefetching to reduce file read latencies by exploiting application level hints about future I/O accesses. We argue that systems that disclose high-level knowledge can transfer optimization information across module boundaries in a manner consistent with sound software engineering principles. Such Transparent Informed Prefetching (TIP) systems provide a technique for converting the high throughput of new technologies such as disk arrays and log-structured file systems into low latency for applications. Our preliminary experiments show that even without a high-throughput I/O sub-system TIP yields reduced execution time of up to 30\% for applications obtaining data from a remote file server and up to 13\% for applications obtaining data from a single local disk. These experiments indicate that greater performance benefits will be available when TIP is integrated with low level resource management policies and highly parallel I/O subsystems such as disk arrays. } } @InProceedings{aggarwal:latency, author = "Alok Aggarwal and Ashok K. Chandra and Marc Snir", title = "On Communication Latency In {PRAM} Computations", booktitle = dags93, year = 1993, pages = "76--86", organization = dags, address = "Hanover, NH", month = jun, comment = "Invited speaker: Alok Aggarwal. Extended version submitted to FOCS.", keyword = "PRAM, parallel algorithm, memory hierarchy, PRAM models, parallel I/O" } @InProceedings{scott:matrix, author = "David S. Scott", title = "Parallel {I/O} and Solving Out of Core Systems of Linear Equations", booktitle = dags93, year = 1993, pages = "123--130", organization = dags, address = "Hanover, NH", month = jun, comment = "Invited speaker.", keyword = "parallel I/O, scientific computing, matrix factorization, Intel", abstract = { Large systems of linear equations arise in a number of scientific and engineering applications. In this paper we describe the implementation of a family of disk based linear equation solvers and the required characteristics of the I/O system needed to support them. } } @InProceedings{waltz:database, author = "David L. Waltz", title = "Innovative Massively Parallel {AI} Applications", booktitle = dags93, year = 1993, pages = "132--138", organization = dags, address = "Hanover, NH", month = jun, comment = "Invited speaker.", keyword = "database, AI, artificial intelligence", abstract = { Massively parallel applications must address problems that will be too large for workstations for the next several years, or else it will not make sense to expend development costs on them. Suitable applications include one or more of the following properties: 1) large amounts of data; 2) intensive computations; 3) requirement for very fast response times; 4) ways to trade computations for human effort, as in developing applications using learning methods. Most of the suitable applications that we have found come from the general area of very large databases. Massively parallel machines have proved to be important not only in being able to run large applications, but in accelerating development (allowing the use of simpler algorithms, cutting the time to test performance on realistic databases) and allowing many different algorithms and parameter settings to be tried and compared for a particular task. This paper summarizes four such applications. The applications described are: 1) prediction of credit card "defaulters" (non-payers) and "attritters" (people who didn't renew their cards) from a credit card database; 2) prediction of the continuation of time series, e.g. stock price movements; 3) automatic keyword assignment for news articles; and 4) protein secondary structure prediction. These add to a list identified in an earlier paper [Waltz 90] including: 5) automatic classification of U.S. Census Bureau long forms, using MBR -- Memory-Based Reasoning [Creecy et al 92, Waltz 89, Stanfill \& Waltz 86]; 6) generating catalogs for a mail order company that maximize expected net returns (revenues from orders minus cost of the catalogs and mailings) using genetically-inspired methods; and 7) text-based intelligent systems for information retrieval, decision support, etc. } } CONTRIBUTED PAPERS: @InProceedings{krieger:hfs, author = "Orran Krieger and Michael Stumm", title = "{HFS:} A Flexible File System for large-scale Multiprocessors", booktitle = dags93, year = 1993, pages = "6--14", organization = dags, address = "Hanover, NH", month = jun, keyword = "multiprocessor file system, parallel I/O, operating system, shared memory", abstract = { The {H{\sc urricane}} File System (HFS) is a new file system being developed for large-scale shared memory multiprocessors with distributed disks. The main goal of this file system is scalability; that is, the file system is designed to handle demands that are expected to grow linearly with the number of processors in the system. To achieve this goal, HFS is designed using a new structuring technique called Hierarchical Clustering. HFS is also designed to be flexible in supporting a variety of policies for managing file data and for managing file system state. This flexibility is necessary to support in a scalable fashion the diverse workloads we expect for a multiprocessor file system. } } @InProceedings{keane:commercial, author = "J. A. Keane and T. N. Franklin and A. J. Grant and R. Sumner and M. Q. Xu", title = "Commercial Users' Requirements for Parallel Systems", booktitle = dags93, year = 1993, pages = "15--25", organization = dags, address = "Hanover, NH", month = jun, keyword = "parallel architecture, parallel I/O, databases, commercial requirements", abstract = { This paper reports on part of an on-going analysis of parallel systems for commercial users. The particular focus of this paper is on the requirements that commercial users, in particular users with financial database systems, have of parallel systems. The issues of concern to such users differ from those of concern to science and engineering users. Performance of the parallel system is not the only, or indeed primary, reason for moving to such systems for commercial users. Infra-structure issues are important, such as system availability and inter-working with existing systems. These issues are discussed in the context of a banking customer's requirements. The various technical concerns that these requirements impose are discussed in terms of commercially available systems. } } @InProceedings{womble:pario, author = "David Womble and David Greenberg and Stephen Wheat and Rolf Riesen", title = "Beyond Core: Making Parallel Computer {I/O} Practical", booktitle = dags93, year = 1993, pages = "56--63", organization = dags, address = "Hanover, NH", month = jun, keyword = "parallel I/O, out-of-core, parallel algorithm, scientific computing, multiprocessor file system", abstract = { The solution of Grand Challenge Problems will require computations which are too large to fit in the memories of even the largest machines. Inevitably, new designs of I/O systems will be necessary to support them. Through our implementations of an out-of-core LU factorization we have learned several important lessons about what I/O systems should be like. In particular we believe that the I/O system must provide the programmer with the ability to explicitly manage storage. One method of doing so is to have a partitioned secondary storage in which each processor owns a logical disk. Along with operating system enhancements which allow overheads such as buffer copying to be avoided, this sort of I/O system meets the needs of high performance computing. } } @InProceedings{cormen:integrate, author = "Thomas H. Cormen and David Kotz", title = "Integrating Theory and Practice in Parallel File Systems", booktitle = dags93, year = 1993, pages = "64--74", organization = dags, address = "Hanover, NH", month = jun, keyword = "parallel I/O, multiprocessor file systems, algorithm, file system interface",, abstract = { Several algorithms for parallel disk systems have appeared in the literature recently, and they are asymptotically optimal in terms of the number of disk accesses. Scalable systems with parallel disks must be able to run these algorithms. We present for the first time a list of capabilities that must be provided by the system to support these optimal algorithms: control over declustering, querying about the configuration, independent I/O, and turning off parity, file caching, and prefetching. We summarize recent theoretical and empirical work that justifies the need for these capabilities. In addition, we sketch an organization for a parallel file interface with low-level primitives and higher-level operations. } } @InProceedings{chin:hash, author = "Andrew Chin", title = "Locality-Preserving Hashing", booktitle = dags93, year = 1993, pages = "87--98", organization = dags, address = "Hanover, NH", month = jun, keyword = "parallel computing model, hashing, locality, shared memory", abstract = { When simulating shared-memory parallel computations on physically distributed memory, it may be advantageous to hash the address space to prevent network congestion and memory bank contention. The decision whether or not to use hashing depends on the communication latency in the network and the locality of memory accesses in the algorithm. A complexity-theoretic basis for this decision is provided by the Block PRAM model of Aggarwal, Chandra and Snir, a shared-memory model of parallel computation which accounts for communication locality. For this model, we exhibit a universal family of hash functions having optimal locality. The complexity of applying these hash functions to the shared address space of the Block PRAM (i.e., by permuting data elements) is asymptotically equivalent to the complexity of performing a square matrix transpose, and this result is best possible for all pairwise independent universal hash families. } } @InProceedings{dixon:arithmetic, author = "B. Dixon and A. K. Lenstra", title = "Fast Massively Parallel Modular Arithmetic", booktitle = dags93, year = 1993, pages = "99--110", organization = dags, address = "Hanover, NH", month = jun, keyword = "arithmetic, factoring, parallel algorithm", abstract = { High-performance implementations on massively parallel architectures of many number theoretic algorithms and cryptographic schemes require efficient methods to perform modular arithmetic. This paper shows how this can be achieved on a SIMD array of processors. The goal of our algorithms is to trade parallelism for time, which we are able to do in an optimal fashion. For the purposes of performing modular arithmetic, we can effectively reconfigure a 16K processor massively parallel SIMD computer into a machine having 16K$/k$ processors that are $k$ times faster. This is particularly useful for our high-performance application, where a high degree of slow parallelism is much less efficient than a lower degree of faster parallelism. } } @InProceedings{nyland:afma, author = "Lars S. Nyland and Jan F. Prins and John H. Reif", title = "A Data-Parallel Implementation of the Adaptive Fast Multipole Algorithm", booktitle = dags93, year = 1993, pages = "111--123", organization = dags, address = "Hanover, NH", month = jun, keyword = "parallel algorithm, scientific computing, parallel computing, language, prototyping", abstract = { Given an ensemble of $n$ bodies in space whose interaction is governed by a potential function, the N-body problem is to calculate the force on each body in the ensemble that results from its interaction with all other bodies. An efficient algorithm for this problem is critical in the simulation of molecular dynamics, turbulent fluid flow, intergalactic matter and other problems. The fast multipole algorithm (FMA) developed by Greengard approximates the solution with bounded error in time $O(n)$. For non-uniform distributions of bodies, an adaptive variation of the algorithm is required to maintain this time complexity. The parallel execution of the FMA poses complex implementation issues in the decomposition of the problem over processors to reduce communication. As a result the 3D Adaptive FMA has, to our knowledge, never been implemented on a scalable parallel computer. This paper describes several variations on the parallel adaptive 3D FMA algorithm that are expressed using the data-parallel subset of the high-level parallel prototyping language Proteus. These formulations have implicit parallelism that is executed sequentially using the current Proteus execution system to yield some insight into the performance of the variations. Efforts underway will make it possible to directly generate vector code from the formulations, rendering them executable on a broad class of parallel computers. } }