This file contains a BibTeX bibliography for the papers that appear in IOPADS '96, the Fourth Workshop on Input/Output in Parallel and Distributed Systems. Please leave this header on the the file; bibtex won't mind. You can find more about this conference at http://www.cs.dartmouth.edu/iopads96 @string{iopads96 = "Fourth Workshop on Input/Output in Parallel and Distributed Systems"} @InProceedings{moore:detection, author = "Jason A. Moore and Phil Hatcher and Michael J. Quinn", title = "Efficient Data-Parallel Files via Automatic Mode Detection", pages = "1--14", booktitle = iopads96, year = 1996, address = "Philadelphia", month = may, keyword = "parallel I/O, data parallelism", abstract = { Parallel languages rarely specify parallel I/O constructs, and existing commercial systems provide the programmer with a low-level I/O interface. We present design principles for integrating I/O into languages and show how these principles are applied to a virtual-processor-oriented language. We illustrate how machine-independent modes are used to support both high performance and generality. We describe an automatic mode detection technique that saves the programmer from extra syntax and low-level file system details. We show how virtual processor file operations, typically small by themselves, are combined into efficient large-scale file system calls. Finally, we present a variety of benchmark results detailing design tradeoffs and the performance of various modes. Parallel languages rarely specify parallel I/O constructs, and existing commercial systems provide the programmer with a low-level I/O interface. We present design principles for integrating I/O into languages and show how these principles are applied to a virtual-processor-oriented language. We illustrate how machine-independent modes are used to support both high performance and generality. We describe an automatic mode detection technique that saves the programmer from extra syntax and low-level file system details. We show how virtual processor file operations, typically small by themselves, are combined into efficient large-scale file system calls. Finally, we present a variety of benchmark results detailing design tradeoffs and the performance of various modes.} } @InProceedings{acharya:tuning, author = "Anurag Acharya and Mustafa Uysal and Robert Bennett and Assaf Mendelson and Michael Beynon and Jeffrey K. Hollingsworth and Joel Saltz and Alan Sussman", title = "Tuning the Performance of {I/O} Intensive Parallel Applications", pages = "15--27", booktitle = iopads96, year = 1996, address = "Philadelphia", month = may, keyword = "parallel I/O, filesystem workload, parallel application", abstract = { Getting good I/O performance from parallel programs is a critical problem for many application domains. In this paper, we report our experience tuning the I/O performance of four application programs from the areas of satellite-data processing and linear algebra. After tuning, three of the four applications achieve application-level I/O rates of over 100 MB/s on 16 processors. The total volume of I/O required by the programs ranged from about 75 MB to over 200 GB. We report the lessons learned in achieving high I/O performance from these applications, including the need for code restructuring, local disks on every node and knowledge of future I/O requests. We also report our experience on achieving high performance on peer-to-peer configurations. Finally, we comment on the necessity of complex I/O interfaces like collective I/O and strided requests to achieve high performance.} } @InProceedings{toledo:solar, author = "Sivan Toledo and Fred G. Gustavson", title = "The Design and Implementation of {SOLAR}, a Portable Library for Scalable Out-of-Core Linear Algebra Computations", pages = "28--40", booktitle = iopads96, year = 1996, address = "Philadelphia", month = may, keyword = "parallel I/O, out-of-core, linear algebra", abstract = { SOLAR is a portable high-performance library for out-of-core dense matrix computations. It combines portability with high performance by using existing high-performance in-core subroutine libraries and by using an optimized matrix input-output library. SOLAR works on parallel computers, workstations, and personal computers. It supports in-core computations on both shared-memory and distributed-memory machines, and its matrix input-output library supports both conventional I/O interfaces and parallel I/O interfaces. This paper discusses the overall design of SOLAR, its interfaces, and the design of several important subroutines. Experimental results show that SOLAR can factor on a single workstation an out-of-core positive-definite symmetric matrix at a rate exceeding 215 Mflops, and an out-of-core general matrix at a rate exceeding 195 Mflops. Less than 16\% of the running time is spent on I/O in these computations. These results indicate that SOLAR's portability does not compromise its performance. We expect that the combination of portability, modularity, and the use of a high-level I/O interface will make the library an important platform for research on out-of-core algorithms and on parallel I/O.} } @InProceedings{schwabe:layouts, author = "Eric J. Schwabe and Ian M. Sutherland and Bruce K. Holmer", title = "Evaluating Approximately Balanced Parity-Declustered Data Layouts for Disk Arrays", pages = "41--54", booktitle = iopads96, year = 1996, address = "Philadelphia", month = may, keyword = "parallel I/O, disk array, parity, RAID", abstract = { Parity declustering has been used to reduce the time required to reconstruct a failed disk in a disk array. Most existing work on parity declustering uses BIBD-based data layouts, which distribute the workload of reconstructing a failed disk over the remaining disks of the array with perfect balance. For certain array sizes, however, there is no known BIBD-based layout. In this paper, we evaluate data layouts that are approximately balanced --- that is, that distribute the reconstruction workload over the disks of the array with only approximate balance. Approximately balanced layouts are considerably easier to construct than perfectly balanced layouts. We consider three methods for generating approximately balanced layouts: randomization, simulated annealing, and perturbing a BIBD-based layout whose size is near the desired size. We compare the performance of these approximately balanced layouts with that of perfectly balanced layouts using a disk array simulator. We conclude that, on uniform workloads, approximately balanced data layouts have performance nearly identical to that of perfectly balanced layouts. Approximately balanced layouts therefore provide the reconstruction performance benefits of perfectly balanced layouts for arrays where perfectly balanced layouts are either not known, or do not exist.} } @InProceedings{purakayastha:enwrich, author = "Apratim Purakayastha and Carla Ellis and David Kotz", title = "{ENWRICH}: A Compute-Processor Write Caching Scheme for Parallel File Systems", pages = "55--68", booktitle = iopads96, year = 1996, address = "Philadelphia", month = may, keyword = "parallel I/O, parallel file system, file caching", abstract = { Many parallel scientific applications need high-performance I/O\@. Unfortunately, end-to-end parallel-I/O performance has not been able to keep up with substantial improvements in parallel-I/O hardware because of poor parallel file-system software. Many radical changes, both at the interface level and the implementation level, have recently been proposed. One such proposed interface is {\em collective I/O}, which allows parallel jobs to request transfer of large contiguous objects in a single request, thereby preserving useful semantic information that would otherwise be lost if the transfer were expressed as per-processor non-contiguous requests. Kotz has proposed {\em disk-directed I/O} as an efficient implementation technique for collective-I/O operations, where the compute processors make a single collective data-transfer request, and the I/O processors thereafter take full control of the actual data transfer, exploiting their detailed knowledge of the disk layout to attain substantially improved performance. \par Recent parallel file-system usage studies show that writes to write-only files are a dominant part of the workload. Therefore, optimizing writes could have a significant impact on overall performance. In this paper, we propose ENWRICH, a compute-processor write-caching scheme for write-only files in parallel file systems. ENWRICH combines low-overhead write caching at the compute processors with high performance disk-directed I/O at the I/O processors to achieve both low latency and high bandwidth. This combination facilitates the use of the powerful disk-directed I/O technique independent of any particular choice of interface. By collecting writes over many files and applications, ENWRICH lets the I/O processors optimize disk I/O over a large pool of requests. We evaluate our design via simulated implementation and show that ENWRICH achieves high performance for various configurations and workloads.} } @InProceedings{soloviev:prefetching, author = "Valery V. Soloviev", title = "Prefetching in Segmented Disk Cache for Multi-Disk Systems", pages = "69--82", booktitle = iopads96, year = 1996, address = "Philadelphia", month = may, keyword = "parallel I/O, prefetching, disk cache, disk array", abstract = { This paper investigates the performance of a multi-disk storage system equipped with a segmented disk cache processing a workload of multiple relational scans. Prefetching is a popular method of improving the performance of scans. Many modern disks have a multisegment cache which can be used for prefetching. We observe that, exploiting declustering as a data placement method, prefetching in a segmented cache causes a load imbalance among several disks. A single disk becomes a bottleneck, degrading performance of the entire system. A variation in disk queue length is a primary factor of the imbalance. Using a precise simulation model, we investigate several approaches to achieving better balancing. Our metrics are a scan response time for the closed-end system and an ability to sustain a workload without saturating for the open-end system. We arrive at two main conclusions: (1) Prefetching in main memory is inexpensive and effective for balancing and can supplement or substitute prefetching in disk cache. (2) Disk-level prefetching provides about the same performance as main memory prefetching if request queues are managed in the disk controllers rather than in the host. Checking the disk cache before queuing requests provides not only better request response time but also drastically improves balancing. A single cache performs better than a segmented cache for this method. } } @InProceedings{nieuwejaar:galley, author = "Nils Nieuwejaar and David Kotz", title = "Performance of the {Galley} Parallel File System ", pages = "83--94", booktitle = iopads96, year = 1996, address = "Philadelphia", month = may, keyword = "parallel I/O, parallel file system", abstract = { As the I/O needs of parallel scientific applications increase, file systems for multiprocessors are being designed to provide applications with parallel access to multiple disks. Many parallel file systems present applications with a conventional Unix-like interface that allows the application to access multiple disks transparently. This interface conceals the parallelism within the file system, which increases the ease of programmability, but makes it difficult or impossible for sophisticated programmers and libraries to use knowledge about their I/O needs to exploit that parallelism. Furthermore, most current parallel file systems are optimized for a different workload than they are being asked to support. We introduce Galley, a new parallel file system that is intended to efficiently support realistic parallel workloads. Initial experiments, reported in this paper, indicate that Galley is capable of providing high-performance I/O to applications that access data in patterns that have been observed to be common. } } @InProceedings{krieger:hfs, author = "Orran Krieger and Michael Stumm", title = "{HFS}: A Performance-Oriented Flexible File System Based on Building-Block Compositions ", pages = "95--108", booktitle = iopads96, year = 1996, address = "Philadelphia", month = may, keyword = "parallel I/O, parallel file system, object-oriented", abstract = { The Hurricane File System (HFS) is designed for (potentially large-scale) shared memory multiprocessors. Its architecture is based on the principle that, in order to maximize performance for applications with diverse requirements, a file system must support a wide variety of file structures, file system policies and I/O interfaces. Files in HFS are implemented using simple building blocks composed in potentially complex ways. This approach yields great flexibility, allowing an application to customize the structure and policies of a file to exactly meet its requirements. For example, a file's structure can be optimized for concurrent random-access write-only operations by ten processes. Similarly, the prefetching, locking, and file cache management policies can all be chosen to match an application's access pattern. In contrast, most existing parallel file systems support a single file structure and a small set of policies. \par We have implemented HFS as part of the Hurricane operating system running on the Hector shared memory multiprocessor. We demonstrate that the flexibility of HFS comes with little processing or I/O overhead. We also show that for a number of file access patterns HFS is able to deliver to the applications the full I/O bandwidth of the disks on our system.} } @InProceedings{chen:panda, author = "Y. Chen and M. Winslett and K. E. Seamons and S. Kuo and Y. Cho and M. Subramaniam", title = "Scalable Message Passing in {Panda}", pages = "109--121", booktitle = iopads96, year = 1996, address = "Philadelphia", month = may, keyword = "parallel I/O, parallel file system", abstract = { To provide high performance for applications with a wide variety of i/o requirements and to support many different parallel platforms, the design of a parallel i/o system must provide for efficient utilization of available bandwidth both for disk traffic and for message passing. In this paper we discuss the message-passing scalability of the server-directed i/o architecture of Panda, a library for synchronized i/o of multidimensional arrays on parallel platforms. We show how to improve i/o performance in situations where message-passing is a bottleneck, by combining the server-directed i/o strategy for highly efficient use of available disk bandwidth with new mechanisms to minimize internal communication and computation overhead in Panda. We present experimental results that show that with these improvements, Panda will provide high i/o performance for a wider range of applications, such as applications running with slow interconnects, applications performing i/o operations on large numbers of arrays, or applications that require drastic data rearrangements as data are moved between memory and disk (e.g., array transposition). We also argue that in the future, the improved approach to message-passing will allow Panda to support applications that are not closely synchronized or that run in heterogeneous environments.} } @InProceedings{armen:disk-model, author = "Chris Armen", title = "Bounds on the Separation of Two Parallel Disk Models", pages = "122--127", booktitle = iopads96, year = 1996, address = "Philadelphia", month = may, keyword = "parallel I/O, theory, parallel I/O algorithm", abstract = { The single-disk, D-head model of parallel I/O was introduced by Agarwal and Vitter to analyze algorithms for problem instances that are too large to fit in primary memory. Subsequently Vitter and Shriver proposed a more realistic model in which the disk space is partitioned into D disks, with a single head per disk. To date, each problem for which there is a known optimal algorithm for both models has the same asymptotic bounds on both models. Therefore, it has been unknown whether the models are equivalent or whether the single-disk model is strictly more powerful. \par In this paper we provide evidence that the single-disk model is strictly more powerful. We prove a lower bound on any general simulation of the single-disk model on the multi-disk model and establish randomized and deterministic upper bounds. Let $N$ be the problem size and let $T$ be the number of parallel I/Os required by a program on the single-disk model. Then any simulation of this program on the multi-disk model will require $\Omega\left(T \frac{\log(N/D)}{\log \log(N/D)}\right)$ parallel I/Os. This lower bound holds even if replication is allowed in the multi-disk model. We also show an $O\left(\frac{\log D}{\log \log D}\right)$ randomized upper bound and an $O\left(\log D (\log \log D)^2\right)$ deterministic upper bound. These results exploit an interesting analogy between the disk models and the PRAM and DCM models of parallel computation.} } @InProceedings{wisniewski:in-place, author = "Leonard F. Wisniewski", title = "Structured Permuting in Place on Parallel Disk Systems", pages = "128--139", booktitle = iopads96, year = 1996, address = "Philadelphia", month = may, keyword = "parallel I/O, parallel I/O algorithm, permutation, out-of-core", abstract = { The ability to perform permutations of large data sets in place reduces the amount of necessary available disk storage. The simplest way to perform a permutation often is to read the records of a data set from a source portion of data storage, permute them in memory, and write them to a separate target portion of the same size. It can be quite expensive, however, to provide disk storage that is twice the size of very large data sets. Permuting in place reduces the expense by using only a small amount of extra disk storage beyond the size of the data set. \par \newcommand{\ceil}[1]{\lceil #1\rceil} \newcommand{\rank}[1]{\mathop{\rm rank}\nolimits #1} This paper features in-place algorithms for commonly used structured permutations. We have developed an asymptotically optimal algorithm for performing BMMC (bit-matrix-multiply/complement) permutations in place that requires at most $\frac{2N}{BD}\left( 2\ceil{\frac{\rank{\gamma}}{\lg (M/B)}} + \frac{7}{2}\right)$ parallel disk accesses, as long as $M \geq 2BD$, where $N$ is the number of records in the data set, $M$ is the number of records that can fit in memory, $D$ is the number of disks, $B$ is the number of records in a block, and $\gamma$ is the lower left $\lg (N/B) \times \lg B$ submatrix of the characteristic matrix for the permutation. This algorithm uses $N+M$ records of disk storage and requires only a constant factor more parallel disk accesses and insignificant additional computation than a previously published asymptotically optimal algorithm that uses $2N$ records of disk storage. \par We also give algorithms to perform mesh and torus permutations on a $d$-dimensional mesh. The in-place algorithm for mesh permutations requires at most $3\ceil{N/BD}$ parallel I/Os and the in-place algorithm for torus permutations uses at most $4dN/BD$ parallel I/Os. The algorithms for mesh and torus permutations require no extra disk space as long as the memory size $M$ is at least $3BD$. The torus algorithm improves upon the previous best algorithm in terms of both time and space.} }