% BibTeX bibliography file @InProceedings{acharya:tuning, author = {Anurag Acharya and Mustafa Uysal and Robert Bennett and Assaf Mendelson and Michael Beynon and Jeffrey K. Hollingsworth and Joel Saltz and Alan Sussman}, title = {Tuning the Performance of {I/O} Intensive Parallel Applications}, booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1996}, month = {May}, pages = {15--27}, publisher = {ACM Press}, address = {Philadelphia}, keyword = {parallel I/O, filesystem workload, parallel application, pario-bib}, abstract = {Getting good I/O performance from parallel programs is a critical problem for many application domains. In this paper, we report our experience tuning the I/O performance of four application programs from the areas of satellite-data processing and linear algebra. After tuning, three of the four applications achieve application-level I/O rates of over 100 MB/s on 16 processors. The total volume of I/O required by the programs ranged from about 75 MB to over 200 GB. We report the lessons learned in achieving high I/O performance from these applications, including the need for code restructuring, local disks on every node and knowledge of future I/O requests. We also report our experience on achieving high performance on peer-to-peer configurations. Finally, we comment on the necessity of complex I/O interfaces like collective I/O and strided requests to achieve high performance.} } @InProceedings{ap:enwrich, author = {Apratim Purakayastha and Carla Schlatter Ellis and David Kotz}, title = {{ENWRICH:} A Compute-Processor Write Caching Scheme for Parallel File Systems}, booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1996}, month = {May}, pages = {55--68}, publisher = {ACM Press}, address = {Philadelphia}, earlier = {ap:enwrich-tr}, URL = {ftp://ftp.cs.dartmouth.edu/pub/kotz/papers/ap:enwrich.ps.Z}, keyword = {parallel file system, parallel I/O, caching, pario-bib, dfk}, abstract = {Many parallel scientific applications need high-performance I/O. Unfortunately, end-to-end parallel-I/O performance has not been able to keep up with substantial improvements in parallel-I/O hardware because of poor parallel file-system software. Many radical changes, both at the interface level and the implementation level, have recently been proposed. One such proposed interface is {\em collective I/O}, which allows parallel jobs to request transfer of large contiguous objects in a single request, thereby preserving useful semantic information that would otherwise be lost if the transfer were expressed as per-processor non-contiguous requests. Kotz has proposed {\em disk-directed I/O} as an efficient implementation technique for collective-I/O operations, where the compute processors make a single collective data-transfer request, and the I/O processors thereafter take full control of the actual data transfer, exploiting their detailed knowledge of the disk-layout to attain substantially improved performance. \par Recent parallel file-system usage studies show that writes to write-only files are a dominant part of the workload. Therefore, optimizing writes could have a significant impact on overall performance. In this paper, we propose ENWRICH, a compute-processor write-caching scheme for write-only files in parallel file systems. ENWRICH combines low-overhead write caching at the compute processors with high performance disk-directed I/O at the I/O processors to achieve both low latency and high bandwidth. This combination facilitates the use of the powerful disk-directed I/O technique independent of any particular choice of interface. By collecting writes over many files and applications, ENWRICH lets the I/O processors optimize disk I/O over a large pool of requests. We evaluate our design via simulated implementation and show that ENWRICH achieves high performance for various configurations and workloads.} } @InProceedings{armen:disk-model, author = {Chris Armen}, title = {Bounds on the Separation of Two Parallel Disk Models}, booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1996}, month = {May}, pages = {122--127}, publisher = {ACM Press}, address = {Philadelphia}, keyword = {parallel I/O, theory, parallel I/O algorithm, pario-bib}, abstract = {The single-disk, D-head model of parallel I/O was introduced by Agarwal and Vitter to analyze algorithms for problem instances that are too large to fit in primary memory. Subsequently Vitter and Shriver proposed a more realistic model in which the disk space is partitioned into D disks, with a single head per disk. To date, each problem for which there is a known optimal algorithm for both models has the same asymptotic bounds on both models. Therefore, it has been unknown whether the models are equivalent or whether the single-disk model is strictly more powerful. \par In this paper we provide evidence that the single-disk model is strictly more powerful. We prove a lower bound on any general simulation of the single-disk model on the multi-disk model and establish randomized and deterministic upper bounds. Let $N$ be the problem size and let $T$ be the number of parallel I/Os required by a program on the single-disk model. Then any simulation of this program on the multi-disk model will require $\Omega\left(T \frac{\log(N/D)}{\log \log(N/D)}\right)$ parallel I/Os. This lower bound holds even if replication is allowed in the multi-disk model. We also show an $O\left(\frac{\log D}{\log \log D}\right)$ randomized upper bound and an $O\left(\log D (\log \log D)^2\right)$ deterministic upper bound. These results exploit an interesting analogy between the disk models and the PRAM and DCM models of parallel computation.} } @InProceedings{arpaci-dusseau:river, author = {Remzi H. Arpaci-Dusseau and Eric Anderson and Noah Treuhaft and David E. Culler and Joseph M. Hellerstein and David Patterson and Kathy Yelick}, title = {Cluster {I/O} with {River}: Making the Fast Case Common}, booktitle = {Proceedings of the Sixth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1999}, month = {May}, pages = {10--22}, publisher = {ACM Press}, address = {Atlanta, GA}, URL = {http://vibes.cs.uiuc.edu/IOPADS/Accepted/Remzi.ps}, keyword = {cluster computing, parallel I/O, pario-bib}, abstract = {We introduce River, a data-flow programming environment and I/O substrate for clusters of computers. River is designed to provide maximum performance in the common case--- even in the face of non-uniformities in hardware, software, and workload. River is based on two simple design features: a high-performance distributed queue,and a storage redundancy mechanism called graduated declustering.We have implemented a number of data-intensive applications on River, which validate our design with near-ideal performance in a variety of non-uniform performance scenarios.} } @InProceedings{barve:competitive2, author = {Rakesh Barve and Mahesh Kallahalla and Peter J. Varman and Jeffrey Scott Vitter}, title = {Competitive Parallel Disk Prefetching and Buffer Management}, booktitle = {Proceedings of the Fifth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1997}, month = {November}, pages = {47--56}, publisher = {ACM Press}, address = {San Jose, CA}, keyword = {disk prefetching, file caching, parallel I/O, pario-bib}, abstract = {We provide a competitive analysis framework for online prefetching and buffer management algorithms in parallel I/O systems, using a read-once model of block references. This has widespread applicability to key I/O-bound applications such as external merging and concurrent playback of multiple video streams. Two realistic lookahead models, global lookahead and local lookahead, are defined. Algorithms NOM and GREED based on these two forms of lookahead are analyzed for shared buffer and distributed buffer configurations, both of which occur frequently in existing systems. An important aspect of our work is that we show how to implement both the models of lookahead in practice using the simple techniques of forecasting and flushing. \par Given a D-disk parallel I/O system and a globally shared I/O buffer that can hold upto M disk blocks, we derive a lower bound of $\Omega(\sqrt{D}$) on the competitive ratio of any deterministic online prefetching algorithm with O(M) lookahead. NOM is shown to match the lower bound using global M-block lookahead. In contrast, using only local lookahead results in an $\Omega(D)$ competitive ratio. When the buffer is distributed into D portions of M/D blocks each, the algorithm GREED based on local lookahead is shown to be optimal, and NOM is within a constant factor of optimal. Thus we provide a theoretical basis for the intuition that global lookahead is more valuable for prefetching in the case of a shared buffer configuration whereas it is enough to provide local lookahead in case of the distributed configuration. Finally, we analyze the performance of these algorithms for reference strings generated by a uniformly-random stochastic process and we show that they achieve the minimal expected number of I/Os. These results also give bounds on the worst-case expected performance of algorithms which employ randomization in the data layout.}, comment = {See also barve:competitive. They propose two methods for scheduling prefetch operations in the situation where the access pattern is largely known in advance, in such a way as to minimize the total number of parallel I/Os. The two methods are quite straightforward, and yet match the optimum lower bound for an on-line algorithm.} } @InProceedings{barve:round, author = {Rakesh Barve and Phillip B. Gibbons and Bruce K. Hillyer and Yossi Matias and Elizabeth Shriver and Jeffrey Scott Vitter}, title = {Round-like Behavior in Multiple Disks on a Bus}, booktitle = {Proceedings of the Sixth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1999}, month = {May}, pages = {1--9}, publisher = {ACM Press}, address = {Atlanta, GA}, URL = {http://vibes.cs.uiuc.edu/IOPADS/Accepted/Shriver.ps}, keyword = {disk, I/O bus, parallel I/O, pario-bib}, abstract = {In modern I/O architectures, multiple disk drives are attached to each I/O bus. Under I/O-intensive workloads, the disk latency for a request can be overlapped with the disk latency and data transfers of requests to other disks, potentially resulting in an aggregate I/O throughput at nearly bus bandwidth. This paper reports on a performance impairment that results from a previously unknown form of convoy behavior in disk I/O, which we call rounds. In rounds, independent requests to distinct disks convoy, so that each disk services one request before any disk services its next request. We analyze log files to describe read performance of multiple Seagate Wren-7 disks that share a SCSI bus under a heavy workload, demonstrating the rounds behavior and quantifying its performance impact.} } @InProceedings{bester:gass, author = {Joseph Bester and Ian Foster and Carl Kesselman and Jean Tedesco and Steven Tuecke}, title = {{GASS}: A Data Movement and Access Service for Wide Area Computing Systems}, booktitle = {Proceedings of the Sixth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1999}, month = {May}, pages = {78--88}, publisher = {ACM Press}, address = {Atlanta, GA}, URL = {http://vibes.cs.uiuc.edu/IOPADS/Accepted/Tedesco.ps}, keyword = {wide-area network, parallel I/O, pario-bib}, abstract = {In wide area computing, programs frequently execute at sites that are distant from their data. Data access mechanisms are required that place limited functionality demands on an application or host system yet permit high-performance implementations. To address these requirements, we propose a data movement and access service called Global Access to Secondary Storage (GASS). This service defines a global name space via Uniform Resource Locators and allows applications to access remote files via standard I/O interfaces. High performance is achieved by incorporating default data movement strategies that are specialized for I/O patterns common in wide area applications and by providing support for programmer management of data movement. GASS forms part of the Globus toolkit, a set of services for high-performance distributed computing. GASS itself makes use of Globus services for security and communication, and other Globus components use GASS services for executable staging and real-time remote monitoring. Application experiences demonstrate that the library has practical utility.} } @InProceedings{burns:delta, author = {Randal C. Burns and Darrell D. E. Long}, title = {Efficient Distributed Backup with Delta Compression}, booktitle = {Proceedings of the Fifth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1997}, month = {November}, pages = {26--36}, publisher = {ACM Press}, address = {San Jose, CA}, keyword = {distributed file system, file system backup, tertiary storage, compression}, abstract = {Inexpensive storage and more powerful processors have resulted in a proliferation of data that needs to be reliably backed up. Network resource limitations make it increasingly difficult to backup a distributed file system on a nightly or even weekly basis. By using delta compression algorithms, which minimally encode a version of a file using only the bytes that have changed, a backup system can compress the data sent to a server. With the delta backup technique, we can achieve significant savings in network transmission time over previous techniques. \par Our measurements indicate that file system data may, on average, be compressed to within 10\% of its original size with this method and that approximately 45\% of all changed files have also been backed up in the previous week. Based on our measurements, we conclude that a small file store on the client that contains copies of previously backed up files can be used to retain versions in order to generate delta files. \par To reduce the load on the backup server, we implement a modified version storage architecture, {\em version jumping}, that allows us to restore delta encoded file versions with at most two accesses to tertiary storage. This minimizes server workload and network transmission time on file restore.}, comment = {Use delta compression to save a set of differences between versions rather than to backup a file system by copying whole files. They need to keep a reference version (a complete file), and the deltas. Since deltas are computed off of intermediate versions, they depend on each other; to recover a version, you need to rebuild all intermediate versions from the base file. They propose version jumping, by computing deltas not from the previous version but from a common earlier version. Thus, you can reconstruct a version from one of the reference versions, and one delta. That's good when you need to get these off of tape!} } @InProceedings{chang:reuse, author = {Tai-Sheng Chang and Sangyup Shim and David H.~C. Du}, title = {The Scalability of Spatial Reuse Based Serial Storage Interfaces}, booktitle = {Proceedings of the Fifth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1997}, month = {November}, pages = {93--101}, publisher = {ACM Press}, address = {San Jose, CA}, keyword = {I/O interface, I/O network, I/O architecture, parallel I/O, pario-bib}, abstract = {Due to the growing popularity of emerging applications such as digital libraries, Video-On Demand, distance learning, and Internet World-Wide Web, multimedia servers with a large capacity and high performance storage subsystem are in high demand. Serial storage interfaces are emerging technologies designed to improve the performance of such storage subsystems. They provide high bandwidth, fault tolerance, fair bandwidth sharing and long distance connection capability. All of these issues are critical in designing a scalable and high performance storage subsystem. Some of the serial storage interfaces provide the spatial reuse feature which allows multiple concurrent transmissions. That is, multiple hosts can access disks concurrently with full link bandwidth if their access paths are disjoint. Spatial reuse provides a way to build a storage subsystem whose aggregate bandwidth may be scaled up with the number of hosts. However, it is not clear how much the performance of a storage subsystem could be improved by the spatial reuse with different configurations and traffic scenarios. Both limitation and capability of this scalability need to be investigated. To understand their fundamental performance characteristics, we derive an analytic model for the serial storage interfaces with the spatial reuse feature. Based on this model, we investigate the maximum aggregate throughput from different system configurations and load distributions. We show how the number of disks needed to saturate a loop varies with different number of hosts and different load scenarios. We also show how the load balancing by uniformly distributing the load to all the disks on a loop may incur high overhead. This is because the accesses to far away disks need to go through many links and consume the bandwidth of each link it goes through. The results show the achievable throughput may be reduced by more than half in some cases.} } @InProceedings{chen:panda, author = {Y. Chen and M. Winslett and K. E. Seamons and S. Kuo and Y. Cho and M. Subramaniam}, title = {Scalable Message Passing in {Panda}}, booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1996}, month = {May}, pages = {109--121}, publisher = {ACM Press}, address = {Philadelphia}, keyword = {parallel I/O, parallel file system, pario-bib}, abstract = {To provide high performance for applications with a wide variety of i/o requirements and to support many different parallel platforms, the design of a parallel i/o system must provide for efficient utilization of available bandwidth both for disk traffic and for message passing. In this paper we discuss the message-passing scalability of the server-directed i/o architecture of Panda, a library for synchronized i/o of multidimensional arrays on parallel platforms. We show how to improve i/o performance in situations where message-passing is a bottleneck, by combining the server-directed i/o strategy for highly efficient use of available disk bandwidth with new mechanisms to minimize internal communication and computation overhead in Panda. We present experimental results that show that with these improvements, Panda will provide high i/o performance for a wider range of applications, such as applications running with slow interconnects, applications performing i/o operations on large numbers of arrays, or applications that require drastic data rearrangements as data are moved between memory and disk (e.g., array transposition). We also argue that in the future, the improved approach to message-passing will allow Panda to support applications that are not closely synchronized or that run in heterogeneous environments.}, comment = {see seamons:panda. This paper goes further with some communication improvements.} } @InProceedings{cho:local, author = {Yong Cho and Marianne Winslett and Mahesh Subramaniam and Ying Chen and Szu-wen Kuo and Kent E. Seamons}, title = {Exploiting Local Data in Parallel Array {I/O} on a Practical Network of Workstations}, booktitle = {Proceedings of the Fifth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1997}, month = {November}, pages = {1--13}, publisher = {ACM Press}, address = {San Jose, CA}, keyword = {parallel I/O, distributed system, pario-bib}, abstract = {A cost-effective way to run a parallel application is to use existing workstations connected by a local area network such as Ethernet or FDDI. In this paper, we present an approach for parallel I/O of multidimensional arrays on small networks of workstations with a shared-media interconnect, using the Panda I/O library. \par In such an environment, the message passing throughput per node is lower than the throughput obtainable from a fast disk and it is not easy for users to determine the configuration which will yield the best I/O performance. \par We introduce an I/O strategy that exploits local data to reduce the amount of data that must be shipped across the network, present experimental results, and analyze the results using an analytical performance model and predict the best choice of I/O parameters. \par Our experiments show that the new strategy results in a factor of 1.2--2.1 speedup in response time compared to the Panda version originally developed for the IBM SP2, depending on the array sizes, distributions and compute and I/O node meshes. Further, the performance model predicts the results within a 13\% margin of error.}, comment = {They examine a system that supports nodes that are both compute and I/O nodes. The assumption is that the application is writing data to a new file, and does not care to which disks the data goes. They are trying to decide which nodes should be used for I/O, given the distribution of data on compute nodes and the distribution desired across disks. They use a Hungarian algorithm to solve a weighted optimization problem on a bipartite graph connecting I/O nodes to compute nodes, in an attempt to minimize the data flow across the network. But there is no attempt to make a decision that might be sensible for a future read operation that may want to read in a different pattern.} } @InProceedings{cormen:fft3, author = {Thomas H. Cormen and Jake Wegmann and David M. Nicol}, title = {Multiprocessor Out-of-Core {FFTs} with Distributed Memory and Parallel Disks}, booktitle = {Proceedings of the Fifth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1997}, month = {November}, pages = {68--78}, publisher = {ACM Press}, address = {San Jose, CA}, keyword = {out of core, parallel I/O, pario-bib}, abstract = {This paper extends an earlier out-of-core Fast Fourier Transform (FFT) method for a uniprocessor with the Parallel Disk Model (PDM) to use multiple processors. Four out-of-core multiprocessor methods are examined. Operationally, these methods differ in the size of "mini-butterfly" computed in memory and how the data are organized on the disks and in the distributed memory of the multiprocessor. The methods also perform differing amounts of I/O and communication. Two of them have the remarkable property that even though they are computing the FFT on a multiprocessor, all interprocessor communication occurs outside the mini-butterfly computations; communication that ordinarily occurs in a butterfly is folded into other data-movement operations. An analysis program shows that the two methods that use no butterfly communication usually use less communication overall than the other methods. The analysis program is fast enough that it can be invoked at run time to determine which of the four methods uses the least communication. One set of performance results on a small workstation cluster indicates that the methods without butterfly communication are approximately 9.5\% faster. Moreover, they are much easier to implement.}, comment = {They find a way to move the interprocessor communication involved in the out-of-core FFT into a single BMMC permutation between "super-levels", where each super-level involves log(M) stages of the FFT. This usually leads to less communication and to better overall performance. See also cormen:fft and cormen:fft2.} } @InProceedings{cortes:cooperative, author = {Toni Cortes and Sergi Girona and Jes\'us Labarta}, title = {Design Issues of a Cooperative Cache with no Coherence Problems}, booktitle = {Proceedings of the Fifth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1997}, month = {November}, pages = {37--46}, publisher = {ACM Press}, address = {San Jose, CA}, keyword = {cooperative caching, distributed file system, parallel I/O, pario-bib}, abstract = {In this paper, we examine some of the important problems observed in the design of cooperative caches. Solutions to the coherence, load-balancing and fault-tolerance problems are presented. These solutions have been implemented as a part of PAFS, a parallel/distributed file system, and its performance has been compared to the one achieved by xFS. Using the comparison results, we have observed that the proposed ideas not only solve the main problems of cooperative caches, but also increase the overall system performance. Although the solutions presented in this paper were targeted to a parallel machine, reasonable good results have also been obtained for networks of workstations.}, comment = {They make the claim that it is better not to replicate data into local client caches, rather, it is better to simply make remote read and write requests to the cached block in whatever memory it may be. That reduces the overhead (space and time) of replication and coherency, and leads to better performance. They also present a range of parity-based fault-tolerance mechanisms, and a load-balancing technique that reassigns cache buffers to cache-manager processes.} } @InProceedings{foster:remote-io, author = {Ian Foster and David {Kohr, Jr.} and Rakesh Krishnaiyer and Jace Mogill}, title = {Remote {I/O}: Fast Access to Distant Storage}, booktitle = {Proceedings of the Fifth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1997}, month = {November}, pages = {14--25}, publisher = {ACM Press}, address = {San Jose, CA}, keyword = {parallel I/O, distributed file system, pario-bib}, abstract = {As high-speed networks make it easier to use distributed resources, it becomes increasingly common that applications and their data are not colocated. Users have traditionally addressed this problem by manually staging data to and from remote computers. We argue instead for a remote I/O paradigm in which programs use familiar parallel I/O interfaces to access remote filesystems. In addition to simplifying remote execution, remote I/O can improve performance relative to staging by overlapping computation and data transfer or by reducing communication requirements. However, remote I/O also introduces new technical challenges in the areas of portability, performance, and integration with distributed computing systems. We propose techniques designed to address these challenges and describe a remote I/O library called RIO that we are developing to evaluate the effectiveness of these techniques. RIO addresses issues of portability by adopting the quasi-standard MPI-IO interface and by defining a RIO device and RIO server within the ADIO abstract I/O device architecture. It addresses performance issues by providing traditional I/O optimizations such as asynchronous operations and through implementation techniques such as buffering and message forwarding to offload communication overheads. Microbenchmarks and application experiments demonstrate that our techniques can improve turnaround time relative to staging.}, comment = {They want to support users that have datasets at different locations in the Internet, but need to access the data at supercomputer parallel machines. Rather than staging data in and out, they want to provide remote access. Issues: naming, dynamic loads, heterogeneity, security, fault-tolerance. All traffic goes through a 'forwarder node' that funnels all the traffic into the network. They use URLs for pathnames (e.g., "x-rio://..."). They find that non-blocking ops are important, as is collective I/O. They think that buffering will be important. Limited experiments.} } @InProceedings{kallahalla:read-once, author = {Mahesh Kallahalla and Peter J. Varman}, title = {Optimal Read-Once Parallel Disk Scheduling}, booktitle = {Proceedings of the Sixth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1999}, month = {May}, pages = {68--77}, publisher = {ACM Press}, address = {Atlanta, GA}, URL = {http://vibes.cs.uiuc.edu/IOPADS/Accepted/Kallahalla.ps}, keyword = {disk scheduling, parallel I/O, pario-bib}, abstract = {We present an optimal algorithm, L-OPT, for prefetching and I/O scheduling in parallel I/O systems using a read-once model of block reference. The algorithm uses knowledge of the next L block references, L-block lookahead, to schedule I/Os in an on-line manner. It uses a dynamic priority assignment scheme to decide when blocks should be prefetched, so as to minimize the total number of I/Os. The parallel disk model of an I/O system is used to study the performance of L-OPT. We show that L-OPT is comparable to the best on-line algorithm with the same amount of lookahead; the ratio of the length of its schedule to the length of the optimal schedule is within a constant factor of the best possible. Specifically, we show that the competitive ratio of L-OPT is $\Omega(\sqrt{MD/L})$ which matches the lower bound on the competitive ratio of any prefetching algorithm with L-block lookahead. In addition we show that when the lookahead consists of the entire reference string, L-OPT performs the minimum possible number of I/Os; hence L-OPT is the optimal off-line algorithm. Finally, using synthetic traces we empirically study the performance characteristics of L-OPT.} } @InProceedings{kandemir:locality, author = {M. Kandemir and A. Choudhary and J. Ramanujam and M. Kandaswamy}, title = {A Unified Compiler Algorithm for Optimizing Locality, Parallelism, and Communication in Out-of-Core Computations}, booktitle = {Proceedings of the Fifth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1997}, month = {November}, pages = {79--92}, publisher = {ACM Press}, address = {San Jose, CA}, keyword = {compiler, out of core, parallel I/O, pario-bib}, abstract = {This paper presents compiler algorithms to optimize out-of-core programs. These algorithms consider loop and data layout transformations in a unified framework. The performance of an out-of-core loop nest containing many references can be improved by a combination of restructuring the loops and file layouts. This approach considers array references one-by-one and attempts to optimize each reference for parallelism and locality. When there are references for which parallelism optimizations do not work, communication is vectorized so that data transfer can be performed before the innermost tiling loop. Preliminary results from hand-compiles on IBM SP-2 and Intel Paragon show that this approach reduces the execution time, improves the bandwidth speedup and overall speedup. In addition, we extend the base algorithm to work with file layout constraints and show how it can be used for optimizing programs consisting of multiple loop nests.} } @InProceedings{kotz:explore, author = {David Kotz and Ting Cai}, title = {Exploring the use of {I/O} Nodes for Computation in a {MIMD} Multiprocessor}, booktitle = {Proceedings of the IPPS~'95 Workshop on Input/Output in Parallel and Distributed Systems}, year = {1995}, month = {April}, pages = {78--89}, earlier = {kotz:explore-tr}, URL = {ftp://ftp.cs.dartmouth.edu/pub/kotz/papers/kotz:explore.ps.Z}, keyword = {parallel I/O, multiprocessor file system, dfk, pario-bib}, abstract = {As parallel systems move into the production scientific-computing world, the emphasis will be on cost-effective solutions that provide high throughput for a mix of applications. Cost-effective solutions demand that a system make effective use of all of its resources. Many MIMD multiprocessors today, however, distinguish between ``compute'' and ``I/O'' nodes, the latter having attached disks and being dedicated to running the file-system server. This static division of responsibilities simplifies system management but does not necessarily lead to the best performance in workloads that need a different balance of computation and I/O. \par Of course, computational processes sharing a node with a file-system service may receive less CPU time, network bandwidth, and memory bandwidth than they would on a computation-only node. In this paper we begin to examine this issue experimentally. We found that high-performance I/O does not necessarily require substantial CPU time, leaving plenty of time for application computation. There were some complex file-system requests, however, which left little CPU time available to the application. (The impact on network and memory bandwidth still needs to be determined.) For applications (or users) that cannot tolerate an occasional interruption, we recommend that they continue to use only compute nodes. For tolerant applications needing more cycles than those provided by the compute nodes, we recommend that they take full advantage of {\em both\/} compute and I/O nodes for computation, and that operating systems should make this possible.} } @InProceedings{krieger:hfs2, author = {Orran Krieger and Michael Stumm}, title = {{HFS}: A Performance-Oriented Flexible File System Based on Building-Block Compositions}, booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1996}, month = {May}, pages = {95--108}, publisher = {ACM Press}, address = {Philadelphia}, earlier = {krieger:hfs}, later = {krieger:hfs3}, keyword = {parallel I/O, parallel file system, object-oriented, pario-bib}, abstract = {The Hurricane File System (HFS) is designed for (potentially large-scale) shared memory multiprocessors. Its architecture is based on the principle that, in order to maximize performance for applications with diverse requirements, a file system must support a wide variety of file structures, file system policies and I/O interfaces. Files in HFS are implemented using simple building blocks composed in potentially complex ways. This approach yields great flexibility, allowing an application to customize the structure and policies of a file to exactly meet its requirements. For example, a file's structure can be optimized for concurrent random-access write-only operations by ten processes. Similarly, the prefetching, locking, and file cache management policies can all be chosen to match an application's access pattern. In contrast, most existing parallel file systems support a single file structure and a small set of policies. \par We have implemented HFS as part of the Hurricane operating system running on the Hector shared memory multiprocessor. We demonstrate that the flexibility of HFS comes with little processing or I/O overhead. We also show that for a number of file access patterns HFS is able to deliver to the applications the full I/O bandwidth of the disks on our system.}, comment = {A published form of krieger:hfs and the thesis krieger:thesis. Their main point is that the file system is constructed from building-block objects. When you create a file you choose a few building blocks, for example, a replication block that mirrors the file, and some distribution blocks that distribute each replica across a set of disks. When you open the file you plug in some more building blocks, e.g., to do prefetching or to provide the kind of interface that you want to use. They point out that this flexibility is critical to be able to get good performance, because different file-access patterns need different structures and policies. They found that mapped files minimize copying costs and improve performance. They were able to obtain full disk bandwidth. Great paper.} } @InProceedings{kuo:efficient, author = {S. Kuo and M. Winslett and Y. Cho and J. Lee and Y. Chen}, title = {Efficient Input and Output for Scientific Simulations}, booktitle = {Proceedings of the Sixth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1999}, month = {May}, pages = {33--44}, publisher = {ACM Press}, address = {Atlanta, GA}, URL = {http://vibes.cs.uiuc.edu/IOPADS/Accepted/Kuo.ps}, keyword = {scientific computing, simulation, parallel I/O, pario-bib}, abstract = {Large simulations which run for hundreds of hours on parallel computers often periodically generate snapshots of states, which are later post-processed to visualize the simulated physical phenomenon. For many applications, fast I/O during post-processing, which is dependent on an efficient organization of data on disk, is as important as minimizing computation-time I/O. In this paper we propose optimizations to support efficient parallel I/O for scientific simulations and subsequent visualizations. We present an ordering mechanism to linearize data on disk, a performance model to help to choose a proper stripe unit size, and a scheduling algorithm to minimize communication contention. Our experiments on an IBM SP show that the combination of these strategies provides a 20-25\% performance boost.} } @InProceedings{mache:spatial, author = {Jens Mache and Virginia Lo and Marilynn Livingston and Sharad Garg}, title = {The Impact of Spatial Layout of Jobs on Parallel {I/O} Performance}, booktitle = {Proceedings of the Sixth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1999}, month = {May}, pages = {45--56}, publisher = {ACM Press}, address = {Atlanta, GA}, URL = {http://vibes.cs.uiuc.edu/IOPADS/Accepted/Mache.ps}, keyword = {parallel I/O, pario-bib}, abstract = {Input/Output is a big obstacle to effective use of teraflops-scale computing systems. Motivated by earlier parallel I/O measurements on an Intel TFLOPS machine, we conduct studies to determine the sensitivity of parallel I/O performance on multi-programmed mesh-connected machines with respect to number of I/O nodes, number of compute nodes, network link bandwidth, I/O node bandwidth, spatial layout of jobs, and read or write demands of applications. \par Our extensive simulations and analytical modeling yield important insights into the limitations on parallel I/O performance due to network contention, and into the possible gains in parallel I/O performance that can be achieved by tuning the spatial layout of jobs. \par Applying these results, we devise a new processor allocation strategy that is sensitive to parallel I/O traffic and the resulting network contention. In performance evaluations driven by synthetic workloads and by a real workload trace captured at the San Diego Supercomputing Center, the new strategy improves the average response time of parallel I/O intensive jobs by up to a factor of 4.5.} } @InProceedings{madhyastha:classification, author = {Tara M. Madhyastha and Daniel A. Reed}, title = {Input/Output Access Pattern Classification Using Hidden {Markov} Models}, booktitle = {Proceedings of the Fifth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1997}, month = {November}, pages = {57--67}, publisher = {ACM Press}, address = {San Jose, CA}, later = {madhyastha:thesis}, keyword = {workload characterization, file access pattern, parallel I/O, pario-bib}, abstract = {Input/output performance on current parallel file systems is sensitive to a good match of application access pattern to file system capabilities. Automatic input/output access classification can determine application access patterns at execution time, guiding adaptive file system policies. In this paper we examine a new method for access pattern classification that uses hidden Markov models, trained on access patterns from previous executions, to create a probabilistic model of input/output accesses. We compare this approach to a neural network classification framework, presenting performance results from parallel and sequential benchmarks and applications.}, comment = {The most interesting thing in this paper is the use of a Hidden Markov Model to understand the access pattern of an application to a file. After running the application on the file once, and simultaneously training their HMM, they use the result to tune the system for the next execution (cache size, cache partitioning, prefetching, Intel file mode, etc). They get much better performance in future runs. See also madhyastha:thesis, and related papers.} } @InProceedings{moore:detection, author = {Jason A. Moore and Philip J. Hatcher and Michael J. Quinn}, title = {Efficient Data-Parallel Files via Automatic Mode Detection}, booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1996}, month = {May}, pages = {1--14}, publisher = {ACM Press}, address = {Philadelphia}, keyword = {parallel I/O, data parallelism, pario-bib}, abstract = {Parallel languages rarely specify parallel I/O constructs, and existing commercial systems provide the programmer with a low-level I/O interface. We present design principles for integrating I/O into languages and show how these principles are applied to a virtual-processor-oriented language. We illustrate how machine-independent modes are used to support both high performance and generality. We describe an automatic mode detection technique that saves the programmer from extra syntax and low-level file system details. We show how virtual processor file operations, typically small by themselves, are combined into efficient large-scale file system calls. Finally, we present a variety of benchmark results detailing design tradeoffs and the performance of various modes.}, comment = {Updated version of TR 95-80-9. See moore:stream. Interesting approach, where they permit a fairly normal fread and fwrite kind of interface, with each VP having its own stream. They choose their own format for the file, and switch between formats (and internal buffering) depending on the particulars of the fread and fwrite parameters. They seem to have good performance, and a familiar interface. They are left with a non-standard file format.} } @InProceedings{nieuwejaar:galley-perf, author = {Nils Nieuwejaar and David Kotz}, title = {Performance of the {Galley} Parallel File System}, booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1996}, month = {May}, pages = {83--94}, publisher = {ACM Press}, address = {Philadelphia}, later = {nieuwejaar:jgalley-tr}, URL = {ftp://ftp.cs.dartmouth.edu/pub/kotz/papers/nieuwejaar:galley-perf.ps.Z}, keyword = {parallel file system, parallel I/O, multiprocessor file system interface, pario-bib, dfk}, abstract = {As the I/O needs of parallel scientific applications increase, file systems for multiprocessors are being designed to provide applications with parallel access to multiple disks. Many parallel file systems present applications with a conventional Unix-like interface that allows the application to access multiple disks transparently. This interface conceals the parallelism within the file system, which increases the ease of programmability, but makes it difficult or impossible for sophisticated programmers and libraries to use knowledge about their I/O needs to exploit that parallelism. Furthermore, most current parallel file systems are optimized for a different workload than they are being asked to support. We introduce Galley, a new parallel file system that is intended to efficiently support realistic parallel workloads. Initial experiments, reported in this paper, indicate that Galley is capable of providing high-performance I/O to applications that access data in patterns that have been observed to be common.}, comment = {See also nieuwejaar:galley.} } @InProceedings{prabhakar:browsing, author = {Sunil Prabhakar and Divyakant Agrawal and Amr {El Abbadi} and Ambuj Singh and Terence Smith}, title = {Browsing and Placement of Multiresolution Images on Parallel Disks}, booktitle = {Proceedings of the Fifth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1997}, month = {November}, pages = {102--113}, publisher = {ACM Press}, address = {San Jose, CA}, keyword = {multimedia, parallel I/O, pario-bib}, abstract = {With rapid advances in computer and communication technologies, there is an increasing demand to build and maintain large image repositories. In order to reduce the demands on I/O and network resources, multiresolution representations are being proposed for the storage organization of images. Image decomposition techniques such as {\em wavelets} can be used to provide these multiresolution images. The original image is represented by several coefficients, one of them with visual similarity to the original image, but at a lower resolution. These visually similar coefficients can be thought of as {\em thumbnails} or {\em icons} of the original image. This paper addresses the problem of storing these multiresolution coefficients on disks so that thumbnail browsing as well as image reconstruction can be performed efficiently. Several strategies are evaluated to store the image coefficients on parallel disks. These strategies can be classified into two broad classes depending on whether the access pattern of the images is used in the placement. Disk simulation is used to evaluate the performance of these strategies. Simulation results are validated with results from experiments with real disks and are found to be in good agreement. The results indicate that significant performance improvements can be achieved with as few as four disks by placing image coefficients based upon browsing access patterns.}, comment = {They use simulation to study several different placement policies for the thumbnail and varying-resolution versions of images on a disk array.} } @InProceedings{schwabe:layouts, author = {Eric J. Schwabe and Ian M. Sutherland and Bruce K. Holmer}, title = {Evaluating Approximately Balanced Parity-Declustered Data Layouts for Disk Arrays}, booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1996}, month = {May}, pages = {41--54}, publisher = {ACM Press}, address = {Philadelphia}, later = {schwabe:jlayouts}, keyword = {parallel I/O, disk array, parity, RAID, pario-bib}, abstract = {Parity declustering has been used to reduce the time required to reconstruct a failed disk in a disk array. Most existing work on parity declustering uses BIBD-based data layouts, which distribute the workload of reconstructing a failed disk over the remaining disks of the array with perfect balance. For certain array sizes, however, there is no known BIBD-based layout. In this paper, we evaluate data layouts that are approximately balanced --- that is, that distribute the reconstruction workload over the disks of the array with only approximate balance. Approximately balanced layouts are considerably easier to construct than perfectly balanced layouts. We consider three methods for generating approximately balanced layouts: randomization, simulated annealing, and perturbing a BIBD-based layout whose size is near the desired size. We compare the performance of these approximately balanced layouts with that of perfectly balanced layouts using a disk array simulator. We conclude that, on uniform workloads, approximately balanced data layouts have performance nearly identical to that of perfectly balanced layouts. Approximately balanced layouts therefore provide the reconstruction performance benefits of perfectly balanced layouts for arrays where perfectly balanced layouts are either not known, or do not exist.} } @InProceedings{soloviev:prefetching, author = {Valery V. Soloviev}, title = {Prefetching in Segmented Disk Cache for Multi-Disk Systems}, booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1996}, month = {May}, pages = {69--82}, publisher = {ACM Press}, address = {Philadelphia}, keyword = {parallel I/O, prefetching, disk cache, disk array, pario-bib}, abstract = {This paper investigates the performance of a multi-disk storage system equipped with a segmented disk cache processing a workload of multiple relational scans. Prefetching is a popular method of improving the performance of scans. Many modern disks have a multisegment cache which can be used for prefetching. We observe that, exploiting declustering as a data placement method, prefetching in a segmented cache causes a load imbalance among several disks. A single disk becomes a bottleneck, degrading performance of the entire system. A variation in disk queue length is a primary factor of the imbalance. Using a precise simulation model, we investigate several approaches to achieving better balancing. Our metrics are a scan response time for the closed-end system and an ability to sustain a workload without saturating for the open-end system. We arrive at two main conclusions: (1) Prefetching in main memory is inexpensive and effective for balancing and can supplement or substitute prefetching in disk cache. (2) Disk-level prefetching provides about the same performance as main memory prefetching if request queues are managed in the disk controllers rather than in the host. Checking the disk cache before queuing requests provides not only better request response time but also drastically improves balancing. A single cache performs better than a segmented cache for this method.}, comment = {An interesting paper about disk-controller cache management in database workloads. Actually, the workloads are sequential scans of partitioned files, which could occur in many kinds of workloads. The declustering pattern (partitioning) is a little unusual for most scientific parallel I/O veterans, who are used to striping. And the cache-management algorithms seem a bit strange, particularly the fact that the cache appears to be used only for explicit prefetch requests. Turns out that it is best to put the prefetching and disk queueing in the same place, either on the controller or in main memory, to avoid load imbalance that arises from randomness in the workload, which is accentuated into a big bottleneck and a convoy effect.} } @InProceedings{thakur:mpi-io-implement, author = {Rajeev Thakur and William Gropp and Ewing Lusk}, title = {On Implementing {MPI-IO} Portably and with High Performance}, booktitle = {Proceedings of the Sixth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1999}, month = {May}, pages = {23--32}, earlier = {thakur:mpi-io-implement-tr}, URL = {http://www.mcs.anl.gov/~thakur/papers/mpio-impl.ps.gz}, keyword = {parallel I/O, multiprocessor file system interface, pario-bib}, abstract = {We discuss the issues involved in implementing MPI-IO portably on multiple machines and file systems and also achieving high performance. One way to implement MPI-IO portably is to implement it on top of the basic Unix I/O functions ({\tt open}, {\tt lseek}, {\tt read}, {\tt write}, and {\tt close}), which are themselves portable. We argue that this approach has limitations in both functionality and performance. We instead advocate an implementation approach that combines a large portion of portable code and a small portion of code that is optimized separately for different machines and file systems. We have used such an approach to develop a high-performance, portable MPI-IO implementation, called ROMIO. \par In addition to basic I/O functionality, we consider the issues of supporting other MPI-IO features, such as 64-bit file sizes, noncontiguous accesses, collective I/O, asynchronous I/O, consistency and atomicity semantics, user-supplied hints, shared file pointers, portable data representation, and file preallocation. We describe how we implemented each of these features on various machines and file systems. The machines we consider are the HP Exemplar, IBM SP, Intel Paragon, NEC SX-4, SGI Origin2000, and networks of workstations; and the file systems we consider are HP HFS, IBM PIOFS, Intel PFS, NEC SFS, SGI XFS, NFS, and any general Unix file system (UFS). \par We also present our thoughts on how a file system can be designed to better support MPI-IO. We provide a list of features desired from a file system that would help in implementing MPI-IO correctly and with high performance.} } @InProceedings{toledo:solar, author = {Sivan Toledo and Fred G. Gustavson}, title = {The Design and Implementation of {SOLAR}, a Portable Library for Scalable Out-of-Core Linear Algebra Computations}, booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1996}, month = {May}, pages = {28--40}, publisher = {ACM Press}, address = {Philadelphia}, keyword = {parallel I/O, out-of-core, linear algebra, pario-bib}, abstract = {SOLAR is a portable high-performance library for out-of-core dense matrix computations. It combines portability with high performance by using existing high-performance in-core subroutine libraries and by using an optimized matrix input-output library. SOLAR works on parallel computers, workstations, and personal computers. It supports in-core computations on both shared-memory and distributed-memory machines, and its matrix input-output library supports both conventional I/O interfaces and parallel I/O interfaces. This paper discusses the overall design of SOLAR, its interfaces, and the design of several important subroutines. Experimental results show that SOLAR can factor on a single workstation an out-of-core positive-definite symmetric matrix at a rate exceeding 215 Mflops, and an out-of-core general matrix at a rate exceeding 195 Mflops. Less than 16\% of the running time is spent on I/O in these computations. These results indicate that SOLAR's portability does not compromise its performance. We expect that the combination of portability, modularity, and the use of a high-level I/O interface will make the library an important platform for research on out-of-core algorithms and on parallel I/O.}, comment = {Sounds great. Library package that supports LAPACK-like functionality on in-core and out-of-core matrices. Good performance. Good portability (IBM workstation, IBM SP-2, and OS/2 laptop). They separate the matrix algorithms from the underlying I/O routines in an interesting way (read and write submatrices), leaving just enough information to allow the I/O system to do some higher-level optimizations.} } @InProceedings{weissman:smart, author = {Jon B. Weissman}, title = {Smart File Objects: A Remote File Access Paradigm}, booktitle = {Proceedings of the Sixth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1999}, month = {May}, pages = {89--97}, publisher = {ACM Press}, address = {Atlanta, GA}, URL = {http://vibes.cs.uiuc.edu/IOPADS/Accepted/Weissman.ps}, keyword = {object, parallel I/O, pario-bib}, abstract = {This paper describes a new scheme for remote file access called Smart File Objects (SFO). The SFO is an object-oriented application-specific file access paradigm designed to attack the bottleneck imposed by high latency, low bandwidth networks such as wide-area and wireless networks. The SFO uses application and network information to adaptively prefetch needed data in parallel with the execution of the application. The SFO can offer additional advantages such as non-blocking I/O, bulk I/O, improved file access APIs, and increased reliability. We describe the SFO concept, a prototype implementation in the Mentat system, and the results obtained with a distributed gene sequence application running across the Internet and vBNS. The results show the potential of the SFO approach to improve application performance.} } @InProceedings{wisniewski:in-place, author = {Leonard F. Wisniewski}, title = {Structured Permuting in Place on Parallel Disk Systems}, booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1996}, month = {May}, pages = {128--139}, publisher = {ACM Press}, address = {Philadelphia}, keyword = {parallel I/O, parallel I/O algorithm, permutation, out-of-core, pario-bib}, abstract = {The ability to perform permutations of large data sets in place reduces the amount of necessary available disk storage. The simplest way to perform a permutation often is to read the records of a data set from a source portion of data storage, permute them in memory, and write them to a separate target portion of the same size. It can be quite expensive, however, to provide disk storage that is twice the size of very large data sets. Permuting in place reduces the expense by using only a small amount of extra disk storage beyond the size of the data set. \par \newcommand{\ceil}[1]{\lceil #1\rceil} \newcommand{\rank}[1]{\mathop{\rm rank}\nolimits #1} This paper features in-place algorithms for commonly used structured permutations. We have developed an asymptotically optimal algorithm for performing BMMC (bit-matrix-multiply/complement) permutations in place that requires at most $\frac{2N}{BD}\left( 2\ceil{\frac{\rank{\gamma}}{\lg (M/B)}} + \frac{7}{2}\right)$ parallel disk accesses, as long as $M \geq 2BD$, where $N$ is the number of records in the data set, $M$ is the number of records that can fit in memory, $D$ is the number of disks, $B$ is the number of records in a block, and $\gamma$ is the lower left $\lg (N/B) \times \lg B$ submatrix of the characteristic matrix for the permutation. This algorithm uses $N+M$ records of disk storage and requires only a constant factor more parallel disk accesses and insignificant additional computation than a previously published asymptotically optimal algorithm that uses $2N$ records of disk storage. \par We also give algorithms to perform mesh and torus permutations on a $d$-dimensional mesh. The in-place algorithm for mesh permutations requires at most $3\ceil{N/BD}$ parallel I/Os and the in-place algorithm for torus permutations uses at most $4dN/BD$ parallel I/Os. The algorithms for mesh and torus permutations require no extra disk space as long as the memory size $M$ is at least $3BD$. The torus algorithm improves upon the previous best algorithm in terms of both time and space.} } @InProceedings{zhou:threads, author = {Yuanyuan Zhou and Limin Wang and Douglas W. Clark and Kai Li}, title = {Thread Scheduling for Out-of-Core Applications with Memory Server on Multicomputers}, booktitle = {Proceedings of the Sixth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1999}, month = {May}, pages = {57--67}, publisher = {ACM Press}, address = {Atlanta, GA}, URL = {http://vibes.cs.uiuc.edu/IOPADS/Accepted/Zhou.ps}, keyword = {threads, scheduling, memory, out-of-core application, parallel I/O, pario-bib}, abstract = {Out-of-core applications perform poorly in paged virtual memory (VM) systems because demand paging involves slow disk I/O accesses. Much research has been done on reducing the I/O overhead in such applications by either reducing the number of I/Os or lowering the cost of each I/O operation. In this paper, we investigate a method that combines fine-grained threading with a memory server model to improve the performance of out-of-core applications on multicomputers. The memory server model decreases the average cost of I/O operations by paging to remote memory, while the fine-grained thread scheduling reduces the number of I/O accesses by improving the data locality of applications. We have evaluated this method on an Intel Paragon with 7 applications. Our results show that the memory server system performs better than the VM disk paging by a factor of 5 for sequential applications and a factor of 1.5 to 2.2 for parallel applications. The fine-grained threading alone improves the VM disk paging performance by a factor of 10 and 1.2 to 3 respectively for sequential and parallel applications. Overall, the combination of these two techniques outperforms the VM disk paging by more than a factor of 12 for sequential applications and a factor of 3 to 6 for parallel applications.} }