BibTeX for papers by David Kotz; for complete/updated list see https://www.cs.dartmouth.edu/~kotz/research/papers.html @InCollection{kotz:bdiskdir, author = {David Kotz}, title = {{Disk-directed I/O for MIMD Multiprocessors}}, booktitle = {{High Performance Mass Storage and Parallel I/O: Technologies and Applications}}, editor = {Hai Jin and Toni Cortes and Rajkumar Buyya}, year = 2001, month = {September}, chapter = 35, pages = {513--535}, publisher = {Wiley-IEEE Press}, copyright = {Wiley-IEEE Press}, ISBN13 = {978-0-471-20809-9}, URL = {https://www.cs.dartmouth.edu/~kotz/research/kotz-bdiskdir/index.html}, abstract = {Many scientific applications that run on today's multiprocessors, such as weather forecasting and seismic analysis, are bottlenecked by their file-I/O needs. Even if the multiprocessor is configured with sufficient I/O hardware, the file-system software often fails to provide the available bandwidth to the application. Although libraries and enhanced file-system interfaces can make a significant improvement, we believe that fundamental changes are needed in the file-server software. We propose a new technique, disk-directed I/O, to allow the disk servers to determine the flow of data for maximum performance. Our simulations show that tremendous performance gains are possible both for simple reads and writes and for an out-of-core application. Indeed, our disk-directed I/O technique provided consistent high performance that was largely independent of data distribution, obtained up to 93\% of peak disk bandwidth, and was as much as 18 times faster than the traditional technique.}, } @Article{kotz:jdiskdir, author = {David Kotz}, title = {{Disk-directed I/O for MIMD Multiprocessors}}, journal = {ACM Transactions on Computer Systems}, year = 1997, month = {February}, volume = 15, number = 1, pages = {41--74}, publisher = {ACM}, copyright = {ACM}, DOI = {10.1145/244764.244766}, URL = {https://www.cs.dartmouth.edu/~kotz/research/kotz-jdiskdir/index.html}, abstract = {Many scientific applications that run on today's multiprocessors, such as weather forecasting and seismic analysis, are bottlenecked by their file-I/O needs. Even if the multiprocessor is configured with sufficient I/O hardware, the file-system software often fails to provide the available bandwidth to the application. Although libraries and enhanced file-system interfaces can make a significant improvement, we believe that fundamental changes are needed in the file-server software. We propose a new technique, disk-directed I/O, to allow the disk servers to determine the flow of data for maximum performance. Our simulations show that tremendous performance gains are possible both for simple reads and writes and for an out-of-core application. Indeed, our disk-directed I/O technique provided consistent high performance that was largely independent of data distribution, obtained up to 93\% of peak disk bandwidth, and was as much as 18 times faster than the traditional technique.}, } @InProceedings{ap:enwrich, author = {Apratim Purakayastha and Carla Schlatter Ellis and David Kotz}, title = {{ENWRICH: A Compute-Processor Write Caching Scheme for Parallel File Systems}}, booktitle = {{Proceedings of the Workshop on Input/Output in Parallel and Distributed Systems (IOPADS)}}, year = 1996, month = {May}, pages = {55--68}, publisher = {ACM}, copyright = {ACM}, address = {Philadelphia}, DOI = {10.1145/236017.236034}, URL = {https://www.cs.dartmouth.edu/~kotz/research/ap-enwrich/index.html}, abstract = {Many parallel scientific applications need high-performance I/O. Unfortunately, end-to-end parallel-I/O performance has not been able to keep up with substantial improvements in parallel-I/O hardware because of poor parallel file-system software. Many radical changes, both at the interface level and the implementation level, have recently been proposed. One such proposed interface is \emph{collective I/O}, which allows parallel jobs to request transfer of large contiguous objects in a single request, thereby preserving useful semantic information that would otherwise be lost if the transfer were expressed as per-processor non-contiguous requests. Kotz has proposed \emph{disk-directed I/O} as an efficient implementation technique for collective-I/O operations, where the compute processors make a single collective data-transfer request, and the I/O processors thereafter take full control of the actual data transfer, exploiting their detailed knowledge of the disk-layout to attain substantially improved performance. \par Recent parallel file-system usage studies show that writes to write-only files are a dominant part of the workload. Therefore, optimizing writes could have a significant impact on overall performance. In this paper, we propose ENWRICH, a compute-processor write-caching scheme for write-only files in parallel file systems. ENWRICH combines low-overhead write caching at the compute processors with high performance disk-directed I/O at the I/O processors to achieve both low latency and high bandwidth. This combination facilitates the use of the powerful disk-directed I/O technique independent of any particular choice of interface. By collecting writes over many files and applications, ENWRICH lets the I/O processors optimize disk I/O over a large pool of requests. We evaluate our design via simulated implementation and show that ENWRICH achieves high performance for various configurations and workloads.}, } @TechReport{kotz:tuning, author = {David Kotz}, title = {{Tuning STARFISH}}, institution = {Dartmouth Computer Science}, year = 1996, month = {October}, number = {PCS-TR96-296}, copyright = {David Kotz}, URL = {https://www.cs.dartmouth.edu/~kotz/research/kotz-tuning/index.html}, abstract = {STARFISH is a parallel file-system simulator we built for our research into the concept of disk-directed I/O. In this report, we detail steps taken to tune the file systems supported by STARFISH, which include a traditional parallel file system (with caching) and a disk-directed I/O system. In particular, we now support two-phase I/O, use smarter disk scheduling, increased the maximum number of outstanding requests that a compute processor may make to each disk, and added gather/scatter block transfer. We also present results of the experiments driving the tuning effort.}, } @Misc{kotz:starfish-sw, author = {David Kotz}, title = {{STARFISH parallel file-system simulator}}, howpublished = {The basis for my research on disk-directed I/O; used by at least two other research groups}, year = 1996, month = {October}, copyright = {the author}, URL = {https://www.cs.dartmouth.edu/~kotz/research/kotz-starfish-sw/index.html}, note = {Third release}, abstract = {STARFISH is a simulator for experimenting with concepts in parallel file systems. It is based on Eric Brewer's Proteus simulator from MIT, version 3.01, and runs only on (MIPS-based) DECstations. I have used this simulator in experiments for several research papers about disk-directed I/O.}, } @TechReport{ap:enwrich-tr, author = {Apratim Purakayastha and Carla Schlatter Ellis and David Kotz}, title = {{ENWRICH: A Compute-Processor Write Caching Scheme for Parallel File Systems}}, institution = {Dept. of Computer Science, Duke University}, year = 1995, month = {October}, number = {CS-1995-22}, copyright = {the authors}, URL = {https://www.cs.dartmouth.edu/~kotz/research/ap-enwrich-tr/index.html}, abstract = {Many parallel scientific applications need high-performance I/O. Unfortunately, end-to-end parallel-I/O performance has not been able to keep up with substantial improvements in parallel-I/O hardware because of poor parallel file-system software. Many radical changes, both at the interface level and the implementation level, have recently been proposed. One such proposed interface is \emph{collective I/O}, which allows parallel jobs to request transfer of large contiguous objects in a single request, thereby preserving useful semantic information that would otherwise be lost if the transfer were expressed as per-processor non-contiguous requests. Kotz has proposed \emph{disk-directed I/O} as an efficient implementation technique for collective-I/O operations, where the compute processors make a single collective data-transfer request, and the I/O processors thereafter take full control of the actual data transfer, exploiting their detailed knowledge of the disk-layout to attain substantially improved performance. \par Recent parallel file-system usage studies show that writes to write-only files are a dominant part of the workload. Therefore, optimizing writes could have a significant impact on overall performance. In this paper, we propose ENWRICH, a compute-processor write-caching scheme for write-only files in parallel file systems. ENWRICH combines low-overhead write caching at the compute processors with high performance disk-directed I/O at the I/O processors to achieve both low latency and high bandwidth. This combination facilitates the use of the powerful disk-directed I/O technique independent of any particular choice of interface. By collecting writes over many files and applications, ENWRICH lets the I/O processors optimize disk I/O over a large pool of requests. We evaluate our design via simulated implementation and show that ENWRICH achieves high performance for various configurations and workloads.}, } @TechReport{kotz:expand-tr, author = {David Kotz}, title = {{Expanding the Potential for Disk-Directed I/O}}, institution = {Dartmouth Computer Science}, year = 1995, month = {March}, number = {PCS-TR95-254}, copyright = {David Kotz}, URL = {https://www.cs.dartmouth.edu/~kotz/research/kotz-expand-tr/index.html}, abstract = {As parallel computers are increasingly used to run scientific applications with large data sets, and as processor speeds continue to increase, it becomes more important to provide fast, effective parallel file systems for data storage and for temporary files. In an earlier work we demonstrated that a technique we call disk-directed I/O has the potential to provide consistent high performance for large, collective, structured I/O requests. In this paper we expand on this potential by demonstrating the ability of a disk-directed I/O system to read irregular subsets of data from a file, and to filter and distribute incoming data according to data-dependent functions.}, } @InProceedings{kotz:expand, author = {David Kotz}, title = {{Expanding the Potential for Disk-Directed I/O}}, booktitle = {{Proceedings of the IEEE Symposium on Parallel and Distributed Processing (SPDP)}}, year = 1995, month = {October}, pages = {490--495}, publisher = {IEEE}, copyright = {IEEE}, address = {San Antonio, TX}, DOI = {10.1109/SPDP.1995.530723}, URL = {https://www.cs.dartmouth.edu/~kotz/research/kotz-expand/index.html}, abstract = {As parallel computers are increasingly used to run scientific applications with large data sets, and as processor speeds continue to increase, it becomes more important to provide fast, effective parallel file systems for data storage and for temporary files. In an earlier work we demonstrated that a technique we call disk-directed I/O has the potential to provide consistent high performance for large, collective, structured I/O requests. In this paper we expand on this potential by demonstrating the ability of a disk-directed I/O system to read irregular subsets of data from a file, and to filter and distribute incoming data according to data-dependent functions.}, } @InProceedings{kotz:explore, author = {David Kotz and Ting Cai}, title = {{Exploring the use of I/O Nodes for Computation in a MIMD Multiprocessor}}, booktitle = {{Proceedings of the IPPS Workshop on Input/Output in Parallel and Distributed Systems (IOPADS)}}, year = 1995, month = {April}, pages = {78--89}, publisher = {ACM}, copyright = {the authors}, URL = {https://www.cs.dartmouth.edu/~kotz/research/kotz-explore/index.html}, abstract = {As parallel systems move into the production scientific-computing world, the emphasis will be on cost-effective solutions that provide high throughput for a mix of applications. Cost-effective solutions demand that a system make effective use of all of its resources. Many MIMD multiprocessors today, however, distinguish between ``compute'' and ``I/O'' nodes, the latter having attached disks and being dedicated to running the file-system server. This static division of responsibilities simplifies system management but does not necessarily lead to the best performance in workloads that need a different balance of computation and I/O. \par Of course, computational processes sharing a node with a file-system service may receive less CPU time, network bandwidth, and memory bandwidth than they would on a computation-only node. In this paper we begin to examine this issue experimentally. We found that high-performance I/O does not necessarily require substantial CPU time, leaving plenty of time for application computation. There were some complex file-system requests, however, which left little CPU time available to the application. (The impact on network and memory bandwidth still needs to be determined.) For applications (or users) that cannot tolerate an occasional interruption, we recommend that they continue to use only compute nodes. For tolerant applications needing more cycles than those provided by the compute nodes, we recommend that they take full advantage of \emph{both} compute and I/O nodes for computation, and that operating systems should make this possible.}, } @TechReport{kotz:int-ddio, author = {David Kotz}, title = {{Interfaces for Disk-Directed I/O}}, institution = {Dartmouth Computer Science}, year = 1995, month = {September}, number = {PCS-TR95-270}, copyright = {David Kotz}, URL = {https://www.cs.dartmouth.edu/~kotz/research/kotz-int-ddio/index.html}, abstract = {In other papers I propose the idea of disk-directed I/O for multiprocessor file systems. Those papers focus on the performance advantages and capabilities of disk-directed I/O, but say little about the application-programmer's interface or about the interface between the compute processors and I/O processors. In this short note I discuss the requirements for these interfaces, and look at many existing interfaces for parallel file systems. I conclude that many of the existing interfaces could be adapted for use in a disk-directed I/O system.}, } @TechReport{kotz:lu-tr, author = {David Kotz}, title = {{Disk-directed I/O for an Out-of-core Computation}}, institution = {Dartmouth Computer Science}, year = 1995, month = {January}, number = {PCS-TR95-251}, copyright = {David Kotz}, URL = {https://www.cs.dartmouth.edu/~kotz/research/kotz-lu-tr/index.html}, abstract = {New file systems are critical to obtain good I/O performance on large multiprocessors. Several researchers have suggested the use of \emph{collective} file-system operations, in which all processes in an application cooperate in each I/O request. Others have suggested that the traditional low-level interface (\emph{read, write, seek}) be augmented with various higher-level requests (e.g., \emph{read matrix}), allowing the programmer to express a complex transfer in a single (perhaps collective) request. Collective, high-level requests permit techniques like \emph{two-phase I/O} and \emph{disk-directed I/O} to significantly improve performance over traditional file systems and interfaces. Neither of these techniques have been tested on anything other than simple benchmarks that read or write matrices. Many applications, however, intersperse computation and I/O to work with data sets that cannot fit in main memory. In this paper, we present the results of experiments with an ``out-of-core'' LU-decomposition program, comparing a traditional interface and file system with a system that has a high-level, collective interface and disk-directed I/O. We found that a collective interface was awkward in some places, and forced additional synchronization. Nonetheless, disk-directed I/O was able to obtain much better performance than the traditional system.}, } @InProceedings{kotz:lu, author = {David Kotz}, title = {{Disk-directed I/O for an Out-of-core Computation}}, booktitle = {{Proceedings of the IEEE International Symposium on High Performance Distributed Computing (HPDC)}}, year = 1995, month = {August}, pages = {159--166}, publisher = {IEEE}, copyright = {IEEE}, DOI = {10.1109/HPDC.1995.518706}, URL = {https://www.cs.dartmouth.edu/~kotz/research/kotz-lu/index.html}, abstract = {New file systems are critical to obtain good I/O performance on large multiprocessors. Several researchers have suggested the use of \emph{collective} file-system operations, in which all processes in an application cooperate in each I/O request. Others have suggested that the traditional low-level interface (\emph{read, write, seek}) be augmented with various higher-level requests (e.g., \emph{read matrix}). Collective, high-level requests permit a technique called \emph{disk-directed I/O} to significantly improve performance over traditional file systems and interfaces, at least on simple I/O benchmarks. In this paper, we present the results of experiments with an ``out-of-core'' LU-decomposition program. Although its collective interface was awkward in some places, and forced additional synchronization, disk-directed I/O was able to obtain much better overall performance than the traditional system.}, } @TechReport{kotz:diskdir-tr, author = {David Kotz}, title = {{Disk-directed I/O for MIMD Multiprocessors}}, institution = {Dartmouth Computer Science}, year = 1994, month = {July}, number = {PCS-TR94-226}, copyright = {David Kotz}, URL = {https://www.cs.dartmouth.edu/~kotz/research/kotz-diskdir-tr/index.html}, note = {Revised November 8, 1994}, abstract = {Many scientific applications that run on today's multiprocessors are bottlenecked by their file I/O needs. Even if the multiprocessor is configured with sufficient I/O hardware, the file-system software often fails to provide the available bandwidth to the application. Although libraries and improved file-system interfaces can make a significant improvement, we believe that fundamental changes are needed in the file-server software. We propose a new technique, \emph{disk-directed I/O}, that flips the usual relationship between server and client to allow the disks (actually, disk servers) to determine the flow of data for maximum performance. Our simulations show that tremendous performance gains are possible. Indeed, disk-directed I/O provided consistent high performance that was largely independent of data distribution, and close to the maximum disk bandwidth.}, } @InProceedings{kotz:diskdir, author = {David Kotz}, title = {{Disk-directed I/O for MIMD Multiprocessors}}, booktitle = {{Proceedings of the Symposium on Operating Systems Design and Implementation (OSDI)}}, year = 1994, month = {November}, pages = {61--74}, publisher = {USENIX Association}, copyright = {David Kotz}, URL = {https://www.cs.dartmouth.edu/~kotz/research/kotz-diskdir/index.html}, note = {Updated as Dartmouth TR PCS-TR94-226 on November 8, 1994}, abstract = {Many scientific applications that run on today's multiprocessors are bottlenecked by their file I/O needs. Even if the multiprocessor is configured with sufficient I/O hardware, the file-system software often fails to provide the available bandwidth to the application. Although libraries and improved file-system interfaces can make a significant improvement, we believe that fundamental changes are needed in the file-server software. We propose a new technique, \emph{disk-directed I/O}, that flips the usual relationship between server and client to allow the disks (actually, disk servers) to determine the flow of data for maximum performance. Our simulations show that tremendous performance gains are possible. Indeed, disk-directed I/O provided consistent high performance that was largely independent of data distribution, and close to the maximum disk bandwidth.}, } @TechReport{kotz:diskmodel, author = {David Kotz and Song Bac Toh and Sriram Radhakrishnan}, title = {{A Detailed Simulation Model of the HP 97560 Disk Drive}}, institution = {Dartmouth Computer Science}, year = 1994, month = {July}, number = {PCS-TR94-220}, copyright = {the authors}, URL = {https://www.cs.dartmouth.edu/~kotz/research/kotz-diskmodel/index.html}, abstract = {We implemented a detailed model of the HP 97560 disk drive, to replicate a model devised by Ruemmler and Wilkes (both of Hewlett-Packard, HP). Our model simulates one or more disk drives attached to one or more SCSI buses, using a small discrete-event simulation module included in our implementation. The design is broken into three components: a test driver, the disk model itself, and the discrete-event simulation support. Thus, the disk model can be easily extracted and used in other simulation environments. We validated our model using traces obtained from HP, using the same ``demerit'' measure as Ruemmler and Wilkes. We obtained a demerit figure of 3.9\%, indicating that our model was extremely accurate. This paper describes our implementation, and is meant for those wishing to understand our model or to implement their own.}, } @TechReport{kotz:explore-tr, author = {David Kotz and Ting Cai}, title = {{Exploring the use of I/O Nodes for Computation in a MIMD Multiprocessor}}, institution = {Dartmouth Computer Science}, year = 1994, month = {October}, number = {PCS-TR94-232}, copyright = {the authors}, URL = {https://www.cs.dartmouth.edu/~kotz/research/kotz-explore-tr/index.html}, note = {Revised 2/20/95}, abstract = {Most MIMD multiprocessors today are configured with two distinct types of processor nodes: those that have disks attached, which are dedicated to file I/O, and those that do not have disks attached, which are used for running applications. Several architectural trends have led some to propose configuring systems so that all processors are used for application processing, even those with disks attached. We examine this idea experimentally, focusing on the impact of remote I/O requests on local computational processes. We found that in an efficient file system the I/O processors can transfer data at near peak speeds with little CPU overhead, leaving substantial CPU power for running applications. On the other hand, we found that some complex file-system features could require substantial CPU overhead. Thus, for a multiprocessor system to obtain good I/O and computational performance on a mix of applications, the file system (both operating system and libraries) must be prepared to adapt their policies to changing conditions.}, } @Misc{kotz:diskmodel-sw, author = {David Kotz}, title = {{HP 97560 disk simulation module}}, howpublished = {Used in STARFISH and several other research projects}, year = 1994, copyright = {the author}, URL = {https://www.cs.dartmouth.edu/~kotz/research/kotz-diskmodel-sw/index.html}, abstract = {We implemented a detailed model of the HP 97560 disk drive, to replicate a model devised by Ruemmler and Wilkes (both of Hewlett-Packard).}, } @Article{kotz:diskdir2, author = {David Kotz}, title = {{Disk-directed I/O for MIMD Multiprocessors}}, journal = {Bulletin of the IEEE Technical Committee on Operating Systems and Application Environments}, year = 1994, month = {Autumn}, pages = {29--42}, publisher = {IEEE}, copyright = {David Kotz}, URL = {https://www.cs.dartmouth.edu/~kotz/research/kotz-diskdir2/index.html}, abstract = {Many scientific applications that run on today's multiprocessors are bottlenecked by their file I/O needs. Even if the multiprocessor is configured with sufficient I/O hardware, the file-system software often fails to provide the available bandwidth to the application. Although libraries and improved file-system interfaces can make a significant improvement, we believe that fundamental changes are needed in the file-server software. We propose a new technique, \emph{disk-directed I/O}, that flips the usual relationship between server and client to allow the disks (actually, disk servers) to determine the flow of data for maximum performance. Our simulations show that tremendous performance gains are possible. Indeed, disk-directed I/O provided consistent high performance that was largely independent of data distribution, and close to the maximum disk bandwidth.}, }