@PhdThesis{nitzberg:thesis,
  author = {William J. Nitzberg},
  title = {Collective Parallel {I/O}},
  year = {1995},
  month = {December},
  school = {Department of Computer and Information Science, University of
  Oregon},
  keywords = {parallel I/O, parallel algorithm, file system interface,
  pario-bib},
  abstract = {Parallel I/O, the process of transferring a global data structure
  distributed among compute nodes to a file striped across storage devices, can
  be quite complicated and involve a significant amount of data movement.
  Optimizing parallel I/O with respect to data distribution, file layout, and
  machine architecture is critical for performance. In this work, we propose a
  solution to both the performance and portability problems plaguing the wide
  acceptance of distributed memory parallel computers for scientific computing:
  a collective parallel I/O interface and efficient algorithms to implement it.
  A collective interface allows the programmer to specify a file access as a
  high-level global operation rather than as a series of seeks and writes. This
  not only provides a more natural interface for the programmer, but also
  provides the system with both the opportunity and the semantic information
  necessary to optimize the file operation. \par We attack this problem in
  three steps: we evaluate an early parallel I/O system, the Intel iPSC/860
  Concurrent File System; we design and analyze the performance of two classes
  of algorithms taking advantage of collective parallel I/O; and we design
  MPI-IO, a collective parallel I/O interface likely to become the standard for
  portable parallel I/O. \par The collective I/O algorithms fall into two broad
  categories: data block scheduling and collective buffering. Data block
  scheduling algorithms attempt to schedule the individual data transfers to
  minimize resource contention and to optimize for particular hardware
  characteristics. We develop and evaluate three data block scheduling
  algorithms: Grouping, Random, and Sliding Window. The data block scheduling
  algorithms improved performance by as much as a factor of eight. The
  collective buffering algorithms permute the data before writing or after
  reading in order to combine small file accesses into large blocks. We design
  and test a series of four collective buffering algorithms and demonstrate
  improvement in performance by two orders of magnitude over naive file I/O for
  the hardest, three-dimensional distributions.},
  comment = {See also nitzberg:cfs and corbett:mpi-overview.}
}