@InProceedings{thakur:romio,
  author = {Rajeev Thakur and William Gropp and Ewing Lusk},
  title = {Data Sieving and Collective {I/O} in {ROMIO}},
  booktitle = {Proceedings of the Seventh Symposium on the Frontiers of
  Massively Parallel Computation},
  year = {1999},
  month = {February},
  pages = {182--189},
  publisher = {IEEE Computer Society Press},
  earlier = {thakur:romio-tr},
  URL = {http://www.mcs.anl.gov/~thakur/papers/romio-coll.ps},
  keywords = {parallel I/O, collective I/O, application programmer interface,
  pario-bib},
  abstract = {The I/O access patterns of parallel programs often consist of
  accesses to a large number of small, noncontiguous pieces of data. If an
  application's I/O needs are met by making many small, distinct I/O requests,
  however, the I/O performance degrades drastically. To avoid this problem,
  MPI-IO allows users to access a noncontiguous data set with a single I/O
  function call. This feature provides MPI-IO implementations an opportunity to
  optimize data access. \par We describe how our MPI-IO implementation, ROMIO,
  delivers high performance in the presence of noncontiguous requests. We
  explain in detail the two key optimizations ROMIO performs: data sieving for
  noncontiguous requests from one process and collective I/O for noncontiguous
  requests from multiple processes. We describe how one can implement these
  optimizations portably on multiple machines and file systems, control their
  memory requirements, and also achieve high performance. We demonstrate the
  performance and portability with performance results for three
  applications---an astrophysics-application template (DIST3D), the NAS BTIO
  benchmark, and an unstructured code (UNSTRUC)---on five different parallel
  machines: HP Exemplar, IBM SP, Intel Paragon, NEC SX-4, and SGI Origin2000.},
  comment = {They describe how ROMIO, their MPI-IO implementation, delivers
  high performance through the use of data sieving and collective I/O. The
  paper discusses several specific optimizations. They have results from five
  major parallel platforms. The paper confirms that the UNIX interface is
  terrible for many parallel access patterns, and that collective I/O is an
  important solution.}
}