@Article{thakur:noncontigous,
  author = {Rajeev Thakur and William Gropp and Ewing Lusk},
  title = {Optimizing Noncontiguous Accesses in {MPI-IO}},
  journal = {Parallel Computing},
  year = {2002},
  month = {January},
  volume = {28},
  number = {1},
  pages = {83--105},
  URL = {http://www.mcs.anl.gov/~thakur/papers/mpi-io-noncontig.ps},
  keywords = {parallel I/O, parallel I/O, MPI-IO, collective I/O, data sieving,
  pario-bib},
  abstract = {The I/O access patterns of many parallel applications consist of
  accesses to a large number of small, noncontiguous pieces of data. If an
  application's I/O needs are met by making many small, distinct I/O requests,
  however, the I/O performance degrades drastically. To avoid this problem,
  MPI-IO allows users to access noncontiguous data with a single I/O function
  call, unlike in Unix I/O. In this paper, we explain how critical this feature
  of MPI-IO is for high performance and how it enables implementations to
  perform optimizations. We first provide a classification of the different
  ways of expressing an application's I/O needs in MPI-IO---we classify them
  into four {\em levels}, called level~0 through level~3. We demonstrate that,
  for applications with noncontiguous access patterns, the I/O performance
  improves dramatically if users write their applications to make level-3
  requests (noncontiguous, collective) rather than level-0 requests (Unix
  style). We then describe how our MPI-IO implementation, ROMIO, delivers high
  performance for noncontiguous requests. We explain in detail the two key
  optimizations ROMIO performs: data sieving for noncontiguous requests from
  one process and collective I/O for noncontiguous requests from multiple
  processes. We describe how we have implemented these optimizations portably
  on multiple machines and file systems, controlled their memory requirements,
  and also achieved high performance. We demonstrate the performance and
  portability with performance results for three applications---an
  astrophysics-application template (DIST3D), the NAS BTIO benchmark, and an
  unstructured code (UNSTRUC)---on five different parallel machines: HP
  Exemplar, IBM SP, Intel Paragon, NEC SX-4, and SGI Origin2000.}
}