@Article{dickens:evaluation,
  author = {Phillip M. Dickens and Rajeev Thakur},
  title = {Evaluation of Collective {I/O} Implementations on Parallel
  Architectures},
  journal = {Journal of Parallel and Distributed Computing},
  year = {2001},
  month = {August},
  volume = {61},
  number = {8},
  pages = {1052--1076},
  publisher = {Academic Press},
  copyright = {Academic Press},
  URL = {http://www.idealibrary.com/links/doi/10.1006/jpdc.2000.1733},
  keywords = {parallel I/O, collective I/O, pario-bib, parallel architecture},
  abstract = {In this paper, we evaluate the impact on performance of various
  implementation techniques for collective I/O operations, and we do so across
  four important parallel architectures. We show that a naive implementation of
  collective I/0 does not result in significant performance gains for any of
  the architectures, but that an optimized implementation does provide
  excellent performance across all of the platforms under study. Furthermore,
  we demonstrate that there exists a single implementation strategy that
  provides the best performance for all four computational platforms. Next, we
  evaluate implementation techniques for thread-based collective I/O
  operations. We show that the most obvious implementation technique, which is
  to spawn a thread to execute the whole collective I/O operation in the
  background, frequently provides the worst performance, often performing much
  worse than just executing the collective I/O routine entirely in the
  foreground. To improve performance, we explore an alternate approach where
  part of the collective I/O operation is performed in the background, and part
  is performed in the foreground. We demonstrate that this implementation
  technique can provide significant performance gains, offering up to a 50\%
  improvement over implementations that do not attempt to overlap collective
  I/O and computation.}
}