@InProceedings{dickens:threads,
  author = {Phillip Dickens and Rajeev Thakur},
  title = {Improving Collective {I/O} Performance Using Threads},
  booktitle = {Proceedings of the Joint International Parallel Processing
  Symposium and IEEE Symposium on Parallel and Distributed Processing},
  year = {1999},
  month = {April},
  pages = {38--45},
  URL = {http://www.mcs.anl.gov/~thakur/papers/ipps99-thread-coll.ps},
  keywords = {parallel I/O, multithread programming, collective I/O,
  disk-directed I/O, two-phase I/O, pario-bib},
  abstract = {Massively parallel computers are increasingly being used to solve
  large, I/O intensive applications in many different fields. For such
  applications, the I/O requirements quite often present a significant obstacle
  in the way of achieving good performance, and an important area of current
  research is the development of techniques by which these costs can be
  reduced. One such approach is {\it collective I/O}, where the processors
  cooperatively develop an I/O strategy that reduces the number, and increases
  the size, of I/O requests, making a much better use of the I/O subsystem.
  Collective I/O has been shown to significantly reduce the cost of performing
  I/O in many large, parallel applications, and for this reason serves as an
  important base upon which we can explore other mechanisms which can further
  reduce these costs. One promising approach is to use threads to perform the
  collective I/O {\it in the background} while the main thread continues with
  other computation in the foreground. \par In this paper, we explore the
  issues associated with implementing collective I/O in the background using
  threads. The most natural approach is to simply spawn off an I/O thread to
  perform the collective I/O in the background while the main thread continues
  with other computation. However, our research demonstrates that this approach
  is frequently the {\it worst} implementation option, often performing much
  more poorly than just executing collective I/O completely in the foreground.
  To improve the performance of thread-based collective I/O, we developed an
  alternate approach where {\it part} of the collective I/O operation is
  performed in the background, and part is performed in the foreground. We
  demonstrate that this new technique can significantly improve the performance
  of thread-based collective I/O, providing up to an 80\% improvement over
  sequential collective I/O (where there is no attempt to overlap computation
  with I/O). Also, we discuss one very important application of this research
  which is the implementation of the {\it split-collective} parallel I/O
  operations defined in MPI 2.0.},
  comment = {They examine an implementation of collective I/O in MPI2 such that
  the collective I/O is done in the background, using a thread, while the
  computation continues. They found that the performance can be quite
  disappointing, because of the competition for the CPU between the
  computational thread and the background thread executing the redistribution
  phase of the I/O operation. They get better results by doing the
  redistribution in the foreground, making the computation wait, and then doing
  the I/O in the background thread while the computation continues. Results
  from four major parallel platforms, but only for write operations.}
}