@InProceedings{thakur:runtime,
  author = {R. Thakur and R. Bordawekar and A. Choudhary},
  title = {{Compiler and Runtime Support for Out-of-Core HPF Programs}},
  booktitle = {Proceedings of the 8th ACM International Conference on
  Supercomputing},
  year = {1994},
  month = {July},
  pages = {382--391},
  publisher = {ACM Press},
  address = {Manchester, UK},
  URL =
  {ftp://erc.cat.syr.edu/ece/choudhary/PASSION/ics94-out-of-core-hpf.ps.Z},
  keywords = {parallel I/O, pario-bib},
  abstract = {This paper describes the design of a compiler which can translate
  out-of-core programs written in a data parallel language like HPF. Such a
  compiler is required for compiling large scale scientific applications, such
  as the Grand Challenge applications, which deal with enormous quantities of
  data. We propose a framework by which a compiler together with appropriate
  runtime support can translate an out-of-core HPF program to a message passing
  node program with explicit parallel I/O. We describe the basic model of the
  compiler and the various transformations made by the compiler. We also
  discuss the runtime routines used by the compiler for I/O and communication.
  In order to minimize I/O, the runtime support system can reuse data already
  fetched into memory. The working of the compiler is illustrated using two
  out-of-core applications, namely a Laplace equation solver and LU
  Decomposition, together with performance results on the Intel Touchstone
  Delta.},
  comment = {They describe ways to make HPF handle out-of-core arrays.
  Basically, they add directives to say which arrays are out of core, and how
  much memory to devote to the in-core portion of the array. Then the compiler
  distributes the array across processors, as in HPF, to form local arrays.
  Each local array is broken into slabs, where each slab can fit in local
  memory. The local array is kept in a local array file, from which slabs are
  loaded and stored. Ghost nodes are also handled. They were careful to avoid
  double I/O when one slab is another slab's ghost node. They found it most
  convenient to do all the communication between iterations, then do all the
  computation for that iteration, where the iteration itself required a loop
  including both computation and I/O. This means that there may need to be I/O
  during the communication phase, to store ghost nodes coming in from other
  places. They do not mention use of asynchronous I/O for overlap. See also
  bordawekar:efficient.}
}