@TechReport{bordawekar:efficient,
  author = {Rajesh Bordawekar and Rajeev Thakur and Alok Choudhary},
  title = {Efficient Compilation of Out-of-core Data Parallel Programs},
  year = {1994},
  month = {April},
  number = {SCCS-622},
  institution = {NPAC},
  later = {bordawekar:reorganize},
  URL = {ftp://erc.cat.syr.edu/ece/choudhary/PASSION/access_reorg.ps.Z},
  keywords = {parallel I/O, compiler, pario-bib},
  abstract = {Large scale scientific applications, such as the Grand Challenge
  applications, deal with very large quantities of data. The amount of main
  memory in distributed memory machines is usually not large enough to solve
  problems of realistic size. This limitation results in the need for system
  and application software support to provide efficient parallel I/O for
  out-of-core programs. This paper describes techniques for translating
  out-of-core programs written in a data parallel language like HPF to message
  passing node programs with explicit parallel I/O. We describe the basic
  compilation model and various steps involved in the compilation. The
  compilation process is explained with the help of an out-of-core matrix
  multiplication program. We first discuss how an out-of-core program can be
  translated by extending the method used for translating in-core programs. We
  then describe how the compiler can optimize the code by estimating the I/O
  costs associated with different array access patterns and selecting the
  method with the least I/O cost. This optimization can reduce the amount of
  I/O by as much as an order of magnitude. Performance results on the Intel
  Touchstone Delta are presented and analyzed.},
  comment = {Revised as bordawekar: This is actually fairly different from
  thakur:runtime. They describe the same basic compiler technique, where arrays
  are distributed across processors, and each processor has a local array file
  for holding data from its local partitions. Then the I/O needed for a loop is
  broken into slabs, where the program proceeds as an alternation of (read
  slabs, compute, write slabs). The big new thing here is that the compiler
  tries different ways to form slabs (e.g., by row or by column), estimates the
  number of I/Os and the amount of data moved for each case, and chooses the
  case with the smallest amount of I/O. They also mention how the choice of
  memory size allocated to different arrays affects the amount of IO, but give
  no algorithm other than "try all the possibilities."}
}