@InProceedings{thakur:runtime, author = {R. Thakur and R. Bordawekar and A. Choudhary}, title = {{Compiler and Runtime Support for Out-of-Core HPF Programs}}, booktitle = {Proceedings of the 8th ACM International Conference on Supercomputing}, year = {1994}, month = {July}, pages = {382--391}, publisher = {ACM Press}, address = {Manchester, UK}, URL = {ftp://erc.cat.syr.edu/ece/choudhary/PASSION/ics94-out-of-core-hpf.ps.Z}, keywords = {parallel I/O, pario-bib}, abstract = {This paper describes the design of a compiler which can translate out-of-core programs written in a data parallel language like HPF. Such a compiler is required for compiling large scale scientific applications, such as the Grand Challenge applications, which deal with enormous quantities of data. We propose a framework by which a compiler together with appropriate runtime support can translate an out-of-core HPF program to a message passing node program with explicit parallel I/O. We describe the basic model of the compiler and the various transformations made by the compiler. We also discuss the runtime routines used by the compiler for I/O and communication. In order to minimize I/O, the runtime support system can reuse data already fetched into memory. The working of the compiler is illustrated using two out-of-core applications, namely a Laplace equation solver and LU Decomposition, together with performance results on the Intel Touchstone Delta.}, comment = {They describe ways to make HPF handle out-of-core arrays. Basically, they add directives to say which arrays are out of core, and how much memory to devote to the in-core portion of the array. Then the compiler distributes the array across processors, as in HPF, to form local arrays. Each local array is broken into slabs, where each slab can fit in local memory. The local array is kept in a local array file, from which slabs are loaded and stored. Ghost nodes are also handled. They were careful to avoid double I/O when one slab is another slab's ghost node. They found it most convenient to do all the communication between iterations, then do all the computation for that iteration, where the iteration itself required a loop including both computation and I/O. This means that there may need to be I/O during the communication phase, to store ghost nodes coming in from other places. They do not mention use of asynchronous I/O for overlap. See also bordawekar:efficient.} }