@TechReport{bordawekar:efficient, author = {Rajesh Bordawekar and Rajeev Thakur and Alok Choudhary}, title = {Efficient Compilation of Out-of-core Data Parallel Programs}, year = {1994}, month = {April}, number = {SCCS-622}, institution = {NPAC}, later = {bordawekar:reorganize}, URL = {ftp://erc.cat.syr.edu/ece/choudhary/PASSION/access_reorg.ps.Z}, keywords = {parallel I/O, compiler, pario-bib}, abstract = {Large scale scientific applications, such as the Grand Challenge applications, deal with very large quantities of data. The amount of main memory in distributed memory machines is usually not large enough to solve problems of realistic size. This limitation results in the need for system and application software support to provide efficient parallel I/O for out-of-core programs. This paper describes techniques for translating out-of-core programs written in a data parallel language like HPF to message passing node programs with explicit parallel I/O. We describe the basic compilation model and various steps involved in the compilation. The compilation process is explained with the help of an out-of-core matrix multiplication program. We first discuss how an out-of-core program can be translated by extending the method used for translating in-core programs. We then describe how the compiler can optimize the code by estimating the I/O costs associated with different array access patterns and selecting the method with the least I/O cost. This optimization can reduce the amount of I/O by as much as an order of magnitude. Performance results on the Intel Touchstone Delta are presented and analyzed.}, comment = {Revised as bordawekar: This is actually fairly different from thakur:runtime. They describe the same basic compiler technique, where arrays are distributed across processors, and each processor has a local array file for holding data from its local partitions. Then the I/O needed for a loop is broken into slabs, where the program proceeds as an alternation of (read slabs, compute, write slabs). The big new thing here is that the compiler tries different ways to form slabs (e.g., by row or by column), estimates the number of I/Os and the amount of data moved for each case, and chooses the case with the smallest amount of I/O. They also mention how the choice of memory size allocated to different arrays affects the amount of IO, but give no algorithm other than "try all the possibilities."} }