@TechReport{li:recursive-tr,
  author = {Zhiyong Li and John H. Reif and Sandeep K. S. Gupta},
  title = {Synthesizing Efficient Out-of-Core Programs for Block Recursive
  Algorithms using Block-Cyclic Data Distributions},
  year = {1996},
  month = {March},
  number = {96-04},
  institution = {Dept. of Computer Science, Duke University},
  later = {li:recursive},
  URL = {ftp://ftp.cs.duke.edu/pub/zli/papers/TR-96-04.ps.gz},
  keywords = {parallel I/O, out-of-core algorithm, pario-bib},
  abstract = {In this paper, we present a framework for synthesizing I/O
  efficient out-of-core programs for block recursive algorithms, such as the
  fast Fourier transform (FFT) and block matrix transposition algorithms. Our
  framework uses an algebraic representation which is based on tensor products
  and other matrix operations. The programs are optimized for the striped
  Vitter and Shriver's two-level memory model in which data can be distributed
  using various cyclic(B) distributions in contrast to the normally used {\it
  physical track} distribution cyclic(B_d), where B_d is the physical disk
  block size. \par We first introduce tensor bases to capture the semantics of
  block-cyclic data distributions of out-of-core data and also data access
  patterns to out-of-core data. We then present program generation techniques
  for tensor products and matrix transposition. We accurately represent the
  number of parallel I/O operations required for the synthesized programs for
  tensor products and matrix transposition as a function of tensor bases and
  data distributions. We introduce an algorithm to determine the data
  distribution which optimizes the performance of the synthesized programs.
  Further, we formalize the procedure of synthesizing efficient out-of-core
  programs for tensor product formulas with various block-cyclic distributions
  as a dynamic programming problem. \par We demonstrate the effectiveness of
  our approach through several examples. We show that the choice of an
  appropriate data distribution can reduce the number of passes to access
  out-of-core data by as large as eight times for a tensor product, and the
  dynamic programming approach can largely reduce the number of passes to
  access out-of-core data for the overall tensor product formulas.}
}