@InProceedings{brunet:factor,
  author = {Jean-Philippe Brunet and Palle Pedersen and S.~Lennart Johnsson},
  title = {Load-Balanced {LU} and {QR} Factor and Solve Routines for Scalable
  Processors with Scalable {I/O}},
  booktitle = {Proceedings of the 17th IMACS World Congress},
  year = {1994},
  month = {July},
  address = {Atlanta, GA},
  note = {Also available as Harvard University Computer Science Technical
  Report TR-20-94.},
  URL = {ftp://das-ftp.harvard.edu/techreports/tr-20-94.ps.gz},
  keywords = {parallel I/O, linear algebra, out-of-core, pario-bib},
  abstract = {The concept of block-cyclic order elimination can be applied to
  out-of-core $LU$ and $QR$ matrix factorizations on distributed memory
  architectures equipped with a parallel I/O system. This elimination scheme
  provides load balanced computation in both the factor and solve phases and
  further optimizes the use of the network bandwidth to perform I/O operations.
  Stability of LU factorization is enforced by full column pivoting.
  Performance results are presented for the Connection Machine system CM-5.},
  comment = {Short, not many details. Performance results shows about 3.5
  Gflops for all problem sizes, both in-core on small N and out-of-core on
  large N.}
}