@TechReport{nitzberg:cfs,
  author = {Bill Nitzberg},
  title = {Performance of the {iPSC/860 Concurrent File System}},
  year = {1992},
  month = {December},
  number = {RND-92-020},
  institution = {NAS Systems Division, NASA Ames},
  later = {krystynak:pario},
  URL =
  {http://www.nas.nasa.gov/NAS/TechReports/RNDreports/RND-92-020/RND-92-020.html},
  keywords = {Intel, parallel file system, performance measurement, parallel
  I/O, pario-bib},
  abstract = {Typical scientific applications require vast amounts of
  processing power coupled with significant I/O capacity. Highly parallel
  computer systems can provide processing power at low cost, but tend to lack
  I/O capacity. By evaluating the performance and scalability of the Intel
  iPSC/860 Concurrent File System (CFS), we can get an idea of the current
  state of parallel I/O performance. I ran three types of tests on the iPSC/860
  system at the Numerical Aerodynamic Simulation facility (NAS): broadcast,
  simulating initial data loading; partitioned, simulating reading and writing
  a one-dimensional decomposition; and interleaved, simulating reading and
  writing a two-dimensional decomposition. \par The CFS at NAS can sustain up
  to 7 megabytes per second writing and 8 megabytes per second reading.
  However, due to the limited disk cache size, partitioned read performance
  sharply drops to less than 1 megabyte per second on 128 nodes. In addition,
  interleaved read and write performance show a similar drop in performance for
  small block sizes. Although the CFS can sustain 70-80\% of peak I/O
  throughput, the I/O performance does not scale with the number of nodes. \par
  Obtaining maximum performance may require significant programming effort:
  pre-allocating files, overlapping computation and I/O, using large block
  sizes, and limiting I/O parallelism. A better approach would be to attack the
  problem by either fixing the CFS (e.g., add more cache to the I/O nodes), or
  hiding its idiosyncracies (e.g., implement a parallel I/O library).},
  comment = {Straightforward measurements of an iPSC/860 with 128 compute
  nodes, 10 I/O nodes, and 10 disks. This is a bigger system than has been
  measured before. Has some basic MB/s measurements for some features in Tables
  1--2. CFS bug prevents more than 2 asynch requests at a time. Another bug
  forced random-writes to use preallocated files. For low number of procs, they
  weren't able to pull the full disk bandwidth. Cache thrashing caused problems
  when they had a large number of procs, because each read prefetched 8 blocks,
  which were flushed by some other proc doing a subsequent read. Workaround by
  synchronizing procs to limit concurrency. Increasing cache size is the right
  answer, but is not scalable.}
}