@TechReport{thakur:astrophysics,
  author = {Rajeev Thakur and Ewing Lusk and William Gropp},
  title = {{I/O} Characterization of a Portable Astrophysics Application on the
  {IBM SP} and {Intel Paragon}},
  year = {1995},
  month = {August},
  number = {MCS-P534-0895},
  institution = {Argonne National Laboratory},
  note = {Revised October 1995},
  URL = {http://www.mcs.anl.gov/~thakur/papers/astro.ps},
  keywords = {file access pattern, workload characterization, parallel I/O,
  pario-bib},
  abstract = {Many large-scale applications on parallel machines are
  bottlenecked by the I/O performance rather than the CPU or communication
  performance of the system. To improve the I/O performance, it is first
  necessary for system designers to understand the I/O requirements of various
  applications. This paper presents the results of a study of the I/O
  characteristics and performance of a real, I/O-intensive, portable, parallel
  application in astrophysics, on two different parallel machines---the IBM SP
  and the Intel Paragon. We instrumented the source code to record all I/O
  activity, and analyzed the resulting trace files. Our results show that, for
  this application, the I/O consists of fairly large writes, and writing data
  to files is faster on the Paragon, whereas opening and closing files are
  faster on the SP. We also discuss how the I/O performance of this application
  could be improved; particularly, we believe that this application would
  benefit from using collective I/O.},
  comment = {Adds another data point to the collection of parallel scientific
  applications whose I/O has been characterized, a collection started in
  earnest by crandall:iochar. It's a pretty straightforward application; it
  just writes its matrices every few timesteps. The application writes whole
  matrices; the OS sees request sizes that are more a factor of the Chameleon
  library than of the application. Most of the I/O itself is not implemented in
  parallel, because they used UniTree on the SP, and because the Chameleon
  library sequentializes this kind of I/O through one node. Other numbers from
  the paper don't add much insight into the workload. Revised slightly in
  October 1995; the abstract represents that revision.}
}