@TechReport{nitzberg:cfs, author = {Bill Nitzberg}, title = {Performance of the {iPSC/860 Concurrent File System}}, year = {1992}, month = {December}, number = {RND-92-020}, institution = {NAS Systems Division, NASA Ames}, later = {krystynak:pario}, URL = {http://www.nas.nasa.gov/NAS/TechReports/RNDreports/RND-92-020/RND-92-020.html}, keywords = {Intel, parallel file system, performance measurement, parallel I/O, pario-bib}, abstract = {Typical scientific applications require vast amounts of processing power coupled with significant I/O capacity. Highly parallel computer systems can provide processing power at low cost, but tend to lack I/O capacity. By evaluating the performance and scalability of the Intel iPSC/860 Concurrent File System (CFS), we can get an idea of the current state of parallel I/O performance. I ran three types of tests on the iPSC/860 system at the Numerical Aerodynamic Simulation facility (NAS): broadcast, simulating initial data loading; partitioned, simulating reading and writing a one-dimensional decomposition; and interleaved, simulating reading and writing a two-dimensional decomposition. \par The CFS at NAS can sustain up to 7 megabytes per second writing and 8 megabytes per second reading. However, due to the limited disk cache size, partitioned read performance sharply drops to less than 1 megabyte per second on 128 nodes. In addition, interleaved read and write performance show a similar drop in performance for small block sizes. Although the CFS can sustain 70-80\% of peak I/O throughput, the I/O performance does not scale with the number of nodes. \par Obtaining maximum performance may require significant programming effort: pre-allocating files, overlapping computation and I/O, using large block sizes, and limiting I/O parallelism. A better approach would be to attack the problem by either fixing the CFS (e.g., add more cache to the I/O nodes), or hiding its idiosyncracies (e.g., implement a parallel I/O library).}, comment = {Straightforward measurements of an iPSC/860 with 128 compute nodes, 10 I/O nodes, and 10 disks. This is a bigger system than has been measured before. Has some basic MB/s measurements for some features in Tables 1--2. CFS bug prevents more than 2 asynch requests at a time. Another bug forced random-writes to use preallocated files. For low number of procs, they weren't able to pull the full disk bandwidth. Cache thrashing caused problems when they had a large number of procs, because each read prefetched 8 blocks, which were flushed by some other proc doing a subsequent read. Workaround by synchronizing procs to limit concurrency. Increasing cache size is the right answer, but is not scalable.} }