% BibTeX bibliography file

@InProceedings{acharya:tuning,
  author = {Anurag Acharya and Mustafa Uysal and Robert Bennett and Assaf
  Mendelson and Michael Beynon and Jeffrey K. Hollingsworth and Joel Saltz and
  Alan Sussman},
  title = {Tuning the Performance of {I/O} Intensive Parallel Applications},
  booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel
  and Distributed Systems},
  year = {1996},
  month = {May},
  pages = {15--27},
  publisher = {ACM Press},
  address = {Philadelphia},
  keyword = {parallel I/O, filesystem workload, parallel application,
  pario-bib},
  abstract = {Getting good I/O performance from parallel programs is a critical
  problem for many application domains. In this paper, we report our experience
  tuning the I/O performance of four application programs from the areas of
  satellite-data processing and linear algebra. After tuning, three of the four
  applications achieve application-level I/O rates of over 100 MB/s on 16
  processors. The total volume of I/O required by the programs ranged from
  about 75 MB to over 200 GB. We report the lessons learned in achieving high
  I/O performance from these applications, including the need for code
  restructuring, local disks on every node and knowledge of future I/O
  requests. We also report our experience on achieving high performance on
  peer-to-peer configurations. Finally, we comment on the necessity of complex
  I/O interfaces like collective I/O and strided requests to achieve high
  performance.}
}

@InProceedings{ap:enwrich,
  author = {Apratim Purakayastha and Carla Schlatter Ellis and David Kotz},
  title = {{ENWRICH:} A Compute-Processor Write Caching Scheme for Parallel
  File Systems},
  booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel
  and Distributed Systems},
  year = {1996},
  month = {May},
  pages = {55--68},
  publisher = {ACM Press},
  address = {Philadelphia},
  earlier = {ap:enwrich-tr},
  URL = {ftp://ftp.cs.dartmouth.edu/pub/kotz/papers/ap:enwrich.ps.Z},
  keyword = {parallel file system, parallel I/O, caching, pario-bib, dfk},
  abstract = {Many parallel scientific applications need high-performance I/O.
  Unfortunately, end-to-end parallel-I/O performance has not been able to keep
  up with substantial improvements in parallel-I/O hardware because of poor
  parallel file-system software. Many radical changes, both at the interface
  level and the implementation level, have recently been proposed. One such
  proposed interface is {\em collective I/O}, which allows parallel jobs to
  request transfer of large contiguous objects in a single request, thereby
  preserving useful semantic information that would otherwise be lost if the
  transfer were expressed as per-processor non-contiguous requests. Kotz has
  proposed {\em disk-directed I/O} as an efficient implementation technique for
  collective-I/O operations, where the compute processors make a single
  collective data-transfer request, and the I/O processors thereafter take full
  control of the actual data transfer, exploiting their detailed knowledge of
  the disk-layout to attain substantially improved performance. \par Recent
  parallel file-system usage studies show that writes to write-only files are a
  dominant part of the workload. Therefore, optimizing writes could have a
  significant impact on overall performance. In this paper, we propose ENWRICH,
  a compute-processor write-caching scheme for write-only files in parallel
  file systems. ENWRICH combines low-overhead write caching at the compute
  processors with high performance disk-directed I/O at the I/O processors to
  achieve both low latency and high bandwidth. This combination facilitates the
  use of the powerful disk-directed I/O technique independent of any particular
  choice of interface. By collecting writes over many files and applications,
  ENWRICH lets the I/O processors optimize disk I/O over a large pool of
  requests. We evaluate our design via simulated implementation and show that
  ENWRICH achieves high performance for various configurations and workloads.}
}

@InProceedings{armen:disk-model,
  author = {Chris Armen},
  title = {Bounds on the Separation of Two Parallel Disk Models},
  booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel
  and Distributed Systems},
  year = {1996},
  month = {May},
  pages = {122--127},
  publisher = {ACM Press},
  address = {Philadelphia},
  keyword = {parallel I/O, theory, parallel I/O algorithm, pario-bib},
  abstract = {The single-disk, D-head model of parallel I/O was introduced by
  Agarwal and Vitter to analyze algorithms for problem instances that are too
  large to fit in primary memory. Subsequently Vitter and Shriver proposed a
  more realistic model in which the disk space is partitioned into D disks,
  with a single head per disk. To date, each problem for which there is a known
  optimal algorithm for both models has the same asymptotic bounds on both
  models. Therefore, it has been unknown whether the models are equivalent or
  whether the single-disk model is strictly more powerful. \par In this paper
  we provide evidence that the single-disk model is strictly more powerful. We
  prove a lower bound on any general simulation of the single-disk model on the
  multi-disk model and establish randomized and deterministic upper bounds. Let
  $N$ be the problem size and let $T$ be the number of parallel I/Os required
  by a program on the single-disk model. Then any simulation of this program on
  the multi-disk model will require $\Omega\left(T \frac{\log(N/D)}{\log
  \log(N/D)}\right)$ parallel I/Os. This lower bound holds even if replication
  is allowed in the multi-disk model. We also show an $O\left(\frac{\log
  D}{\log \log D}\right)$ randomized upper bound and an $O\left(\log D (\log
  \log D)^2\right)$ deterministic upper bound. These results exploit an
  interesting analogy between the disk models and the PRAM and DCM models of
  parallel computation.}
}

@InProceedings{arpaci-dusseau:river,
  author = {Remzi H. Arpaci-Dusseau and Eric Anderson and Noah Treuhaft and
  David E. Culler and Joseph M. Hellerstein and David Patterson and Kathy
  Yelick},
  title = {Cluster {I/O} with {River}: Making the Fast Case Common},
  booktitle = {Proceedings of the Sixth Workshop on Input/Output in Parallel
  and Distributed Systems},
  year = {1999},
  month = {May},
  pages = {10--22},
  publisher = {ACM Press},
  address = {Atlanta, GA},
  URL = {http://vibes.cs.uiuc.edu/IOPADS/Accepted/Remzi.ps},
  keyword = {cluster computing, parallel I/O, pario-bib},
  abstract = {We introduce River, a data-flow programming environment and I/O
  substrate for clusters of computers. River is designed to provide maximum
  performance in the common case--- even in the face of non-uniformities in
  hardware, software, and workload. River is based on two simple design
  features: a high-performance distributed queue,and a storage redundancy
  mechanism called graduated declustering.We have implemented a number of
  data-intensive applications on River, which validate our design with
  near-ideal performance in a variety of non-uniform performance scenarios.}
}

@InProceedings{barve:competitive2,
  author = {Rakesh Barve and Mahesh Kallahalla and Peter J. Varman and Jeffrey
  Scott Vitter},
  title = {Competitive Parallel Disk Prefetching and Buffer Management},
  booktitle = {Proceedings of the Fifth Workshop on Input/Output in Parallel
  and Distributed Systems},
  year = {1997},
  month = {November},
  pages = {47--56},
  publisher = {ACM Press},
  address = {San Jose, CA},
  keyword = {disk prefetching, file caching, parallel I/O, pario-bib},
  abstract = {We provide a competitive analysis framework for online
  prefetching and buffer management algorithms in parallel I/O systems, using a
  read-once model of block references. This has widespread applicability to key
  I/O-bound applications such as external merging and concurrent playback of
  multiple video streams. Two realistic lookahead models, global lookahead and
  local lookahead, are defined. Algorithms NOM and GREED based on these two
  forms of lookahead are analyzed for shared buffer and distributed buffer
  configurations, both of which occur frequently in existing systems. An
  important aspect of our work is that we show how to implement both the models
  of lookahead in practice using the simple techniques of forecasting and
  flushing. \par Given a D-disk parallel I/O system and a globally shared I/O
  buffer that can hold upto M disk blocks, we derive a lower bound of
  $\Omega(\sqrt{D}$) on the competitive ratio of any deterministic online
  prefetching algorithm with O(M) lookahead. NOM is shown to match the lower
  bound using global M-block lookahead. In contrast, using only local lookahead
  results in an $\Omega(D)$ competitive ratio. When the buffer is distributed
  into D portions of M/D blocks each, the algorithm GREED based on local
  lookahead is shown to be optimal, and NOM is within a constant factor of
  optimal. Thus we provide a theoretical basis for the intuition that global
  lookahead is more valuable for prefetching in the case of a shared buffer
  configuration whereas it is enough to provide local lookahead in case of the
  distributed configuration. Finally, we analyze the performance of these
  algorithms for reference strings generated by a uniformly-random stochastic
  process and we show that they achieve the minimal expected number of I/Os.
  These results also give bounds on the worst-case expected performance of
  algorithms which employ randomization in the data layout.},
  comment = {See also barve:competitive. They propose two methods for
  scheduling prefetch operations in the situation where the access pattern is
  largely known in advance, in such a way as to minimize the total number of
  parallel I/Os. The two methods are quite straightforward, and yet match the
  optimum lower bound for an on-line algorithm.}
}

@InProceedings{barve:round,
  author = {Rakesh Barve and Phillip B. Gibbons and Bruce K. Hillyer and Yossi
  Matias and Elizabeth Shriver and Jeffrey Scott Vitter},
  title = {Round-like Behavior in Multiple Disks on a Bus},
  booktitle = {Proceedings of the Sixth Workshop on Input/Output in Parallel
  and Distributed Systems},
  year = {1999},
  month = {May},
  pages = {1--9},
  publisher = {ACM Press},
  address = {Atlanta, GA},
  URL = {http://vibes.cs.uiuc.edu/IOPADS/Accepted/Shriver.ps},
  keyword = {disk, I/O bus, parallel I/O, pario-bib},
  abstract = {In modern I/O architectures, multiple disk drives are attached to
  each I/O bus. Under I/O-intensive workloads, the disk latency for a request
  can be overlapped with the disk latency and data transfers of requests to
  other disks, potentially resulting in an aggregate I/O throughput at nearly
  bus bandwidth. This paper reports on a performance impairment that results
  from a previously unknown form of convoy behavior in disk I/O, which we call
  rounds. In rounds, independent requests to distinct disks convoy, so that
  each disk services one request before any disk services its next request. We
  analyze log files to describe read performance of multiple Seagate Wren-7
  disks that share a SCSI bus under a heavy workload, demonstrating the rounds
  behavior and quantifying its performance impact.}
}

@InProceedings{bester:gass,
  author = {Joseph Bester and Ian Foster and Carl Kesselman and Jean Tedesco
  and Steven Tuecke},
  title = {{GASS}: A Data Movement and Access Service for Wide Area Computing
  Systems},
  booktitle = {Proceedings of the Sixth Workshop on Input/Output in Parallel
  and Distributed Systems},
  year = {1999},
  month = {May},
  pages = {78--88},
  publisher = {ACM Press},
  address = {Atlanta, GA},
  URL = {http://vibes.cs.uiuc.edu/IOPADS/Accepted/Tedesco.ps},
  keyword = {wide-area network, parallel I/O, pario-bib},
  abstract = {In wide area computing, programs frequently execute at sites that
  are distant from their data. Data access mechanisms are required that place
  limited functionality demands on an application or host system yet permit
  high-performance implementations. To address these requirements, we propose a
  data movement and access service called Global Access to Secondary Storage
  (GASS). This service defines a global name space via Uniform Resource
  Locators and allows applications to access remote files via standard I/O
  interfaces. High performance is achieved by incorporating default data
  movement strategies that are specialized for I/O patterns common in wide area
  applications and by providing support for programmer management of data
  movement. GASS forms part of the Globus toolkit, a set of services for
  high-performance distributed computing. GASS itself makes use of Globus
  services for security and communication, and other Globus components use GASS
  services for executable staging and real-time remote monitoring. Application
  experiences demonstrate that the library has practical utility.}
}

@InProceedings{burns:delta,
  author = {Randal C. Burns and Darrell D. E. Long},
  title = {Efficient Distributed Backup with Delta Compression},
  booktitle = {Proceedings of the Fifth Workshop on Input/Output in Parallel
  and Distributed Systems},
  year = {1997},
  month = {November},
  pages = {26--36},
  publisher = {ACM Press},
  address = {San Jose, CA},
  keyword = {distributed file system, file system backup, tertiary storage,
  compression},
  abstract = {Inexpensive storage and more powerful processors have resulted in
  a proliferation of data that needs to be reliably backed up. Network resource
  limitations make it increasingly difficult to backup a distributed file
  system on a nightly or even weekly basis. By using delta compression
  algorithms, which minimally encode a version of a file using only the bytes
  that have changed, a backup system can compress the data sent to a server.
  With the delta backup technique, we can achieve significant savings in
  network transmission time over previous techniques. \par Our measurements
  indicate that file system data may, on average, be compressed to within 10\%
  of its original size with this method and that approximately 45\% of all
  changed files have also been backed up in the previous week. Based on our
  measurements, we conclude that a small file store on the client that contains
  copies of previously backed up files can be used to retain versions in order
  to generate delta files. \par To reduce the load on the backup server, we
  implement a modified version storage architecture, {\em version jumping},
  that allows us to restore delta encoded file versions with at most two
  accesses to tertiary storage. This minimizes server workload and network
  transmission time on file restore.},
  comment = {Use delta compression to save a set of differences between
  versions rather than to backup a file system by copying whole files. They
  need to keep a reference version (a complete file), and the deltas. Since
  deltas are computed off of intermediate versions, they depend on each other;
  to recover a version, you need to rebuild all intermediate versions from the
  base file. They propose version jumping, by computing deltas not from the
  previous version but from a common earlier version. Thus, you can reconstruct
  a version from one of the reference versions, and one delta. That's good when
  you need to get these off of tape!}
}

@InProceedings{chang:reuse,
  author = {Tai-Sheng Chang and Sangyup Shim and David H.~C. Du},
  title = {The Scalability of Spatial Reuse Based Serial Storage Interfaces},
  booktitle = {Proceedings of the Fifth Workshop on Input/Output in Parallel
  and Distributed Systems},
  year = {1997},
  month = {November},
  pages = {93--101},
  publisher = {ACM Press},
  address = {San Jose, CA},
  keyword = {I/O interface, I/O network, I/O architecture, parallel I/O,
  pario-bib},
  abstract = {Due to the growing popularity of emerging applications such as
  digital libraries, Video-On Demand, distance learning, and Internet
  World-Wide Web, multimedia servers with a large capacity and high performance
  storage subsystem are in high demand. Serial storage interfaces are emerging
  technologies designed to improve the performance of such storage subsystems.
  They provide high bandwidth, fault tolerance, fair bandwidth sharing and long
  distance connection capability. All of these issues are critical in designing
  a scalable and high performance storage subsystem. Some of the serial storage
  interfaces provide the spatial reuse feature which allows multiple concurrent
  transmissions. That is, multiple hosts can access disks concurrently with
  full link bandwidth if their access paths are disjoint. Spatial reuse
  provides a way to build a storage subsystem whose aggregate bandwidth may be
  scaled up with the number of hosts. However, it is not clear how much the
  performance of a storage subsystem could be improved by the spatial reuse
  with different configurations and traffic scenarios. Both limitation and
  capability of this scalability need to be investigated. To understand their
  fundamental performance characteristics, we derive an analytic model for the
  serial storage interfaces with the spatial reuse feature. Based on this
  model, we investigate the maximum aggregate throughput from different system
  configurations and load distributions. We show how the number of disks needed
  to saturate a loop varies with different number of hosts and different load
  scenarios. We also show how the load balancing by uniformly distributing the
  load to all the disks on a loop may incur high overhead. This is because the
  accesses to far away disks need to go through many links and consume the
  bandwidth of each link it goes through. The results show the achievable
  throughput may be reduced by more than half in some cases.}
}

@InProceedings{chen:panda,
  author = {Y. Chen and M. Winslett and K. E. Seamons and S. Kuo and Y. Cho and
  M. Subramaniam},
  title = {Scalable Message Passing in {Panda}},
  booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel
  and Distributed Systems},
  year = {1996},
  month = {May},
  pages = {109--121},
  publisher = {ACM Press},
  address = {Philadelphia},
  keyword = {parallel I/O, parallel file system, pario-bib},
  abstract = {To provide high performance for applications with a wide variety
  of i/o requirements and to support many different parallel platforms, the
  design of a parallel i/o system must provide for efficient utilization of
  available bandwidth both for disk traffic and for message passing. In this
  paper we discuss the message-passing scalability of the server-directed i/o
  architecture of Panda, a library for synchronized i/o of multidimensional
  arrays on parallel platforms. We show how to improve i/o performance in
  situations where message-passing is a bottleneck, by combining the
  server-directed i/o strategy for highly efficient use of available disk
  bandwidth with new mechanisms to minimize internal communication and
  computation overhead in Panda. We present experimental results that show that
  with these improvements, Panda will provide high i/o performance for a wider
  range of applications, such as applications running with slow interconnects,
  applications performing i/o operations on large numbers of arrays, or
  applications that require drastic data rearrangements as data are moved
  between memory and disk (e.g., array transposition). We also argue that in
  the future, the improved approach to message-passing will allow Panda to
  support applications that are not closely synchronized or that run in
  heterogeneous environments.},
  comment = {see seamons:panda. This paper goes further with some communication
  improvements.}
}

@InProceedings{cho:local,
  author = {Yong Cho and Marianne Winslett and Mahesh Subramaniam and Ying Chen
  and Szu-wen Kuo and Kent E. Seamons},
  title = {Exploiting Local Data in Parallel Array {I/O} on a Practical Network
  of Workstations},
  booktitle = {Proceedings of the Fifth Workshop on Input/Output in Parallel
  and Distributed Systems},
  year = {1997},
  month = {November},
  pages = {1--13},
  publisher = {ACM Press},
  address = {San Jose, CA},
  keyword = {parallel I/O, distributed system, pario-bib},
  abstract = {A cost-effective way to run a parallel application is to use
  existing workstations connected by a local area network such as Ethernet or
  FDDI. In this paper, we present an approach for parallel I/O of
  multidimensional arrays on small networks of workstations with a shared-media
  interconnect, using the Panda I/O library. \par In such an environment, the
  message passing throughput per node is lower than the throughput obtainable
  from a fast disk and it is not easy for users to determine the configuration
  which will yield the best I/O performance. \par We introduce an I/O strategy
  that exploits local data to reduce the amount of data that must be shipped
  across the network, present experimental results, and analyze the results
  using an analytical performance model and predict the best choice of I/O
  parameters. \par Our experiments show that the new strategy results in a
  factor of 1.2--2.1 speedup in response time compared to the Panda version
  originally developed for the IBM SP2, depending on the array sizes,
  distributions and compute and I/O node meshes. Further, the performance model
  predicts the results within a 13\% margin of error.},
  comment = {They examine a system that supports nodes that are both compute
  and I/O nodes. The assumption is that the application is writing data to a
  new file, and does not care to which disks the data goes. They are trying to
  decide which nodes should be used for I/O, given the distribution of data on
  compute nodes and the distribution desired across disks. They use a Hungarian
  algorithm to solve a weighted optimization problem on a bipartite graph
  connecting I/O nodes to compute nodes, in an attempt to minimize the data
  flow across the network. But there is no attempt to make a decision that
  might be sensible for a future read operation that may want to read in a
  different pattern.}
}

@InProceedings{cormen:fft3,
  author = {Thomas H. Cormen and Jake Wegmann and David M. Nicol},
  title = {Multiprocessor Out-of-Core {FFTs} with Distributed Memory and
  Parallel Disks},
  booktitle = {Proceedings of the Fifth Workshop on Input/Output in Parallel
  and Distributed Systems},
  year = {1997},
  month = {November},
  pages = {68--78},
  publisher = {ACM Press},
  address = {San Jose, CA},
  keyword = {out of core, parallel I/O, pario-bib},
  abstract = {This paper extends an earlier out-of-core Fast Fourier Transform
  (FFT) method for a uniprocessor with the Parallel Disk Model (PDM) to use
  multiple processors. Four out-of-core multiprocessor methods are examined.
  Operationally, these methods differ in the size of "mini-butterfly" computed
  in memory and how the data are organized on the disks and in the distributed
  memory of the multiprocessor. The methods also perform differing amounts of
  I/O and communication. Two of them have the remarkable property that even
  though they are computing the FFT on a multiprocessor, all interprocessor
  communication occurs outside the mini-butterfly computations; communication
  that ordinarily occurs in a butterfly is folded into other data-movement
  operations. An analysis program shows that the two methods that use no
  butterfly communication usually use less communication overall than the other
  methods. The analysis program is fast enough that it can be invoked at run
  time to determine which of the four methods uses the least communication. One
  set of performance results on a small workstation cluster indicates that the
  methods without butterfly communication are approximately 9.5\% faster.
  Moreover, they are much easier to implement.},
  comment = {They find a way to move the interprocessor communication involved
  in the out-of-core FFT into a single BMMC permutation between "super-levels",
  where each super-level involves log(M) stages of the FFT. This usually leads
  to less communication and to better overall performance. See also cormen:fft
  and cormen:fft2.}
}

@InProceedings{cortes:cooperative,
  author = {Toni Cortes and Sergi Girona and Jes\'us Labarta},
  title = {Design Issues of a Cooperative Cache with no Coherence Problems},
  booktitle = {Proceedings of the Fifth Workshop on Input/Output in Parallel
  and Distributed Systems},
  year = {1997},
  month = {November},
  pages = {37--46},
  publisher = {ACM Press},
  address = {San Jose, CA},
  keyword = {cooperative caching, distributed file system, parallel I/O,
  pario-bib},
  abstract = {In this paper, we examine some of the important problems observed
  in the design of cooperative caches. Solutions to the coherence,
  load-balancing and fault-tolerance problems are presented. These solutions
  have been implemented as a part of PAFS, a parallel/distributed file system,
  and its performance has been compared to the one achieved by xFS. Using the
  comparison results, we have observed that the proposed ideas not only solve
  the main problems of cooperative caches, but also increase the overall system
  performance. Although the solutions presented in this paper were targeted to
  a parallel machine, reasonable good results have also been obtained for
  networks of workstations.},
  comment = {They make the claim that it is better not to replicate data into
  local client caches, rather, it is better to simply make remote read and
  write requests to the cached block in whatever memory it may be. That reduces
  the overhead (space and time) of replication and coherency, and leads to
  better performance. They also present a range of parity-based fault-tolerance
  mechanisms, and a load-balancing technique that reassigns cache buffers to
  cache-manager processes.}
}

@InProceedings{foster:remote-io,
  author = {Ian Foster and David {Kohr, Jr.} and Rakesh Krishnaiyer and Jace
  Mogill},
  title = {Remote {I/O}: Fast Access to Distant Storage},
  booktitle = {Proceedings of the Fifth Workshop on Input/Output in Parallel
  and Distributed Systems},
  year = {1997},
  month = {November},
  pages = {14--25},
  publisher = {ACM Press},
  address = {San Jose, CA},
  keyword = {parallel I/O, distributed file system, pario-bib},
  abstract = {As high-speed networks make it easier to use distributed
  resources, it becomes increasingly common that applications and their data
  are not colocated. Users have traditionally addressed this problem by
  manually staging data to and from remote computers. We argue instead for a
  remote I/O paradigm in which programs use familiar parallel I/O interfaces to
  access remote filesystems. In addition to simplifying remote execution,
  remote I/O can improve performance relative to staging by overlapping
  computation and data transfer or by reducing communication requirements.
  However, remote I/O also introduces new technical challenges in the areas of
  portability, performance, and integration with distributed computing systems.
  We propose techniques designed to address these challenges and describe a
  remote I/O library called RIO that we are developing to evaluate the
  effectiveness of these techniques. RIO addresses issues of portability by
  adopting the quasi-standard MPI-IO interface and by defining a RIO device and
  RIO server within the ADIO abstract I/O device architecture. It addresses
  performance issues by providing traditional I/O optimizations such as
  asynchronous operations and through implementation techniques such as
  buffering and message forwarding to offload communication overheads.
  Microbenchmarks and application experiments demonstrate that our techniques
  can improve turnaround time relative to staging.},
  comment = {They want to support users that have datasets at different
  locations in the Internet, but need to access the data at supercomputer
  parallel machines. Rather than staging data in and out, they want to provide
  remote access. Issues: naming, dynamic loads, heterogeneity, security,
  fault-tolerance. All traffic goes through a 'forwarder node' that funnels all
  the traffic into the network. They use URLs for pathnames (e.g.,
  "x-rio://..."). They find that non-blocking ops are important, as is
  collective I/O. They think that buffering will be important. Limited
  experiments.}
}

@InProceedings{kallahalla:read-once,
  author = {Mahesh Kallahalla and Peter J. Varman},
  title = {Optimal Read-Once Parallel Disk Scheduling},
  booktitle = {Proceedings of the Sixth Workshop on Input/Output in Parallel
  and Distributed Systems},
  year = {1999},
  month = {May},
  pages = {68--77},
  publisher = {ACM Press},
  address = {Atlanta, GA},
  URL = {http://vibes.cs.uiuc.edu/IOPADS/Accepted/Kallahalla.ps},
  keyword = {disk scheduling, parallel I/O, pario-bib},
  abstract = {We present an optimal algorithm, L-OPT, for prefetching and I/O
  scheduling in parallel I/O systems using a read-once model of block
  reference. The algorithm uses knowledge of the next L block references,
  L-block lookahead, to schedule I/Os in an on-line manner. It uses a dynamic
  priority assignment scheme to decide when blocks should be prefetched, so as
  to minimize the total number of I/Os. The parallel disk model of an I/O
  system is used to study the performance of L-OPT. We show that L-OPT is
  comparable to the best on-line algorithm with the same amount of lookahead;
  the ratio of the length of its schedule to the length of the optimal schedule
  is within a constant factor of the best possible. Specifically, we show that
  the competitive ratio of L-OPT is $\Omega(\sqrt{MD/L})$ which matches the
  lower bound on the competitive ratio of any prefetching algorithm with
  L-block lookahead. In addition we show that when the lookahead consists of
  the entire reference string, L-OPT performs the minimum possible number of
  I/Os; hence L-OPT is the optimal off-line algorithm. Finally, using synthetic
  traces we empirically study the performance characteristics of L-OPT.}
}

@InProceedings{kandemir:locality,
  author = {M. Kandemir and A. Choudhary and J. Ramanujam and M. Kandaswamy},
  title = {A Unified Compiler Algorithm for Optimizing Locality, Parallelism,
  and Communication in Out-of-Core Computations},
  booktitle = {Proceedings of the Fifth Workshop on Input/Output in Parallel
  and Distributed Systems},
  year = {1997},
  month = {November},
  pages = {79--92},
  publisher = {ACM Press},
  address = {San Jose, CA},
  keyword = {compiler, out of core, parallel I/O, pario-bib},
  abstract = {This paper presents compiler algorithms to optimize out-of-core
  programs. These algorithms consider loop and data layout transformations in a
  unified framework. The performance of an out-of-core loop nest containing
  many references can be improved by a combination of restructuring the loops
  and file layouts. This approach considers array references one-by-one and
  attempts to optimize each reference for parallelism and locality. When there
  are references for which parallelism optimizations do not work, communication
  is vectorized so that data transfer can be performed before the innermost
  tiling loop. Preliminary results from hand-compiles on IBM SP-2 and Intel
  Paragon show that this approach reduces the execution time, improves the
  bandwidth speedup and overall speedup. In addition, we extend the base
  algorithm to work with file layout constraints and show how it can be used
  for optimizing programs consisting of multiple loop nests.}
}

@InProceedings{kotz:explore,
  author = {David Kotz and Ting Cai},
  title = {Exploring the use of {I/O} Nodes for Computation in a {MIMD}
  Multiprocessor},
  booktitle = {Proceedings of the IPPS~'95 Workshop on Input/Output in Parallel
  and Distributed Systems},
  year = {1995},
  month = {April},
  pages = {78--89},
  earlier = {kotz:explore-tr},
  URL = {ftp://ftp.cs.dartmouth.edu/pub/kotz/papers/kotz:explore.ps.Z},
  keyword = {parallel I/O, multiprocessor file system, dfk, pario-bib},
  abstract = {As parallel systems move into the production scientific-computing
  world, the emphasis will be on cost-effective solutions that provide high
  throughput for a mix of applications. Cost-effective solutions demand that a
  system make effective use of all of its resources. Many MIMD multiprocessors
  today, however, distinguish between ``compute'' and ``I/O'' nodes, the latter
  having attached disks and being dedicated to running the file-system server.
  This static division of responsibilities simplifies system management but
  does not necessarily lead to the best performance in workloads that need a
  different balance of computation and I/O. \par Of course, computational
  processes sharing a node with a file-system service may receive less CPU
  time, network bandwidth, and memory bandwidth than they would on a
  computation-only node. In this paper we begin to examine this issue
  experimentally. We found that high-performance I/O does not necessarily
  require substantial CPU time, leaving plenty of time for application
  computation. There were some complex file-system requests, however, which
  left little CPU time available to the application. (The impact on network and
  memory bandwidth still needs to be determined.) For applications (or users)
  that cannot tolerate an occasional interruption, we recommend that they
  continue to use only compute nodes. For tolerant applications needing more
  cycles than those provided by the compute nodes, we recommend that they take
  full advantage of {\em both\/} compute and I/O nodes for computation, and
  that operating systems should make this possible.}
}

@InProceedings{krieger:hfs2,
  author = {Orran Krieger and Michael Stumm},
  title = {{HFS}: A Performance-Oriented Flexible File System Based on
  Building-Block Compositions},
  booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel
  and Distributed Systems},
  year = {1996},
  month = {May},
  pages = {95--108},
  publisher = {ACM Press},
  address = {Philadelphia},
  earlier = {krieger:hfs},
  later = {krieger:hfs3},
  keyword = {parallel I/O, parallel file system, object-oriented, pario-bib},
  abstract = {The Hurricane File System (HFS) is designed for (potentially
  large-scale) shared memory multiprocessors. Its architecture is based on the
  principle that, in order to maximize performance for applications with
  diverse requirements, a file system must support a wide variety of file
  structures, file system policies and I/O interfaces. Files in HFS are
  implemented using simple building blocks composed in potentially complex
  ways. This approach yields great flexibility, allowing an application to
  customize the structure and policies of a file to exactly meet its
  requirements. For example, a file's structure can be optimized for concurrent
  random-access write-only operations by ten processes. Similarly, the
  prefetching, locking, and file cache management policies can all be chosen to
  match an application's access pattern. In contrast, most existing parallel
  file systems support a single file structure and a small set of policies.
  \par We have implemented HFS as part of the Hurricane operating system
  running on the Hector shared memory multiprocessor. We demonstrate that the
  flexibility of HFS comes with little processing or I/O overhead. We also show
  that for a number of file access patterns HFS is able to deliver to the
  applications the full I/O bandwidth of the disks on our system.},
  comment = {A published form of krieger:hfs and the thesis krieger:thesis.
  Their main point is that the file system is constructed from building-block
  objects. When you create a file you choose a few building blocks, for
  example, a replication block that mirrors the file, and some distribution
  blocks that distribute each replica across a set of disks. When you open the
  file you plug in some more building blocks, e.g., to do prefetching or to
  provide the kind of interface that you want to use. They point out that this
  flexibility is critical to be able to get good performance, because different
  file-access patterns need different structures and policies. They found that
  mapped files minimize copying costs and improve performance. They were able
  to obtain full disk bandwidth. Great paper.}
}

@InProceedings{kuo:efficient,
  author = {S. Kuo and M. Winslett and Y. Cho and J. Lee and Y. Chen},
  title = {Efficient Input and Output for Scientific Simulations},
  booktitle = {Proceedings of the Sixth Workshop on Input/Output in Parallel
  and Distributed Systems},
  year = {1999},
  month = {May},
  pages = {33--44},
  publisher = {ACM Press},
  address = {Atlanta, GA},
  URL = {http://vibes.cs.uiuc.edu/IOPADS/Accepted/Kuo.ps},
  keyword = {scientific computing, simulation, parallel I/O, pario-bib},
  abstract = {Large simulations which run for hundreds of hours on parallel
  computers often periodically generate snapshots of states, which are later
  post-processed to visualize the simulated physical phenomenon. For many
  applications, fast I/O during post-processing, which is dependent on an
  efficient organization of data on disk, is as important as minimizing
  computation-time I/O. In this paper we propose optimizations to support
  efficient parallel I/O for scientific simulations and subsequent
  visualizations. We present an ordering mechanism to linearize data on disk, a
  performance model to help to choose a proper stripe unit size, and a
  scheduling algorithm to minimize communication contention. Our experiments on
  an IBM SP show that the combination of these strategies provides a 20-25\%
  performance boost.}
}

@InProceedings{mache:spatial,
  author = {Jens Mache and Virginia Lo and Marilynn Livingston and Sharad
  Garg},
  title = {The Impact of Spatial Layout of Jobs on Parallel {I/O} Performance},
  booktitle = {Proceedings of the Sixth Workshop on Input/Output in Parallel
  and Distributed Systems},
  year = {1999},
  month = {May},
  pages = {45--56},
  publisher = {ACM Press},
  address = {Atlanta, GA},
  URL = {http://vibes.cs.uiuc.edu/IOPADS/Accepted/Mache.ps},
  keyword = {parallel I/O, pario-bib},
  abstract = {Input/Output is a big obstacle to effective use of
  teraflops-scale computing systems. Motivated by earlier parallel I/O
  measurements on an Intel TFLOPS machine, we conduct studies to determine the
  sensitivity of parallel I/O performance on multi-programmed mesh-connected
  machines with respect to number of I/O nodes, number of compute nodes,
  network link bandwidth, I/O node bandwidth, spatial layout of jobs, and read
  or write demands of applications. \par Our extensive simulations and
  analytical modeling yield important insights into the limitations on parallel
  I/O performance due to network contention, and into the possible gains in
  parallel I/O performance that can be achieved by tuning the spatial layout of
  jobs. \par Applying these results, we devise a new processor allocation
  strategy that is sensitive to parallel I/O traffic and the resulting network
  contention. In performance evaluations driven by synthetic workloads and by a
  real workload trace captured at the San Diego Supercomputing Center, the new
  strategy improves the average response time of parallel I/O intensive jobs by
  up to a factor of 4.5.}
}

@InProceedings{madhyastha:classification,
  author = {Tara M. Madhyastha and Daniel A. Reed},
  title = {Input/Output Access Pattern Classification Using Hidden {Markov}
  Models},
  booktitle = {Proceedings of the Fifth Workshop on Input/Output in Parallel
  and Distributed Systems},
  year = {1997},
  month = {November},
  pages = {57--67},
  publisher = {ACM Press},
  address = {San Jose, CA},
  later = {madhyastha:thesis},
  keyword = {workload characterization, file access pattern, parallel I/O,
  pario-bib},
  abstract = {Input/output performance on current parallel file systems is
  sensitive to a good match of application access pattern to file system
  capabilities. Automatic input/output access classification can determine
  application access patterns at execution time, guiding adaptive file system
  policies. In this paper we examine a new method for access pattern
  classification that uses hidden Markov models, trained on access patterns
  from previous executions, to create a probabilistic model of input/output
  accesses. We compare this approach to a neural network classification
  framework, presenting performance results from parallel and sequential
  benchmarks and applications.},
  comment = {The most interesting thing in this paper is the use of a Hidden
  Markov Model to understand the access pattern of an application to a file.
  After running the application on the file once, and simultaneously training
  their HMM, they use the result to tune the system for the next execution
  (cache size, cache partitioning, prefetching, Intel file mode, etc). They get
  much better performance in future runs. See also madhyastha:thesis, and
  related papers.}
}

@InProceedings{moore:detection,
  author = {Jason A. Moore and Philip J. Hatcher and Michael J. Quinn},
  title = {Efficient Data-Parallel Files via Automatic Mode Detection},
  booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel
  and Distributed Systems},
  year = {1996},
  month = {May},
  pages = {1--14},
  publisher = {ACM Press},
  address = {Philadelphia},
  keyword = {parallel I/O, data parallelism, pario-bib},
  abstract = {Parallel languages rarely specify parallel I/O constructs, and
  existing commercial systems provide the programmer with a low-level I/O
  interface. We present design principles for integrating I/O into languages
  and show how these principles are applied to a virtual-processor-oriented
  language. We illustrate how machine-independent modes are used to support
  both high performance and generality. We describe an automatic mode detection
  technique that saves the programmer from extra syntax and low-level file
  system details. We show how virtual processor file operations, typically
  small by themselves, are combined into efficient large-scale file system
  calls. Finally, we present a variety of benchmark results detailing design
  tradeoffs and the performance of various modes.},
  comment = {Updated version of TR 95-80-9. See moore:stream. Interesting
  approach, where they permit a fairly normal fread and fwrite kind of
  interface, with each VP having its own stream. They choose their own format
  for the file, and switch between formats (and internal buffering) depending
  on the particulars of the fread and fwrite parameters. They seem to have good
  performance, and a familiar interface. They are left with a non-standard file
  format.}
}

@InProceedings{nieuwejaar:galley-perf,
  author = {Nils Nieuwejaar and David Kotz},
  title = {Performance of the {Galley} Parallel File System},
  booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel
  and Distributed Systems},
  year = {1996},
  month = {May},
  pages = {83--94},
  publisher = {ACM Press},
  address = {Philadelphia},
  later = {nieuwejaar:jgalley-tr},
  URL =
  {ftp://ftp.cs.dartmouth.edu/pub/kotz/papers/nieuwejaar:galley-perf.ps.Z},
  keyword = {parallel file system, parallel I/O, multiprocessor file system
  interface, pario-bib, dfk},
  abstract = {As the I/O needs of parallel scientific applications increase,
  file systems for multiprocessors are being designed to provide applications
  with parallel access to multiple disks. Many parallel file systems present
  applications with a conventional Unix-like interface that allows the
  application to access multiple disks transparently. This interface conceals
  the parallelism within the file system, which increases the ease of
  programmability, but makes it difficult or impossible for sophisticated
  programmers and libraries to use knowledge about their I/O needs to exploit
  that parallelism. Furthermore, most current parallel file systems are
  optimized for a different workload than they are being asked to support. We
  introduce Galley, a new parallel file system that is intended to efficiently
  support realistic parallel workloads. Initial experiments, reported in this
  paper, indicate that Galley is capable of providing high-performance I/O to
  applications that access data in patterns that have been observed to be
  common.},
  comment = {See also nieuwejaar:galley.}
}

@InProceedings{prabhakar:browsing,
  author = {Sunil Prabhakar and Divyakant Agrawal and Amr {El Abbadi} and Ambuj
  Singh and Terence Smith},
  title = {Browsing and Placement of Multiresolution Images on Parallel Disks},
  booktitle = {Proceedings of the Fifth Workshop on Input/Output in Parallel
  and Distributed Systems},
  year = {1997},
  month = {November},
  pages = {102--113},
  publisher = {ACM Press},
  address = {San Jose, CA},
  keyword = {multimedia, parallel I/O, pario-bib},
  abstract = {With rapid advances in computer and communication technologies,
  there is an increasing demand to build and maintain large image repositories.
  In order to reduce the demands on I/O and network resources, multiresolution
  representations are being proposed for the storage organization of images.
  Image decomposition techniques such as {\em wavelets} can be used to provide
  these multiresolution images. The original image is represented by several
  coefficients, one of them with visual similarity to the original image, but
  at a lower resolution. These visually similar coefficients can be thought of
  as {\em thumbnails} or {\em icons} of the original image. This paper
  addresses the problem of storing these multiresolution coefficients on disks
  so that thumbnail browsing as well as image reconstruction can be performed
  efficiently. Several strategies are evaluated to store the image coefficients
  on parallel disks. These strategies can be classified into two broad classes
  depending on whether the access pattern of the images is used in the
  placement. Disk simulation is used to evaluate the performance of these
  strategies. Simulation results are validated with results from experiments
  with real disks and are found to be in good agreement. The results indicate
  that significant performance improvements can be achieved with as few as four
  disks by placing image coefficients based upon browsing access patterns.},
  comment = {They use simulation to study several different placement policies
  for the thumbnail and varying-resolution versions of images on a disk array.}
}

@InProceedings{schwabe:layouts,
  author = {Eric J. Schwabe and Ian M. Sutherland and Bruce K. Holmer},
  title = {Evaluating Approximately Balanced Parity-Declustered Data Layouts
  for Disk Arrays},
  booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel
  and Distributed Systems},
  year = {1996},
  month = {May},
  pages = {41--54},
  publisher = {ACM Press},
  address = {Philadelphia},
  later = {schwabe:jlayouts},
  keyword = {parallel I/O, disk array, parity, RAID, pario-bib},
  abstract = {Parity declustering has been used to reduce the time required to
  reconstruct a failed disk in a disk array. Most existing work on parity
  declustering uses BIBD-based data layouts, which distribute the workload of
  reconstructing a failed disk over the remaining disks of the array with
  perfect balance. For certain array sizes, however, there is no known
  BIBD-based layout. In this paper, we evaluate data layouts that are
  approximately balanced --- that is, that distribute the reconstruction
  workload over the disks of the array with only approximate balance.
  Approximately balanced layouts are considerably easier to construct than
  perfectly balanced layouts. We consider three methods for generating
  approximately balanced layouts: randomization, simulated annealing, and
  perturbing a BIBD-based layout whose size is near the desired size. We
  compare the performance of these approximately balanced layouts with that of
  perfectly balanced layouts using a disk array simulator. We conclude that, on
  uniform workloads, approximately balanced data layouts have performance
  nearly identical to that of perfectly balanced layouts. Approximately
  balanced layouts therefore provide the reconstruction performance benefits of
  perfectly balanced layouts for arrays where perfectly balanced layouts are
  either not known, or do not exist.}
}

@InProceedings{soloviev:prefetching,
  author = {Valery V. Soloviev},
  title = {Prefetching in Segmented Disk Cache for Multi-Disk Systems},
  booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel
  and Distributed Systems},
  year = {1996},
  month = {May},
  pages = {69--82},
  publisher = {ACM Press},
  address = {Philadelphia},
  keyword = {parallel I/O, prefetching, disk cache, disk array, pario-bib},
  abstract = {This paper investigates the performance of a multi-disk storage
  system equipped with a segmented disk cache processing a workload of multiple
  relational scans. Prefetching is a popular method of improving the
  performance of scans. Many modern disks have a multisegment cache which can
  be used for prefetching. We observe that, exploiting declustering as a data
  placement method, prefetching in a segmented cache causes a load imbalance
  among several disks. A single disk becomes a bottleneck, degrading
  performance of the entire system. A variation in disk queue length is a
  primary factor of the imbalance. Using a precise simulation model, we
  investigate several approaches to achieving better balancing. Our metrics are
  a scan response time for the closed-end system and an ability to sustain a
  workload without saturating for the open-end system. We arrive at two main
  conclusions: (1) Prefetching in main memory is inexpensive and effective for
  balancing and can supplement or substitute prefetching in disk cache. (2)
  Disk-level prefetching provides about the same performance as main memory
  prefetching if request queues are managed in the disk controllers rather than
  in the host. Checking the disk cache before queuing requests provides not
  only better request response time but also drastically improves balancing. A
  single cache performs better than a segmented cache for this method.},
  comment = {An interesting paper about disk-controller cache management in
  database workloads. Actually, the workloads are sequential scans of
  partitioned files, which could occur in many kinds of workloads. The
  declustering pattern (partitioning) is a little unusual for most scientific
  parallel I/O veterans, who are used to striping. And the cache-management
  algorithms seem a bit strange, particularly the fact that the cache appears
  to be used only for explicit prefetch requests. Turns out that it is best to
  put the prefetching and disk queueing in the same place, either on the
  controller or in main memory, to avoid load imbalance that arises from
  randomness in the workload, which is accentuated into a big bottleneck and a
  convoy effect.}
}

@InProceedings{thakur:mpi-io-implement,
  author = {Rajeev Thakur and William Gropp and Ewing Lusk},
  title = {On Implementing {MPI-IO} Portably and with High Performance},
  booktitle = {Proceedings of the Sixth Workshop on Input/Output in Parallel
  and Distributed Systems},
  year = {1999},
  month = {May},
  pages = {23--32},
  earlier = {thakur:mpi-io-implement-tr},
  URL = {http://www.mcs.anl.gov/~thakur/papers/mpio-impl.ps.gz},
  keyword = {parallel I/O, multiprocessor file system interface, pario-bib},
  abstract = {We discuss the issues involved in implementing MPI-IO portably on
  multiple machines and file systems and also achieving high performance. One
  way to implement MPI-IO portably is to implement it on top of the basic Unix
  I/O functions ({\tt open}, {\tt lseek}, {\tt read}, {\tt write}, and {\tt
  close}), which are themselves portable. We argue that this approach has
  limitations in both functionality and performance. We instead advocate an
  implementation approach that combines a large portion of portable code and a
  small portion of code that is optimized separately for different machines and
  file systems. We have used such an approach to develop a high-performance,
  portable MPI-IO implementation, called ROMIO. \par In addition to basic I/O
  functionality, we consider the issues of supporting other MPI-IO features,
  such as 64-bit file sizes, noncontiguous accesses, collective I/O,
  asynchronous I/O, consistency and atomicity semantics, user-supplied hints,
  shared file pointers, portable data representation, and file preallocation.
  We describe how we implemented each of these features on various machines and
  file systems. The machines we consider are the HP Exemplar, IBM SP, Intel
  Paragon, NEC SX-4, SGI Origin2000, and networks of workstations; and the file
  systems we consider are HP HFS, IBM PIOFS, Intel PFS, NEC SFS, SGI XFS, NFS,
  and any general Unix file system (UFS). \par We also present our thoughts on
  how a file system can be designed to better support MPI-IO. We provide a list
  of features desired from a file system that would help in implementing MPI-IO
  correctly and with high performance.}
}

@InProceedings{toledo:solar,
  author = {Sivan Toledo and Fred G. Gustavson},
  title = {The Design and Implementation of {SOLAR}, a Portable Library for
  Scalable Out-of-Core Linear Algebra Computations},
  booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel
  and Distributed Systems},
  year = {1996},
  month = {May},
  pages = {28--40},
  publisher = {ACM Press},
  address = {Philadelphia},
  keyword = {parallel I/O, out-of-core, linear algebra, pario-bib},
  abstract = {SOLAR is a portable high-performance library for out-of-core
  dense matrix computations. It combines portability with high performance by
  using existing high-performance in-core subroutine libraries and by using an
  optimized matrix input-output library. SOLAR works on parallel computers,
  workstations, and personal computers. It supports in-core computations on
  both shared-memory and distributed-memory machines, and its matrix
  input-output library supports both conventional I/O interfaces and parallel
  I/O interfaces. This paper discusses the overall design of SOLAR, its
  interfaces, and the design of several important subroutines. Experimental
  results show that SOLAR can factor on a single workstation an out-of-core
  positive-definite symmetric matrix at a rate exceeding 215 Mflops, and an
  out-of-core general matrix at a rate exceeding 195 Mflops. Less than 16\% of
  the running time is spent on I/O in these computations. These results
  indicate that SOLAR's portability does not compromise its performance. We
  expect that the combination of portability, modularity, and the use of a
  high-level I/O interface will make the library an important platform for
  research on out-of-core algorithms and on parallel I/O.},
  comment = {Sounds great. Library package that supports LAPACK-like
  functionality on in-core and out-of-core matrices. Good performance. Good
  portability (IBM workstation, IBM SP-2, and OS/2 laptop). They separate the
  matrix algorithms from the underlying I/O routines in an interesting way
  (read and write submatrices), leaving just enough information to allow the
  I/O system to do some higher-level optimizations.}
}

@InProceedings{weissman:smart,
  author = {Jon B. Weissman},
  title = {Smart File Objects: A Remote File Access Paradigm},
  booktitle = {Proceedings of the Sixth Workshop on Input/Output in Parallel
  and Distributed Systems},
  year = {1999},
  month = {May},
  pages = {89--97},
  publisher = {ACM Press},
  address = {Atlanta, GA},
  URL = {http://vibes.cs.uiuc.edu/IOPADS/Accepted/Weissman.ps},
  keyword = {object, parallel I/O, pario-bib},
  abstract = {This paper describes a new scheme for remote file access called
  Smart File Objects (SFO). The SFO is an object-oriented application-specific
  file access paradigm designed to attack the bottleneck imposed by high
  latency, low bandwidth networks such as wide-area and wireless networks. The
  SFO uses application and network information to adaptively prefetch needed
  data in parallel with the execution of the application. The SFO can offer
  additional advantages such as non-blocking I/O, bulk I/O, improved file
  access APIs, and increased reliability. We describe the SFO concept, a
  prototype implementation in the Mentat system, and the results obtained with
  a distributed gene sequence application running across the Internet and vBNS.
  The results show the potential of the SFO approach to improve application
  performance.}
}

@InProceedings{wisniewski:in-place,
  author = {Leonard F. Wisniewski},
  title = {Structured Permuting in Place on Parallel Disk Systems},
  booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel
  and Distributed Systems},
  year = {1996},
  month = {May},
  pages = {128--139},
  publisher = {ACM Press},
  address = {Philadelphia},
  keyword = {parallel I/O, parallel I/O algorithm, permutation, out-of-core,
  pario-bib},
  abstract = {The ability to perform permutations of large data sets in place
  reduces the amount of necessary available disk storage. The simplest way to
  perform a permutation often is to read the records of a data set from a
  source portion of data storage, permute them in memory, and write them to a
  separate target portion of the same size. It can be quite expensive, however,
  to provide disk storage that is twice the size of very large data sets.
  Permuting in place reduces the expense by using only a small amount of extra
  disk storage beyond the size of the data set. \par
  \newcommand{\ceil}[1]{\lceil #1\rceil} \newcommand{\rank}[1]{\mathop{\rm
  rank}\nolimits #1} This paper features in-place algorithms for commonly used
  structured permutations. We have developed an asymptotically optimal
  algorithm for performing BMMC (bit-matrix-multiply/complement) permutations
  in place that requires at most $\frac{2N}{BD}\left(
  2\ceil{\frac{\rank{\gamma}}{\lg (M/B)}} + \frac{7}{2}\right)$ parallel disk
  accesses, as long as $M \geq 2BD$, where $N$ is the number of records in the
  data set, $M$ is the number of records that can fit in memory, $D$ is the
  number of disks, $B$ is the number of records in a block, and $\gamma$ is the
  lower left $\lg (N/B) \times \lg B$ submatrix of the characteristic matrix
  for the permutation. This algorithm uses $N+M$ records of disk storage and
  requires only a constant factor more parallel disk accesses and insignificant
  additional computation than a previously published asymptotically optimal
  algorithm that uses $2N$ records of disk storage. \par We also give
  algorithms to perform mesh and torus permutations on a $d$-dimensional mesh.
  The in-place algorithm for mesh permutations requires at most $3\ceil{N/BD}$
  parallel I/Os and the in-place algorithm for torus permutations uses at most
  $4dN/BD$ parallel I/Os. The algorithms for mesh and torus permutations
  require no extra disk space as long as the memory size $M$ is at least $3BD$.
  The torus algorithm improves upon the previous best algorithm in terms of
  both time and space.}
}

@InProceedings{zhou:threads,
  author = {Yuanyuan Zhou and Limin Wang and Douglas W. Clark and Kai Li},
  title = {Thread Scheduling for Out-of-Core Applications with Memory Server on
  Multicomputers},
  booktitle = {Proceedings of the Sixth Workshop on Input/Output in Parallel
  and Distributed Systems},
  year = {1999},
  month = {May},
  pages = {57--67},
  publisher = {ACM Press},
  address = {Atlanta, GA},
  URL = {http://vibes.cs.uiuc.edu/IOPADS/Accepted/Zhou.ps},
  keyword = {threads, scheduling, memory, out-of-core application, parallel
  I/O, pario-bib},
  abstract = {Out-of-core applications perform poorly in paged virtual memory
  (VM) systems because demand paging involves slow disk I/O accesses. Much
  research has been done on reducing the I/O overhead in such applications by
  either reducing the number of I/Os or lowering the cost of each I/O
  operation. In this paper, we investigate a method that combines fine-grained
  threading with a memory server model to improve the performance of
  out-of-core applications on multicomputers. The memory server model decreases
  the average cost of I/O operations by paging to remote memory, while the
  fine-grained thread scheduling reduces the number of I/O accesses by
  improving the data locality of applications. We have evaluated this method on
  an Intel Paragon with 7 applications. Our results show that the memory server
  system performs better than the VM disk paging by a factor of 5 for
  sequential applications and a factor of 1.5 to 2.2 for parallel applications.
  The fine-grained threading alone improves the VM disk paging performance by a
  factor of 10 and 1.2 to 3 respectively for sequential and parallel
  applications. Overall, the combination of these two techniques outperforms
  the VM disk paging by more than a factor of 12 for sequential applications
  and a factor of 3 to 6 for parallel applications.}
}