This file contains a BibTeX bibliography for the papers that appear in
IOPADS '96, the Fourth Workshop on Input/Output in Parallel and
Distributed Systems.

Please leave this header on the the file; bibtex won't mind.

You can find more about this conference at 
   http://www.cs.dartmouth.edu/iopads96

@string{iopads96 = "Fourth Workshop on Input/Output in Parallel
		 and Distributed Systems"}

@InProceedings{moore:detection,
  author = 	 "Jason A. Moore and Phil Hatcher and Michael J. Quinn",
  title = 	 "Efficient Data-Parallel Files via Automatic Mode Detection",
  pages =	 "1--14",
  booktitle =	 iopads96,
  year =	 1996,
  address =	 "Philadelphia",
  month =	 may,
  keyword =	 "parallel I/O, data parallelism",
  abstract = {
Parallel languages rarely specify parallel I/O constructs, and
existing commercial systems provide the programmer with a low-level
I/O interface. We present design principles for integrating I/O into
languages and show how these principles are applied to a
virtual-processor-oriented language. We illustrate how
machine-independent modes are used to support both high performance
and generality. We describe an automatic mode detection technique that
saves the programmer from extra syntax and low-level file system
details. We show how virtual processor file operations, typically
small by themselves, are combined into efficient large-scale file
system calls. Finally, we present a variety of benchmark results
detailing design tradeoffs and the performance of various
modes. Parallel languages rarely specify parallel I/O constructs, and
existing commercial systems provide the programmer with a low-level
I/O interface. We present design principles for integrating I/O into
languages and show how these principles are applied to a
virtual-processor-oriented language. We illustrate how
machine-independent modes are used to support both high performance
and generality. We describe an automatic mode detection technique that
saves the programmer from extra syntax and low-level file system
details. We show how virtual processor file operations, typically
small by themselves, are combined into efficient large-scale file
system calls. Finally, we present a variety of benchmark results
detailing design tradeoffs and the performance of various modes.}
}

@InProceedings{acharya:tuning,
  author = 	 "Anurag Acharya and Mustafa Uysal and Robert Bennett and Assaf
		  Mendelson and Michael Beynon and Jeffrey K. Hollingsworth and
		  Joel Saltz and Alan Sussman",
  title = 	 "Tuning the Performance of {I/O} Intensive Parallel
		  Applications",
  pages =	 "15--27",
  booktitle =	 iopads96,
  year =	 1996,
  address =	 "Philadelphia",
  month =	 may,
  keyword =	 "parallel I/O, filesystem workload, parallel application",
  abstract = {
Getting good I/O performance from parallel programs is a
critical problem for many application domains.  In this paper, we
report our experience tuning the I/O performance of four application
programs from the areas of satellite-data processing and linear
algebra.  After tuning, three of the four applications achieve
application-level I/O rates of over 100 MB/s on 16 processors.  The
total volume of I/O required by the programs ranged from about 75 MB
to over 200 GB. We report the lessons learned in achieving high I/O
performance from these applications, including the need for code
restructuring, local disks on every node and knowledge of future I/O
requests. We also report our experience on achieving high performance
on peer-to-peer configurations. Finally, we comment on the necessity
of complex I/O interfaces like collective I/O and strided requests to
achieve high performance.}
}

@InProceedings{toledo:solar,
  author = 	 "Sivan Toledo and Fred G. Gustavson",
  title = 	 "The Design and Implementation of {SOLAR}, a Portable
		  Library for Scalable Out-of-Core Linear Algebra
		  Computations", 
  pages =	 "28--40",
  booktitle =	 iopads96,
  year =	 1996,
  address =	 "Philadelphia",
  month =	 may,
  keyword =	 "parallel I/O, out-of-core, linear algebra",
  abstract = {
SOLAR is a portable high-performance library for out-of-core dense
matrix computations.  It combines portability with high performance by
using existing high-performance in-core subroutine libraries and by
using an optimized matrix input-output library.  SOLAR works on
parallel computers, workstations, and personal computers.  It supports
in-core computations on both shared-memory and distributed-memory
machines, and its matrix input-output library supports both
conventional I/O interfaces and parallel I/O interfaces.  This paper
discusses the overall design of SOLAR, its interfaces, and the design
of several important subroutines.  Experimental results show that
SOLAR can factor on a single workstation an out-of-core
positive-definite symmetric matrix at a rate exceeding 215 Mflops, and
an out-of-core general matrix at a rate exceeding 195 Mflops.  Less
than 16\% of the running time is spent on I/O in these computations.
These results indicate that SOLAR's portability does not compromise
its performance.  We expect that the combination of portability,
modularity, and the use of a high-level I/O interface will make the
library an important platform for research on out-of-core algorithms
and on parallel I/O.}
}

@InProceedings{schwabe:layouts,
  author = 	 "Eric J. Schwabe and Ian M. Sutherland and Bruce K. Holmer",
  title = 	 "Evaluating Approximately Balanced Parity-Declustered
		  Data Layouts for Disk Arrays",
  pages =	 "41--54",
  booktitle =	 iopads96,
  year =	 1996,
  address =	 "Philadelphia",
  month =	 may,
  keyword =	 "parallel I/O, disk array, parity, RAID",
  abstract = {
Parity declustering has been used to reduce the time required to 
reconstruct a failed disk in a disk array.  Most existing work on 
parity declustering uses BIBD-based data layouts, which distribute 
the workload of reconstructing a failed disk over the remaining disks 
of the array with perfect balance.  For certain array sizes, however, 
there is no known BIBD-based layout.  In this paper, we evaluate data 
layouts that are approximately balanced --- that is, that distribute 
the reconstruction workload over the disks of the array with only 
approximate balance.  Approximately balanced layouts are considerably 
easier to construct than perfectly balanced layouts.  We consider 
three methods for generating approximately balanced layouts: 
randomization, simulated annealing, and  perturbing a BIBD-based 
layout whose size is near the desired size.  We compare the performance 
of these approximately balanced layouts with that of perfectly balanced 
layouts using a disk array simulator.  We conclude that, on uniform 
workloads, approximately balanced data layouts have performance nearly 
identical to that of perfectly balanced layouts.  Approximately 
balanced layouts therefore provide the reconstruction performance 
benefits of perfectly balanced layouts for arrays where perfectly 
balanced layouts are either not known, or do not exist.}
}

@InProceedings{purakayastha:enwrich,
  author = 	 "Apratim Purakayastha and Carla Ellis and David Kotz",
  title = 	 "{ENWRICH}: A Compute-Processor Write Caching Scheme
		  for Parallel File Systems",
  pages =	 "55--68",
  booktitle =	 iopads96,
  year =	 1996,
  address =	 "Philadelphia",
  month =	 may,
  keyword =	 "parallel I/O, parallel file system, file caching",
  abstract = {
Many parallel scientific applications need high-performance I/O\@.
Unfortunately, end-to-end parallel-I/O performance has not been able
to keep up with substantial improvements in parallel-I/O hardware
because of poor parallel file-system software. Many radical changes,
both at the interface level and the implementation level, have
recently been proposed. One such proposed interface is {\em collective
I/O}, which allows parallel jobs to request transfer of large
contiguous objects in a single request, thereby preserving useful
semantic information that would otherwise be lost if the transfer were
expressed as per-processor non-contiguous requests. Kotz has proposed
{\em disk-directed I/O} as an efficient implementation technique for
collective-I/O operations, where the compute processors make a single
collective data-transfer request, and the I/O processors thereafter
take full control of the actual data transfer, exploiting their
detailed knowledge of the disk layout to attain substantially improved
performance.
\par
Recent parallel file-system usage studies show that writes to
write-only files are a dominant part of the workload. Therefore,
optimizing writes could have a significant impact on overall
performance. In this paper, we propose ENWRICH, a compute-processor
write-caching scheme for write-only files in parallel file
systems. ENWRICH combines low-overhead write caching at the compute
processors with high performance disk-directed I/O at the I/O
processors to achieve both low latency and high bandwidth. This
combination facilitates the use of the powerful disk-directed I/O
technique independent of any particular choice of interface. By
collecting writes over many files and applications, ENWRICH lets the
I/O processors optimize disk I/O over a large pool of requests.  We
evaluate our design via simulated implementation and show that ENWRICH
achieves high performance for various configurations and workloads.}
}

@InProceedings{soloviev:prefetching,
  author = 	 "Valery V. Soloviev",
  title = 	 "Prefetching in Segmented Disk Cache for Multi-Disk Systems",
  pages =	 "69--82",
  booktitle =	 iopads96,
  year =	 1996,
  address =	 "Philadelphia",
  month =	 may,
  keyword =	 "parallel I/O, prefetching, disk cache, disk array",
  abstract = {
This paper investigates the performance of a multi-disk storage system
equipped with a segmented disk cache processing a workload of multiple
relational scans. Prefetching is a popular method of improving the
performance of scans. Many modern disks have a multisegment cache which
can be used for prefetching.  We observe that, exploiting declustering as
a data placement method, prefetching in a segmented cache causes a load
imbalance among several disks. A single disk becomes a bottleneck,
degrading performance of the entire system. A variation in disk queue
length is a primary factor of the imbalance. Using a precise simulation
model, we investigate several approaches to achieving better balancing.
Our metrics are a scan response time for the closed-end system and an
ability to sustain a workload without saturating for the open-end system. 
We arrive at two main conclusions: (1) Prefetching in main memory is
inexpensive and effective for balancing and can supplement or substitute
prefetching in disk cache. (2) Disk-level prefetching provides about the
same performance as main memory prefetching if request queues are managed
in the disk controllers rather than in the host.  Checking the disk cache
before queuing requests provides not only better request response time but
also drastically improves balancing. A single cache performs better than a
segmented cache for this method. }
}

@InProceedings{nieuwejaar:galley,
  author = 	 "Nils Nieuwejaar and David Kotz",
  title = 	 "Performance of the {Galley} Parallel File System ",
  pages =	 "83--94",
  booktitle =	 iopads96,
  year =	 1996,
  address =	 "Philadelphia",
  month =	 may,
  keyword =	 "parallel I/O, parallel file system",
  abstract = {
As the I/O needs of parallel scientific applications increase, file
systems for multiprocessors are being designed to provide applications
with parallel access to multiple disks.  Many parallel file systems
present applications with a conventional Unix-like interface that
allows the application to access multiple disks transparently.  This
interface conceals the parallelism within the file system, which
increases the ease of programmability, but makes it difficult or
impossible for sophisticated programmers and libraries to use knowledge
about their I/O needs to exploit that parallelism.  Furthermore, most
current parallel file systems are optimized for a different workload
than they are being asked to support.  We introduce Galley, a new
parallel file system that is intended to efficiently support realistic
parallel workloads. Initial experiments, reported in this paper,
indicate that Galley is capable of providing high-performance I/O to
applications that access data in patterns that have been observed to be
common. }
}

@InProceedings{krieger:hfs,
  author = 	 "Orran Krieger and Michael Stumm",
  title = 	 "{HFS}: A Performance-Oriented Flexible File System
		  Based on Building-Block Compositions ",
  pages =	 "95--108",
  booktitle =	 iopads96,
  year =	 1996,
  address =	 "Philadelphia",
  month =	 may,
  keyword =	 "parallel I/O, parallel file system, object-oriented",
  abstract = {
The Hurricane File System (HFS) is designed for (potentially
large-scale) shared memory multiprocessors.  Its architecture is based
on the principle that, in order to maximize performance for
applications with diverse requirements, a file system must support a
wide variety of file structures, file system policies and I/O
interfaces.  Files in HFS are implemented using simple building blocks
composed in potentially complex ways.  This approach yields great
flexibility, allowing an application to customize the structure and
policies of a file to exactly meet its requirements.  For example, a
file's structure can be optimized for concurrent random-access
write-only operations by ten processes.  Similarly, the prefetching,
locking, and file cache management policies can all be chosen to match
an application's access pattern.  In contrast, most existing parallel
file systems support a single file structure and a small set of
policies.
\par
We have implemented HFS as part of the Hurricane operating system
running on the Hector shared memory multiprocessor.  We demonstrate
that the flexibility of HFS comes with little processing or I/O
overhead.  We also show that for a number of file access patterns HFS
is able to deliver to the applications the full I/O bandwidth of the
disks on our system.}
}

@InProceedings{chen:panda,
  author = 	 "Y. Chen and M. Winslett and K. E. Seamons and S. Kuo
		  and Y. Cho and M. Subramaniam",
  title = 	 "Scalable Message Passing in {Panda}",
  pages =	 "109--121",
  booktitle =	 iopads96,
  year =	 1996,
  address =	 "Philadelphia",
  month =	 may,
  keyword =	 "parallel I/O, parallel file system",
  abstract = {
To provide high  performance for applications with a wide variety
of i/o requirements and to support many different parallel platforms,
the design of a parallel i/o system must provide for efficient utilization
of available bandwidth both for disk traffic and for message passing.
In this paper we discuss the message-passing scalability of the
server-directed i/o architecture of Panda, a library for synchronized
i/o of multidimensional arrays on parallel platforms.  We show how to
improve i/o performance in situations where message-passing is a
bottleneck, by combining the server-directed i/o strategy for highly
efficient use of available disk bandwidth with new mechanisms to
minimize internal communication and computation overhead in Panda.
We present experimental results that
show that with these improvements, Panda will provide high i/o
performance for a wider range of applications, such as applications
running with slow interconnects, applications performing i/o
operations on large numbers of arrays, or applications that require
drastic data rearrangements as data are moved between memory and disk
(e.g., array transposition).   We also argue that in the
future, the improved approach to message-passing
will allow Panda to support applications that are not closely 
synchronized or that run in heterogeneous environments.}
}

@InProceedings{armen:disk-model,
  author = 	 "Chris Armen",
  title = 	 "Bounds on the Separation of Two Parallel Disk Models",
  pages =	 "122--127",
  booktitle =	 iopads96,
  year =	 1996,
  address =	 "Philadelphia",
  month =	 may,
  keyword =	 "parallel I/O, theory, parallel I/O algorithm",
  abstract = {
The single-disk, D-head model of parallel I/O was introduced 
by Agarwal and Vitter to analyze algorithms for problem 
instances that are too large to fit in primary memory.  
Subsequently Vitter and Shriver proposed a more realistic 
model in which the disk space is partitioned into D disks, 
with a single head per disk.  To date, each problem for 
which there is a known optimal algorithm for both models 
has the same asymptotic bounds on both models. Therefore, 
it has been unknown whether the models are equivalent or 
whether the single-disk model is strictly more powerful.
\par
In this paper we provide evidence that the single-disk model is
strictly more powerful.  We prove a lower bound on any general
simulation of the single-disk model on the multi-disk model and
establish randomized and deterministic upper bounds.  Let $N$ be the
problem size and let $T$ be the number of parallel I/Os required by a
program on the single-disk model.  Then any simulation of this program
on the multi-disk model will require $\Omega\left(T
\frac{\log(N/D)}{\log \log(N/D)}\right)$ parallel I/Os.  This lower
bound holds even if replication is allowed in the multi-disk model.
We also show an $O\left(\frac{\log D}{\log \log D}\right)$ randomized
upper bound and an $O\left(\log D (\log \log D)^2\right)$
deterministic upper bound.  These results exploit an interesting
analogy between the disk models and the PRAM and DCM models of
parallel computation.}
}

@InProceedings{wisniewski:in-place,
  author = 	 "Leonard F. Wisniewski",
  title = 	 "Structured Permuting in Place on Parallel Disk Systems",
  pages =	 "128--139",
  booktitle =	 iopads96,
  year =	 1996,
  address =	 "Philadelphia",
  month =	 may,
  keyword =	 "parallel I/O, parallel I/O algorithm, permutation,
		  out-of-core",
  abstract = {
The ability to perform permutations of large data sets in place
reduces the amount of necessary available disk storage.  The simplest
way to perform a permutation often is to read the records of a data
set from a source portion of data storage, permute them in memory, and
write them to a separate target portion of the same size.  It can be
quite expensive, however, to provide disk storage that is twice the
size of very large data sets.  Permuting in place reduces the expense
by using only a small amount of extra disk storage beyond the size of
the data set.
\par
\newcommand{\ceil}[1]{\lceil #1\rceil}
\newcommand{\rank}[1]{\mathop{\rm rank}\nolimits #1}
This paper features in-place algorithms for commonly used structured
permutations.  We have developed an asymptotically optimal algorithm
for performing BMMC (bit-matrix-multiply/complement) permutations in
place that requires at most $\frac{2N}{BD}\left(
2\ceil{\frac{\rank{\gamma}}{\lg (M/B)}} + \frac{7}{2}\right)$ parallel
disk accesses, as long as $M \geq 2BD$, where $N$ is the number of
records in the data set, $M$ is the number of records that can fit in
memory, $D$ is the number of disks, $B$ is the number of records in a
block, and $\gamma$ is the lower left $\lg (N/B) \times \lg B$
submatrix of the characteristic matrix for the permutation.  This
algorithm uses $N+M$ records of disk storage and requires only a
constant factor more parallel disk accesses and insignificant
additional computation than a previously published asymptotically
optimal algorithm that uses $2N$ records of disk storage.
\par
We also give algorithms to perform mesh and torus permutations on a
$d$-dimensional mesh.  The in-place algorithm for mesh permutations
requires at most $3\ceil{N/BD}$ parallel I/Os and the in-place
algorithm for torus permutations uses at most $4dN/BD$ parallel I/Os.
The algorithms for mesh and torus permutations require no extra disk
space as long as the memory size $M$ is at least $3BD$.  The torus
algorithm improves upon the previous best algorithm in terms of both
time and space.}
}