BibTeX bibliography for contents of the book

      Input/Output in Parallel and Distributed Computer Systems
         Ravi Jain, John Werth, and James C. Browne, editors
                      Kluwer Academic Publishers
                                 1996

@InCollection{jain:pario-intro,
  author = "Ravi Jain and John Werth and J.~C. Browne",
  title = "{I/O} in Parallel and Distributed Systems: An Introduction",
  chapter = 1,
  pages = "3--30",
  crossref = 	"iopads-book",
  keyword =	"parallel I/O, out-of-core, pario bib", 
  comment =	"Part of a whole book on parallel I/O; see iopads-book.",
  abstract =    {We sketch the reasons for the I/O
  bottleneck in parallel and distributed systems, pointing
  out that it can be viewed as a special case of a general
  bottleneck that arises at all levels of the memory
  hierarchy.  We argue that because of its severity, the I/O
  bottleneck deserves systematic attention at all levels of
  system design.  We then present a survey of the issues
  raised by the I/O bottleneck in five key areas of parallel
  and distributed systems:  applications, algorithms,
  compilers, operating systems and architecture.  Finally, we
  address some of the trends we observe emerging in new
  paradigms of parallel and distributed computing: the
  convergence of networking and I/O, I/O for massively
  distributed ``global information systems'' such as the
  World Wide Web, and I/O for mobile computing and wireless
  communications.  These considerations suggest exciting new
  research directions in I/O for parallel and distributed
  systems in the years to come.} 
}

@InCollection{shriver:models-algs,
  author = "Elizabeth Shriver and Mark Nodine",
  title = "An Introduction to Parallel {I/O} Models and Algorithms",
  chapter = 2,
  pages = "31--68",
  crossref = 	"iopads-book",
  keyword =	"parallel I/O algorithms, out-of-core, pario bib", 
  comment =	"Part of a whole book on parallel I/O; see iopads-book.",
  abstract =    {Problems whose data are too large to fit
  into main memory are called {\it out-of-core} problems.
  Out-of-core parallel-I/O algorithms can handle much
  larger problems than in-memory variants and have much
  better performance than single-device variants.  However,
  they are not commonly used---partly because the
  understanding of them is not widespread.  Yet such
  algorithms ought to be growing in importance because they
  address the needs of users with ever-growing problem
  sizes and ever-increasing performance needs.
\par
  This paper addresses this lack of understanding by
  presenting an introduction to the data-transfer models on
  which most of the out-of-core parallel-I/O algorithms are
  based, with particular emphasis on the Parallel Disk
  Model.  Sample algorithms are discussed to demonstrate
  the paradigms (algorithmic techniques) used with these
  models.
\par
  Our aim is to provide insight into both the paradigms and
  the particular algorithms described, thereby also
  providing a background for understanding a range of
  related solutions.  It is hoped that this background
  would enable the appropriate selection of existing
  algorithms and the development of new ones for current
  and future out-of-core problems.  } 
}

@InCollection{bordawekar:compiling,
  author = "Rajesh Bordawekar and Alok Choudhary",
  title = "Issues in Compiling {I/O} Intensive Problems",
  chapter = 3,
  pages = "69--96",
  crossref = 	"iopads-book",
  keyword =	"parallel I/O, compiler, out-of-core, pario bib", 
  comment =	"Part of a whole book on parallel I/O; see iopads-book.",
  abstract =	{None.}
}

@InCollection{kotz:pioarch,
  author = "David Kotz",
  title = "Introduction to Multiprocessor {I/O} Architecture",
  chapter = 4,
  pages = "97--124",
  crossref = 	"iopads-book",
  keyword =	"parallel I/O architecture, pario bib", 
  comment =	"Part of a whole book on parallel I/O; see iopads-book.",
  abstract =    {The computational performance of
  multiprocessors continues to improve by leaps and bounds,
  fueled in part by rapid improvements in processor and
  interconnection technology.  I/O performance thus becomes
  ever more critical, to avoid becoming the bottleneck of
  system performance.  In this paper we provide an
  introduction to I/O architectural issues in
  multiprocessors, with a focus on disk subsystems.  While we
  discuss examples from actual architectures and provide
  pointers to interesting research in the literature, we do
  not attempt to provide a comprehensive survey.  We
  concentrate on a study of the architectural design issues,
  and the effects of different design alternatives.} 
}

@InCollection{corbett:mpi-overview-book,
  author = "Peter Corbett and Dror Feitelson and Sam Fineberg and
		  Yarsun Hsu and Bill Nitzberg and Jean-Pierre Prost
		  and Marc Snir and Bernard Traversat and Parkson
		  Wong", 
  title = "Overview of the {MPI-IO} Parallel {I/O} Interface",
  chapter = 5,
  pages = "127--146",
  crossref = 	"iopads-book",
  keyword =	"parallel I/O, file system interface, pario bib", 
  comment =	"Part of a whole book on parallel I/O; see iopads-book
		  and corbett:mpi-overview.", 
  abstract =    {Thanks to MPI, writing portable message
  passing parallel programs is almost a reality.  One of
  the remaining problems is file I/O.  Although parallel file
  systems support similar interfaces, the lack of a standard
  makes developing a truly portable program impossible.  It
  is not feasible to develop large scientific applications
  from scratch for each generation of parallel machine, and,
  in the scientific world, a program is not considered truly
  portable unless it not only compiles, but also runs
  efficiently.
\par  
  The MPI-IO interface is being proposed as an extension to
  the MPI standard to fill this need.  MPI-IO supports a
  high-level interface to describe the partitioning of file
  data among processes, a collective interface describing
  complete transfers of global data structures between
  process memories and files, asynchronous I/O operations,
  allowing computation to be overlapped with I/O, and
  optimization of physical file layout on storage devices
  (disks).  } 
}

@InCollection{thakur:out-of-core-book,
  author = "Rajeev Thakur and Alok Choudhary",
  title = "Runtime Support for Out-of-Core Parallel Programs",
  chapter = 6,
  pages = "147--165",
  crossref = 	"iopads-book",
  keyword =	"parallel I/O, out-of-core, pario bib", 
  comment =	"Part of a whole book on parallel I/O; see
		  iopads-book and thakur:out-of-core.",
  abstract =    {In parallel programs with large
  out-of-core arrays stored in files, it is necessary to
  read/write smaller sections of the arrays from/to files. We
  describe a runtime method for accessing sections of
  out-of-core arrays efficiently. This method, called the
  {\em extended two-phase method}, uses collective I/O in
  which processors cooperate to read/write out-of-core data
  in an efficient manner. The I/O workload is divided among
  processors dynamically, depending on the access requests.
  Performance results on the Intel Touchstone Delta show
  that  the extended two-phase method performs considerably
  better than a direct method for different access patterns,
  array sizes, and number of processors. We have used the
  extended two-phase method in the PASSION runtime library
  for parallel I/O.  } 
}

@InCollection{baylor:workload-book,
  author = "Sandra Johnson Baylor and C. Eric Wu",
  title = "Parallel {I/O} Workload Characteristics Using {Vesta}",
  chapter = 7,
  pages = "167--185",
  crossref = 	"iopads-book",
  keyword =	"parallel I/O, file access pattern, workload
		  characterization, file system workload, pario bib",
  comment =	"Part of a whole book on parallel I/O; see iopads-book
		  and baylor:workload.",
  abstract =    {To develop optimal parallel I/O
  subsystems, one must have a thorough understanding of the
  workload characteristics of parallel I/O and its
  exploitation of the associated parallel file system.
  Presented are the results of a study conducted to analyze
  the parallel I/O workloads of several applications on a
  parallel processor using the Vesta parallel file system.
  Traces of the applications are obtained to collect system
  events, communication events, and parallel I/O events. The
  traces are then analyzed to determine workload
  characteristics.  The results show I/O request rates on the
  order of hundreds of requests per second, a large majority
  of requests are for small amounts of data (less than 1500
  bytes), a few requests are for large amounts of data (on
  the order of megabytes), significant file sharing among
  processes within a job, and strong temporal, traditional
  spatial, and interprocess spatial locality.  } 
}

@InCollection{kalns:video-book,
  author = "Edgar T. Kalns and Yarsun Hsu",
  title = "Video on Demand Using the {Vesta} Parallel File System",
  chapter = 8,
  pages = "187--204",
  crossref = 	"iopads-book",
  keyword =	"parallel I/O, parallel file system, video on demand,
		  multimedia, pario bib", 
  comment =	"Part of a whole book on parallel I/O; see iopads-book
		  and kalns:video.",
  abstract =    {Video on Demand (VoD) servers are expected
  to serve hundreds of customers with as many, or more,
  movie videos. Such an environment requires large storage
  capacity and real-time, high-bandwidth transmission
  capabilities. Massive striping of videos across disk arrays
  is a viable means to store large amounts of video data and,
  through parallelism of file access, achieve the needed
  bandwidth.  The Vesta Parallel File System facilitates
  parallel access from an application to files distributed
  across a set of I/O processors, each with a set of attached
  disks.  Given Vesta's parallel file access capabilities,
  this paper examines a number of issues pertaining to the
  implementation of VoD services on top of Vesta.  We develop
  a prototype VoD experimentation environment on an IBM SP-1
  and analyze Vesta's performance in video data retrieval for
  real-time playback.  Specifically, we explore the impact of
  concurrent video streams competing for I/O node resources,
  cache effects, and video striping across multiple I/O
  nodes. }
}

@InCollection{nieuwejaar:strided2-book,
  author = "Nils Nieuwejaar and David Kotz",
  title = "Low-level Interfaces for High-level Parallel {I/O}",
  chapter = 9,
  pages = "205--223",
  crossref = 	"iopads-book",
  keyword =	"parallel I/O, multiprocessor file system interface,
		  pario bib",  
  comment =	"Part of a whole book on parallel I/O; see iopads-book
		  and nieuwejaar:strided2.",
  abstract =    {As the I/O needs of parallel scientific
  applications increase, file systems for multiprocessors
  are being designed to provide applications with parallel
  access to multiple disks.  Many parallel file systems
  present applications with a conventional Unix-like
  interface that allows the application to access multiple
  disks transparently.  By tracing all the activity of a
  parallel file system in a production, scientific computing
  environment, we show that many applications exhibit highly
  regular, but non-consecutive I/O access patterns.  Since
  the conventional interface not provide an efficient method
  of describing these patterns, we present three extensions
  to the interface that support {\em strided}, {\em
  nested-strided}, and {\em nested-batched} I/O requests.  We
  show how these extensions can be used to express common
  access patterns.} 
}

@InCollection{moyer:scalable-book,
  author = "Steven A. Moyer and V.~S. Sunderam",
  title = "Scalable Concurrency Control for Parallel File Systems",
  chapter = 10,
  pages = "225--243",
  crossref = 	"iopads-book",
  keyword =	"parallel I/O, parallel file system, concurrency
		  control, synchronization, transaction, pario bib", 
  comment =	"Part of a whole book on parallel I/O; see iopads-book
		  and moyer:scalable.",
  abstract =    {Parallel file systems employ data
  declustering to increase \mbox{I/O} throughput.  As a
  result, a single read or write operation can generate
  concurrent data accesses on multiple storage devices.
  Unless a concurrency control mechanism is employed,
  familiar file access semantics are likely to be violated.
  This paper details the transaction-based concurrency
  control mechanism implemented in the PIOUS parallel file
  system.  Performance results are presented demonstrating
  that sequential consistency semantics can be provided
  without loss of system scalability.  }
}

@InCollection{durand:scheduling-book,
  author = "Dannie Durand and Ravi Jain and David Tseytlin",
  title = "Improving the Performance of Parallel {I/O} Using Distributed
	    Scheduling Algorithms", 
  chapter = 11,
  pages = "245--269",
  crossref = 	"iopads-book",
  keyword =	"parallel I/O, distributed scheduling algorithm, pario bib", 
  comment =	"Part of a whole book on parallel I/O; see iopads-book
		  and durand:scheduling.",
  abstract =	{The cost of data transfers, and in particular of I/O
  operations, is a growing problem in parallel computing.
  This performance bottleneck is especially severe for
  data-intensive applications such as multimedia information
  systems, databases, and Grand Challenge problems.  A
  promising approach to alleviating this bottleneck is to
  schedule parallel I/O operations explicitly.
\par  
  Although centralized algorithms for batch scheduling of
  parallel I/O operations have previously been developed,
  they are not be appropriate for all applications and
  architectures.  We develop a class of decentralized
  algorithms for scheduling parallel I/O operations, where
  the objective is to reduce the time required to complete a
  given set of transfers.  These algorithms, based on
  edge-coloring and matching of bipartite graphs, rely upon
  simple heuristics to obtain shorter schedules.  We present
  simulation results indicating that the best of our
  algorithms can produce schedules whose length (or makespan)
  is within 2 - 20\% of the optimal schedule, a substantial
  improvement on previous decentralized algorithms.  We
  discuss theoretical and experimental work in progress and
  possible extensions.}
}

@InCollection{sinclair:instability-book,
  author = "J.~B. Sinclair and J. Tang and P.~J. Varman",
  title = "Placement-Related Problems in Shared Disk {I/O}",
  chapter = 12,
  pages = "271--289",
  crossref = 	"iopads-book",
  keyword =	"parallel I/O, pario bib", 
  comment =	"Part of a whole book on parallel I/O; see iopads-book
		  and sinclair:instability.",
  abstract =    {In a shared-disk parallel I/O system,
  several processes may be accessing the disks
  concurrently.  An important example is concurrent
  external merging arising in database management systems
  with multiple independent sort queries.  Such a system may
  exhibit instability, with one of the processes racing ahead
  of the others and monopolizing I/O resources.  This race
  can lead to serialization of the processes and poor disk
  utilization, even when the static load on the disks is
  balanced.  The phenomenon can be avoided by proper layout
  of data on the disks, as well as through other I/O
  management strategies.  This has implications for both data
  placement in multiple disk systems and task partitioning
  for parallel processing.  }
}

@InCollection{baylor:vulcan-perf-book,
  author = "Sandra Johnson Baylor and Caroline Benveniste and Yarsun Hsu",
  title = "Performance Evaluation of a Massively Parallel {I/O} Subsystem",
  chapter = 13,
  pages = "293--311",
  crossref = 	"iopads-book",
  keyword =	"parallel I/O architecture, performance evaluation,
		  pario bib",  
  comment =	"Part of a whole book on parallel I/O; see iopads-book
		  and baylor:vulcan-perf.",
  abstract =    {Presented are the trace-driven simulation
  results of a study conducted to evaluate the performance
  of the internal parallel I/O subsystem of the Vulcan
  massively parallel processor (MPP) architecture. The system
  sizes evaluated vary from 16 to 512 nodes. The results show
  that a compute node to I/O node ratio of four is the most
  cost effective for all system sizes, suggesting high
  scalability. Also, processor-to-processor communication
  effects are negligible for small message sizes and the
  greater the fraction of I/O reads, the better the I/O
  performance. Worse case I/O node placement is within 13\%
  of more efficient placement strategies. Introducing
  parallelism into the internal I/O subsystem improves I/O
  performance significantly.  } 
}

@InCollection{vanderleest:contention-book,
  author = "Steven H. VanderLeest and Ravishankar K. Iyer",
  title = "Heterogeneous {I/O} Contention in a Single-bus Multiprocessor",
  chapter = 14,
  pages = "313--331",
  crossref = 	"iopads-book",
  keyword =	"parallel I/O, pario bib", 
  comment =	"Part of a whole book on parallel I/O; see iopads-book
		  and vanderleest:contention.",
  abstract =	{None.}
}

@InCollection{schloss:hcsa-book,
  author = "Gerhard A. Schloss and Michael Vernick",
  title = "{HCSA}: A Hybrid Client-Server Architecture",
  chapter = 15,
  pages = "333--351",
  crossref = 	"iopads-book",
  keyword =	"parallel I/O architecture, pario bib", 
  comment =	"Part of a whole book on parallel I/O; see iopads-book
		  and schloss:hcsa.",
  abstract =    {The {\em HCSA} (Hybrid Client-Server
  Architecture), a flexible system layout that combines the
  advantages of the traditional Client-Server Architecture
  (CSA) with those of the Shared Disk Architecture (SDA), is
  introduced.  In {\em HCSA}, the traditional CSA-style I/O
  subsystem is modified to give the clients network access to
  both the server and the server's set of disks. Hence, the
  {\em HCSA} is more fault-tolerant than the CSA since there
  are two paths between any client and the shared data.
  Moreover, a simulation study demonstrates that the {\em
  HCSA} is able to support a larger number of clients than
  the CSA or SDA under similar system workloads. Finally, the
  {\em HCSA} can run applications in either a CSA mode, an
  SDA mode, or a combination of the two, thus offering
  backward compatibility with  a large number of existing
  applications.  } 
}

@InCollection{yokota:nets-book,
  author = "Haruo Yokota and Yasuyuki Mimatsu",
  title = "A Scalable Disk System with Data Reconstruction Functions",
  chapter = 16,
  pages = "353--372",
  crossref = 	"iopads-book",
  keyword =	"parallel I/O architecture, disk array, pario bib", 
  comment =	"Part of a whole book on parallel I/O; see iopads-book
		  and yokota:nets.",
  abstract =    {Scalable disk systems are required to
  implement well-balanced computer systems.  We have
  proposed DR-nets, Data-Reconstruction networks, to
  construct the scalable parallel disk systems with high
  reliability.  Each node of a DR-net has disks, and is
  connected by links to form an interconnection network.
  To realize the high reliability, nodes in a sub-network
  of the interconnection network organize a group of parity
  calculation proposed for RAIDs.  Inter-node communication
  for calculating parity keeps the locality of data
  transfer, and it inhibits bottlenecks from occurring,
  even if the size of the network becomes very large.  We
  have developed an experimental system using Transputers.
  In this chapter, we provide execution models for
  estimating the response time and throughput of DR-nets,
  and compare them to experimental results.  We also
  discuss the reliability of the DR-nets and RAIDs.} 
}

@InCollection{asthana:active-book,
  author = "Abhaya Asthana and Mark Cravatts and Paul Krzyzanowski",
  title = "An Experimental Memory-based {I/O} Subsystem",
  chapter = 17,
  pages = "373--390",
  crossref = 	"iopads-book",
  keyword =	"parallel I/O architecture, pario bib", 
  comment =	"Part of a whole book on parallel I/O; see iopads-book
		  and asthana:active.",
  abstract = {We describe an I/O subsystem based on an active memory
  named SWIM (Structured Wafer-based Intelligent Memory) designed for
  efficient storage and manipulation of data structures. The key
  architectural idea in SWIM is to associate some processing logic with
  each memory chip that allows it to perform data manipulation
  operations locally and to communicate with a disk or a communication
  line through a backend port.  The processing logic is specially
  designed to perform operations such as pointer dereferencing, memory
  indirection, searching and bounds checking efficiently. The I/O
  subsystem is built using an interconnected ensemble of such memory
  logic pairs. A complex processing task can now be distributed between
  a large number of small memory processors each doing a sub-task, while
  still retaining a common locus of control in the host CPU for higher
  level administrative and provisioning functions.  We argue that active
  memory based processing enables more powerful, scalable and robust
  designs for storage and communications subsystems, that can support
  emerging network services, multimedia workstations and wireless PCS
  systems. A complete parallel hardware and software system constructed
  using an array of SWIM elements has been operational for over a
  year. We present results from application of SWIM to three network
  functions: a national phone database server, a high performance IP
  router, and a call screening agent.}
}

@Book{iopads-book,
  editor =	"Ravi Jain and John Werth and James C. Browne",
  title = 	 "Input/Output in Parallel and Distributed Computer Systems",
  booktitle = 	 "Input/Output in Parallel and Distributed Computer Systems",
  publisher =	"Kluwer Academic Publishers",
  year =	1996,
  keyword =	"parallel I/O, parallel I/O architecture, parallel I/O
		    algorithm, multiprocessor file system, workload
		  characterization, parallel file access pattern, pario bib", 
  comment =	"A book containing papers from IOPADS '94 and IOPADS
		'95, plus several survey/tutorial papers.   See the
		bib entries with cross-ref to iopads-book. "
}