@InProceedings{foster:remote-io, author = {Ian Foster and David {Kohr, Jr.} and Rakesh Krishnaiyer and Jace Mogill}, title = {Remote {I/O}: Fast Access to Distant Storage}, booktitle = {Proceedings of the Fifth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1997}, month = {November}, pages = {14--25}, publisher = {ACM Press}, address = {San Jose, CA}, URL = {http://doi.acm.org/10.1145/266220.266222}, keywords = {parallel I/O, distributed file system, pario-bib}, abstract = {As high-speed networks make it easier to use distributed resources, it becomes increasingly common that applications and their data are not colocated. Users have traditionally addressed this problem by manually staging data to and from remote computers. We argue instead for a remote I/O paradigm in which programs use familiar parallel I/O interfaces to access remote filesystems. In addition to simplifying remote execution, remote I/O can improve performance relative to staging by overlapping computation and data transfer or by reducing communication requirements. However, remote I/O also introduces new technical challenges in the areas of portability, performance, and integration with distributed computing systems. We propose techniques designed to address these challenges and describe a remote I/O library called RIO that we are developing to evaluate the effectiveness of these techniques. RIO addresses issues of portability by adopting the quasi-standard MPI-IO interface and by defining a RIO device and RIO server within the ADIO abstract I/O device architecture. It addresses performance issues by providing traditional I/O optimizations such as asynchronous operations and through implementation techniques such as buffering and message forwarding to offload communication overheads. Microbenchmarks and application experiments demonstrate that our techniques can improve turnaround time relative to staging.}, comment = {They want to support users that have datasets at different locations in the Internet, but need to access the data at supercomputer parallel machines. Rather than staging data in and out, they want to provide remote access. Issues: naming, dynamic loads, heterogeneity, security, fault-tolerance. All traffic goes through a 'forwarder node' that funnels all the traffic into the network. They use URLs for pathnames (e.g., "x-rio://..."). They find that non-blocking ops are important, as is collective I/O. They think that buffering will be important. Limited experiments.} }