@Article{dickens:evaluation, author = {Phillip M. Dickens and Rajeev Thakur}, title = {Evaluation of Collective {I/O} Implementations on Parallel Architectures}, journal = {Journal of Parallel and Distributed Computing}, year = {2001}, month = {August}, volume = {61}, number = {8}, pages = {1052--1076}, publisher = {Academic Press}, copyright = {Academic Press}, URL = {http://www.idealibrary.com/links/doi/10.1006/jpdc.2000.1733}, keywords = {parallel I/O, collective I/O, pario-bib, parallel architecture}, abstract = {In this paper, we evaluate the impact on performance of various implementation techniques for collective I/O operations, and we do so across four important parallel architectures. We show that a naive implementation of collective I/0 does not result in significant performance gains for any of the architectures, but that an optimized implementation does provide excellent performance across all of the platforms under study. Furthermore, we demonstrate that there exists a single implementation strategy that provides the best performance for all four computational platforms. Next, we evaluate implementation techniques for thread-based collective I/O operations. We show that the most obvious implementation technique, which is to spawn a thread to execute the whole collective I/O operation in the background, frequently provides the worst performance, often performing much worse than just executing the collective I/O routine entirely in the foreground. To improve performance, we explore an alternate approach where part of the collective I/O operation is performed in the background, and part is performed in the foreground. We demonstrate that this implementation technique can provide significant performance gains, offering up to a 50\% improvement over implementations that do not attempt to overlap collective I/O and computation.} }