@InProceedings{dickens:threads, author = {Phillip Dickens and Rajeev Thakur}, title = {Improving Collective {I/O} Performance Using Threads}, booktitle = {Proceedings of the Joint International Parallel Processing Symposium and IEEE Symposium on Parallel and Distributed Processing}, year = {1999}, month = {April}, pages = {38--45}, URL = {http://www.mcs.anl.gov/~thakur/papers/ipps99-thread-coll.ps}, keywords = {parallel I/O, multithread programming, collective I/O, disk-directed I/O, two-phase I/O, pario-bib}, abstract = {Massively parallel computers are increasingly being used to solve large, I/O intensive applications in many different fields. For such applications, the I/O requirements quite often present a significant obstacle in the way of achieving good performance, and an important area of current research is the development of techniques by which these costs can be reduced. One such approach is {\it collective I/O}, where the processors cooperatively develop an I/O strategy that reduces the number, and increases the size, of I/O requests, making a much better use of the I/O subsystem. Collective I/O has been shown to significantly reduce the cost of performing I/O in many large, parallel applications, and for this reason serves as an important base upon which we can explore other mechanisms which can further reduce these costs. One promising approach is to use threads to perform the collective I/O {\it in the background} while the main thread continues with other computation in the foreground. \par In this paper, we explore the issues associated with implementing collective I/O in the background using threads. The most natural approach is to simply spawn off an I/O thread to perform the collective I/O in the background while the main thread continues with other computation. However, our research demonstrates that this approach is frequently the {\it worst} implementation option, often performing much more poorly than just executing collective I/O completely in the foreground. To improve the performance of thread-based collective I/O, we developed an alternate approach where {\it part} of the collective I/O operation is performed in the background, and part is performed in the foreground. We demonstrate that this new technique can significantly improve the performance of thread-based collective I/O, providing up to an 80\% improvement over sequential collective I/O (where there is no attempt to overlap computation with I/O). Also, we discuss one very important application of this research which is the implementation of the {\it split-collective} parallel I/O operations defined in MPI 2.0.}, comment = {They examine an implementation of collective I/O in MPI2 such that the collective I/O is done in the background, using a thread, while the computation continues. They found that the performance can be quite disappointing, because of the competition for the CPU between the computational thread and the background thread executing the redistribution phase of the I/O operation. They get better results by doing the redistribution in the foreground, making the computation wait, and then doing the I/O in the background thread while the computation continues. Results from four major parallel platforms, but only for write operations.} }