@InProceedings{cormen:fft3, author = {Thomas H. Cormen and Jake Wegmann and David M. Nicol}, title = {Multiprocessor Out-of-Core {FFTs} with Distributed Memory and Parallel Disks}, booktitle = {Proceedings of the Fifth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1997}, month = {November}, pages = {68--78}, publisher = {ACM Press}, copyright = {ACM}, address = {San Jose, CA}, keywords = {out of core, parallel I/O, pario-bib}, abstract = {This paper extends an earlier out-of-core Fast Fourier Transform (FFT) method for a uniprocessor with the Parallel Disk Model (PDM) to use multiple processors. Four out-of-core multiprocessor methods are examined. Operationally, these methods differ in the size of "mini-butterfly" computed in memory and how the data are organized on the disks and in the distributed memory of the multiprocessor. The methods also perform differing amounts of I/O and communication. Two of them have the remarkable property that even though they are computing the FFT on a multiprocessor, all interprocessor communication occurs outside the mini-butterfly computations; communication that ordinarily occurs in a butterfly is folded into other data-movement operations. An analysis program shows that the two methods that use no butterfly communication usually use less communication overall than the other methods. The analysis program is fast enough that it can be invoked at run time to determine which of the four methods uses the least communication. One set of performance results on a small workstation cluster indicates that the methods without butterfly communication are approximately 9.5\% faster. Moreover, they are much easier to implement.}, comment = {They find a way to move the interprocessor communication involved in the out-of-core FFT into a single BMMC permutation between "super-levels", where each super-level involves log(M) stages of the FFT. This usually leads to less communication and to better overall performance. See also cormen:fft and cormen:fft2.} }