@Article{aggarwal:sorting,
  author = {Alok Aggarwal and Jeffrey Scott Vitter},
  title = {The Input/Output Complexity of Sorting and Related Problems},
  journal = {Communications of the ACM},
  year = {1988},
  month = {September},
  volume = {31},
  number = {9},
  pages = {1116--1127},
  keywords = {parallel I/O, sorting, pario-bib},
  abstract = {We provide tight upper and lower bounds, up to a constant factor,
  for the number of inputs and outputs~(I/Os) between internal memory and
  secondary storage required for five sorting-related problems: sorting, the
  fast Fourier transform (FFT), permutation networks, permuting, and matrix
  transposition. The bounds hold both in the worst case and in the average
  case, and in several situations the constant factors match. Secondary storage
  is modeled as a magnetic disk capable of transfering $P$~blocks each
  containing $B$~records in a single time unit; the records in each block must
  be input from or output to $B$~contiguous locations on the disk. We give two
  optimal algorithms for the problems, which are variants of merge sorting and
  distribution sorting. In particular we show for $P=1$ that the standard merge
  sorting algorithm is an optimal external sorting method, up to a constant
  factor in the number of~I/Os. Our sorting algorithms use the same number
  of~I/Os as does the permutation phase of key sorting, except when the
  internal memory size is extremely small, thus affirming the popular adage
  that key sorting is not faster. We also give a simpler and more direct
  derivation of Hong and Kung's lower bound for the FFT for the special case $B
  = P = O(1)$.},
  comment = {Good comments on typical external sorts, and how big they are.
  Focuses on parallelism at the disk. They give tight theoretical bounds on the
  number of I/O's required to do external sorting and other problems (FFTs,
  matrix transpose, etc.). If $x$ is the number of blocks in the file and $y$
  is the number of blocks that fit in memory, then the number of I/Os needed
  grows as $\Theta (x \log x / \log y)$. If parallel transfers of $p$ blocks
  are allowed, speedup linear in $p$ is obtained.}
}