@InProceedings{arge:sorting-strings, author = {Lars Arge and Paolo Ferragina and Roberto Grossi and Jeffrey Scott Vitter}, title = {On sorting strings in external memory}, booktitle = {Proceedings of the 29th Annual ACM Symposium on Theory of Computing}, year = {1997}, month = {May}, pages = {540--548}, publisher = {ACM Press}, address = {El Paso}, URL = {file://ftp.cs.duke.edu/pub/jsv/Papers/AFG97.stringsorting.ps.gz}, keywords = {out-of-core algorithm, sorting, parallel I/O, pario-bib}, abstract = {In this paper we address for the first time the I/O complexity of the problem of sorting strings in external memory, which is a fundamental component of many large-scale text applications. In the standard unit-cost RAM comparison model, the complexity of sorting K strings of total length N is theta(K log K + N). By analogy, in the external memory (or I/O) model, where the internal memory has size M and the block transfer size is B, it would be natural to guess that the I/O complexity of sorting strings is $\theta(K/B log_(M/B) (K/B) + N/B)$, but the known algorithms do not come even close to achieving this bound. Our results show, somewhat counterintuitively, that the I/O complexity of string sorting depends upon the length of the strings relative to the block size. We first consider a simple comparison I/O model, where one is not allowed to break the strings into their characters, and we show that the I/O complexity of string sorting in this model is $\theta(N_1/B log_(M/B) (N_1/B) + K_2 log_(M/B) K_2 + N/B)$, where $N_1$ is the total length of all strings shorter than B and $K_2$ is the number of strings longer than B. We then consider two more general I/O comparison models in which string breaking is allowed. We obtain improved algorithms and in several cases lower bounds that match their I/O bounds. Finally, we develop more practical algorithms without assuming the comparison model.}, comment = {Not parallel? But mentions some parallel disk stuff.} }