@InProceedings{arge:sorting, author = {Lars Arge and Paolo Ferragina and Roberto Grossi and Jeffrey Scott Vitter}, title = {Sequence sorting in secondary storage}, booktitle = {Proceedings of Compression and Complexity of Sequences}, year = {1998}, month = {June}, pages = {329--346}, publisher = {IEEE Computer Society Press}, address = {Salerno, Italy}, keywords = {out-of-core algorithm, sorting algorithm, pario-bib}, abstract = {We investigate the I/O complexity of the problem of sorting sequences (or strings of characters) in external memory, which is a fundamental component of many large-scale text applications. In the standard unit-cost RAM comparison model, the complexity of sorting K strings of total length N is Theta (K log/sub 2/ K+N). By analogy, in the external memory (or I/O) model, where the internal memory has size M and the block transfer size is B, it would be natural to guess that the I/O complexity of sorting sequences is Theta ((K/B)log/sub M/B/(K/B)+(N/B)), but the known algorithms do not come even close to achieving this bound. Our results show, somewhat counterintuitively, that the I/O complexity of string sorting depends upon the length of the strings relative to the block size. We first consider a simple comparison I/O model, where the strings are not allowed to be broken into their individual characters, and we show that the I/O complexity of string sorting in this model is Theta ((N/sub 1//B)log/sub M/B/(N/sub 1//B)+K/sub 2/+(N/B)), where N/sub 1/ is the total length of all strings shorter than B and K/sub 2/ is the number of strings longer than B. We then consider two more general I/O comparison models in which string breaking is allowed. We obtain improved algorithms and in several cases lower bounds that match their I/O bounds. Finally, we develop more practical algorithms outside the comparison model.}, comment = {This paper is really the same paper as arge:sorting-strings.} }