@InProceedings{soloviev:prefetching, author = {Valery V. Soloviev}, title = {Prefetching in Segmented Disk Cache for Multi-Disk Systems}, booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1996}, month = {May}, pages = {69--82}, publisher = {ACM Press}, address = {Philadelphia}, keywords = {parallel I/O, prefetching, disk cache, disk array, pario-bib}, abstract = {This paper investigates the performance of a multi-disk storage system equipped with a segmented disk cache processing a workload of multiple relational scans. Prefetching is a popular method of improving the performance of scans. Many modern disks have a multisegment cache which can be used for prefetching. We observe that, exploiting declustering as a data placement method, prefetching in a segmented cache causes a load imbalance among several disks. A single disk becomes a bottleneck, degrading performance of the entire system. A variation in disk queue length is a primary factor of the imbalance. Using a precise simulation model, we investigate several approaches to achieving better balancing. Our metrics are a scan response time for the closed-end system and an ability to sustain a workload without saturating for the open-end system. We arrive at two main conclusions: (1) Prefetching in main memory is inexpensive and effective for balancing and can supplement or substitute prefetching in disk cache. (2) Disk-level prefetching provides about the same performance as main memory prefetching if request queues are managed in the disk controllers rather than in the host. Checking the disk cache before queuing requests provides not only better request response time but also drastically improves balancing. A single cache performs better than a segmented cache for this method.}, comment = {An interesting paper about disk-controller cache management in database workloads. Actually, the workloads are sequential scans of partitioned files, which could occur in many kinds of workloads. The declustering pattern (partitioning) is a little unusual for most scientific parallel I/O veterans, who are used to striping. And the cache-management algorithms seem a bit strange, particularly the fact that the cache appears to be used only for explicit prefetch requests. Turns out that it is best to put the prefetching and disk queueing in the same place, either on the controller or in main memory, to avoid load imbalance that arises from randomness in the workload, which is accentuated into a big bottleneck and a convoy effect.} }