@InProceedings{asami:self, author = {Satoshi Asami and Nisha Talagala and David A. Patterson}, title = {Designing a Self-Maintaining Storage System}, booktitle = {Proceedings of the Sixteenth IEEE Symposium on Mass Storage Systems}, year = {1999}, month = {March}, pages = {222-233}, publisher = {IEEE Computer Society Press}, later = {asami:bself}, URL = {http://storageconference.org/1999/1999/posters/22asami.pdf}, keywords = {parallel I/O, disk array, RAID, pario-bib}, abstract = {This paper shows the suitability of a self-maintaining approach to Tertiary Disk, a large-scale disk array system built from commodity components. Instead of incurring the cost of custom hardware, we attempt to solve various problems by design and software. We have built a cluster of storage nodes connected by switched Ethernet. Each storage node is a PC hosting a few dozen SCSI disks, running the FreeBSD operating system. The system is used as a web-based image server for the Zoom Project in cooperation with the Fine Arts Museums of San Francisco (http://www.thinker.org/). We are designing self-maintenance extension to the OS to run on this cluster to mitigate the system administrator's burden. There are several components required for building self-maintaining system. One is decoupling the time failure from the time of hardware replacement. This implies the system must have some amount of redundancy, and has no single point of failure. Our system is fully redundant, and everything is constructed to avoid a single point of failure. Another is correctly identifying failures and their dependencies. The paper also outlines several approaches to lower the human cost of system administration of such a system and making the system as autonomous as possible.} }