@InProceedings{franke:filters, author = {Ernest Franke and Michael Magee}, title = {Reducing Data Distribution Bottlenecks by Employing Data Visualization Filters}, booktitle = {Proceedings of the Eighth IEEE International Symposium on High Performance Distributed Computing}, year = {1999}, month = {August}, pages = {255--262}, publisher = {IEEE Computer Society Press}, address = {Redondo Beach, CA}, URL = {http://computer.org/conferen/proceed/hpdc/0287/02870041abs.htm}, keywords = {distributed computing, filters, grid, input/output, parallel I/O, pario-bib, app-pario}, abstract = {Between 1994 and 1997, researchers at Southwest Research Institute (SwRI) investigated methods for distributing parallel computation and data visualization under the support of an internally funded Research Initiative Program entitled the Advanced Visualization Technology Project (AVTP). A hierarchical data cache architecture was developed to provide a flexible interface between the modeling or simulation computational processes and data visualization programs. Compared to conventional post facto data visualization approaches, this data cache structure provides many advantages including simultaneous data access by multiple visualization clients, comparison of experimental and simulated data, and visual analysis of computer simulation as computation proceeds. \par However, since the data cache was resident on a single workstation, this approach did not address the issue of scalability of methods for avoiding the data storage bottleneck by distributing the data across multiple networked workstations. Scalability through distributed database approaches is being investigated as part of the Applied Visualization using Advanced Network Technology Infrastructure (AVANTI) project.\par This paper describes a methodology currently under development that is intended to avoid bottlenecks that typically arise as the result of data consumers (e.g. visualization applications) that must access and process large amounts of data that has been generated and resides on other hosts, and which must pass through a central data cache prior to being used by the data consumer. The methodology is based on a fundamental paradigm that the end result (visualization) rendered by a data consumer can, in many cases, be produced using a reduced data set that has been distilled or filtered from the original data set. \par In the most basic case, the filtered data used as input to the data consumer may simply be a proper subset of massive data sets that have been distributed among hosts. For the general case, however, the filtered data may bear no resemblance to the original data since it is the result of processing the raw data set and distilling it to its visual "essence", i.e. the minimal data set that is absolutely required by the data consumer in order to perform the required rendering function. Data distribution bottlenecks for visualization applications are thereby reduced by avoiding the transfer of large amounts of raw data in favor of considerably distilled visual data.\par There are, of course, computational costs associated with this approach since raw data must be processed into its visual essence, but these computational costs may be distributed among multiple processors. It should be realized, however, that, in general, these computational costs would exist any way since, for the visualization to be performed, there must be a transformation between the raw data and the visualization primitives (e.g. line segments, polygon vertices, etc.) to be rendered. The main principal put forth by this paper is that if data distribution bottlenecks are to be minimized, the amount of raw data transferred should be reduced by employing data filtering processes that can be distributed among multiple hosts. \par The complete paper demonstrates, both analytically and experimentally, that this approach becomes increasingly effective (scalable) as the computational expense associated with the data filtering transformation rises.}, comment = {The goal of their work is to improve the performance of data visualization applications which use remote the data generators (disk or running application) and data consumers (the visualization station) for visualzation applications. They deal with network bottlenecks by using a distributed-redundant data cache to hold intermediate data between the data generator and the data consumer. They also reduce network traffic by applying data filters to the data at the distributed cache processors. The main argument is that since the data must be filtered before it is visualized, it makes more sense to perform the filter at the data cache so the computation can be distributed and to reduce the amount of data that needs to be transferred to the data consumer.} }