@InProceedings{alverson:tera, author = {Robert Alverson and David Callahan and Daniel Cummings and Brian Koblenz and Allan Porterfield and Burton Smith}, title = {The {Tera} Computer System}, booktitle = {Proceedings of the 1990 ACM International Conference on Supercomputing}, year = {1990}, pages = {1--6}, keywords = {parallel architecture, MIMD, NUMA, pario-bib}, comment = {Interesting architecture. 3-d mesh of pipelined packet-switch nodes, e.g., 16x16x16 is 4096 nodes, with 256 procs, 512 memory units, 256 I/O cache units, and 256 I/O processors attached. 2816 remaining nodes are just switching nodes. Each processor is 64-bit custom chip with up to 128 simultaneous threads in execution. It alternates between ready threads, with a deep pipeline. Inter-instruction dependencies explicitly encoded by the compiler, stalling those threads until the appropriate time. Each thread has a complete set of registers! Memory units have 4-bit tags on each word, for full/empty and trap bits. Shared memory across the network: ``The Tera ISP-level architecture is UMA, even though the PMS-level architecture is NUMA. Put another way, the memory looks a single cycle away to the compiler writer.'' -- Burton Smith. See also tera:brochure.} }