@InProceedings{alverson:tera,
  author = {Robert Alverson and David Callahan and Daniel Cummings and Brian
  Koblenz and Allan Porterfield and Burton Smith},
  title = {The {Tera} Computer System},
  booktitle = {Proceedings of the 1990 ACM International Conference on
  Supercomputing},
  year = {1990},
  pages = {1--6},
  keywords = {parallel architecture, MIMD, NUMA, pario-bib},
  comment = {Interesting architecture. 3-d mesh of pipelined packet-switch
  nodes, e.g., 16x16x16 is 4096 nodes, with 256 procs, 512 memory units, 256 I/O
  cache units, and 256 I/O processors attached. 2816 remaining nodes are just
  switching nodes. Each processor is 64-bit custom chip with up to 128
  simultaneous threads in execution. It alternates between ready threads, with
  a deep pipeline. Inter-instruction dependencies explicitly encoded by the
  compiler, stalling those threads until the appropriate time. Each thread has
  a complete set of registers! Memory units have 4-bit tags on each word, for
  full/empty and trap bits. Shared memory across the network: ``The Tera
  ISP-level architecture is UMA, even though the PMS-level architecture is
  NUMA. Put another way, the memory looks a single cycle away to the compiler
  writer.'' -- Burton Smith. See also tera:brochure.}
}