/* * CHARISMA Project Trace Format * * Nils Nieuwejaar and David Kotz * {nils,dfk}@cs.dartmouth.edu * Dartmouth College * Hanover, NH * * This is the generic header file for a multi-platform parallel file * system tracing effort. This file defines the format of the records * stored in trace files. * * Since there is yet no common model for parallel I/O, this file * cannot be expected to meet all the needs of all (or, for that * matter, any) parallel file system. Therefore, this file should be * considered a starting point for a machine-dependent tracing format. * Bear in mind, however, that the closer to this generic format we * can get, the easier it will be to compare results across systems. * * The ideal solution would be to declare all machine-dependent constants * and records in a trace_xxxx.h file which would be included before this * file. This file could then remain largely unchanged. * */ /* time.h, or something else that defines the typedef time_t, * should be included before this file. */ /* The trace file begins with two 32-bit integers that identify the * format of the file. */ #define MAGIC 0x01234567 /* First 4 bytes of any trace file */ #define FORMAT_VERSION 0x00000001 /* Second 4 bytes of any trace file */ /* Record Types */ #define TRACE_HEADER_RECORD 0 #define TRACE_APP_LOAD_RECORD 1 #define TRACE_APP_EXIT_RECORD 2 #define TRACE_CLIENT_JOIN_RECORD 3 #define TRACE_APP_CLIENT_LEAVE_RECORD 4 #define TRACE_APP_OPEN_RECORD 5 #define TRACE_CLIENT_OPEN_RECORD 6 #define TRACE_APP_CLOSE_RECORD 7 #define TRACE_CLIENT_CLOSE_RECORD 8 #define TRACE_RW_RECORD 9 #define TRACE_EXTEND_RECORD 10 #define TRACE_TRUNCATE_RECORD 11 #define TRACE_LINK_RECORD 12 #define TRACE_UNLINK_RECORD 13 #define TRACE_ASYNCH_END_RECORD 14 #define TRACE_FCNTL_RECORD 15 /* Used in file open records */ #define TRACE_OPEN_READ_MODE 0x01 #define TRACE_OPEN_RW_MODE 0x02 #define TRACE_OPEN_WRITE_MODE 0x04 #define TRACE_OPEN_CREATE_MODE 0x08 #define TRACE_OPEN_TRUNC_MODE 0x10 #define TRACE_OPEN_APPEND_MODE 0x20 /* TRACE_HEADER_RECORD: * This should be the first record in every trace file, immediately * following the magic number and format version number. The IP address * is used to differentiate between machines of the same type; when * necessary it could be left out for confidentiality reasons. * md_version determines which version of the machine-dependent * include file was used. release_data is free-format and should be * used to identify the software release version information for the OS, * libraries, compilers, etc. Machine-dependent records that give more * configuration information, perhaps details about each I/O device, may * follow. */ typedef struct { u_short record_type; /* TRACE_HEADER_RECORD */ time_t start_date; /* Unix Standard */ long gmt_offset; /* Offset from GMT in seconds */ TimestampType timestamp; /* to correlate with start_date */ u_long SystemId; /* IP address */ byte system_type; /* which kind of system */ #define IPSC_TYPE 1 #define PARAGON_TYPE 2 #define CM5_TYPE 3 #define MASPAR_TYPE 4 #define HECTOR_TYPE 5 #define NCUBE_TYPE 6 #define SP2_TYPE 7 #define CRAY_T3D_TYPE 8 /* etc... */ int md_version; /* machine-dependent format version number */ /* MDFORMAT_VERSION */ /* system configuration info */ char release_data[80]; int num_processors; int memory_per_proc; /* in megabytes */ int num_disks; int num_IO_nodes; int memory_per_disk; /* in megabytes */ double timestamp_unit; /* in seconds, eg, 0.001 for msec timestamps */ } TraceHeaderRecord; /* TRACE_APP_LOAD_RECORD: * This is used for every new application that is started. The file * name and pathname for the application are included. Where appropriate, * the size of the executable loaded into each nodes can be noted (another * form of I/O). The application ID, perhaps a process ID, should be a * unique number that identifies this application system-wide. All clients * (processes, threads, tasks, etc) that are part of this application at * its startup time can be listed in the array at the end of the record. * Some systems may use num_clients=1 and the CLIENT_JOIN record. Client * IDs should be unique system-wide, not just application-wide. */ typedef struct { u_short record_type; /* TRACE_APP_LOAD_RECORD */ TimestampType timestamp; char name[TRACE_NAME_LENGTH]; char path[TRACE_PATH_LENGTH]; TraceUserId user_id; u_long initial_load_size; /* in bytes */ TraceAppId appl_id; u_short num_clients; TraceClientId clients[1]; /* actually [num_clients] */ } TraceAppLoadRecord; /* TRACE_APP_EXIT_RECORD * Here we assume that all clients of the application are done. */ typedef struct { u_short record_type; /* TRACE_APP_EXIT_RECORD */ TimestampType timestamp; TraceAppId appl_id; } TraceAppExitRecord; /* TRACE_CLIENT_JOIN_RECORD * For systems with a "fork" model to create clients within an application, * each client registers its birth here. This provides a binding between * client-ids and applications. */ typedef struct { u_short record_type; /* TRACE_CLIENT_JOIN_RECORD */ TimestampType timestamp; TraceAppId appl_id; TraceClientId client_id; } TraceClientJoinRecord; /* TRACE_CLIENT_LEAVE_RECORD * In case a client needs to leave early. Probably rarely used. */ typedef struct { u_short record_type; /* TRACE_CLIENT_LEAVE_RECORD */ TimestampType timestamp; TraceAppId appl_id; TraceClientId client_id; } TraceClientLeaveRecord; /* TRACE_APP_OPEN_RECORD * Here, an application as a whole opens a file for all of its clients. * This is useful in data-parallel programming models. Same as the client * version except with a TraceAppId. */ typedef struct { u_short record_type; /* TRACE_APP_OPEN_RECORD */ TimestampType timestamp; TraceAppId appl_id; char name[TRACE_NAME_LENGTH]; char path[TRACE_PATH_LENGTH]; TraceFileId file_id; /* system-wide identifier */ TraceFileDesc file_desc; /* per-process identifier */ BigInt file_size; time_t creation_date; /* Unix Standard */ byte open_flags; } TraceAppOpenRecord; /* TRACE_APP_CLOSE_RECORD * An application as a whole closes a file. */ typedef struct { u_short record_type; /* TRACE_APP_CLOSE_RECORD */ TimestampType timestamp; TraceAppId appl_id; TraceFileDesc file_desc; BigInt file_size; } TraceAppCloseRecord; /* TRACE_CLIENT_OPEN_RECORD * A single client opens a file. The file name and directory path name * are included, along with the file ID, which should be a unique number * for the file (eg, tuple of device number and inode number). The file * descriptor identifies the particular open of the file; on unix-like * systems, all clients accessing a file have the same file ID, but may * have different file descriptors if they open the file separately, or * the same file descriptor if they inherited the file descriptor through * dup or fork. */ typedef struct { u_short record_type; /* TRACE_CLIENT_OPEN_RECORD */ TimestampType timestamp; TraceClientId client_id; char name[TRACE_NAME_LENGTH]; char path[TRACE_PATH_LENGTH]; TraceFileId file_id; /* system-wide identifier */ TraceFileDesc file_desc; /* per-process identifier */ BigInt file_size; time_t creation_date; /* Unix Standard */ byte open_flags; } TraceClientOpenRecord; /* TRACE_CLIENT_CLOSE_RECORD * A client closes a file. Note we record the ending file size. */ typedef struct { u_short record_type; /* TRACE_CLIENT_CLOSE_RECORD */ TimestampType timestamp; TraceClientId client_id; TraceFileDesc file_desc; BigInt file_size; } TraceClientCloseRecord; /* TRACE_RW_RECORD * Each read or write request is logged here. Operation_type * tells us what is going on, allowing 8 variants of this record. * For synchronous I/O, both the starting and ending time are recorded. * For asynchronous I/O, the start time is recorded here, along with * some kind of identifier that will allow us to pair it with the end * record. For application-wide (data-parallel) I/O, the application ID * is included, along with multiple (offset, size, clientID) tuples. * * Since this is likely to be the most common record - by far - * implementors should do whatever they can to reduce its size. For * example, on the iPSC/860 we removed the num_clients and appl_id * fields. On-the-fly compression could also be used. */ /* This is part of the read/write record types below. */ typedef struct { BigInt offset; u_long memory_offset; BigInt IO_size; TraceClientId client_id; } TraceIORequest; typedef struct { u_short record_type; /* TRACE_RW_RECORD */ TimestampType starttime; TimestampType endtime; TraceFileDesc file_desc; byte operation_type; #define ASYNCHRONOUS 0x01 /* 0=sync 1=async */ #define WRITE 0x02 /* 0=read 1=write */ #define CLIENTREQUEST 0x04 /* 0=application 1=client */ TraceAsyncId async_id; /* if appropriate */ TraceAppId appl_id; /* if appropriate, or NO_APP */ u_short num_clients; /* = 1 if CLIENTREQUEST */ TraceIORequest requests[1]; /* actually [num_clients] */ } TraceRWRecord; /* TRACE_ASYNCH_END_RECORD * End of asynch I/O request for either client or application. * The calltime is the time that the "wait for I/O completion" * call was invoked (marking the end of overlapped computation) and the * endtime is the end of the actual I/O (or time of return from the * I/O-wait call, if that's all we can get). */ typedef struct { u_short record_type; /* TRACE_ASYNCH_END_RECORD */ TimestampType call_time; TimestampType end_time; TraceAsyncId async_id; /* fileId is implicit */ TraceClientId client_id; /* if appropriate, or NO_CLIENT */ TraceAppId appl_id; /* if appropriate, or NO_APP */ } TraceAsynchEndRecord; /* TRACE_TRUNC_RECORD * A file was explicitly truncated to a new file size. */ typedef struct { u_short record_type; /* TRACE_TRUNC_RECORD */ TimestampType timestamp; TraceClientId client_id; TraceFileId file_id; BigInt original_size; BigInt new_size; } TraceTruncRecord; /* TRACE_EXTEND_RECORD * A file was explicitly extended (eg, with a preallocate) to a new * file size. Implicit extensions are implicitly recorded in write records. */ typedef struct { u_short record_type; /* TRACE_EXTEND_RECORD */ TimestampType timestamp; TraceClientId client_id; TraceFileId file_id; BigInt original_size; BigInt new_size; } TraceExtendRecord; /* TRACE_LINK_RECORD * A (unix-like) link was created to a file. */ typedef struct { u_short record_type; /* TRACE_LINK_RECORD */ TimestampType timestamp; TraceClientId client_id; TraceFileId file_id; u_short new_link_count; } TraceLinkRecord; /* TRACE_UNLINK_RECORD * A file was unlinked; if new_link_count is 0, the file was removed. * This can be used even on systems with no true link/unlink facility. * Useful for identifying temporary files. */ typedef struct { u_short record_type; /* TRACE_UNLINK_RECORD */ TimestampType timestamp; TraceClientId client_id; TraceFileId file_id; u_short new_link_count; } TraceUnlinkRecord; /* TRACE_FCNTL_RECORD * A record of a unix-like fcntl call. Codes used are probably * machine-specific. */ typedef struct { u_short record_type; /* TRACE_FCNTL_RECORD */ TimestampType timestamp; TraceClientId client_id; TraceFileId file_desc; int request; /* machine-specific contents */ int arg; /* machine- and request-specific contents */ } TraceFcntlRecord; /* COMP_RW_RECORD * An example of a trace record for an implementation doing on-the-fly * compression. This assumes that we are remembering 3 different * request sizes, strides in the file, and strides in memory. If one of * the fields in this request matches one of the 'remembered' values, the * index of that value is stored in the appropriate slot. If there is no * match, a 0 is stored and the actual value is stored in one of the * 'fields'. A simpler (and probably nearly as effective) approach * would be simply to use flags that indicate that we should use the same * value as the previous request. * * Since raw timestamps are likely to be 8 bytes, this also allows * compression of timestamps. If it is possible to express the time of * this operation with a 4-byte delta relative to the previous operation, * timestamp_delta should be set to 1, and the delta store in fields[0]. * Otherwise, the full 8-byte timestamp should be stored in fields[0-1]. */ typedef struct { byte record_type; /* COMP_RW_RECORD */ byte client_id; byte file_desc : 6; /* Assumes a limit of 64 */ byte memstride_index : 2; byte size_index : 2; byte stride_index : 2; byte operation_type : 3; /* See RW_RECORD for these codes */ byte timestamp_delta : 1; unsigned long duration; unsigned long fields[]; } CompRWRecord;