--- src/sys/sys/syslink_msg.h 2007/04/26 02:11:00 1.7 +++ src/sys/sys/syslink_msg.h 2007/05/27 20:35:43 1.8 @@ -35,7 +35,7 @@ */ /* * The syslink infrastructure implements an optimized RPC mechanism across a - * communications link. Endpoints, defined by a sysid, are typically + * communications link. Endpoints, defined by a session sysid, are typically * associated with system structures but do not have to be. * * This header file is primarily responsible for the formatting of message @@ -52,208 +52,245 @@ #include #endif -typedef u_int32_t sl_msgid_t; /* transaction sequencing */ -typedef u_int32_t sl_auxdata_t; /* auxillary data element */ -typedef u_int16_t sl_cmd_t; /* command or error */ -typedef u_int16_t sl_error_t; -typedef u_int16_t sl_itemid_t; /* item id */ +typedef int32_t sl_auxdata_t; /* auxillary data element */ +typedef u_int32_t sl_rlabel_t; /* reply label routing id */ +typedef u_int16_t sl_proto_t; /* protocol control field */ +typedef u_int16_t sl_cmd_t; /* command/status id */ typedef u_int16_t sl_reclen_t; /* item length */ #define SL_ALIGN 8 /* 8-byte alignment */ #define SL_ALIGNMASK (SL_ALIGN - 1) /* - * The msgid is used to control transaction sequencing within a session, but - * also has a special meaning to the transport layer. A msgid of 0 indicates - * a PAD syslink message, used to pad FIFO buffers to prevent messages from - * being bisected by the end of the buffer. Since all structures are 8-byte - * aligned, 8-byte PAD messages are allowed. All other messages must be - * at least sizeof(syslink_msg). - * - * The reclen is the actual record length in bytes prior to alignment. - * The reclen must be aligned to obtain the actual size of a syslink_msg - * or syslink_item structure. Note that the reclen includes structural - * headers (i.e. it does not represent just the data payload, it represents - * the entire structure). - * - * Syslink messages allow special treatment for large data payloads, allowing - * the transport mechanism to separate the data payload into its own buffer - * or DMA area (for example, its own page), facilitating DMA and page-mapping - * operations at the end points while allowing the message to be maximally - * compressed during transport. This is typically handled by special casing - * a readv() or writev(). - * - * Sessions are identified with a session id. The session id is a rendezvous - * id that associates physical and logical routing information with a single - * sysid, allowing us to both avoid storing the source and target logical id - * in the syslink message AND ALSO providing a unique session id and validator - * which manages the abstracted 'connection' between two entities. This - * reduces bloat. - * - * The target physical address is deconstructed as the message hops across - * the mesh. All 0's, or all 0's remaining indicates a link layer message - * to be processed by the syslink route node itself. All 1's indicates - * a broadcast message. Broadcast messages also require special attention. - * Sending a message to a target address of 0 basically sends it to the - * nearest route node as a link layer message. - * - * The source physical address normally starts out as 0 and is constructed - * as the message hops across the mesh. The target can use the constructed - * source address to respond to the originator of the message (as it must - * if it has no knowledge about the session). A target with knowledge - * of the session id has the option of forging its own return path. - * - * Checksums are the responsibility of higher layers but message checking - * elements can be negotiated or required as part of the syslink message's - * structured data. + * SYSLINK_ELM - structured data element. + * + * syslink_msg's have zero or more syslink_elm's arranged as an array. + * Each syslink_elm may represent opaque data or recursively structured + * data. + * + * SE_CMD field - identify RPC command (at the top level) or RPC data element + * in deeper recursions. + * + * Please note that while bits have individual meanings, command switches + * should universally compare all 16 bits against the command. This + * guarentees that commands will not be misinterpreted (e.g. reply vs + * command, or data which has not been endian converted). + * + * SE_CMDF_REPLY - is usually set in the top level syslink_elm embedded + * in syslink message replies as a safety in order to prevent a reply + * from being misinterpreted as a command. + * + * SE_CMDF_STRUCTURED - indicates that the payload is an array of + * structured syslink_elm's, otherwise the payload is considered to + * be opaque. + * + * SE_CMDF_GLOBAL indicates that the command is globally defined by the + * syslink standard and is not protocol-specific. Note that PADs + * are not global commands. + * + * SE_CMDF_UNTRANSLATED indicates that the syslink_elm structure had + * to be translated into host endian format but any directly or + * indirectly represented opaque data has not been. This bit is used + * by the protocol layer to properly endian-translate protocol-specific + * opaque data. + * + * SE_AUX field - auxillary data field (signed 32 bit integer) + * + * This field contains protocol and command/element specific data. + * This typically contains an error code in replies (at least in + * sm_head). + */ +struct syslink_elm { + sl_cmd_t se_cmd; /* syslink element command/status id */ + sl_reclen_t se_bytes; /* unaligned record size */ + sl_auxdata_t se_aux; /* auxillary data always present */ + /* extended by data */ +}; + +#define SE_CMDF_REPLY 0x8000 /* safety feature */ +#define SE_CMDF_STRUCTURED 0x4000 /* payload is structured */ +#define SE_CMDF_GLOBAL 0x2000 /* non-proto-specific global cmd */ +#define SE_CMDF_UNTRANSLATED 0x1000 /* needs endian translation */ +#define SE_CMD_MASK 0x0FFF + +#define SE_CMD_PAD 0x0000 /* always reserved to mean PAD */ + +/* + * SYSLINK_MSG - Syslink transactional command or response + * + * This structure represents a syslink transactional command or response + * between two end-points identified by the session id. Either end may + * initiate a command independant of the other. A transaction consists of + * the sending of a command and the reception of a response. + * + * Multiple transactions in each direction (and both directions at once) + * may occur in parallel. The command/reply transaction space in one + * direction is independant of the command/reply transaction space in the + * other direction. + * + * SM_PROTO rppppppx-ppppppx + * + * r 0 = Command, 1 = Reply + * + * x Used to detect endian reversal. The protocol id is OR'd + * with 0x0100 on transmission. If we find bit 0 set to 1 on + * reception, endian translation must occur. + * + * - Reserved, must be 0 + * + * p12 Encoded protocol number. Protocol 0 indicates PAD (r must + * be 0 as well). Protocols 0-63 are reserved and may only be + * used when officially recognized by the DragonFly project. + * 64-4095 are user defined. + * + * SM_BYTES bbbbbbbbbbbbbbbb + * + * b16 This is the size of the whole message, including headers + * but not including out-of-band DMA. All messages must + * be 8-byte aligned. Unlike syslink_elm structures, sm_bytes + * must be properly aligned. + * + * SM_RLABEL llllllllllllllllllllllllllllllll + * + * l32 This is a 32 bit reply label routing id. The format of + * this field is defined by the transport layer. The field + * is typically assigned in the command message as it passes + * through the transport layer and is retained verbatim in + * the reply message. + * + * The most typical use of this field is as an aid to direct + * messages in a multi-threaded environment. For example, + * a kernel talking to a filesystem over a syslink might + * identify the thread originating the command in this field + * in order to allow the reply to be routed directly back to + * that thread. + * + * The field can also be used in crossbar switching meshes + * to identify both the originator and the target, but it + * should be noted that the verbatim requirement means the + * mesh must pick out the proper field based on the 'r'eply + * bit in sm_proto. + * + * SM_MSGID m64 + * + * m64 This 64 bit message id combined with the mesh id and the + * 'r'eply bit (and also the direction of the message when + * operating over a full-duplex syslink) uniquely identifies + * a syslink message. + * + * The message id is typically set to the address of the + * syslink message or control structure used by the originator, + * or obtained from a 64 bit counter. This way the originator + * can guarentee uniqueness without actually having to track + * message id allocations. + * + * SM_SESSID s64 + * + * s64 This is a 64 bit session id key whos primary purpose is to + * validate a link and prevent improperly routed or stale + * messages from having an adverse effect on the cluster. The + * field is typically left 0 for intra-host links. + * + * SM_HEAD (structure) + * + * All syslink messages other then PAD messages must contain at least + * one whole syslink_elm. Elements are arranged in an array until + * the syslink message space is exhausted. Each element may represent + * opaque data or recursively structured data. Structured data consists + * of an array of 0 or more elements embedded in the parent element. + * + * + * ENDIAN TRANSLATION - endian translation occurs when a message is received + * with bit 0 set in sm_proto, indicating that the native endian mode of + * the sender is different from the native endian mode of the receiver. + * Endian translation is NOT specific to little or big endian formatting + * but instead occurs only when the two sides have different native endian + * modes. All fields are interpreted structurally. Only little and big + * endian formats are supported (i.e. simple byte reversal). + * + * Translation consists of reversing the byte ordering for each structural + * field. Any syslink_elm structures are recursively translated as well, + * but opaque data contained within is not. The SE_CMDF_UNTRANSLATED bit + * in each translated syslink_elm structure is flipped. + * + * Syslink routers and switches may or may not translate a syslink_msg (but + * they must still properly interpret the syslink_msg header as the + * message passes through). It is possible for a message to be translated + * multiple times while it transits the network so it is important when + * translation occurs that the SE_CMDF_UNTRANSLATED bit in the syslink_elm + * structures gets flipped rather then simply set. */ struct syslink_msg { - sl_msgid_t sh_msgid; /* message transaction control */ - sl_reclen_t sh_payloadoff; /* offset of payload as a DMA aid */ - sl_reclen_t sh_bytes; /* unaligned size of message */ + sl_proto_t sm_proto; /* protocol id, endian, reply bit */ + sl_reclen_t sm_bytes; /* unaligned size of message */ + sl_rlabel_t sm_rlabel; /* reply label routing id */ /* minimum syslink_msg size is 8 bytes (special PAD) */ - sysid_t sh_sessid; /* session id */ - sysid_t sh_srcphysid; /* transit routing */ - sysid_t sh_dstphysid; /* transit routing */ - /* 8-byte aligned structure */ - /* followed by structured data */ + sysid_t sm_msgid; /* message id */ + sysid_t sm_sessid; /* session id */ + struct syslink_elm sm_head; /* structured data */ }; /* - * MSGID handling. This controls message transactions and PAD. Terminal - * nodes, such as filesystems, are state driven entities whos syslink - * message transactions are directly supported by the local on-machine route - * nodes they connect to. The route nodes use various fields in the header, - * particularly sm_msgid, sm_sessid, and sm_payloadoff, to optimally present - * syslink messages to the terminal node. In particular, a route node may - * present the payload for a syslink message or the message itself through - * some out-of-band means, such as by mapping it into memory. - * - * These route nodes also handle timeout and retry processing, providing - * appropriate response messages to terminal nodes if the target never replies - * to a transaction or some other exceptional condition occurs. The route - * node does not handle RETRY and other exceptional conditions itself.. - * that is, the route node is not responsible for storing the message, only - * routing it. The route node only tracks the related session(s). - * - * A route node only directly supports terminal nodes directly connected to - * it. Intermediate route nodes ignore the MSGID (other then the all 0's PAD - * case) and do not track indirect sessions. For example, a piece of - * hardware doing syslink message routing does not have to mess with - * any of this. - * - * A session id establishes a session between two entities. One terminal node - * is considered to be the originator of the session, the other terminal node - * is the target. However, once established, EITHER ENTITY may initiate - * a transaction (or both simulataniously). SH_MSGID_CMD_ORIGINATOR is used - * in all messages and replies related to a transaction initiated by the - * session originator, and SH_MSGID_CMD_TARGET is used in all messages and - * replies related to a transaction initiated by the session target. - * Establishment of new sessions uses SH_MSGID_CMD_FORGE. - * - * Parallel transactions are supported by using different transaction ids - * amoungst the parallel transactions. Once a transaction id is used, it - * may not be reused until after the timeout period is exceeded. With 23 - * transaction id bits we have 8 million transaction ids, supporting around - * 26000 transactions per second with a 5 minute timeout. Note that - * multiple sessions may be established between any two entities, giving us - * essentially an unlimited number of transactions per second. - * - * ENDIANESS - syslink messages may be transported with any endianess. This - * includes all fields including the syslink header and syslink element - * header fields. If upon reception SH_MSGID_ENDIAN_NORM is set in the msgid - * both end-points will have the same endianess and no translation is - * required. If SH_MSGID_ENDIAN_REV is set then the two end-points have - * different endianess and translation is required. Only little endian and - * bit endian transport is supported (that is, a simple reversal of bytes for - * each field). - * - * Intermediate route nodes (i.e. those not tracking the session) may NOT - * translate the endianess of the message in any fashion. The management - * node that talks to the actual resource is responsible for doing the - * endian translations for all the above fields... everything except the - * syslink_elm payload, which is described later. + * Minimum sizes for syslink pads and syslink messages. Pads can be as + * small as 8 bytes and are 8-byte aligned. Syslink messages can be as + * small as 16 bytes and are 8-byte aligned. */ -#define SL_MIN_MESSAGE_SIZE offsetof(struct syslink_msg, sm_sessid) +#define SL_MIN_PAD_SIZE offsetof(struct syslink_msg, sm_msgid) +#define SL_MIN_MSG_SIZE sizeof(struct syslink_msg) +#define SL_MIN_ELM_SIZE sizeof(struct syslink_elm) #define SL_MSG_ALIGN(bytes) (((bytes) + 7) & ~7) -#define SH_MSGID_CMD_MASK 0xF0000000 -#define SH_MSGID_CMD_HEARTBEAT 0x60000000 /* seed heartbeat broadcast */ -#define SH_MSGID_CMD_TIMESYNC 0x50000000 /* timesync broadcast */ -#define SH_MSGID_CMD_ALLOCATE 0x40000000 /* allocate session id space */ -#define SH_MSGID_CMD_ORIGINATOR 0x30000000 /* origin initiated trans */ -#define SH_MSGID_CMD_TARGET 0x20000000 /* target initiated trans */ -#define SH_MSGID_CMD_ESTABLISH 0x10000000 /* establish session */ -#define SH_MSGID_CMD_PAD 0x00000000 - -#define SH_MSGID_REPLY 0x08000000 -#define SH_MSGID_ENDIAN_NORM 0x01000000 -#define SH_MSGID_ENDIAN_REV 0x00000001 -#define SM_MSGID_TRANS_MASK 0x00FFFFFE /* 23 bits */ - -/* - * A syslink message is broken up into three pieces: (1) The headers, (2) The - * message elements, and (3) DMA payload. - * - * A non-PAD syslink message contains a single top-level message element. - * Unlike recursive message elements which can be iterated, the top level - * element is never iterated. There is always only one. The top level - * element is usually structured but does not have to be. The top level - * element's aux field represents the RPC protocol id for the command. - * - * A PAD syslink message contains no message elements. The entire syslink - * message is considered pad based on the header. - * - * A structured syslink message element may be specified by setting - * SE_CMDF_STRUCTURED. The data payload for a structured message element - * is a sequence of ZERO or MORE message elements until the payload size is - * reached. Each message element may be opaque or structured. Fully - * recursive message elements are supported in this manner. - * - * A syslink message element with SE_CMDF_MASTERPAYLOAD set is associated - * with the master payload for the syslink message as a whole. This field - * is only interpreted by terminal nodes and does not have to be used this - * way, but its a good idea to for debugging purposes. - * - * Syslink message elements are always 8-byte aligned. In order to - * guarentee an 8-byte alignment for our extended data, a 32 bit auxillary - * field is always included as part of the official syslink_elm structure - * definition. This field is actually part of the element command's data - * and its use, if any, depends on the element command. - * - * Syslink message elements do not have to be validated by intermediate - * route nodes but must ALWAYS be validated by the route node that connects - * to the terminal node intended to receive the syslink message. - * - * Only the header fields of a syslink_elm are translated for endianess - * by the management node. If the management node does have to do an - * endian conversion it will also set SE_CMDF_UNTRANSLATED in se_cmd (all - * of them, recursively, since it has to validate and translate the entire - * hierarchy anyway) and the rpc mechanism will be responsible for doing - * the conversion and clearing the flag. The seu_proto field IS always - * translated, which means that when used as aux data it must be referenced - * as a 32 bit field. - * - * As a fringe benefit, since the RPC command is the entire se_cmd field, - * flags and all, an untranslated element will wind up with an unrecognized - * command code and be reported as an error rather then being mis-executed. +/* + * sm_proto field rppppppx-PPPPPPx encoded + * ----ppppppPPPPPP decoded + * + * Note: SMPROTO_ defines are in encoded form */ -struct syslink_elm { - sl_cmd_t se_cmd; - sl_reclen_t se_bytes; - union { - sl_auxdata_t seu_aux; /* aux data */ - sl_auxdata_t seu_proto; /* protocol field */ - } u; - /* extended by data */ -}; +#define SM_PROTO_REPLY 0x8000 +#define SM_PROTO_ENDIAN_NORM 0x0100 +#define SM_PROTO_ENDIAN_REV 0x0001 +#define SM_PROTO_ENCODE(n) ((((n) << 1) & ~127) | (((n) << 3) & 0x7E00) \ + | SM_PROTO_ENDIAN_NORM) +#define SM_PROTO_DECODE(n) ((((n) >> 1) & 63) | (((n) >> 3) & )) 0x0FC0) \ + | SM_PROTO_ENDIAN_NORM) -#define SE_CMDF_STRUCTURED 0x8000 /* structured, else opaque */ -#define SE_CMDF_RESERVED4000 0x4000 -#define SE_CMDF_MASTERPAYLOAD 0x2000 /* DMA payload association */ -#define SE_CMDF_UNTRANSLATED 0x1000 /* needs endian translation */ +/* + * Reserved protocol encodings 0-63 + */ +#define SMPROTO_PAD SM_PROTO_ENCODE(0x0000) + +/* + * high level protocol encodings + */ +#define SMPROTO_BSDVFS SM_PROTO_ENCODE(0x0040) + +/* + * Syslink messages may contain recursive components. The recursion depth + * allowed is limited to SL_MAXDEPTH. + * + * Syslink messages, NON-inclusive of any DMA buffers, are limited to + * SL_MAXSIZE bytes. DMA buffer limitations are not defined here but + * the expectation is that they can be fairly large. + */ +#define SL_MAXDEPTH 10 +#define SL_MAXSIZE 4096 -#define SE_CMD_PAD 0x0000 /* CMD 0 is always PAD */ +/* + * slmsgalloc() sizes + */ +#define SLMSG_SMALL 256 +#define SLMSG_BIG SL_MAXSIZE + + +union syslink_small_msg { + struct syslink_msg msg; + char buf[SLMSG_SMALL]; +}; + +union syslink_big_msg { + struct syslink_msg msg; + char buf[SLMSG_BIG]; +}; typedef struct syslink_msg *syslink_msg_t; typedef struct syslink_elm *syslink_elm_t;