--- src/sys/kern/kern_syslink.c 2007/04/26 02:10:59 1.10 +++ src/sys/kern/kern_syslink.c 2007/05/27 20:35:38 1.11 @@ -34,24 +34,23 @@ * $DragonFly$ */ /* - * This module implements the syslink() system call and protocol which - * is used to glue clusters together as well as to interface userland - * devices and filesystems to the kernel. - * - * We implement the management node concept in this module. A management - * node is basically a router node with additional features that take much - * of the protocol burden away from connecting terminal nodes. + * This module implements the core syslink() system call and provides + * glue for kernel syslink frontends and backends, creating a intra-host + * communications infrastructure and DMA transport abstraction. */ #include #include #include +#include #include #include #include #include #include #include +#include +#include #include #include #include @@ -60,231 +59,200 @@ #include #include #include +#include #include #include #include #include +#include #include "opt_syslink.h" /* - * Red-Black trees organizing the syslink 'router' nodes and connections - * to router nodes. - */ -struct slrouter; -struct sldata; - -RB_HEAD(slrouter_rb_tree, slrouter); -RB_HEAD(sldata_rb_tree, sldata); -RB_PROTOTYPE2(slrouter_rb_tree, slrouter, rbnode, - rb_slrouter_compare, sysid_t); -RB_PROTOTYPE2(sldata_rb_tree, sldata, rbnode, - rb_sldata_compare, int); - -/* - * Fifo used to buffer broadcast packets - */ -struct slbuf { - char *buf; - int bufsize; /* must be a power of 2 */ - int bufmask; /* (bufsize - 1) */ - int rindex; /* tail-chasing FIFO indices */ - int windex; -}; - -/* - * Syslink Router abstraction - */ -struct slrouter { - RB_ENTRY(slrouter) rbnode; /* list of routers */ - struct sldata_rb_tree sldata_rb_root; /* connections to router */ - sysid_t sysid; /* logical sysid of router */ - int flags; /* flags passed on create */ - int bits; /* accomodate connections */ - int count; /* number of connections */ - int refs; - alist_t bitmap; - struct slbuf bbuf; /* broadcast buffer */ - char label[SYSLINK_LABEL_SIZE]; -}; - -/* * Syslink Connection abstraction */ -struct sldata { - RB_ENTRY(sldata) rbnode; - struct slrouter *router; /* organizing router */ - struct file *xfp; /* external file pointer */ - struct lock rlock; /* synchronizing lock */ - struct lock wlock; /* synchronizing lock */ - struct thread *rthread; /* helper thread */ - struct thread *wthread; /* helper thread */ - struct sockbuf sior; /* accumulate incoming mbufs */ - struct sockbuf siow; /* accumulate outgoing mbufs */ - struct sockaddr sa; /* used w/SLIF_SUBNET mode */ - int bindex; /* broadcast index */ - int flags; /* connection flags */ - int linkid; - int bits; +struct slcommon { + struct spinlock spin; int refs; - char label[SYSLINK_LABEL_SIZE]; }; -#define SYSLINK_BBUFSIZE (32*1024) -#define SYSLINK_SIOBUFSIZE (128*1024) - -static int rb_slrouter_compare(struct slrouter *r1, struct slrouter *r2); -static int rb_sldata_compare(struct sldata *d1, struct sldata *d2); - -static int syslink_destroy(struct slrouter *slrouter); -static int syslink_add(struct slrouter *slrouter, - struct syslink_info *info, int *result); -static int syslink_rem(struct slrouter *slrouter, struct sldata *sldata, - struct syslink_info *info); - -static int syslink_read(struct file *fp, struct uio *uio, - struct ucred *cred, int flags); -static int syslink_write(struct file *fp, struct uio *uio, - struct ucred *cred, int flags); -static int syslink_close(struct file *fp); -static int syslink_stat(struct file *fp, struct stat *sb, struct ucred *cred); -static int syslink_shutdown(struct file *fp, int how); -static int syslink_ioctl(struct file *fp, u_long cmd, caddr_t data, - struct ucred *cred); -static int syslink_poll(struct file *fp, int events, struct ucred *cred); -static int syslink_kqfilter(struct file *fp, struct knote *kn); - -static void syslink_rthread_so(void *arg); -static void syslink_rthread_fp(void *arg); -static void syslink_wthread_so(void *arg); -static void syslink_wthread_fp(void *arg); -static int syslink_getsubnet(struct sockaddr *sa); -static struct mbuf *syslink_parse_stream(struct sockbuf *sio); -static void syslink_route(struct slrouter *slrouter, int linkid, struct mbuf *m); -static void slbuf_alloc(struct slbuf *buf, int bytes); -static void slbuf_free(struct slbuf *buf); -static void sldata_rels(struct sldata *sldata); -static void slrouter_rels(struct slrouter *slrouter); -static int process_syslink_msg(struct sldata *sldata, struct syslink_msg *head); -static int syslink_validate(struct syslink_msg *head, int bytes); - -RB_GENERATE2(slrouter_rb_tree, slrouter, rbnode, - rb_slrouter_compare, sysid_t, sysid); -RB_GENERATE2(sldata_rb_tree, sldata, rbnode, - rb_sldata_compare, int, linkid); - -static struct fileops syslinkops = { - .fo_read = syslink_read, - .fo_write = syslink_write, - .fo_ioctl = syslink_ioctl, - .fo_poll = syslink_poll, - .fo_kqfilter = syslink_kqfilter, - .fo_stat = syslink_stat, - .fo_close = syslink_close, - .fo_shutdown = syslink_shutdown +struct sldesc { + struct slmsgq inq; + struct slmsg_rb_tree reply_rb_root; /* replies to requests */ + struct spinlock spin; + struct sldesc *peer; /* peer syslink, if any */ + struct file *xfp; /* external file pointer */ + struct slcommon *common; + int flags; + int rwaiters; /* number of threads waiting */ + int wblocked; /* blocked waiting for us to drain */ + size_t cmdbytes; /* unreplied commands pending */ + size_t repbytes; /* undrained replies pending */ + int (*backend_wblocked)(struct sldesc *, int, sl_proto_t); + int (*backend_write)(struct sldesc *, struct slmsg *); + void (*backend_reply)(struct sldesc *,struct slmsg *,struct slmsg *); + void (*backend_dispose)(struct sldesc *, struct slmsg *); }; -MALLOC_DEFINE(M_SYSLINK, "syslink", "syslink manager"); +#define SLF_RSHUTDOWN 0x0001 +#define SLF_WSHUTDOWN 0x0002 -static int syslink_enabled; -SYSCTL_INT(_kern, OID_AUTO, syslink_enabled, - CTLFLAG_RW, &syslink_enabled, 0, "Enable SYSLINK"); +static int syslink_cmd_new(struct syslink_info_new *info, int *result); +static struct sldesc *allocsldesc(struct slcommon *common); +static void setsldescfp(struct sldesc *sl, struct file *fp); +static void shutdownsldesc(struct sldesc *sl, int how); +static void shutdownsldesc2(struct sldesc *sl, int how); +static void sldrop(struct sldesc *sl); +static int syslink_validate_msg(struct syslink_msg *msg, int bytes); +static int syslink_validate_elm(struct syslink_elm *elm, sl_reclen_t bytes, + int swapit, int depth); + +static int backend_wblocked_user(struct sldesc *sl, int nbio, sl_proto_t proto); +static int backend_write_user(struct sldesc *sl, struct slmsg *slmsg); +static void backend_reply_user(struct sldesc *sl, struct slmsg *slcmd, + struct slmsg *slrep); +static void backend_dispose_user(struct sldesc *sl, struct slmsg *slmsg); + +static int backend_wblocked_kern(struct sldesc *sl, int nbio, sl_proto_t proto); +static int backend_write_kern(struct sldesc *sl, struct slmsg *slmsg); +static void backend_reply_kern(struct sldesc *sl, struct slmsg *slcmd, + struct slmsg *slrep); +static void backend_dispose_kern(struct sldesc *sl, struct slmsg *slmsg); /* - * Support declarations and compare function for our RB trees + * Objcache memory backend + * + * All three object caches return slmsg structures but each is optimized + * for syslink message buffers of varying sizes. We use the slightly + * more complex ctor/dtor API in order to provide ready-to-go slmsg's. */ -static struct slrouter_rb_tree slrouter_rb_root; -static int -rb_slrouter_compare(struct slrouter *r1, struct slrouter *r2) -{ - if (r1->sysid < r2->sysid) - return(-1); - if (r1->sysid > r2->sysid) - return(1); - return(0); -} +struct objcache *sl_objcache_big; +struct objcache *sl_objcache_small; +struct objcache *sl_objcache_none; -static int -rb_sldata_compare(struct sldata *d1, struct sldata *d2) -{ - if (d1->linkid < d2->linkid) - return(-1); - if (d1->linkid > d2->linkid) - return(1); - return(0); -} +MALLOC_DEFINE(M_SYSLINK, "syslink", "syslink manager"); -/* - * Compare and callback functions for first-sysid and first-linkid searches. - */ -static int -syslink_cmd_locate_cmp(struct slrouter *slrouter, void *data) +static boolean_t slmsg_ctor(void *data, void *private, int ocflags); +static void slmsg_dtor(void *data, void *private); + +static +void +syslinkinit(void *dummy __unused) { - struct syslink_info *info = data; + size_t n = sizeof(struct slmsg); - if (slrouter->sysid < info->sysid) - return(-1); - if (slrouter->sysid > info->sysid) - return(1); - return(0); + sl_objcache_none = objcache_create_mbacked(M_SYSLINK, n, 0, 64, + slmsg_ctor, slmsg_dtor, + &sl_objcache_none); + sl_objcache_small= objcache_create_mbacked(M_SYSLINK, n, 0, 64, + slmsg_ctor, slmsg_dtor, + &sl_objcache_small); + sl_objcache_big = objcache_create_mbacked(M_SYSLINK, n, 0, 16, + slmsg_ctor, slmsg_dtor, + &sl_objcache_big); } -static int -syslink_cmd_locate_callback(struct slrouter *slrouter, void *data) +static +boolean_t +slmsg_ctor(void *data, void *private, int ocflags) { - struct syslink_info *info = data; + struct slmsg *slmsg = data; - info->flags = slrouter->flags; /* also clears SLIF_ERROR */ - bcopy(slrouter->label, info->label, SYSLINK_LABEL_SIZE); + bzero(slmsg, sizeof(*slmsg)); - return(-1); + slmsg->oc = *(struct objcache **)private; + if (slmsg->oc == sl_objcache_none) { + slmsg->maxsize = 0; + } else if (slmsg->oc == sl_objcache_small) { + slmsg->maxsize = SLMSG_SMALL; + } else if (slmsg->oc == sl_objcache_big) { + slmsg->maxsize = SLMSG_BIG; + } else { + panic("slmsg_ctor: bad objcache?\n"); + } + if (slmsg->maxsize) { + slmsg->msg = kmalloc(slmsg->maxsize, + M_SYSLINK, M_WAITOK|M_ZERO); + } + return(TRUE); } -static int -syslink_cmd_find_cmp(struct sldata *sldata, void *data) +static +void +slmsg_dtor(void *data, void *private) { - struct syslink_info *info = data; + struct slmsg *slmsg = data; - if (sldata->linkid < info->linkid) - return(-1); - if (sldata->linkid > info->linkid) - return(1); - return(0); + if (slmsg->maxsize && slmsg->msg) { + kfree(slmsg->msg, M_SYSLINK); + slmsg->msg = NULL; + } + slmsg->oc = NULL; } -static int -syslink_cmd_find_callback(struct sldata *sldata, void *data) -{ - struct syslink_info *info = data; +SYSINIT(syslink, SI_BOOT2_MACHDEP, SI_ORDER_ANY, syslinkinit, NULL) - info->linkid = sldata->linkid; - info->flags = sldata->flags; /* also clears SLIF_ERROR */ - bcopy(sldata->label, info->label, SYSLINK_LABEL_SIZE); +static int rb_slmsg_compare(struct slmsg *msg1, struct slmsg *msg2); +RB_GENERATE2(slmsg_rb_tree, slmsg, rbnode, rb_slmsg_compare, + sysid_t, msg->sm_msgid); - return(-1); -} +/* + * Sysctl elements + */ +static int syslink_enabled; +SYSCTL_NODE(_kern, OID_AUTO, syslink, CTLFLAG_RW, 0, "Pipe operation"); +SYSCTL_INT(_kern_syslink, OID_AUTO, enabled, + CTLFLAG_RW, &syslink_enabled, 0, "Enable SYSLINK"); +static size_t syslink_bufsize = 65536; +SYSCTL_UINT(_kern_syslink, OID_AUTO, bufsize, + CTLFLAG_RW, &syslink_bufsize, 0, "Maximum buffer size"); /* - * Primary system call interface - associate a full-duplex stream - * (typically a pipe or a connected socket) with a sysid namespace, - * or create a direct link. + * Fileops API - typically used to glue a userland frontend with a + * kernel backend. + */ + +static int slfileop_read(struct file *fp, struct uio *uio, + struct ucred *cred, int flags); +static int slfileop_write(struct file *fp, struct uio *uio, + struct ucred *cred, int flags); +static int slfileop_close(struct file *fp); +static int slfileop_stat(struct file *fp, struct stat *sb, struct ucred *cred); +static int slfileop_shutdown(struct file *fp, int how); +static int slfileop_ioctl(struct file *fp, u_long cmd, caddr_t data, + struct ucred *cred); +static int slfileop_poll(struct file *fp, int events, struct ucred *cred); +static int slfileop_kqfilter(struct file *fp, struct knote *kn); + +static struct fileops syslinkops = { + .fo_read = slfileop_read, + .fo_write = slfileop_write, + .fo_ioctl = slfileop_ioctl, + .fo_poll = slfileop_poll, + .fo_kqfilter = slfileop_kqfilter, + .fo_stat = slfileop_stat, + .fo_close = slfileop_close, + .fo_shutdown = slfileop_shutdown +}; + +/************************************************************************ + * PRIMARY SYSTEM CALL INTERFACE * + ************************************************************************ * * syslink(int cmd, struct syslink_info *info, size_t bytes) */ int sys_syslink(struct syslink_args *uap) { - struct syslink_info info; - struct slrouter *slrouter = NULL; - struct sldata *sldata = NULL; + union syslink_info_all info; int error; - int n; /* * System call is under construction and disabled by default. - * Superuser access is also required. + * Superuser access is also required for now, but eventually + * will not be needed. */ if (syslink_enabled == 0) return (EAUTH); @@ -307,1059 +275,970 @@ sys_syslink(struct syslink_args *uap) if (error) return (error); - if (info.label[sizeof(info.label)-1] != 0) - return (EINVAL); - /* - * Process command + * Process the command */ switch(uap->cmd) { - case SYSLINK_CMD_CREATE: - /* - * Create a new syslink router node. Set refs to prevent the - * router node from being destroyed. One ref is our temporary - * reference while the other is the SLIF_DESTROYED-interlocked - * reference. - */ - if (info.bits < 2 || info.bits > SYSLINK_ROUTER_MAXBITS) - return (EINVAL); - slrouter = kmalloc(sizeof(struct slrouter), M_SYSLINK, - M_WAITOK|M_ZERO); - if (slrouter_rb_tree_RB_LOOKUP(&slrouter_rb_root, info.sysid)) { - kfree(slrouter, M_SYSLINK); - slrouter = NULL; - return (EINVAL); - } - slrouter->sysid = info.sysid; - slrouter->refs = 2; - slrouter->bits = info.bits; - slrouter->flags = info.flags & SLIF_USERFLAGS; - slrouter->bitmap = alist_create(1 << info.bits, M_SYSLINK); - slbuf_alloc(&slrouter->bbuf, SYSLINK_BBUFSIZE); - RB_INIT(&slrouter->sldata_rb_root); - RB_INSERT(slrouter_rb_tree, &slrouter_rb_root, slrouter); - break; - case SYSLINK_CMD_DESTROY: - /* - * Destroy a syslink router node. The physical node is - * not freed until our temporary reference is removed. - */ - slrouter = slrouter_rb_tree_RB_LOOKUP(&slrouter_rb_root, - info.sysid); - if (slrouter) { - ++slrouter->refs; - if ((slrouter->flags & SLIF_DESTROYED) == 0) { - slrouter->flags |= SLIF_DESTROYED; - /* SLIF_DESTROYED interlock */ - slrouter_rels(slrouter); - error = syslink_destroy(slrouter); - /* still holding our private interlock */ - } - } - break; - case SYSLINK_CMD_LOCATE: - /* - * Locate the first syslink router node >= info.sysid - */ - info.flags |= SLIF_ERROR; - n = slrouter_rb_tree_RB_SCAN( - &slrouter_rb_root, - syslink_cmd_locate_cmp, syslink_cmd_locate_callback, - &info); - if (info.flags & SLIF_ERROR) - error = ENOENT; - break; - case SYSLINK_CMD_ADD: - slrouter = slrouter_rb_tree_RB_LOOKUP(&slrouter_rb_root, info.sysid); - if (info.bits && - (info.bits < 2 || info.bits > SYSLINK_ROUTER_MAXBITS)) { - error = EINVAL; - } else if (slrouter && (slrouter->flags & SLIF_DESTROYED)) { - /* - * Someone is trying to destroy this route node, - * no new adds please! - */ - error = EIO; - } else if (slrouter) { - ++slrouter->refs; - error = syslink_add(slrouter, &info, - &uap->sysmsg_result); - } else { - error = EINVAL; - } - break; - case SYSLINK_CMD_REM: - slrouter = slrouter_rb_tree_RB_LOOKUP(&slrouter_rb_root, - info.sysid); - if (slrouter) { - ++slrouter->refs; - sldata = sldata_rb_tree_RB_LOOKUP(&slrouter->sldata_rb_root, info.linkid); - if (sldata) { - ++sldata->refs; - error = syslink_rem(slrouter, sldata, &info); - } else { - error = ENOENT; - } - } else { - error = EINVAL; - } - break; - case SYSLINK_CMD_FIND: - slrouter = slrouter_rb_tree_RB_LOOKUP(&slrouter_rb_root, info.sysid); - info.flags |= SLIF_ERROR; - if (slrouter) { - ++slrouter->refs; - n = sldata_rb_tree_RB_SCAN( - &slrouter->sldata_rb_root, - syslink_cmd_find_cmp, syslink_cmd_find_callback, - &info); - if (info.flags & SLIF_ERROR) - error = ENOENT; - } else { - error = EINVAL; - } + case SYSLINK_CMD_NEW: + error = syslink_cmd_new(&info.cmd_new, &uap->sysmsg_result); break; default: error = EINVAL; break; } - - /* - * Cleanup - */ - if (sldata) - sldata_rels(sldata); - if (slrouter) - slrouter_rels(slrouter); + if (error == 0 && info.head.wbflag) + copyout(&info, uap->info, uap->bytes); return (error); } +/* + * Create a linked pair of descriptors, like a pipe. + */ static int -syslink_destroy_callback(struct sldata *sldata, void *data __unused) +syslink_cmd_new(struct syslink_info_new *info, int *result) { - ++sldata->refs; - if ((sldata->flags & SLIF_RQUIT) == 0) { - sldata->flags |= SLIF_RQUIT; - wakeup(&sldata->rthread); - } - if ((sldata->flags & SLIF_WQUIT) == 0) { - sldata->flags |= SLIF_WQUIT; - wakeup(&sldata->wthread); - } - sldata_rels(sldata); + struct proc *p = curproc; + struct file *fp1; + struct file *fp2; + struct sldesc *sl; + struct sldesc *slpeer; + int error; + int fd1, fd2; + + error = falloc(p, &fp1, &fd1); + if (error) + return(error); + error = falloc(p, &fp2, &fd2); + if (error) { + fsetfd(p, NULL, fd1); + fdrop(fp1); + return(error); + } + slpeer = allocsldesc(NULL); + slpeer->backend_wblocked = backend_wblocked_user; + slpeer->backend_write = backend_write_user; + slpeer->backend_reply = backend_reply_user; + slpeer->backend_dispose = backend_dispose_user; + sl = allocsldesc(slpeer->common); + sl->peer = slpeer; + sl->backend_wblocked = backend_wblocked_user; + sl->backend_write = backend_write_user; + sl->backend_reply = backend_reply_user; + sl->backend_dispose = backend_dispose_user; + slpeer->peer = sl; + + setsldescfp(sl, fp1); + setsldescfp(slpeer, fp2); + + fsetfd(p, fp1, fd1); + fdrop(fp1); + fsetfd(p, fp2, fd2); + fdrop(fp2); + + info->head.wbflag = 1; /* write back */ + info->fds[0] = fd1; + info->fds[1] = fd2; + return(0); } -/* - * Shutdown all the connections going into this syslink. +/************************************************************************ + * LOW LEVEL SLDESC SUPPORT * + ************************************************************************ * - * Try to wait for completion, but return after 1 second - * regardless. */ + static -int -syslink_destroy(struct slrouter *slrouter) +struct sldesc * +allocsldesc(struct slcommon *common) { - int retries = 10; + struct sldesc *sl; - while (!RB_EMPTY(&slrouter->sldata_rb_root) && retries) { - RB_SCAN(sldata_rb_tree, &slrouter->sldata_rb_root, NULL, - syslink_destroy_callback, slrouter); - --retries; - tsleep(&retries, 0, "syslnk", hz / 10); - } - if (RB_EMPTY(&slrouter->sldata_rb_root)) - return(0); - else - return(EINPROGRESS); + sl = kmalloc(sizeof(struct sldesc), M_SYSLINK, M_WAITOK|M_ZERO); + if (common == NULL) + common = kmalloc(sizeof(*common), M_SYSLINK, M_WAITOK|M_ZERO); + TAILQ_INIT(&sl->inq); /* incoming requests */ + RB_INIT(&sl->reply_rb_root); /* match incoming replies */ + spin_init(&sl->spin); + sl->common = common; + ++common->refs; + return(sl); +} + +static +void +setsldescfp(struct sldesc *sl, struct file *fp) +{ + sl->xfp = fp; + fp->f_type = DTYPE_SYSLINK; + fp->f_flag = FREAD | FWRITE; + fp->f_ops = &syslinkops; + fp->f_data = sl; } +/* + * Red-black tree compare function + */ static int -syslink_add(struct slrouter *slrouter, struct syslink_info *info, - int *result) +rb_slmsg_compare(struct slmsg *msg1, struct slmsg *msg2) { - struct sldata *sldata; - struct file *fp; - int maxphys; - int numphys; - int linkid; - int error; + if (msg1->msg->sm_msgid < msg2->msg->sm_msgid) + return(-1); + if (msg1->msg->sm_msgid == msg2->msg->sm_msgid) + return(0); + return(1); +} + +static +void +shutdownsldesc(struct sldesc *sl, int how) +{ + struct slmsg *slmsg; + int rhow; - error = 0; - maxphys = 1 << slrouter->bits; - numphys = info->bits ? (1 << info->bits) : 1; + shutdownsldesc2(sl, how); /* - * Create a connection to the route node and allocate a physical ID. - * Physical ID 0 is reserved for the route node itself, and an all-1's - * ID is reserved as a broadcast address. + * Return unread and unreplied messages */ - sldata = kmalloc(sizeof(struct sldata), M_SYSLINK, M_WAITOK|M_ZERO); - - linkid = alist_alloc(slrouter->bitmap, numphys); - if (linkid == ALIST_BLOCK_NONE) { - kfree(sldata, M_SYSLINK); - return (ENOSPC); + spin_lock_wr(&sl->spin); + while ((slmsg = TAILQ_FIRST(&sl->inq)) != NULL) { + TAILQ_REMOVE(&sl->inq, slmsg, tqnode); + spin_unlock_wr(&sl->spin); + if (slmsg->msg->sm_proto & SM_PROTO_REPLY) { + sl->repbytes -= slmsg->maxsize; + slmsg->flags &= ~SLMSGF_ONINQ; + sl->peer->backend_dispose(sl->peer, slmsg); + } + /* leave ONINQ set for commands, it will cleared below */ + spin_lock_wr(&sl->spin); + } + while ((slmsg = RB_ROOT(&sl->reply_rb_root)) != NULL) { + RB_REMOVE(slmsg_rb_tree, &sl->reply_rb_root, slmsg); + sl->cmdbytes -= slmsg->maxsize; + spin_unlock_wr(&sl->spin); + slmsg->flags &= ~SLMSGF_ONINQ; + sl->peer->backend_reply(sl->peer, slmsg, NULL); + spin_lock_wr(&sl->spin); } + spin_unlock_wr(&sl->spin); /* - * Insert the node, initializing enough fields to prevent things from - * being ripped out from under us before we have a chance to complete - * the system call. + * Call shutdown on the peer with the opposite flags */ - sldata->linkid = linkid; - sldata->refs = 1; - ++slrouter->count; - if (sldata_rb_tree_RB_LOOKUP(&slrouter->sldata_rb_root, linkid)) - panic("syslink_add: free linkid wasn't free!"); - RB_INSERT(sldata_rb_tree, &slrouter->sldata_rb_root, sldata); + rhow = 0; + if (how & SHUT_RD) + rhow |= SHUT_WR; + if (how & SHUT_WR) + rhow |= SHUT_RD; + shutdownsldesc2(sl->peer, rhow); +} + +static +void +shutdownsldesc2(struct sldesc *sl, int how) +{ + spin_lock_wr(&sl->spin); + if (how & SHUT_RD) + sl->flags |= SLF_RSHUTDOWN; + if (how & SHUT_WR) + sl->flags |= SLF_WSHUTDOWN; + spin_unlock_wr(&sl->spin); /* - * Complete initialization of the physical route node. Setting - * sldata->router activates the node. + * Handle signaling on the user side */ - sbinit(&sldata->sior, SYSLINK_SIOBUFSIZE); - sbinit(&sldata->siow, SYSLINK_SIOBUFSIZE); - sldata->bindex = slrouter->bbuf.windex; - sldata->flags = info->flags & SLIF_USERFLAGS; - lockinit(&sldata->rlock, "slread", 0, 0); - lockinit(&sldata->wlock, "slwrite", 0, 0); - bcopy(&info->u.sa, &sldata->sa, sizeof(sldata->sa)); - - if (info->fd < 0) { - /* - * We create a direct syslink descriptor. No helper threads - * are needed. - */ - error = falloc(curproc, &fp, &info->fd); - if (error == 0) { - fp->f_type = DTYPE_SYSLINK; - fp->f_flag = FREAD | FWRITE; - fp->f_ops = &syslinkops; - fp->f_data = sldata; - /* one ref: the fp descriptor */ - sldata->refs += 1; - sldata->flags |= SLIF_WQUIT | SLIF_WDONE; - sldata->flags |= SLIF_RQUIT | SLIF_RDONE; - fsetfd(curproc, fp, info->fd); - fdrop(fp); - *result = info->fd; - } - } else { - sldata->xfp = holdfp(curproc->p_fd, info->fd, -1); - if (sldata->xfp != NULL) { - /* two refs: reader thread and writer thread */ - sldata->refs += 2; - if (sldata->xfp->f_type == DTYPE_SOCKET) { - lwkt_create(syslink_rthread_so, sldata, - &sldata->rthread, NULL, - 0, -1, "syslink_r"); - lwkt_create(syslink_wthread_so, sldata, - &sldata->wthread, NULL, - 0, -1, "syslink_w"); - } else { - lwkt_create(syslink_rthread_fp, sldata, - &sldata->rthread, NULL, - 0, -1, "syslink_r"); - lwkt_create(syslink_wthread_fp, sldata, - &sldata->wthread, NULL, - 0, -1, "syslink_w"); - } - } else { - error = EBADF; + if (how & SHUT_RD) { + if (sl->rwaiters) + wakeup(&sl->rwaiters); + } + if (how & SHUT_WR) { + if (sl->wblocked) { + sl->wblocked = 0; /* race ok */ + wakeup(&sl->wblocked); } } - sldata->router = slrouter; - sldata_rels(sldata); - return(error); } static -int -syslink_rem(struct slrouter *slrouter, struct sldata *sldata, - struct syslink_info *info) +void +sldrop(struct sldesc *sl) { - int error = EINPROGRESS; + struct sldesc *slpeer; - if ((sldata->flags & SLIF_RQUIT) == 0) { - sldata->flags |= SLIF_RQUIT; - wakeup(&sldata->rthread); - error = 0; - } - if ((sldata->flags & SLIF_WQUIT) == 0) { - sldata->flags |= SLIF_WQUIT; - wakeup(&sldata->wthread); - error = 0; + spin_lock_wr(&sl->common->spin); + if (--sl->common->refs == 0) { + spin_unlock_wr(&sl->common->spin); + if ((slpeer = sl->peer) != NULL) { + sl->peer = NULL; + slpeer->peer = NULL; + slpeer->common = NULL; + KKASSERT(slpeer->xfp == NULL); + KKASSERT(TAILQ_EMPTY(&slpeer->inq)); + KKASSERT(RB_EMPTY(&slpeer->reply_rb_root)); + kfree(slpeer, M_SYSLINK); + } + KKASSERT(sl->xfp == NULL); + KKASSERT(TAILQ_EMPTY(&sl->inq)); + KKASSERT(RB_EMPTY(&sl->reply_rb_root)); + kfree(sl->common, M_SYSLINK); + sl->common = NULL; + kfree(sl, M_SYSLINK); + } else { + spin_unlock_wr(&sl->common->spin); } - return(error); } -/* - * Read syslink messages from an external socket and route them. +/************************************************************************ + * FILEOPS API * + ************************************************************************ + * + * Implement userland fileops. + * + * MPSAFE ops */ static -void -syslink_rthread_so(void *arg) +int +slfileop_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags) { - struct sldata *sldata = arg; - struct socket *so; - struct sockaddr *sa; - struct mbuf *m; - int soflags; - int linkid; + struct sldesc *sl = fp->f_data; /* fp refed on call */ + struct slmsg *slmsg; + struct iovec *iov0; int error; - int needsa; - - so = (void *)sldata->xfp->f_data; - sa = NULL; + int nbio; /* - * Calculate whether we need to get the peer address or not. - * We need to obtain the peer address for packet-mode sockets - * representing subnets (rather then single connections). + * Kinda messy. Figure out the non-blocking state */ - needsa = (sldata->bits && (sldata->flags & SLIF_PACKET)); + if (flags & O_FBLOCKING) + nbio = 0; + else if (flags & O_FNONBLOCKING) + nbio = 1; + else if (fp->f_flag & O_NONBLOCK) + nbio = 1; + else + nbio = 0; - while ((sldata->flags & SLIF_RQUIT) == 0) { - /* - * Read some data. This is easy if the data is packetized, - * otherwise we can still obtain an mbuf chain but we have - * to parse out the syslink messages. - */ - soflags = 0; - error = so_pru_soreceive(so, - (needsa ? &sa : NULL), - NULL, &sldata->sior, - NULL, &soflags); + /* + * Validate the uio + */ + if (uio->uio_iovcnt < 1) { + error = 0; + goto done2; + } + iov0 = &uio->uio_iov[0]; - /* - * The target is responsible for adjusting the src address - * field in the syslink_msg. We may need subnet information - * from the sockaddr to accomplish this. - * - * For streams representing subnets the originator is - * responsible for tagging its subnet bits in the src - * address but we have to renormalize - */ - linkid = sldata->linkid; - if (sldata->flags & SLIF_PACKET) { - if (sldata->bits) { - linkid += syslink_getsubnet(sa) & - ((1 << sldata->bits) - 1); - } - if ((m = sldata->sior.sb_mb) != NULL) { - sbinit(&sldata->sior, SYSLINK_SIOBUFSIZE); - syslink_route(sldata->router, linkid, m); - } - } else { - while ((m = syslink_parse_stream(&sldata->sior)) != NULL) { - syslink_route(sldata->router, linkid, m); - } + /* + * Get a message, blocking if necessary. + */ + spin_lock_wr(&sl->spin); + while ((slmsg = TAILQ_FIRST(&sl->inq)) == NULL) { + if (sl->flags & SLF_RSHUTDOWN) { + error = 0; + goto done1; } - - - - /* - * - */ - if ((sldata->flags & SLIF_SUBNET) && sldata->bits && sa) { - linkid += syslink_getsubnet(sa) & - ((1 << sldata->bits) - 1); - FREE(sa, M_SONAME); - } - if (error) - break; - - /* - * Note: Incoming syslink messages must have their headers - * adjusted to reflect the origination address. This will - * be handled by syslink_route. - */ - if (sldata->flags & SLIF_PACKET) { - /* - * Packetized data can just be directly routed. - */ - if ((m = sldata->sior.sb_mb) != NULL) { - sbinit(&sldata->sior, SYSLINK_SIOBUFSIZE); - syslink_route(sldata->router, linkid, m); - } - } else { - /* - * Stream data has to be parsed out. - */ - while ((m = syslink_parse_stream(&sldata->sior)) != NULL) { - syslink_route(sldata->router, linkid, m); - } + if (nbio) { + error = EAGAIN; + goto done1; } + ++sl->rwaiters; + error = msleep(&sl->rwaiters, &sl->spin, PCATCH, "slrmsg", 0); + --sl->rwaiters; + if (error) + goto done1; } /* - * Mark us as done and deref sldata. Tell the writer to terminate as - * well. + * We have a message. If there isn't enough space, return + * ENOSPC without dequeueing it. */ - sldata->flags |= SLIF_RDONE; - sbflush(&sldata->sior); - sbflush(&sldata->siow); - if ((sldata->flags & SLIF_WDONE) == 0) { - sldata->flags |= SLIF_WQUIT; - wakeup(&sldata->wthread); - } - wakeup(&sldata->rthread); - wakeup(&sldata->wthread); - sldata_rels(sldata); -} - -/* - * Read syslink messages from an external descriptor and route them. Used - * when no socket interface is available. - */ -static -void -syslink_rthread_fp(void *arg) -{ - struct sldata *sldata = arg; + if (slmsg->msgsize > iov0->iov_len) { + error = ENOSPC; + goto done1; + } -#if 0 /* - * Loop until told otherwise + * Dequeue the message. Adjust repbytes immediately. cmdbytes + * are adjusted when the command is replied to, not here. */ - while ((sldata->flags & SLIF_RQUIT) == 0) { - error = fp_read(slink->xfp, - slbuf->buf + - (slbuf->windex & slbuf->bufmask - ), - count, &count, 0, UIO_SYSSPACE); - } -#endif + TAILQ_REMOVE(&sl->inq, slmsg, tqnode); + if (slmsg->msg->sm_proto & SM_PROTO_REPLY) + sl->repbytes -= slmsg->maxsize; + spin_unlock_wr(&sl->spin); /* - * Mark us as done and deref sldata. Tell the writer to terminate as - * well. + * Load the message data into the user buffer and clean up. We + * may have to wakeup blocked writers. */ - sldata->flags |= SLIF_RDONE; - sbflush(&sldata->sior); - sbflush(&sldata->siow); - if ((sldata->flags & SLIF_WDONE) == 0) { - sldata->flags |= SLIF_WQUIT; - wakeup(&sldata->wthread); + if ((error = uiomove((void *)slmsg->msg, slmsg->msgsize, uio)) == 0) { + /* yip yip */ } - wakeup(&sldata->rthread); - wakeup(&sldata->wthread); - sldata_rels(sldata); -} - -static -struct mbuf * -syslink_parse_stream(struct sockbuf *sio) -{ - return(NULL); -} - -static -void -syslink_route(struct slrouter *slrouter, int linkid, struct mbuf *m) -{ - m_freem(m); -} - -#if 0 - - - int count; - int used; - int error; + if (slmsg->msg->sm_proto & SM_PROTO_REPLY) { /* - * Calculate contiguous space available to read and read as - * much as possible. - * - * If the entire buffer is used there's probably a format - * error of some sort and we terminate the link. + * Dispose of any received reply after we've copied it + * to userland. We don't need the slmsg any more. */ - used = slbuf->windex - slbuf->rindex; - error = 0; - + slmsg->flags &= ~SLMSGF_ONINQ; + sl->peer->backend_dispose(sl->peer, slmsg); + if (sl->wblocked && sl->repbytes < syslink_bufsize) { + sl->wblocked = 0; /* MP race ok here */ + wakeup(&sl->wblocked); + } + } else if (error) { /* - * Read some data, terminate the link if an error occurs or - * if EOF is encountered. xfp can be NULL, indicating that - * the data was injected by other means. + * Reply to a command that we failed to copy to userspace. */ - if (sldata->xfp) { - count = slbuf->bufsize - - (slbuf->windex & slbuf->bufmask); - if (count > slbuf->bufsize - used) - count = slbuf->bufsize - used; - if (count == 0) - break; - error = fp_read(sldata->xfp, - slbuf->buf + - (slbuf->windex & slbuf->bufmask), - count, &count, 0, UIO_SYSSPACE); - if (error) - break; - if (count == 0) - break; - slbuf->windex += count; - used += count; - } else { - tsleep(slbuf, 0, "fiford", 0); + spin_lock_wr(&sl->spin); + RB_REMOVE(slmsg_rb_tree, &sl->reply_rb_root, slmsg); + sl->cmdbytes -= slmsg->maxsize; + spin_unlock_wr(&sl->spin); + slmsg->flags &= ~SLMSGF_ONINQ; + sl->peer->backend_reply(sl->peer, slmsg, NULL); + if (sl->wblocked && sl->cmdbytes < syslink_bufsize) { + sl->wblocked = 0; /* MP race ok here */ + wakeup(&sl->wblocked); } - + } else { /* - * Process as many syslink messages as we can. The record - * length must be at least a minimal PAD record (8 bytes). + * Leave the command in the RB tree but clear ONINQ now + * that we have returned it to userland so userland can + * reply to it. */ - while (slbuf->windex - slbuf->rindex >= min_msg_size) { - int aligned_reclen; - - head = (void *)(slbuf->buf + - (slbuf->rindex & slbuf->bufmask)); - if (head->sm_bytes < min_msg_size) { - error = EINVAL; - break; - } - aligned_reclen = SLMSG_ALIGN(head->sm_bytes); - - /* - * Disallow wraps - */ - if ((slbuf->rindex & slbuf->bufmask) > - ((slbuf->rindex + aligned_reclen) & slbuf->bufmask) - ) { - error = EINVAL; - break; - } - - /* - * Insufficient data read - */ - if (slbuf->windex - slbuf->rindex < aligned_reclen) - break; - - /* - * Process non-pad messages. Non-pad messages have - * to be at least the size of the syslink_msg - * structure. - * - * A PAD message's sm_cmd field contains 0. - */ - if (head->sm_cmd) { - if (head->sm_bytes < sizeof(*head)) { - error = EINVAL; - break; - } - error = process_syslink_msg(sldata, head); - if (error) - break; - } - cpu_sfence(); - slbuf->rindex += aligned_reclen; - } - if (error) - break; + slmsg->flags &= ~SLMSGF_ONINQ; } - + return(error); +done1: + spin_unlock_wr(&sl->spin); +done2: + return(error); } -#endif - /* - * This thread takes outgoing syslink messages queued to wbuf and writes them - * to the descriptor. PAD is stripped. PAD is also added as required to - * conform to the outgoing descriptor's buffering requirements. + * Userland writes syslink message */ static -void -syslink_wthread_so(void *arg) +int +slfileop_write(struct file *fp, struct uio *uio, struct ucred *cred, int flags) { - struct sldata *sldata = arg; - struct slrouter *slrouter; - struct syslink_msg *head; - struct sockaddr *sa; - struct socket *so; - struct iovec aiov; - struct uio auio; + struct sldesc *sl = fp->f_data; + struct slmsg *slmsg; + struct slmsg *slcmd; + struct syslink_msg sltmp; + struct iovec *iov0; + sl_proto_t proto; + int nbio; int error; - int avail; - int bytes; - -#if 0 - so = (void *)sldata->xfp->f_data; - slrouter = sldata->router; - - while ((sldata->flags & SLIF_WQUIT) == 0) { - /* - * Deal with any broadcast data sitting in the route node's - * broadcast buffer. If we have fallen too far behind the - * data may no longer be valid. - * - * avail -- available data in broadcast buffer and - * bytes -- available contiguous data in broadcast buffer - */ - if (slrouter->bbuf.rindex - sldata->bindex > 0) - sldata->bindex = slrouter->bbuf.rindex; - if ((avail = slrouter->bbuf.windex - sldata->bindex) > 0) { - bytes = slrouter->bbuf.bufsize - - (sldata->bindex & slrouter->bbuf.bufmask); - if (bytes > avail) - bytes = avail; - head = (void *)(slrouter->bbuf.buf + - (sldata->bindex & slrouter->bbuf.bufmask)); - /* - * Break into packets if necessary, else just write - * it all in one fell swoop. - */ - aiov.iov_base = (void *)head; - aiov.iov_len = bytes; - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - auio.uio_offset = 0; - auio.uio_resid = bytes; - auio.uio_segflg = UIO_SYSSPACE; - auio.uio_rw = UIO_WRITE; - auio.uio_td = curthread; - if (sldata->flags & SLIF_PACKET) { - if (head->sm_bytes < SL_MIN_MESSAGE_SIZE) { - kprintf("syslink_msg too small, terminating\n"); - break; - } - if (head->sm_bytes > bytes) { - kprintf("syslink_msg not FIFO aligned, terminating\n"); - break; - } - bytes = SLMSG_ALIGN(head->sm_bytes); - so_pru_sosend(so, sa, &auio, NULL, NULL, 0, curthread); - } else { - so_pru_sosend(so, sa, &auio, NULL, NULL, 0, curthread); - } - continue; - } - - /* - * Deal with mbuf records waiting to be output - */ - if (sldata->siow.sb_mb != NULL) { - - } - - /* - * Block waiting for something to do. - */ - tsleep(&sldata->wthread, 0, "wait", 0); - } + /* + * Kinda messy. Figure out the non-blocking state + */ + if (flags & O_FBLOCKING) + nbio = 0; + else if (flags & O_FNONBLOCKING) + nbio = 1; + else if (fp->f_flag & O_NONBLOCK) + nbio = 1; + else + nbio = 0; + /* + * Validate the uio + */ + if (uio->uio_iovcnt < 1) { error = 0; - for (;;) { - int aligned_reclen; - int used; - int count; + goto done2; + } + iov0 = &uio->uio_iov[0]; + if (iov0->iov_len > SLMSG_BIG) { + error = EFBIG; + goto done2; + } - used = slbuf->windex - slbuf->rindex; - if (used < SL_MIN_MESSAGE_SIZE) - break; + /* + * Handle the buffer-full case. slpeer cmdbytes is managed + * by the backend function, not us so if the callback just + * directly implements the message and never adjusts cmdbytes, + * we will never sleep here. + */ + if (sl->flags & SLF_WSHUTDOWN) { + error = EPIPE; + goto done2; + } - head = (void *)(slbuf->buf + - (slbuf->rindex & slbuf->bufmask)); - if (head->sm_bytes < SL_MIN_MESSAGE_SIZE) { - error = EINVAL; - break; - } - aligned_reclen = SLMSG_ALIGN(head->sm_bytes); + /* + * Only commands can block the pipe, not replies. Otherwise a + * deadlock is possible. + */ + error = copyin(iov0->iov_base, &sltmp, sizeof(sltmp)); + if (error) + goto done2; + if ((proto = sltmp.sm_proto) & SM_PROTO_ENDIAN_REV) + proto = bswap16(proto); + error = sl->peer->backend_wblocked(sl->peer, nbio, proto); + if (error) + goto done2; - /* - * Disallow wraps - */ - if ((slbuf->rindex & slbuf->bufmask) > - ((slbuf->rindex + aligned_reclen) & slbuf->bufmask) - ) { - error = EINVAL; - break; - } + /* + * Allocate a slmsg and load the message. Note that the bytes + * returned to userland only reflects the primary syslink message + * and does not include any DMA buffers. + */ + if (iov0->iov_len <= SLMSG_SMALL) + slmsg = objcache_get(sl_objcache_small, M_WAITOK); + else + slmsg = objcache_get(sl_objcache_big, M_WAITOK); + slmsg->msgsize = iov0->iov_len; - /* - * Insufficient data read - */ - if (used < aligned_reclen) - break; + error = uiomove((void *)slmsg->msg, iov0->iov_len, uio); + if (error) + goto done1; + error = syslink_validate_msg(slmsg->msg, slmsg->msgsize); + if (error) + goto done1; - /* - * Write it out whether it is PAD or not. - * XXX re-PAD for output here. - */ - error = fp_write(sldata->xfp, head, - aligned_reclen, - &count, - UIO_SYSSPACE); - if (error && error != ENOBUFS) - break; - if (count != aligned_reclen) { - error = EIO; - break; - } - slbuf->rindex += aligned_reclen; + /* + * Replies have to be matched up against received commands. + */ + if (slmsg->msg->sm_proto & SM_PROTO_REPLY) { + spin_lock_wr(&sl->spin); + slcmd = slmsg_rb_tree_RB_LOOKUP(&sl->reply_rb_root, + slmsg->msg->sm_msgid); + if (slcmd == NULL || (slcmd->flags & SLMSGF_ONINQ)) { + error = ENOENT; + spin_unlock_wr(&sl->spin); + goto done1; } - if (error) - break; - tsleep(slbuf, 0, "fifowt", 0); + RB_REMOVE(slmsg_rb_tree, &sl->reply_rb_root, slcmd); + sl->cmdbytes -= slcmd->maxsize; + spin_unlock_wr(&sl->spin); + sl->peer->backend_reply(sl->peer, slcmd, slmsg); + if (sl->wblocked && sl->cmdbytes < syslink_bufsize) { + sl->wblocked = 0; /* MP race ok here */ + wakeup(&sl->wblocked); + } + /* error is 0 */ + } else { + error = sl->peer->backend_write(sl->peer, slmsg); } -#endif - sldata->flags |= SLIF_WDONE; - sldata_rels(sldata); +done1: + if (error) + objcache_put(slmsg->oc, slmsg); +done2: + return(error); } +/* + * Close a syslink descriptor. + * + * Disassociate the syslink from the file descriptor and disconnect from + * any peer. + */ static -void -syslink_wthread_fp(void *arg) +int +slfileop_close(struct file *fp) { - struct sldata *sldata = arg; + struct sldesc *sl; - sldata->flags |= SLIF_WDONE; - sldata_rels(sldata); + /* + * Disassociate the file pointer. Take ownership of the ref on the + * sldesc. + */ + sl = fp->f_data; + fp->f_data = NULL; + fp->f_ops = &badfileops; + sl->xfp = NULL; + + /* + * Shutdown both directions. The other side will not issue API + * calls to us after we've shutdown both directions. + */ + shutdownsldesc(sl, SHUT_RD|SHUT_WR); + + /* + * Cleanup + */ + KKASSERT(sl->cmdbytes == 0); + KKASSERT(sl->repbytes == 0); + sldrop(sl); + return(0); } static -void -slbuf_alloc(struct slbuf *slbuf, int bytes) +int +slfileop_stat (struct file *fp, struct stat *sb, struct ucred *cred) { - bzero(slbuf, sizeof(*slbuf)); - slbuf->buf = kmalloc(bytes, M_SYSLINK, M_WAITOK); - slbuf->bufsize = bytes; - slbuf->bufmask = bytes - 1; + return(EINVAL); } static -void -slbuf_free(struct slbuf *slbuf) +int +slfileop_shutdown (struct file *fp, int how) { - kfree(slbuf->buf, M_SYSLINK); - slbuf->buf = NULL; + shutdownsldesc((struct sldesc *)fp->f_data, how); + return(0); } static -void -sldata_rels(struct sldata *sldata) +int +slfileop_ioctl (struct file *fp, u_long cmd, caddr_t data, struct ucred *cred) { - struct slrouter *slrouter; - - if (--sldata->refs == 0) { - slrouter = sldata->router; - KKASSERT(slrouter != NULL); - ++slrouter->refs; - RB_REMOVE(sldata_rb_tree, - &sldata->router->sldata_rb_root, sldata); - sldata->router = NULL; - kfree(sldata, M_SYSLINK); - slrouter_rels(slrouter); - } + return(EINVAL); } static -void -slrouter_rels(struct slrouter *slrouter) +int +slfileop_poll (struct file *fp, int events, struct ucred *cred) { - if (--slrouter->refs == 0 && RB_EMPTY(&slrouter->sldata_rb_root)) { - KKASSERT(slrouter->flags & SLIF_DESTROYED); - RB_REMOVE(slrouter_rb_tree, &slrouter_rb_root, slrouter); - alist_destroy(slrouter->bitmap, M_SYSLINK); - slrouter->bitmap = NULL; - slbuf_free(&slrouter->bbuf); - kfree(slrouter, M_SYSLINK); - } + return(0); } -/* - * A switched ethernet socket connected to a syslink router node may - * represent an entire subnet. We need to generate a subnet id from - * the originating IP address which the caller can then incorporate into - * the base linkid assigned to the connection to form the actual linkid - * originating the message. - */ static int -syslink_getsubnet(struct sockaddr *sa) +slfileop_kqfilter(struct file *fp, struct knote *kn) { - struct in_addr *i4; - struct in6_addr *i6; - int linkid; - - switch(sa->sa_family) { - case AF_INET: - i4 = &((struct sockaddr_in *)sa)->sin_addr; - linkid = (int)ntohl(i4->s_addr); - break; - case AF_INET6: - i6 = &((struct sockaddr_in6 *)sa)->sin6_addr; - linkid = (int)ntohl(i6->s6_addr32[0]); /* XXX */ - break; - default: - linkid = 0; - break; - } - return(linkid); + return(0); } -/* - * fileops for an established syslink when the kernel is asked to create a - * descriptor (verses one being handed to it). No threads are created in - * this case. - */ - -/* - * Transfer zero or more messages from the kernel to userland. Only complete - * messages are returned. If the uio has insufficient space then EMSGSIZE - * is returned. The kernel feeds messages to wbuf so we use wlock (structures - * are relative to the kernel). +/************************************************************************ + * MESSAGE VALIDATION * + ************************************************************************ + * + * Validate that the syslink message. Check that all headers and elements + * conform. Correct the endian if necessary. + * + * NOTE: If reverse endian needs to be corrected, SE_CMDF_UNTRANSLATED + * is recursively flipped on all syslink_elm's in the message. As the + * message traverses the mesh, multiple flips may occur. It is + * up to the RPC protocol layer to correct opaque data payloads and + * SE_CMDF_UNTRANSLATED prevents the protocol layer from misinterpreting + * a command or reply element which has not been endian-corrected. */ static int -syslink_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags) +syslink_validate_msg(struct syslink_msg *msg, int bytes) { - struct sldata *sldata = fp->f_data; -#if 0 - struct syslink_msg *head; - int bytes; - int contig; -#endif + int aligned_reclen; + int swapit; int error; - int nbio; - if (flags & O_FBLOCKING) - nbio = 0; - else if (flags & O_FNONBLOCKING) - nbio = 1; - else if (fp->f_flag & O_NONBLOCK) - nbio = 1; - else - nbio = 0; - - lockmgr(&sldata->wlock, LK_EXCLUSIVE | LK_RETRY); - error = 0; - -#if 0 /* - * Calculate the number of bytes we can transfer in one shot. Transfers - * do not wrap the FIFO. + * The raw message must be properly-aligned. */ - contig = slbuf->bufsize - (slbuf->rindex & slbuf->bufmask); - for (;;) { - bytes = slbuf->windex - slbuf->rindex; - if (bytes) - break; - if (sldata->flags & SLIF_RDONE) { - error = EIO; - break; + if (bytes & SL_ALIGNMASK) + return (EINVAL); + + while (bytes) { + /* + * The message must at least contain the msgid, bytes, and + * protoid. + */ + if (bytes < SL_MIN_PAD_SIZE) + return (EINVAL); + + /* + * Fix the endian if it is reversed. + */ + if (msg->sm_proto & SM_PROTO_ENDIAN_REV) { + msg->sm_msgid = bswap64(msg->sm_msgid); + msg->sm_sessid = bswap64(msg->sm_sessid); + msg->sm_bytes = bswap16(msg->sm_bytes); + msg->sm_proto = bswap16(msg->sm_proto); + msg->sm_rlabel = bswap32(msg->sm_rlabel); + if (msg->sm_proto & SM_PROTO_ENDIAN_REV) + return (EINVAL); + swapit = 1; + } else { + swapit = 0; } - if (nbio) { - error = EAGAIN; - goto done; + + /* + * Validate the contents. For PADs, the entire payload is + * ignored and the minimum message size can be as small as + * 8 bytes. + */ + if (msg->sm_proto == SMPROTO_PAD) { + if (msg->sm_bytes < SL_MIN_PAD_SIZE || + msg->sm_bytes > bytes) { + return (EINVAL); + } + /* ignore the entire payload, it can be garbage */ + } else { + if (msg->sm_bytes < SL_MIN_MSG_SIZE || + msg->sm_bytes > bytes) { + return (EINVAL); + } + error = syslink_validate_elm( + &msg->sm_head, + msg->sm_bytes - + offsetof(struct syslink_msg, + sm_head), + swapit, SL_MAXDEPTH); + if (error) + return (error); } - tsleep(slbuf, 0, "fiford", 0); + + /* + * The aligned payload size must be used to locate the + * next syslink_msg in the buffer. + */ + aligned_reclen = SL_MSG_ALIGN(msg->sm_bytes); + bytes -= aligned_reclen; + msg = (void *)((char *)msg + aligned_reclen); } - if (bytes > contig) - bytes = contig; + return(0); +} + +static +int +syslink_validate_elm(struct syslink_elm *elm, sl_reclen_t bytes, + int swapit, int depth) +{ + int aligned_reclen; /* - * The uio must be able to accomodate the transfer. + * If the buffer isn't big enough to fit the header, stop now! */ - if (uio->uio_resid < bytes) { - error = ENOSPC; - goto done; + if (bytes < SL_MIN_ELM_SIZE) + return (EINVAL); + /* + * All syslink_elm headers are recursively endian-adjusted. Opaque + * data payloads are not. + */ + if (swapit) { + elm->se_cmd = bswap16(elm->se_cmd) ^ SE_CMDF_UNTRANSLATED; + elm->se_bytes = bswap16(elm->se_bytes); + elm->se_aux = bswap32(elm->se_aux); } /* - * Copy the data to userland and update rindex. + * Check element size requirements. */ - head = (void *)(slbuf->buf + (slbuf->rindex & slbuf->bufmask)); - error = uiomove((caddr_t)head, bytes, uio); - if (error == 0) - slbuf->rindex += bytes; + if (elm->se_bytes < SL_MIN_ELM_SIZE || elm->se_bytes > bytes) + return (EINVAL); /* - * Cleanup + * Recursively check structured payloads. A structured payload may + * contain as few as 0 recursive elements. */ -done: -#endif - lockmgr(&sldata->wlock, LK_RELEASE); - return (error); + if (elm->se_cmd & SE_CMDF_STRUCTURED) { + if (depth == 0) + return (EINVAL); + bytes -= SL_MIN_ELM_SIZE; + ++elm; + while (bytes > 0) { + if (syslink_validate_elm(elm, bytes, swapit, depth - 1)) + return (EINVAL); + aligned_reclen = SL_MSG_ALIGN(elm->se_bytes); + elm = (void *)((char *)elm + aligned_reclen); + bytes -= aligned_reclen; + } + } + return(0); } +/************************************************************************ + * BACKEND FUNCTIONS - USER DESCRIPTOR * + ************************************************************************ + * + * Peer backend links are primarily used when userland creates a pair + * of linked descriptors. + */ + /* - * Transfer zero or more messages from userland to the kernel. Only complete - * messages may be written. The kernel processes from rbuf so that is where - * we have to copy the messages. + * Do any required blocking / nbio handling for attempts to write to + * a sldesc associated with a user descriptor. */ static int -syslink_write (struct file *fp, struct uio *uio, struct ucred *cred, int flags) +backend_wblocked_user(struct sldesc *sl, int nbio, sl_proto_t proto) { - struct sldata *sldata = fp->f_data; -#if 0 - struct slbuf *slbuf = &sldata->rbuf; - struct syslink_msg *head; - int bytes; - int contig; -#endif - int nbio; - int error; - - if (flags & O_FBLOCKING) - nbio = 0; - else if (flags & O_FNONBLOCKING) - nbio = 1; - else if (fp->f_flag & O_NONBLOCK) - nbio = 1; - else - nbio = 0; - - lockmgr(&sldata->rlock, LK_EXCLUSIVE | LK_RETRY); - error = 0; - -#if 0 - /* - * Calculate the maximum number of contiguous bytes that may be - * available. Caller is required to not wrap our FIFO. - */ - contig = slbuf->bufsize - (slbuf->windex & slbuf->bufmask); - if (uio->uio_resid > contig) { - error = ENOSPC; - goto done; - } + int error = 0; + int *bytesp = (proto & SM_PROTO_REPLY) ? &sl->repbytes : &sl->cmdbytes; /* - * Truncate based on actual unused space available in the FIFO. If - * the uio does not fit, block and loop. + * Block until sufficient data is drained by the target. It is + * ok to have a MP race against cmdbytes. */ - for (;;) { - bytes = slbuf->bufsize - (slbuf->windex - slbuf->rindex); - if (bytes > contig) - bytes = contig; - if (uio->uio_resid <= bytes) - break; - if (sldata->flags & SLIF_RDONE) { - error = EIO; - goto done; - } - if (nbio) { - error = EAGAIN; - goto done; + if (*bytesp >= syslink_bufsize) { + spin_lock_wr(&sl->spin); + while (*bytesp >= syslink_bufsize) { + if (sl->flags & SLF_WSHUTDOWN) { + error = EPIPE; + break; + } + if (nbio) { + error = EAGAIN; + break; + } + ++sl->wblocked; + error = msleep(&sl->wblocked, &sl->spin, + PCATCH, "slwmsg", 0); + if (error) + break; } - tsleep(slbuf, 0, "fifowr", 0); - } - bytes = uio->uio_resid; - head = (void *)(slbuf->buf + (slbuf->windex & slbuf->bufmask)); - error = uiomove((caddr_t)head, bytes, uio); - if (error == 0) - error = syslink_validate(head, bytes); - if (error == 0) { - slbuf->windex += bytes; - wakeup(slbuf); + spin_unlock_wr(&sl->spin); } -done: -#endif - lockmgr(&sldata->rlock, LK_RELEASE); - return(error); + return (error); } +/* + * Unconditionally write a syslink message to the sldesc associated with + * a user descriptor. Command messages are also placed in a red-black + * tree so their DMA tag (if any) can be accessed and so they can be + * linked to any reply message. + */ static int -syslink_close (struct file *fp) +backend_write_user(struct sldesc *sl, struct slmsg *slmsg) { - struct sldata *sldata; + int error; - sldata = fp->f_data; - if ((sldata->flags & SLIF_RQUIT) == 0) { - sldata->flags |= SLIF_RQUIT; - wakeup(&sldata->rthread); - } - if ((sldata->flags & SLIF_WQUIT) == 0) { - sldata->flags |= SLIF_WQUIT; - wakeup(&sldata->wthread); + spin_lock_wr(&sl->spin); + if (sl->flags & SLF_RSHUTDOWN) { + /* + * Not accepting new messages + */ + error = EPIPE; + } else if (slmsg->msg->sm_proto & SM_PROTO_REPLY) { + /* + * Write a reply + */ + TAILQ_INSERT_TAIL(&sl->inq, slmsg, tqnode); + sl->repbytes += slmsg->maxsize; + slmsg->flags |= SLMSGF_ONINQ; + error = 0; + } else if (RB_INSERT(slmsg_rb_tree, &sl->reply_rb_root, slmsg)) { + /* + * Write a command, but there was a msgid collision when + * we tried to insert it into the RB tree. + */ + error = EEXIST; + } else { + /* + * Write a command, successful insertion into the RB tree. + */ + TAILQ_INSERT_TAIL(&sl->inq, slmsg, tqnode); + sl->cmdbytes += slmsg->maxsize; + slmsg->flags |= SLMSGF_ONINQ; + error = 0; } - fp->f_data = NULL; - sldata_rels(sldata); - return(0); + spin_unlock_wr(&sl->spin); + if (sl->rwaiters) + wakeup(&sl->rwaiters); + return(error); } +/* + * Our peer is replying a command we previously sent it back to us, along + * with the reply message (if not NULL). We just queue the reply to + * userland and free of the command. + */ static -int -syslink_stat (struct file *fp, struct stat *sb, struct ucred *cred) +void +backend_reply_user(struct sldesc *sl, struct slmsg *slcmd, struct slmsg *slrep) { - return(EINVAL); + int error; + + objcache_put(slcmd->oc, slcmd); + if (slrep) { + spin_lock_wr(&sl->spin); + if ((sl->flags & SLF_RSHUTDOWN) == 0) { + TAILQ_INSERT_TAIL(&sl->inq, slrep, tqnode); + sl->repbytes += slrep->maxsize; + error = 0; + } else { + error = EPIPE; + } + spin_unlock_wr(&sl->spin); + if (error) + sl->peer->backend_dispose(sl->peer, slrep); + else if (sl->rwaiters) + wakeup(&sl->rwaiters); + } } static -int -syslink_shutdown (struct file *fp, int how) +void +backend_dispose_user(struct sldesc *sl, struct slmsg *slmsg) { - return(EINVAL); + objcache_put(slmsg->oc, slmsg); } -static +/************************************************************************ + * KERNEL DRIVER OR FILESYSTEM API * + ************************************************************************ + * + */ + +/* + * Create a user<->kernel link, returning the user descriptor in *fdp + * and the kernel descriptor in *kslp. 0 is returned on success, and an + * error code is returned on failure. + */ int -syslink_ioctl (struct file *fp, u_long cmd, caddr_t data, struct ucred *cred) +syslink_ukbackend(int *fdp, struct sldesc **kslp) { - return(EINVAL); + struct proc *p = curproc; + struct file *fp; + struct sldesc *usl; + struct sldesc *ksl; + int error; + int fd; + + *fdp = -1; + *kslp = NULL; + + error = falloc(p, &fp, &fd); + if (error) + return(error); + usl = allocsldesc(NULL); + usl->backend_wblocked = backend_wblocked_user; + usl->backend_write = backend_write_user; + usl->backend_reply = backend_reply_user; + usl->backend_dispose = backend_dispose_user; + + ksl = allocsldesc(usl->common); + ksl->peer = usl; + ksl->backend_wblocked = backend_wblocked_kern; + ksl->backend_write = backend_write_kern; + ksl->backend_reply = backend_reply_kern; + ksl->backend_dispose = backend_dispose_kern; + + usl->peer = ksl; + + setsldescfp(usl, fp); + fsetfd(p, fp, fd); + fdrop(fp); + + *fdp = fd; + *kslp = ksl; + return(0); } -static +/* + * Assign a unique message id, issue a syslink message to userland, + * and wait for a reply. + */ int -syslink_poll (struct file *fp, int events, struct ucred *cred) +syslink_kdomsg(struct sldesc *ksl, struct syslink_msg *msg, + struct bio *bio, int flags) { - return(0); + struct slmsg slmsg; + int error; + + /* + * Finish initializing slmsg and post it to the red-black tree for + * reply matching. If the message id is already in use we return + * EEXIST, giving the originator the chance to roll a new msgid. + */ + bzero(&slmsg, sizeof(slmsg)); + slmsg.msg = msg; + slmsg.msgsize = msg->sm_bytes; + slmsg.bio = bio; + if ((error = syslink_validate_msg(slmsg.msg, slmsg.msgsize)) != 0) + return (error); + msg->sm_msgid = allocsysid(); + + /* + * Issue the request and wait for a matching reply or failure, + * then remove the message from the matching tree and return. + */ + error = ksl->peer->backend_write(ksl->peer, &slmsg); + spin_lock_wr(&ksl->spin); + if (error == 0) { + while (slmsg.rep == NULL) { + error = msleep(&slmsg, &ksl->spin, flags, "kwtmsg", 0); + /* XXX ignore error for now */ + } + if (slmsg.rep == (struct slmsg *)-1) { + error = EIO; + } else { + error = slmsg.rep->msg->sm_head.se_aux; + kprintf("reply with error %d\n", error); + ksl->peer->backend_dispose(ksl->peer, slmsg.rep); + } + } + spin_unlock_wr(&ksl->spin); + return(error); } -static -int -syslink_kqfilter(struct file *fp, struct knote *kn) +void +syslink_kshutdown(struct sldesc *ksl, int how) { - return(0); + shutdownsldesc(ksl, how); +} + +void +syslink_kclose(struct sldesc *ksl) +{ + shutdownsldesc(ksl, SHUT_RD|SHUT_WR); + sldrop(ksl); } +/************************************************************************ + * BACKEND FUNCTIONS FOR KERNEL API * + ************************************************************************ + * + * These are the backend functions for a sldesc associated with a kernel + * API. + */ + /* - * This routine is called from a route node's reader thread to process a - * syslink message once it has been completely read and its size validated. + * Our peer wants to write a syslink message to us and is asking us to + * block if our input queue is full. We don't implement command reception + * so don't block right now. */ static int -process_syslink_msg(struct sldata *sldata, struct syslink_msg *head) +backend_wblocked_kern(struct sldesc *ksl, int nbio, sl_proto_t proto) { - kprintf("process syslink msg %08x\n", head->sm_cmd); + /* never blocks */ return(0); } /* - * Validate that the syslink message header(s) are correctly sized. + * Our peer is writing a request to the kernel. At the moment we do not + * accept commands. */ static int -syslink_validate(struct syslink_msg *head, int bytes) +backend_write_kern(struct sldesc *ksl, struct slmsg *slmsg) { - const int min_msg_size = SL_MIN_MESSAGE_SIZE; - int aligned_reclen; - - while (bytes) { - /* - * Message size and alignment - */ - if (bytes < min_msg_size) - return (EINVAL); - if (bytes & SL_ALIGNMASK) - return (EINVAL); - if (head->sm_cmd && bytes < sizeof(struct syslink_msg)) - return (EINVAL); + return(EOPNOTSUPP); +} - /* - * Buffer must contain entire record - */ - aligned_reclen = SLMSG_ALIGN(head->sm_bytes); - if (bytes < aligned_reclen) - return (EINVAL); - bytes -= aligned_reclen; - head = (void *)((char *)head + aligned_reclen); +/* + * Our peer wants to reply to a syslink message we sent it earlier. The + * original command (that we passed to our peer), and the peer's reply + * is specified. If the peer has failed slrep will be NULL. + */ +static +void +backend_reply_kern(struct sldesc *ksl, struct slmsg *slcmd, struct slmsg *slrep) +{ + spin_lock_wr(&ksl->spin); + if (slrep == NULL) { + slcmd->rep = (struct slmsg *)-1; + } else { + slcmd->rep = slrep; } - return(0); + spin_unlock_wr(&ksl->spin); + wakeup(slcmd); +} + +/* + * Any reply messages we sent to our peer are returned to us for disposal. + * Since we do not currently accept commands from our peer, there will not + * be any replies returned to the peer to dispose of. + */ +static +void +backend_dispose_kern(struct sldesc *ksl, struct slmsg *slmsg) +{ + panic("backend_dispose_kern: kernel can't accept commands so it " + "certainly did not reply to one!"); }