File:  [DragonFly] / src / sys / kern / sys_pipe.c
Revision 1.19: download - view: text, annotated - select for diffs
Sun May 2 07:57:45 2004 UTC (10 years, 7 months ago) by dillon
Branches: MAIN
CVS tags: HEAD
We must pmap_qremove() pages that we previously pmap_qenter()'d before
we can safely call kmem_free().  This corrects a serious corruption issue
that occured when using PIPE algorithms other then the default.

The default SFBUF algorithm was not effected by this bug.

    1: /*
    2:  * Copyright (c) 1996 John S. Dyson
    3:  * All rights reserved.
    4:  *
    5:  * Redistribution and use in source and binary forms, with or without
    6:  * modification, are permitted provided that the following conditions
    7:  * are met:
    8:  * 1. Redistributions of source code must retain the above copyright
    9:  *    notice immediately at the beginning of the file, without modification,
   10:  *    this list of conditions, and the following disclaimer.
   11:  * 2. Redistributions in binary form must reproduce the above copyright
   12:  *    notice, this list of conditions and the following disclaimer in the
   13:  *    documentation and/or other materials provided with the distribution.
   14:  * 3. Absolutely no warranty of function or purpose is made by the author
   15:  *    John S. Dyson.
   16:  * 4. Modifications may be freely made to this file if the above conditions
   17:  *    are met.
   18:  *
   19:  * $FreeBSD: src/sys/kern/sys_pipe.c,v 1.60.2.13 2002/08/05 15:05:15 des Exp $
   20:  * $DragonFly: src/sys/kern/sys_pipe.c,v 1.19 2004/05/02 07:57:45 dillon Exp $
   21:  */
   22: 
   23: /*
   24:  * This file contains a high-performance replacement for the socket-based
   25:  * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
   26:  * all features of sockets, but does do everything that pipes normally
   27:  * do.
   28:  */
   29: 
   30: /*
   31:  * This code has two modes of operation, a small write mode and a large
   32:  * write mode.  The small write mode acts like conventional pipes with
   33:  * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
   34:  * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
   35:  * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
   36:  * the receiving process can copy it directly from the pages in the sending
   37:  * process.
   38:  *
   39:  * If the sending process receives a signal, it is possible that it will
   40:  * go away, and certainly its address space can change, because control
   41:  * is returned back to the user-mode side.  In that case, the pipe code
   42:  * arranges to copy the buffer supplied by the user process, to a pageable
   43:  * kernel buffer, and the receiving process will grab the data from the
   44:  * pageable kernel buffer.  Since signals don't happen all that often,
   45:  * the copy operation is normally eliminated.
   46:  *
   47:  * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
   48:  * happen for small transfers so that the system will not spend all of
   49:  * its time context switching.  PIPE_SIZE is constrained by the
   50:  * amount of kernel virtual memory.
   51:  */
   52: 
   53: #include <sys/param.h>
   54: #include <sys/systm.h>
   55: #include <sys/kernel.h>
   56: #include <sys/proc.h>
   57: #include <sys/fcntl.h>
   58: #include <sys/file.h>
   59: #include <sys/filedesc.h>
   60: #include <sys/filio.h>
   61: #include <sys/ttycom.h>
   62: #include <sys/stat.h>
   63: #include <sys/poll.h>
   64: #include <sys/select.h>
   65: #include <sys/signalvar.h>
   66: #include <sys/sysproto.h>
   67: #include <sys/pipe.h>
   68: #include <sys/vnode.h>
   69: #include <sys/uio.h>
   70: #include <sys/event.h>
   71: #include <sys/globaldata.h>
   72: #include <sys/module.h>
   73: #include <sys/malloc.h>
   74: #include <sys/sysctl.h>
   75: 
   76: #include <vm/vm.h>
   77: #include <vm/vm_param.h>
   78: #include <sys/lock.h>
   79: #include <vm/vm_object.h>
   80: #include <vm/vm_kern.h>
   81: #include <vm/vm_extern.h>
   82: #include <vm/pmap.h>
   83: #include <vm/vm_map.h>
   84: #include <vm/vm_page.h>
   85: #include <vm/vm_zone.h>
   86: 
   87: #include <sys/file2.h>
   88: 
   89: #include <machine/cpufunc.h>
   90: 
   91: /*
   92:  * interfaces to the outside world
   93:  */
   94: static int pipe_read (struct file *fp, struct uio *uio, 
   95: 		struct ucred *cred, int flags, struct thread *td);
   96: static int pipe_write (struct file *fp, struct uio *uio, 
   97: 		struct ucred *cred, int flags, struct thread *td);
   98: static int pipe_close (struct file *fp, struct thread *td);
   99: static int pipe_poll (struct file *fp, int events, struct ucred *cred,
  100: 		struct thread *td);
  101: static int pipe_kqfilter (struct file *fp, struct knote *kn);
  102: static int pipe_stat (struct file *fp, struct stat *sb, struct thread *td);
  103: static int pipe_ioctl (struct file *fp, u_long cmd, caddr_t data, struct thread *td);
  104: 
  105: static struct fileops pipeops = {
  106: 	NULL,	/* port */
  107: 	0,	/* autoq */
  108: 	pipe_read, pipe_write, pipe_ioctl, pipe_poll, pipe_kqfilter,
  109: 	pipe_stat, pipe_close
  110: };
  111: 
  112: static void	filt_pipedetach(struct knote *kn);
  113: static int	filt_piperead(struct knote *kn, long hint);
  114: static int	filt_pipewrite(struct knote *kn, long hint);
  115: 
  116: static struct filterops pipe_rfiltops =
  117: 	{ 1, NULL, filt_pipedetach, filt_piperead };
  118: static struct filterops pipe_wfiltops =
  119: 	{ 1, NULL, filt_pipedetach, filt_pipewrite };
  120: 
  121: MALLOC_DEFINE(M_PIPE, "pipe", "pipe structures");
  122: 
  123: /*
  124:  * Default pipe buffer size(s), this can be kind-of large now because pipe
  125:  * space is pageable.  The pipe code will try to maintain locality of
  126:  * reference for performance reasons, so small amounts of outstanding I/O
  127:  * will not wipe the cache.
  128:  */
  129: #define MINPIPESIZE (PIPE_SIZE/3)
  130: #define MAXPIPESIZE (2*PIPE_SIZE/3)
  131: 
  132: /*
  133:  * Maximum amount of kva for pipes -- this is kind-of a soft limit, but
  134:  * is there so that on large systems, we don't exhaust it.
  135:  */
  136: #define MAXPIPEKVA (8*1024*1024)
  137: 
  138: /*
  139:  * Limit for direct transfers, we cannot, of course limit
  140:  * the amount of kva for pipes in general though.
  141:  */
  142: #define LIMITPIPEKVA (16*1024*1024)
  143: 
  144: /*
  145:  * Limit the number of "big" pipes
  146:  */
  147: #define LIMITBIGPIPES	32
  148: #define PIPEQ_MAX_CACHE 16      /* per-cpu pipe structure cache */
  149: 
  150: static int pipe_maxbig = LIMITBIGPIPES;
  151: static int pipe_maxcache = PIPEQ_MAX_CACHE;
  152: static int pipe_nbig;
  153: static int pipe_bcache_alloc;
  154: static int pipe_bkmem_alloc;
  155: static int pipe_dwrite_enable = 1;	/* 0:copy, 1:kmem/sfbuf 2:force */
  156: static int pipe_dwrite_sfbuf = 1;	/* 0:kmem_map 1:sfbufs 2:sfbufs_dmap */
  157: 					/* 3:sfbuf_dmap w/ forced invlpg */
  158: 
  159: SYSCTL_NODE(_kern, OID_AUTO, pipe, CTLFLAG_RW, 0, "Pipe operation");
  160: SYSCTL_INT(_kern_pipe, OID_AUTO, nbig,
  161:         CTLFLAG_RD, &pipe_nbig, 0, "numer of big pipes allocated");
  162: SYSCTL_INT(_kern_pipe, OID_AUTO, maxcache,
  163:         CTLFLAG_RW, &pipe_maxcache, 0, "max pipes cached per-cpu");
  164: SYSCTL_INT(_kern_pipe, OID_AUTO, maxbig,
  165:         CTLFLAG_RW, &pipe_maxbig, 0, "max number of big pipes");
  166: SYSCTL_INT(_kern_pipe, OID_AUTO, dwrite_enable,
  167:         CTLFLAG_RW, &pipe_dwrite_enable, 0, "1:enable/2:force direct writes");
  168: SYSCTL_INT(_kern_pipe, OID_AUTO, dwrite_sfbuf,
  169:         CTLFLAG_RW, &pipe_dwrite_sfbuf, 0, "(if dwrite_enable) 0:kmem 1:sfbuf 2:sfbuf_dmap 3:sfbuf_dmap_forceinvlpg");
  170: #if !defined(NO_PIPE_SYSCTL_STATS)
  171: SYSCTL_INT(_kern_pipe, OID_AUTO, bcache_alloc,
  172:         CTLFLAG_RW, &pipe_bcache_alloc, 0, "pipe buffer from pcpu cache");
  173: SYSCTL_INT(_kern_pipe, OID_AUTO, bkmem_alloc,
  174:         CTLFLAG_RW, &pipe_bkmem_alloc, 0, "pipe buffer from kmem");
  175: #endif
  176: 
  177: static void pipeclose (struct pipe *cpipe);
  178: static void pipe_free_kmem (struct pipe *cpipe);
  179: static int pipe_create (struct pipe **cpipep);
  180: static __inline int pipelock (struct pipe *cpipe, int catch);
  181: static __inline void pipeunlock (struct pipe *cpipe);
  182: static __inline void pipeselwakeup (struct pipe *cpipe);
  183: #ifndef PIPE_NODIRECT
  184: static int pipe_build_write_buffer (struct pipe *wpipe, struct uio *uio);
  185: static int pipe_direct_write (struct pipe *wpipe, struct uio *uio);
  186: static void pipe_clone_write_buffer (struct pipe *wpipe);
  187: #endif
  188: static int pipespace (struct pipe *cpipe, int size);
  189: 
  190: /*
  191:  * The pipe system call for the DTYPE_PIPE type of pipes
  192:  *
  193:  * pipe_ARgs(int dummy)
  194:  */
  195: 
  196: /* ARGSUSED */
  197: int
  198: pipe(struct pipe_args *uap)
  199: {
  200: 	struct thread *td = curthread;
  201: 	struct proc *p = td->td_proc;
  202: 	struct filedesc *fdp;
  203: 	struct file *rf, *wf;
  204: 	struct pipe *rpipe, *wpipe;
  205: 	int fd1, fd2, error;
  206: 
  207: 	KKASSERT(p);
  208: 	fdp = p->p_fd;
  209: 
  210: 	rpipe = wpipe = NULL;
  211: 	if (pipe_create(&rpipe) || pipe_create(&wpipe)) {
  212: 		pipeclose(rpipe); 
  213: 		pipeclose(wpipe); 
  214: 		return (ENFILE);
  215: 	}
  216: 	
  217: 	rpipe->pipe_state |= PIPE_DIRECTOK;
  218: 	wpipe->pipe_state |= PIPE_DIRECTOK;
  219: 
  220: 	/*
  221: 	 * Select the direct-map features to use for this pipe.  Since the
  222: 	 * sysctl's can change on the fly we record the settings when the
  223: 	 * pipe is created.
  224: 	 *
  225: 	 * Generally speaking the system will default to what we consider
  226: 	 * to be the best-balanced and most stable option.  Right now this
  227: 	 * is SFBUF1.  Modes 2 and 3 are considered experiemental at the
  228: 	 * moment.
  229: 	 */
  230: 	wpipe->pipe_feature = PIPE_COPY;
  231: 	if (pipe_dwrite_enable) {
  232: 		switch(pipe_dwrite_sfbuf) {
  233: 		case 0:
  234: 			wpipe->pipe_feature = PIPE_KMEM;
  235: 			break;
  236: 		case 1:
  237: 			wpipe->pipe_feature = PIPE_SFBUF1;
  238: 			break;
  239: 		case 2:
  240: 		case 3:
  241: 			wpipe->pipe_feature = PIPE_SFBUF2;
  242: 			break;
  243: 		}
  244: 	}
  245: 	rpipe->pipe_feature = wpipe->pipe_feature;
  246: 
  247: 	error = falloc(p, &rf, &fd1);
  248: 	if (error) {
  249: 		pipeclose(rpipe);
  250: 		pipeclose(wpipe);
  251: 		return (error);
  252: 	}
  253: 	fhold(rf);
  254: 	uap->sysmsg_fds[0] = fd1;
  255: 
  256: 	/*
  257: 	 * Warning: once we've gotten past allocation of the fd for the
  258: 	 * read-side, we can only drop the read side via fdrop() in order
  259: 	 * to avoid races against processes which manage to dup() the read
  260: 	 * side while we are blocked trying to allocate the write side.
  261: 	 */
  262: 	rf->f_flag = FREAD | FWRITE;
  263: 	rf->f_type = DTYPE_PIPE;
  264: 	rf->f_data = (caddr_t)rpipe;
  265: 	rf->f_ops = &pipeops;
  266: 	error = falloc(p, &wf, &fd2);
  267: 	if (error) {
  268: 		if (fdp->fd_ofiles[fd1] == rf) {
  269: 			fdp->fd_ofiles[fd1] = NULL;
  270: 			fdrop(rf, td);
  271: 		}
  272: 		fdrop(rf, td);
  273: 		/* rpipe has been closed by fdrop(). */
  274: 		pipeclose(wpipe);
  275: 		return (error);
  276: 	}
  277: 	wf->f_flag = FREAD | FWRITE;
  278: 	wf->f_type = DTYPE_PIPE;
  279: 	wf->f_data = (caddr_t)wpipe;
  280: 	wf->f_ops = &pipeops;
  281: 	uap->sysmsg_fds[1] = fd2;
  282: 
  283: 	rpipe->pipe_peer = wpipe;
  284: 	wpipe->pipe_peer = rpipe;
  285: 	fdrop(rf, td);
  286: 
  287: 	return (0);
  288: }
  289: 
  290: /*
  291:  * Allocate kva for pipe circular buffer, the space is pageable
  292:  * This routine will 'realloc' the size of a pipe safely, if it fails
  293:  * it will retain the old buffer.
  294:  * If it fails it will return ENOMEM.
  295:  */
  296: static int
  297: pipespace(struct pipe *cpipe, int size)
  298: {
  299: 	struct vm_object *object;
  300: 	caddr_t buffer;
  301: 	int npages, error;
  302: 
  303: 	npages = round_page(size) / PAGE_SIZE;
  304: 	object = cpipe->pipe_buffer.object;
  305: 
  306: 	/*
  307: 	 * [re]create the object if necessary and reserve space for it
  308: 	 * in the kernel_map.  The object and memory are pageable.  On
  309: 	 * success, free the old resources before assigning the new
  310: 	 * ones.
  311: 	 */
  312: 	if (object == NULL || object->size != npages) {
  313: 		object = vm_object_allocate(OBJT_DEFAULT, npages);
  314: 		buffer = (caddr_t) vm_map_min(kernel_map);
  315: 
  316: 		error = vm_map_find(kernel_map, object, 0,
  317: 			(vm_offset_t *) &buffer, size, 1,
  318: 			VM_PROT_ALL, VM_PROT_ALL, 0);
  319: 
  320: 		if (error != KERN_SUCCESS) {
  321: 			vm_object_deallocate(object);
  322: 			return (ENOMEM);
  323: 		}
  324: 		pipe_free_kmem(cpipe);
  325: 		cpipe->pipe_buffer.object = object;
  326: 		cpipe->pipe_buffer.buffer = buffer;
  327: 		cpipe->pipe_buffer.size = size;
  328: 		++pipe_bkmem_alloc;
  329: 	} else {
  330: 		++pipe_bcache_alloc;
  331: 	}
  332: 	cpipe->pipe_buffer.in = 0;
  333: 	cpipe->pipe_buffer.out = 0;
  334: 	cpipe->pipe_buffer.cnt = 0;
  335: 	return (0);
  336: }
  337: 
  338: /*
  339:  * Initialize and allocate VM and memory for pipe, pulling the pipe from
  340:  * our per-cpu cache if possible.  For now make sure it is sized for the
  341:  * smaller PIPE_SIZE default.
  342:  */
  343: static int
  344: pipe_create(cpipep)
  345: 	struct pipe **cpipep;
  346: {
  347: 	globaldata_t gd = mycpu;
  348: 	struct pipe *cpipe;
  349: 	int error;
  350: 
  351: 	if ((cpipe = gd->gd_pipeq) != NULL) {
  352: 		gd->gd_pipeq = cpipe->pipe_peer;
  353: 		--gd->gd_pipeqcount;
  354: 		cpipe->pipe_peer = NULL;
  355: 	} else {
  356: 		cpipe = malloc(sizeof(struct pipe), M_PIPE, M_WAITOK|M_ZERO);
  357: 	}
  358: 	*cpipep = cpipe;
  359: 	if ((error = pipespace(cpipe, PIPE_SIZE)) != 0)
  360: 		return (error);
  361: 	vfs_timestamp(&cpipe->pipe_ctime);
  362: 	cpipe->pipe_atime = cpipe->pipe_ctime;
  363: 	cpipe->pipe_mtime = cpipe->pipe_ctime;
  364: 	return (0);
  365: }
  366: 
  367: 
  368: /*
  369:  * lock a pipe for I/O, blocking other access
  370:  */
  371: static __inline int
  372: pipelock(cpipe, catch)
  373: 	struct pipe *cpipe;
  374: 	int catch;
  375: {
  376: 	int error;
  377: 
  378: 	while (cpipe->pipe_state & PIPE_LOCK) {
  379: 		cpipe->pipe_state |= PIPE_LWANT;
  380: 		error = tsleep(cpipe, (catch ? PCATCH : 0), "pipelk", 0);
  381: 		if (error != 0) 
  382: 			return (error);
  383: 	}
  384: 	cpipe->pipe_state |= PIPE_LOCK;
  385: 	return (0);
  386: }
  387: 
  388: /*
  389:  * unlock a pipe I/O lock
  390:  */
  391: static __inline void
  392: pipeunlock(cpipe)
  393: 	struct pipe *cpipe;
  394: {
  395: 
  396: 	cpipe->pipe_state &= ~PIPE_LOCK;
  397: 	if (cpipe->pipe_state & PIPE_LWANT) {
  398: 		cpipe->pipe_state &= ~PIPE_LWANT;
  399: 		wakeup(cpipe);
  400: 	}
  401: }
  402: 
  403: static __inline void
  404: pipeselwakeup(cpipe)
  405: 	struct pipe *cpipe;
  406: {
  407: 
  408: 	if (cpipe->pipe_state & PIPE_SEL) {
  409: 		cpipe->pipe_state &= ~PIPE_SEL;
  410: 		selwakeup(&cpipe->pipe_sel);
  411: 	}
  412: 	if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
  413: 		pgsigio(cpipe->pipe_sigio, SIGIO, 0);
  414: 	KNOTE(&cpipe->pipe_sel.si_note, 0);
  415: }
  416: 
  417: /* ARGSUSED */
  418: static int
  419: pipe_read(struct file *fp, struct uio *uio, struct ucred *cred,
  420: 	int flags, struct thread *td)
  421: {
  422: 	struct pipe *rpipe = (struct pipe *) fp->f_data;
  423: 	int error;
  424: 	int nread = 0;
  425: 	u_int size;
  426: 
  427: 	++rpipe->pipe_busy;
  428: 	error = pipelock(rpipe, 1);
  429: 	if (error)
  430: 		goto unlocked_error;
  431: 
  432: 	while (uio->uio_resid) {
  433: 		caddr_t va;
  434: 
  435: 		if (rpipe->pipe_buffer.cnt > 0) {
  436: 			/*
  437: 			 * normal pipe buffer receive
  438: 			 */
  439: 			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
  440: 			if (size > rpipe->pipe_buffer.cnt)
  441: 				size = rpipe->pipe_buffer.cnt;
  442: 			if (size > (u_int) uio->uio_resid)
  443: 				size = (u_int) uio->uio_resid;
  444: 
  445: 			error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
  446: 					size, uio);
  447: 			if (error)
  448: 				break;
  449: 
  450: 			rpipe->pipe_buffer.out += size;
  451: 			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
  452: 				rpipe->pipe_buffer.out = 0;
  453: 
  454: 			rpipe->pipe_buffer.cnt -= size;
  455: 
  456: 			/*
  457: 			 * If there is no more to read in the pipe, reset
  458: 			 * its pointers to the beginning.  This improves
  459: 			 * cache hit stats.
  460: 			 */
  461: 			if (rpipe->pipe_buffer.cnt == 0) {
  462: 				rpipe->pipe_buffer.in = 0;
  463: 				rpipe->pipe_buffer.out = 0;
  464: 			}
  465: 			nread += size;
  466: #ifndef PIPE_NODIRECT
  467: 		} else if (rpipe->pipe_kva &&
  468: 			   rpipe->pipe_feature == PIPE_KMEM &&
  469: 			   (rpipe->pipe_state & (PIPE_DIRECTW|PIPE_DIRECTIP)) 
  470: 			       == PIPE_DIRECTW
  471: 		) {
  472: 			/*
  473: 			 * Direct copy using source-side kva mapping
  474: 			 */
  475: 			size = rpipe->pipe_map.xio_bytes;
  476: 			if (size > (u_int)uio->uio_resid)
  477: 				size = (u_int)uio->uio_resid;
  478: 			va = (caddr_t)rpipe->pipe_kva + rpipe->pipe_map.xio_offset;
  479: 			error = uiomove(va, size, uio);
  480: 			if (error)
  481: 				break;
  482: 			nread += size;
  483: 			rpipe->pipe_map.xio_offset += size;
  484: 			rpipe->pipe_map.xio_bytes -= size;
  485: 			if (rpipe->pipe_map.xio_bytes == 0) {
  486: 				rpipe->pipe_state |= PIPE_DIRECTIP;
  487: 				rpipe->pipe_state &= ~PIPE_DIRECTW;
  488: 				wakeup(rpipe);
  489: 			}
  490: 		} else if (rpipe->pipe_map.xio_bytes &&
  491: 			   rpipe->pipe_kva &&
  492: 			   rpipe->pipe_feature == PIPE_SFBUF2 &&
  493: 			   (rpipe->pipe_state & (PIPE_DIRECTW|PIPE_DIRECTIP)) 
  494: 			       == PIPE_DIRECTW
  495: 		) {
  496: 			/*
  497: 			 * Direct copy, bypassing a kernel buffer.  We cannot
  498: 			 * mess with the direct-write buffer until
  499: 			 * PIPE_DIRECTIP is cleared.  In order to prevent 
  500: 			 * the pipe_write code from racing itself in
  501: 			 * direct_write, we set DIRECTIP when we clear
  502: 			 * DIRECTW after we have exhausted the buffer.
  503: 			 */
  504: 			if (pipe_dwrite_sfbuf == 3)
  505: 				rpipe->pipe_kvamask = 0;
  506: 			pmap_qenter2(rpipe->pipe_kva, rpipe->pipe_map.xio_pages,
  507: 				    rpipe->pipe_map.xio_npages,
  508: 				    &rpipe->pipe_kvamask);
  509: 			size = rpipe->pipe_map.xio_bytes;
  510: 			if (size > (u_int)uio->uio_resid)
  511: 				size = (u_int)uio->uio_resid;
  512: 			va = (caddr_t)rpipe->pipe_kva + 
  513: 				rpipe->pipe_map.xio_offset;
  514: 			error = uiomove(va, size, uio);
  515: 			if (error)
  516: 				break;
  517: 			nread += size;
  518: 			rpipe->pipe_map.xio_offset += size;
  519: 			rpipe->pipe_map.xio_bytes -= size;
  520: 			if (rpipe->pipe_map.xio_bytes == 0) {
  521: 				rpipe->pipe_state |= PIPE_DIRECTIP;
  522: 				rpipe->pipe_state &= ~PIPE_DIRECTW;
  523: 				wakeup(rpipe);
  524: 			}
  525: 		} else if (rpipe->pipe_map.xio_bytes &&
  526: 			   rpipe->pipe_feature == PIPE_SFBUF1 &&
  527: 			   (rpipe->pipe_state & (PIPE_DIRECTW|PIPE_DIRECTIP)) 
  528: 				== PIPE_DIRECTW
  529: 		) {
  530: 			/*
  531: 			 * Direct copy, bypassing a kernel buffer.  We cannot
  532: 			 * mess with the direct-write buffer until
  533: 			 * PIPE_DIRECTIP is cleared.  In order to prevent 
  534: 			 * the pipe_write code from racing itself in
  535: 			 * direct_write, we set DIRECTIP when we clear
  536: 			 * DIRECTW after we have exhausted the buffer.
  537: 			 */
  538: 			error = xio_uio_copy(&rpipe->pipe_map, uio, &size);
  539: 			if (error)
  540: 				break;
  541: 			nread += size;
  542: 			if (rpipe->pipe_map.xio_bytes == 0) {
  543: 				rpipe->pipe_state |= PIPE_DIRECTIP;
  544: 				rpipe->pipe_state &= ~PIPE_DIRECTW;
  545: 				wakeup(rpipe);
  546: 			}
  547: #endif
  548: 		} else {
  549: 			/*
  550: 			 * detect EOF condition
  551: 			 * read returns 0 on EOF, no need to set error
  552: 			 */
  553: 			if (rpipe->pipe_state & PIPE_EOF)
  554: 				break;
  555: 
  556: 			/*
  557: 			 * If the "write-side" has been blocked, wake it up now.
  558: 			 */
  559: 			if (rpipe->pipe_state & PIPE_WANTW) {
  560: 				rpipe->pipe_state &= ~PIPE_WANTW;
  561: 				wakeup(rpipe);
  562: 			}
  563: 
  564: 			/*
  565: 			 * Break if some data was read.
  566: 			 */
  567: 			if (nread > 0)
  568: 				break;
  569: 
  570: 			/*
  571: 			 * Unlock the pipe buffer for our remaining
  572: 			 * processing.  We will either break out with an
  573: 			 * error or we will sleep and relock to loop.
  574: 			 */
  575: 			pipeunlock(rpipe);
  576: 
  577: 			/*
  578: 			 * Handle non-blocking mode operation or
  579: 			 * wait for more data.
  580: 			 */
  581: 			if (fp->f_flag & FNONBLOCK) {
  582: 				error = EAGAIN;
  583: 			} else {
  584: 				rpipe->pipe_state |= PIPE_WANTR;
  585: 				if ((error = tsleep(rpipe, PCATCH|PNORESCHED,
  586: 				    "piperd", 0)) == 0) {
  587: 					error = pipelock(rpipe, 1);
  588: 				}
  589: 			}
  590: 			if (error)
  591: 				goto unlocked_error;
  592: 		}
  593: 	}
  594: 	pipeunlock(rpipe);
  595: 
  596: 	if (error == 0)
  597: 		vfs_timestamp(&rpipe->pipe_atime);
  598: unlocked_error:
  599: 	--rpipe->pipe_busy;
  600: 
  601: 	/*
  602: 	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
  603: 	 */
  604: 	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
  605: 		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
  606: 		wakeup(rpipe);
  607: 	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
  608: 		/*
  609: 		 * Handle write blocking hysteresis.
  610: 		 */
  611: 		if (rpipe->pipe_state & PIPE_WANTW) {
  612: 			rpipe->pipe_state &= ~PIPE_WANTW;
  613: 			wakeup(rpipe);
  614: 		}
  615: 	}
  616: 
  617: 	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
  618: 		pipeselwakeup(rpipe);
  619: 	return (error);
  620: }
  621: 
  622: #ifndef PIPE_NODIRECT
  623: /*
  624:  * Map the sending processes' buffer into kernel space and wire it.
  625:  * This is similar to a physical write operation.
  626:  */
  627: static int
  628: pipe_build_write_buffer(wpipe, uio)
  629: 	struct pipe *wpipe;
  630: 	struct uio *uio;
  631: {
  632: 	int error;
  633: 	u_int size;
  634: 
  635: 	size = (u_int) uio->uio_iov->iov_len;
  636: 	if (size > wpipe->pipe_buffer.size)
  637: 		size = wpipe->pipe_buffer.size;
  638: 	if (size > XIO_INTERNAL_SIZE)
  639: 		size = XIO_INTERNAL_SIZE;
  640: 
  641: 	error = xio_init_ubuf(&wpipe->pipe_map, uio->uio_iov->iov_base, 
  642: 				size, XIOF_READ);
  643: 	if (error)
  644: 		return(error);
  645: 
  646: 	/*
  647: 	 * Create a kernel map for KMEM and SFBUF2 copy modes.  SFBUF2 will
  648: 	 * map the pages on the target while KMEM maps the pages now.
  649: 	 */
  650: 	switch(wpipe->pipe_feature) {
  651: 	case PIPE_KMEM:
  652: 	case PIPE_SFBUF2:
  653: 		if (wpipe->pipe_kva == NULL) {
  654: 			wpipe->pipe_kva = 
  655: 			    kmem_alloc_pageable(kernel_map, XIO_INTERNAL_SIZE);
  656: 			wpipe->pipe_kvamask = 0;
  657: 		}
  658: 		if (wpipe->pipe_feature == PIPE_KMEM) {
  659: 			pmap_qenter(wpipe->pipe_kva, wpipe->pipe_map.xio_pages,
  660: 				    wpipe->pipe_map.xio_npages);
  661: 		}
  662: 		break;
  663: 	default:
  664: 		break;
  665: 	}
  666: 
  667: 	/*
  668: 	 * and update the uio data
  669: 	 */
  670: 	uio->uio_iov->iov_len -= size;
  671: 	uio->uio_iov->iov_base += size;
  672: 	if (uio->uio_iov->iov_len == 0)
  673: 		uio->uio_iov++;
  674: 	uio->uio_resid -= size;
  675: 	uio->uio_offset += size;
  676: 	return (0);
  677: }
  678: 
  679: /*
  680:  * In the case of a signal, the writing process might go away.  This
  681:  * code copies the data into the circular buffer so that the source
  682:  * pages can be freed without loss of data.
  683:  */
  684: static void
  685: pipe_clone_write_buffer(wpipe)
  686: 	struct pipe *wpipe;
  687: {
  688: 	int size;
  689: 
  690: 	size = wpipe->pipe_map.xio_bytes;
  691: 
  692: 	wpipe->pipe_buffer.in = size;
  693: 	wpipe->pipe_buffer.out = 0;
  694: 	wpipe->pipe_buffer.cnt = size;
  695: 	wpipe->pipe_state &= ~(PIPE_DIRECTW | PIPE_DIRECTIP);
  696: 
  697: 	xio_copy_xtok(&wpipe->pipe_map, wpipe->pipe_buffer.buffer, size);
  698: 	xio_release(&wpipe->pipe_map);
  699: 	if (wpipe->pipe_kva) {
  700: 		pmap_qremove(wpipe->pipe_kva, XIO_INTERNAL_PAGES);
  701: 		kmem_free(kernel_map, wpipe->pipe_kva, XIO_INTERNAL_SIZE);
  702: 		wpipe->pipe_kva = NULL;
  703: 	}
  704: }
  705: 
  706: /*
  707:  * This implements the pipe buffer write mechanism.  Note that only
  708:  * a direct write OR a normal pipe write can be pending at any given time.
  709:  * If there are any characters in the pipe buffer, the direct write will
  710:  * be deferred until the receiving process grabs all of the bytes from
  711:  * the pipe buffer.  Then the direct mapping write is set-up.
  712:  */
  713: static int
  714: pipe_direct_write(wpipe, uio)
  715: 	struct pipe *wpipe;
  716: 	struct uio *uio;
  717: {
  718: 	int error;
  719: 
  720: retry:
  721: 	while (wpipe->pipe_state & (PIPE_DIRECTW|PIPE_DIRECTIP)) {
  722: 		if (wpipe->pipe_state & PIPE_WANTR) {
  723: 			wpipe->pipe_state &= ~PIPE_WANTR;
  724: 			wakeup(wpipe);
  725: 		}
  726: 		wpipe->pipe_state |= PIPE_WANTW;
  727: 		error = tsleep(wpipe, PCATCH, "pipdww", 0);
  728: 		if (error)
  729: 			goto error2;
  730: 		if (wpipe->pipe_state & PIPE_EOF) {
  731: 			error = EPIPE;
  732: 			goto error2;
  733: 		}
  734: 	}
  735: 	KKASSERT(wpipe->pipe_map.xio_bytes == 0);
  736: 	if (wpipe->pipe_buffer.cnt > 0) {
  737: 		if (wpipe->pipe_state & PIPE_WANTR) {
  738: 			wpipe->pipe_state &= ~PIPE_WANTR;
  739: 			wakeup(wpipe);
  740: 		}
  741: 			
  742: 		wpipe->pipe_state |= PIPE_WANTW;
  743: 		error = tsleep(wpipe, PCATCH, "pipdwc", 0);
  744: 		if (error)
  745: 			goto error2;
  746: 		if (wpipe->pipe_state & PIPE_EOF) {
  747: 			error = EPIPE;
  748: 			goto error2;
  749: 		}
  750: 		goto retry;
  751: 	}
  752: 
  753: 	/*
  754: 	 * Build our direct-write buffer
  755: 	 */
  756: 	wpipe->pipe_state |= PIPE_DIRECTW | PIPE_DIRECTIP;
  757: 	error = pipe_build_write_buffer(wpipe, uio);
  758: 	if (error)
  759: 		goto error1;
  760: 	wpipe->pipe_state &= ~PIPE_DIRECTIP;
  761: 
  762: 	/*
  763: 	 * Wait until the receiver has snarfed the data.  Since we are likely
  764: 	 * going to sleep we optimize the case and yield synchronously,
  765: 	 * possibly avoiding the tsleep().
  766: 	 */
  767: 	error = 0;
  768: 	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
  769: 		if (wpipe->pipe_state & PIPE_EOF) {
  770: 			pipelock(wpipe, 0);
  771: 			xio_release(&wpipe->pipe_map);
  772: 			if (wpipe->pipe_kva) {
  773: 				pmap_qremove(wpipe->pipe_kva, XIO_INTERNAL_PAGES);
  774: 				kmem_free(kernel_map, wpipe->pipe_kva, XIO_INTERNAL_SIZE);
  775: 				wpipe->pipe_kva = NULL;
  776: 			}
  777: 			pipeunlock(wpipe);
  778: 			pipeselwakeup(wpipe);
  779: 			error = EPIPE;
  780: 			goto error1;
  781: 		}
  782: 		if (wpipe->pipe_state & PIPE_WANTR) {
  783: 			wpipe->pipe_state &= ~PIPE_WANTR;
  784: 			wakeup(wpipe);
  785: 		}
  786: 		pipeselwakeup(wpipe);
  787: 		error = tsleep(wpipe, PCATCH|PNORESCHED, "pipdwt", 0);
  788: 	}
  789: 	pipelock(wpipe,0);
  790: 	if (wpipe->pipe_state & PIPE_DIRECTW) {
  791: 		/*
  792: 		 * this bit of trickery substitutes a kernel buffer for
  793: 		 * the process that might be going away.
  794: 		 */
  795: 		pipe_clone_write_buffer(wpipe);
  796: 		KKASSERT((wpipe->pipe_state & PIPE_DIRECTIP) == 0);
  797: 	} else {
  798: 		/*
  799: 		 * note: The pipe_kva mapping is not qremove'd here.  For
  800: 		 * legacy PIPE_KMEM mode this constitutes an improvement
  801: 		 * over the original FreeBSD-4 algorithm.  For PIPE_SFBUF2
  802: 		 * mode the kva mapping must not be removed to get the
  803: 		 * caching benefit. 
  804: 		 *
  805: 		 * For testing purposes we will give the original algorithm
  806: 		 * the benefit of the doubt 'what it could have been', and
  807: 		 * keep the optimization.
  808: 		 */
  809: 		KKASSERT(wpipe->pipe_state & PIPE_DIRECTIP);
  810: 		xio_release(&wpipe->pipe_map);
  811: 		wpipe->pipe_state &= ~PIPE_DIRECTIP;
  812: 	}
  813: 	pipeunlock(wpipe);
  814: 	return (error);
  815: 
  816: 	/*
  817: 	 * Direct-write error, clear the direct write flags.
  818: 	 */
  819: error1:
  820: 	wpipe->pipe_state &= ~(PIPE_DIRECTW | PIPE_DIRECTIP);
  821: 	/* fallthrough */
  822: 
  823: 	/*
  824: 	 * General error, wakeup the other side if it happens to be sleeping.
  825: 	 */
  826: error2:
  827: 	wakeup(wpipe);
  828: 	return (error);
  829: }
  830: #endif
  831: 	
  832: static int
  833: pipe_write(struct file *fp, struct uio *uio, struct ucred *cred,
  834: 	int flags, struct thread *td)
  835: {
  836: 	int error = 0;
  837: 	int orig_resid;
  838: 	struct pipe *wpipe, *rpipe;
  839: 
  840: 	rpipe = (struct pipe *) fp->f_data;
  841: 	wpipe = rpipe->pipe_peer;
  842: 
  843: 	/*
  844: 	 * detect loss of pipe read side, issue SIGPIPE if lost.
  845: 	 */
  846: 	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
  847: 		return (EPIPE);
  848: 	}
  849: 	++wpipe->pipe_busy;
  850: 
  851: 	/*
  852: 	 * If it is advantageous to resize the pipe buffer, do
  853: 	 * so.
  854: 	 */
  855: 	if ((uio->uio_resid > PIPE_SIZE) &&
  856: 		(pipe_nbig < pipe_maxbig) &&
  857: 		(wpipe->pipe_state & (PIPE_DIRECTW|PIPE_DIRECTIP)) == 0 &&
  858: 		(wpipe->pipe_buffer.size <= PIPE_SIZE) &&
  859: 		(wpipe->pipe_buffer.cnt == 0)) {
  860: 
  861: 		if ((error = pipelock(wpipe,1)) == 0) {
  862: 			if (pipespace(wpipe, BIG_PIPE_SIZE) == 0)
  863: 				pipe_nbig++;
  864: 			pipeunlock(wpipe);
  865: 		}
  866: 	}
  867: 
  868: 	/*
  869: 	 * If an early error occured unbusy and return, waking up any pending
  870: 	 * readers.
  871: 	 */
  872: 	if (error) {
  873: 		--wpipe->pipe_busy;
  874: 		if ((wpipe->pipe_busy == 0) && 
  875: 		    (wpipe->pipe_state & PIPE_WANT)) {
  876: 			wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
  877: 			wakeup(wpipe);
  878: 		}
  879: 		return(error);
  880: 	}
  881: 		
  882: 	KASSERT(wpipe->pipe_buffer.buffer != NULL, ("pipe buffer gone"));
  883: 
  884: 	orig_resid = uio->uio_resid;
  885: 
  886: 	while (uio->uio_resid) {
  887: 		int space;
  888: 
  889: #ifndef PIPE_NODIRECT
  890: 		/*
  891: 		 * If the transfer is large, we can gain performance if
  892: 		 * we do process-to-process copies directly.
  893: 		 * If the write is non-blocking, we don't use the
  894: 		 * direct write mechanism.
  895: 		 *
  896: 		 * The direct write mechanism will detect the reader going
  897: 		 * away on us.
  898: 		 */
  899: 		if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT ||
  900: 		    pipe_dwrite_enable > 1) &&
  901: 		    (fp->f_flag & FNONBLOCK) == 0 &&
  902: 		    pipe_dwrite_enable) {
  903: 			error = pipe_direct_write( wpipe, uio);
  904: 			if (error)
  905: 				break;
  906: 			continue;
  907: 		}
  908: #endif
  909: 
  910: 		/*
  911: 		 * Pipe buffered writes cannot be coincidental with
  912: 		 * direct writes.  We wait until the currently executing
  913: 		 * direct write is completed before we start filling the
  914: 		 * pipe buffer.  We break out if a signal occurs or the
  915: 		 * reader goes away.
  916: 		 */
  917: 	retrywrite:
  918: 		while (wpipe->pipe_state & (PIPE_DIRECTW|PIPE_DIRECTIP)) {
  919: 			if (wpipe->pipe_state & PIPE_WANTR) {
  920: 				wpipe->pipe_state &= ~PIPE_WANTR;
  921: 				wakeup(wpipe);
  922: 			}
  923: 			error = tsleep(wpipe, PCATCH, "pipbww", 0);
  924: 			if (wpipe->pipe_state & PIPE_EOF)
  925: 				break;
  926: 			if (error)
  927: 				break;
  928: 		}
  929: 		if (wpipe->pipe_state & PIPE_EOF) {
  930: 			error = EPIPE;
  931: 			break;
  932: 		}
  933: 
  934: 		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
  935: 
  936: 		/* Writes of size <= PIPE_BUF must be atomic. */
  937: 		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
  938: 			space = 0;
  939: 
  940: 		/* 
  941: 		 * Write to fill, read size handles write hysteresis.  Also
  942: 		 * additional restrictions can cause select-based non-blocking
  943: 		 * writes to spin.
  944: 		 */
  945: 		if (space > 0) {
  946: 			if ((error = pipelock(wpipe,1)) == 0) {
  947: 				int size;	/* Transfer size */
  948: 				int segsize;	/* first segment to transfer */
  949: 
  950: 				/*
  951: 				 * It is possible for a direct write to
  952: 				 * slip in on us... handle it here...
  953: 				 */
  954: 				if (wpipe->pipe_state & (PIPE_DIRECTW|PIPE_DIRECTIP)) {
  955: 					pipeunlock(wpipe);
  956: 					goto retrywrite;
  957: 				}
  958: 				/* 
  959: 				 * If a process blocked in uiomove, our
  960: 				 * value for space might be bad.
  961: 				 *
  962: 				 * XXX will we be ok if the reader has gone
  963: 				 * away here?
  964: 				 */
  965: 				if (space > wpipe->pipe_buffer.size - 
  966: 				    wpipe->pipe_buffer.cnt) {
  967: 					pipeunlock(wpipe);
  968: 					goto retrywrite;
  969: 				}
  970: 
  971: 				/*
  972: 				 * Transfer size is minimum of uio transfer
  973: 				 * and free space in pipe buffer.
  974: 				 */
  975: 				if (space > uio->uio_resid)
  976: 					size = uio->uio_resid;
  977: 				else
  978: 					size = space;
  979: 				/*
  980: 				 * First segment to transfer is minimum of 
  981: 				 * transfer size and contiguous space in
  982: 				 * pipe buffer.  If first segment to transfer
  983: 				 * is less than the transfer size, we've got
  984: 				 * a wraparound in the buffer.
  985: 				 */
  986: 				segsize = wpipe->pipe_buffer.size - 
  987: 					wpipe->pipe_buffer.in;
  988: 				if (segsize > size)
  989: 					segsize = size;
  990: 				
  991: 				/* Transfer first segment */
  992: 
  993: 				error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], 
  994: 						segsize, uio);
  995: 				
  996: 				if (error == 0 && segsize < size) {
  997: 					/* 
  998: 					 * Transfer remaining part now, to
  999: 					 * support atomic writes.  Wraparound
 1000: 					 * happened.
 1001: 					 */
 1002: 					if (wpipe->pipe_buffer.in + segsize != 
 1003: 					    wpipe->pipe_buffer.size)
 1004: 						panic("Expected pipe buffer wraparound disappeared");
 1005: 						
 1006: 					error = uiomove(&wpipe->pipe_buffer.buffer[0],
 1007: 							size - segsize, uio);
 1008: 				}
 1009: 				if (error == 0) {
 1010: 					wpipe->pipe_buffer.in += size;
 1011: 					if (wpipe->pipe_buffer.in >=
 1012: 					    wpipe->pipe_buffer.size) {
 1013: 						if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size)
 1014: 							panic("Expected wraparound bad");
 1015: 						wpipe->pipe_buffer.in = size - segsize;
 1016: 					}
 1017: 				
 1018: 					wpipe->pipe_buffer.cnt += size;
 1019: 					if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size)
 1020: 						panic("Pipe buffer overflow");
 1021: 				
 1022: 				}
 1023: 				pipeunlock(wpipe);
 1024: 			}
 1025: 			if (error)
 1026: 				break;
 1027: 
 1028: 		} else {
 1029: 			/*
 1030: 			 * If the "read-side" has been blocked, wake it up now
 1031: 			 * and yield to let it drain synchronously rather
 1032: 			 * then block.
 1033: 			 */
 1034: 			if (wpipe->pipe_state & PIPE_WANTR) {
 1035: 				wpipe->pipe_state &= ~PIPE_WANTR;
 1036: 				wakeup(wpipe);
 1037: 			}
 1038: 
 1039: 			/*
 1040: 			 * don't block on non-blocking I/O
 1041: 			 */
 1042: 			if (fp->f_flag & FNONBLOCK) {
 1043: 				error = EAGAIN;
 1044: 				break;
 1045: 			}
 1046: 
 1047: 			/*
 1048: 			 * We have no more space and have something to offer,
 1049: 			 * wake up select/poll.
 1050: 			 */
 1051: 			pipeselwakeup(wpipe);
 1052: 
 1053: 			wpipe->pipe_state |= PIPE_WANTW;
 1054: 			error = tsleep(wpipe, PCATCH|PNORESCHED, "pipewr", 0);
 1055: 			if (error != 0)
 1056: 				break;
 1057: 			/*
 1058: 			 * If read side wants to go away, we just issue a signal
 1059: 			 * to ourselves.
 1060: 			 */
 1061: 			if (wpipe->pipe_state & PIPE_EOF) {
 1062: 				error = EPIPE;
 1063: 				break;
 1064: 			}	
 1065: 		}
 1066: 	}
 1067: 
 1068: 	--wpipe->pipe_busy;
 1069: 
 1070: 	if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
 1071: 		wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
 1072: 		wakeup(wpipe);
 1073: 	} else if (wpipe->pipe_buffer.cnt > 0) {
 1074: 		/*
 1075: 		 * If we have put any characters in the buffer, we wake up
 1076: 		 * the reader.
 1077: 		 */
 1078: 		if (wpipe->pipe_state & PIPE_WANTR) {
 1079: 			wpipe->pipe_state &= ~PIPE_WANTR;
 1080: 			wakeup(wpipe);
 1081: 		}
 1082: 	}
 1083: 
 1084: 	/*
 1085: 	 * Don't return EPIPE if I/O was successful
 1086: 	 */
 1087: 	if ((wpipe->pipe_buffer.cnt == 0) &&
 1088: 	    (uio->uio_resid == 0) &&
 1089: 	    (error == EPIPE)) {
 1090: 		error = 0;
 1091: 	}
 1092: 
 1093: 	if (error == 0)
 1094: 		vfs_timestamp(&wpipe->pipe_mtime);
 1095: 
 1096: 	/*
 1097: 	 * We have something to offer,
 1098: 	 * wake up select/poll.
 1099: 	 */
 1100: 	if (wpipe->pipe_buffer.cnt)
 1101: 		pipeselwakeup(wpipe);
 1102: 
 1103: 	return (error);
 1104: }
 1105: 
 1106: /*
 1107:  * we implement a very minimal set of ioctls for compatibility with sockets.
 1108:  */
 1109: int
 1110: pipe_ioctl(struct file *fp, u_long cmd, caddr_t data, struct thread *td)
 1111: {
 1112: 	struct pipe *mpipe = (struct pipe *)fp->f_data;
 1113: 
 1114: 	switch (cmd) {
 1115: 
 1116: 	case FIONBIO:
 1117: 		return (0);
 1118: 
 1119: 	case FIOASYNC:
 1120: 		if (*(int *)data) {
 1121: 			mpipe->pipe_state |= PIPE_ASYNC;
 1122: 		} else {
 1123: 			mpipe->pipe_state &= ~PIPE_ASYNC;
 1124: 		}
 1125: 		return (0);
 1126: 
 1127: 	case FIONREAD:
 1128: 		if (mpipe->pipe_state & PIPE_DIRECTW) {
 1129: 			*(int *)data = mpipe->pipe_map.xio_bytes;
 1130: 		} else {
 1131: 			*(int *)data = mpipe->pipe_buffer.cnt;
 1132: 		}
 1133: 		return (0);
 1134: 
 1135: 	case FIOSETOWN:
 1136: 		return (fsetown(*(int *)data, &mpipe->pipe_sigio));
 1137: 
 1138: 	case FIOGETOWN:
 1139: 		*(int *)data = fgetown(mpipe->pipe_sigio);
 1140: 		return (0);
 1141: 
 1142: 	/* This is deprecated, FIOSETOWN should be used instead. */
 1143: 	case TIOCSPGRP:
 1144: 		return (fsetown(-(*(int *)data), &mpipe->pipe_sigio));
 1145: 
 1146: 	/* This is deprecated, FIOGETOWN should be used instead. */
 1147: 	case TIOCGPGRP:
 1148: 		*(int *)data = -fgetown(mpipe->pipe_sigio);
 1149: 		return (0);
 1150: 
 1151: 	}
 1152: 	return (ENOTTY);
 1153: }
 1154: 
 1155: int
 1156: pipe_poll(struct file *fp, int events, struct ucred *cred, struct thread *td)
 1157: {
 1158: 	struct pipe *rpipe = (struct pipe *)fp->f_data;
 1159: 	struct pipe *wpipe;
 1160: 	int revents = 0;
 1161: 
 1162: 	wpipe = rpipe->pipe_peer;
 1163: 	if (events & (POLLIN | POLLRDNORM))
 1164: 		if ((rpipe->pipe_state & PIPE_DIRECTW) ||
 1165: 		    (rpipe->pipe_buffer.cnt > 0) ||
 1166: 		    (rpipe->pipe_state & PIPE_EOF))
 1167: 			revents |= events & (POLLIN | POLLRDNORM);
 1168: 
 1169: 	if (events & (POLLOUT | POLLWRNORM))
 1170: 		if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) ||
 1171: 		    (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
 1172: 		     (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
 1173: 			revents |= events & (POLLOUT | POLLWRNORM);
 1174: 
 1175: 	if ((rpipe->pipe_state & PIPE_EOF) ||
 1176: 	    (wpipe == NULL) ||
 1177: 	    (wpipe->pipe_state & PIPE_EOF))
 1178: 		revents |= POLLHUP;
 1179: 
 1180: 	if (revents == 0) {
 1181: 		if (events & (POLLIN | POLLRDNORM)) {
 1182: 			selrecord(td, &rpipe->pipe_sel);
 1183: 			rpipe->pipe_state |= PIPE_SEL;
 1184: 		}
 1185: 
 1186: 		if (events & (POLLOUT | POLLWRNORM)) {
 1187: 			selrecord(td, &wpipe->pipe_sel);
 1188: 			wpipe->pipe_state |= PIPE_SEL;
 1189: 		}
 1190: 	}
 1191: 
 1192: 	return (revents);
 1193: }
 1194: 
 1195: static int
 1196: pipe_stat(struct file *fp, struct stat *ub, struct thread *td)
 1197: {
 1198: 	struct pipe *pipe = (struct pipe *)fp->f_data;
 1199: 
 1200: 	bzero((caddr_t)ub, sizeof(*ub));
 1201: 	ub->st_mode = S_IFIFO;
 1202: 	ub->st_blksize = pipe->pipe_buffer.size;
 1203: 	ub->st_size = pipe->pipe_buffer.cnt;
 1204: 	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
 1205: 	ub->st_atimespec = pipe->pipe_atime;
 1206: 	ub->st_mtimespec = pipe->pipe_mtime;
 1207: 	ub->st_ctimespec = pipe->pipe_ctime;
 1208: 	/*
 1209: 	 * Left as 0: st_dev, st_ino, st_nlink, st_uid, st_gid, st_rdev,
 1210: 	 * st_flags, st_gen.
 1211: 	 * XXX (st_dev, st_ino) should be unique.
 1212: 	 */
 1213: 	return (0);
 1214: }
 1215: 
 1216: /* ARGSUSED */
 1217: static int
 1218: pipe_close(struct file *fp, struct thread *td)
 1219: {
 1220: 	struct pipe *cpipe = (struct pipe *)fp->f_data;
 1221: 
 1222: 	fp->f_ops = &badfileops;
 1223: 	fp->f_data = NULL;
 1224: 	funsetown(cpipe->pipe_sigio);
 1225: 	pipeclose(cpipe);
 1226: 	return (0);
 1227: }
 1228: 
 1229: static void
 1230: pipe_free_kmem(struct pipe *cpipe)
 1231: {
 1232: 	if (cpipe->pipe_buffer.buffer != NULL) {
 1233: 		if (cpipe->pipe_buffer.size > PIPE_SIZE)
 1234: 			--pipe_nbig;
 1235: 		kmem_free(kernel_map,
 1236: 			(vm_offset_t)cpipe->pipe_buffer.buffer,
 1237: 			cpipe->pipe_buffer.size);
 1238: 		cpipe->pipe_buffer.buffer = NULL;
 1239: 		cpipe->pipe_buffer.object = NULL;
 1240: 	}
 1241: #ifndef PIPE_NODIRECT
 1242: 	KKASSERT(cpipe->pipe_map.xio_bytes == 0 &&
 1243: 		cpipe->pipe_map.xio_offset == 0 &&
 1244: 		cpipe->pipe_map.xio_npages == 0);
 1245: #endif
 1246: }
 1247: 
 1248: /*
 1249:  * shutdown the pipe
 1250:  */
 1251: static void
 1252: pipeclose(struct pipe *cpipe)
 1253: {
 1254: 	globaldata_t gd;
 1255: 	struct pipe *ppipe;
 1256: 
 1257: 	if (cpipe == NULL)
 1258: 		return;
 1259: 
 1260: 	pipeselwakeup(cpipe);
 1261: 
 1262: 	/*
 1263: 	 * If the other side is blocked, wake it up saying that
 1264: 	 * we want to close it down.
 1265: 	 */
 1266: 	while (cpipe->pipe_busy) {
 1267: 		wakeup(cpipe);
 1268: 		cpipe->pipe_state |= PIPE_WANT | PIPE_EOF;
 1269: 		tsleep(cpipe, 0, "pipecl", 0);
 1270: 	}
 1271: 
 1272: 	/*
 1273: 	 * Disconnect from peer
 1274: 	 */
 1275: 	if ((ppipe = cpipe->pipe_peer) != NULL) {
 1276: 		pipeselwakeup(ppipe);
 1277: 
 1278: 		ppipe->pipe_state |= PIPE_EOF;
 1279: 		wakeup(ppipe);
 1280: 		KNOTE(&ppipe->pipe_sel.si_note, 0);
 1281: 		ppipe->pipe_peer = NULL;
 1282: 	}
 1283: 
 1284: 	if (cpipe->pipe_kva) {
 1285: 		pmap_qremove(cpipe->pipe_kva, XIO_INTERNAL_PAGES);
 1286: 		kmem_free(kernel_map, cpipe->pipe_kva, XIO_INTERNAL_SIZE);
 1287: 		cpipe->pipe_kva = NULL;
 1288: 	}
 1289: 
 1290: 	/*
 1291: 	 * free or cache resources
 1292: 	 */
 1293: 	gd = mycpu;
 1294: 	if (gd->gd_pipeqcount >= pipe_maxcache ||
 1295: 	    cpipe->pipe_buffer.size != PIPE_SIZE
 1296: 	) {
 1297: 		pipe_free_kmem(cpipe);
 1298: 		free(cpipe, M_PIPE);
 1299: 	} else {
 1300: 		KKASSERT(cpipe->pipe_map.xio_npages == 0 &&
 1301: 			cpipe->pipe_map.xio_bytes == 0 &&
 1302: 			cpipe->pipe_map.xio_offset == 0);
 1303: 		cpipe->pipe_state = 0;
 1304: 		cpipe->pipe_busy = 0;
 1305: 		cpipe->pipe_peer = gd->gd_pipeq;
 1306: 		gd->gd_pipeq = cpipe;
 1307: 		++gd->gd_pipeqcount;
 1308: 	}
 1309: }
 1310: 
 1311: /*ARGSUSED*/
 1312: static int
 1313: pipe_kqfilter(struct file *fp, struct knote *kn)
 1314: {
 1315: 	struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
 1316: 
 1317: 	switch (kn->kn_filter) {
 1318: 	case EVFILT_READ:
 1319: 		kn->kn_fop = &pipe_rfiltops;
 1320: 		break;
 1321: 	case EVFILT_WRITE:
 1322: 		kn->kn_fop = &pipe_wfiltops;
 1323: 		cpipe = cpipe->pipe_peer;
 1324: 		if (cpipe == NULL)
 1325: 			/* other end of pipe has been closed */
 1326: 			return (EPIPE);
 1327: 		break;
 1328: 	default:
 1329: 		return (1);
 1330: 	}
 1331: 	kn->kn_hook = (caddr_t)cpipe;
 1332: 
 1333: 	SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext);
 1334: 	return (0);
 1335: }
 1336: 
 1337: static void
 1338: filt_pipedetach(struct knote *kn)
 1339: {
 1340: 	struct pipe *cpipe = (struct pipe *)kn->kn_hook;
 1341: 
 1342: 	SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext);
 1343: }
 1344: 
 1345: /*ARGSUSED*/
 1346: static int
 1347: filt_piperead(struct knote *kn, long hint)
 1348: {
 1349: 	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
 1350: 	struct pipe *wpipe = rpipe->pipe_peer;
 1351: 
 1352: 	kn->kn_data = rpipe->pipe_buffer.cnt;
 1353: 	if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
 1354: 		kn->kn_data = rpipe->pipe_map.xio_bytes;
 1355: 
 1356: 	if ((rpipe->pipe_state & PIPE_EOF) ||
 1357: 	    (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
 1358: 		kn->kn_flags |= EV_EOF; 
 1359: 		return (1);
 1360: 	}
 1361: 	return (kn->kn_data > 0);
 1362: }
 1363: 
 1364: /*ARGSUSED*/
 1365: static int
 1366: filt_pipewrite(struct knote *kn, long hint)
 1367: {
 1368: 	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
 1369: 	struct pipe *wpipe = rpipe->pipe_peer;
 1370: 
 1371: 	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
 1372: 		kn->kn_data = 0;
 1373: 		kn->kn_flags |= EV_EOF; 
 1374: 		return (1);
 1375: 	}
 1376: 	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
 1377: 	if (wpipe->pipe_state & PIPE_DIRECTW)
 1378: 		kn->kn_data = 0;
 1379: 
 1380: 	return (kn->kn_data >= PIPE_BUF);
 1381: }