File:  [DragonFly] / src / sys / kern / vfs_bio.c
Revision 1.26: download - view: text, annotated - select for diffs
Wed May 19 22:52:58 2004 UTC (10 years, 4 months ago) by dillon
Branches: MAIN
CVS tags: HEAD
Device layer rollup commit.

* cdevsw_add() is now required.  cdevsw_add() and cdevsw_remove() may specify
  a mask/match indicating the range of supported minor numbers.  Multiple
  cdevsw_add()'s using the same major number, but distinctly different
  ranges, may be issued.  All devices that failed to call cdevsw_add() before
  now do.

* cdevsw_remove() now automatically marks all devices within its supported
  range as being destroyed.

* vnode->v_rdev is no longer resolved when the vnode is created.  Instead,
  only v_udev (a newly added field) is resolved.  v_rdev is resolved when
  the vnode is opened and cleared on the last close.

* A great deal of code was making rather dubious assumptions with regards
  to the validity of devices associated with vnodes, primarily due to
  the persistence of a device structure due to being indexed by (major, minor)
  instead of by (cdevsw, major, minor).  In particular, if you run a program
  which connects to a USB device and then you pull the USB device and plug
  it back in, the vnode subsystem will continue to believe that the device
  is open when, in fact, it isn't (because it was destroyed and recreated).

  In particular, note that all the VFS mount procedures now check devices
  via v_udev instead of v_rdev prior to calling VOP_OPEN(), since v_rdev
  is NULL prior to the first open.

* The disk layer's device interaction has been rewritten.  The disk layer
  (i.e. the slice and disklabel management layer) no longer overloads
  its data onto the device structure representing the underlying physical
  disk.  Instead, the disk layer uses the new cdevsw_add() functionality
  to register its own cdevsw using the underlying device's major number,
  and simply does NOT register the underlying device's cdevsw.  No
  confusion is created because the device hash is now based on
  (cdevsw,major,minor) rather then (major,minor).

  NOTE: This also means that underlying raw disk devices may use the entire
  device minor number instead of having to reserve the bits used by the disk
  layer, and also means that can we (theoretically) stack a fully
  disklabel-supported 'disk' on top of any block device.

* The new reference counting scheme prevents this by associating a device
  with a cdevsw and disconnecting the device from its cdevsw when the cdevsw
  is removed.  Additionally, all udev2dev() lookups run through the cdevsw
  mask/match and only successfully find devices still associated with an
  active cdevsw.

* Major work on MFS:  MFS no longer shortcuts vnode and device creation.  It
  now creates a real vnode and a real device and implements real open and
  close VOPs.  Additionally, due to the disk layer changes, MFS is no longer
  limited to 255 mounts.  The new limit is 16 million.  Since MFS creates a
  real device node, mount_mfs will now create a real /dev/mfs<PID> device
  that can be read from userland (e.g. so you can dump an MFS filesystem).

* BUF AND DEVICE STRATEGY changes.  The struct buf contains a b_dev field.
  In order to properly handle stacked devices we now require that the b_dev
  field be initialized before the device strategy routine is called.  This
  required some additional work in various VFS implementations.  To enforce
  this requirement, biodone() now sets b_dev to NODEV.  The new disk layer
  will adjust b_dev before forwarding a request to the actual physical
  device.

* A bug in the ISO CD boot sequence which resulted in a panic has been fixed.

Testing by: lots of people, but David Rhodus found the most aggregious bugs.

    1: /*
    2:  * Copyright (c) 1994,1997 John S. Dyson
    3:  * All rights reserved.
    4:  *
    5:  * Redistribution and use in source and binary forms, with or without
    6:  * modification, are permitted provided that the following conditions
    7:  * are met:
    8:  * 1. Redistributions of source code must retain the above copyright
    9:  *    notice immediately at the beginning of the file, without modification,
   10:  *    this list of conditions, and the following disclaimer.
   11:  * 2. Absolutely no warranty of function or purpose is made by the author
   12:  *		John S. Dyson.
   13:  *
   14:  * $FreeBSD: src/sys/kern/vfs_bio.c,v 1.242.2.20 2003/05/28 18:38:10 alc Exp $
   15:  * $DragonFly: src/sys/kern/vfs_bio.c,v 1.26 2004/05/19 22:52:58 dillon Exp $
   16:  */
   17: 
   18: /*
   19:  * this file contains a new buffer I/O scheme implementing a coherent
   20:  * VM object and buffer cache scheme.  Pains have been taken to make
   21:  * sure that the performance degradation associated with schemes such
   22:  * as this is not realized.
   23:  *
   24:  * Author:  John S. Dyson
   25:  * Significant help during the development and debugging phases
   26:  * had been provided by David Greenman, also of the FreeBSD core team.
   27:  *
   28:  * see man buf(9) for more info.
   29:  */
   30: 
   31: #include <sys/param.h>
   32: #include <sys/systm.h>
   33: #include <sys/buf.h>
   34: #include <sys/conf.h>
   35: #include <sys/eventhandler.h>
   36: #include <sys/lock.h>
   37: #include <sys/malloc.h>
   38: #include <sys/mount.h>
   39: #include <sys/kernel.h>
   40: #include <sys/kthread.h>
   41: #include <sys/proc.h>
   42: #include <sys/reboot.h>
   43: #include <sys/resourcevar.h>
   44: #include <sys/sysctl.h>
   45: #include <sys/vmmeter.h>
   46: #include <sys/vnode.h>
   47: #include <sys/proc.h>
   48: #include <vm/vm.h>
   49: #include <vm/vm_param.h>
   50: #include <vm/vm_kern.h>
   51: #include <vm/vm_pageout.h>
   52: #include <vm/vm_page.h>
   53: #include <vm/vm_object.h>
   54: #include <vm/vm_extern.h>
   55: #include <vm/vm_map.h>
   56: #include <sys/buf2.h>
   57: #include <vm/vm_page2.h>
   58: 
   59: static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer");
   60: 
   61: struct	bio_ops bioops;		/* I/O operation notification */
   62: 
   63: struct buf *buf;		/* buffer header pool */
   64: struct swqueue bswlist;
   65: 
   66: static void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
   67: 		vm_offset_t to);
   68: static void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
   69: 		vm_offset_t to);
   70: static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off,
   71: 			       int pageno, vm_page_t m);
   72: static void vfs_clean_pages(struct buf * bp);
   73: static void vfs_setdirty(struct buf *bp);
   74: static void vfs_vmio_release(struct buf *bp);
   75: static void vfs_backgroundwritedone(struct buf *bp);
   76: static int flushbufqueues(void);
   77: 
   78: static int bd_request;
   79: 
   80: static void buf_daemon (void);
   81: /*
   82:  * bogus page -- for I/O to/from partially complete buffers
   83:  * this is a temporary solution to the problem, but it is not
   84:  * really that bad.  it would be better to split the buffer
   85:  * for input in the case of buffers partially already in memory,
   86:  * but the code is intricate enough already.
   87:  */
   88: vm_page_t bogus_page;
   89: int vmiodirenable = TRUE;
   90: int runningbufspace;
   91: struct lwkt_token buftimetoken;  /* Interlock on setting prio and timo */
   92: 
   93: static vm_offset_t bogus_offset;
   94: 
   95: static int bufspace, maxbufspace,
   96: 	bufmallocspace, maxbufmallocspace, lobufspace, hibufspace;
   97: static int bufreusecnt, bufdefragcnt, buffreekvacnt;
   98: static int needsbuffer;
   99: static int lorunningspace, hirunningspace, runningbufreq;
  100: static int numdirtybuffers, lodirtybuffers, hidirtybuffers;
  101: static int numfreebuffers, lofreebuffers, hifreebuffers;
  102: static int getnewbufcalls;
  103: static int getnewbufrestarts;
  104: 
  105: SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD,
  106: 	&numdirtybuffers, 0, "");
  107: SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW,
  108: 	&lodirtybuffers, 0, "");
  109: SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW,
  110: 	&hidirtybuffers, 0, "");
  111: SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD,
  112: 	&numfreebuffers, 0, "");
  113: SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW,
  114: 	&lofreebuffers, 0, "");
  115: SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW,
  116: 	&hifreebuffers, 0, "");
  117: SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD,
  118: 	&runningbufspace, 0, "");
  119: SYSCTL_INT(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW,
  120: 	&lorunningspace, 0, "");
  121: SYSCTL_INT(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW,
  122: 	&hirunningspace, 0, "");
  123: SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD,
  124: 	&maxbufspace, 0, "");
  125: SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD,
  126: 	&hibufspace, 0, "");
  127: SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD,
  128: 	&lobufspace, 0, "");
  129: SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD,
  130: 	&bufspace, 0, "");
  131: SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW,
  132: 	&maxbufmallocspace, 0, "");
  133: SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD,
  134: 	&bufmallocspace, 0, "");
  135: SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW,
  136: 	&getnewbufcalls, 0, "");
  137: SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW,
  138: 	&getnewbufrestarts, 0, "");
  139: SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW,
  140: 	&vmiodirenable, 0, "");
  141: SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW,
  142: 	&bufdefragcnt, 0, "");
  143: SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW,
  144: 	&buffreekvacnt, 0, "");
  145: SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW,
  146: 	&bufreusecnt, 0, "");
  147: 
  148: /*
  149:  * Disable background writes for now.  There appear to be races in the 
  150:  * flags tests and locking operations as well as races in the completion
  151:  * code modifying the original bp (origbp) without holding a lock, assuming
  152:  * splbio protection when there might not be splbio protection.
  153:  */
  154: static int dobkgrdwrite = 0;
  155: SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0,
  156: 	"Do background writes (honoring the BV_BKGRDWRITE flag)?");
  157: 
  158: static int bufhashmask;
  159: static int bufhashshift;
  160: static LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
  161: struct bqueues bufqueues[BUFFER_QUEUES] = { { 0 } };
  162: char *buf_wmesg = BUF_WMESG;
  163: 
  164: extern int vm_swap_size;
  165: 
  166: #define VFS_BIO_NEED_ANY	0x01	/* any freeable buffer */
  167: #define VFS_BIO_NEED_DIRTYFLUSH	0x02	/* waiting for dirty buffer flush */
  168: #define VFS_BIO_NEED_FREE	0x04	/* wait for free bufs, hi hysteresis */
  169: #define VFS_BIO_NEED_BUFSPACE	0x08	/* wait for buf space, lo hysteresis */
  170: 
  171: /*
  172:  * Buffer hash table code.  Note that the logical block scans linearly, which
  173:  * gives us some L1 cache locality.
  174:  */
  175: 
  176: static __inline 
  177: struct bufhashhdr *
  178: bufhash(struct vnode *vnp, daddr_t bn)
  179: {
  180: 	u_int64_t hashkey64;
  181: 	int hashkey; 
  182: 	
  183: 	/*
  184: 	 * A variation on the Fibonacci hash that Knuth credits to
  185: 	 * R. W. Floyd, see Knuth's _Art of Computer Programming,
  186: 	 * Volume 3 / Sorting and Searching_
  187: 	 *
  188:          * We reduce the argument to 32 bits before doing the hash to
  189: 	 * avoid the need for a slow 64x64 multiply on 32 bit platforms.
  190: 	 *
  191: 	 * sizeof(struct vnode) is 168 on i386, so toss some of the lower
  192: 	 * bits of the vnode address to reduce the key range, which
  193: 	 * improves the distribution of keys across buckets.
  194: 	 *
  195: 	 * The file system cylinder group blocks are very heavily
  196: 	 * used.  They are located at invervals of fbg, which is
  197: 	 * on the order of 89 to 94 * 2^10, depending on other
  198: 	 * filesystem parameters, for a 16k block size.  Smaller block
  199: 	 * sizes will reduce fpg approximately proportionally.  This
  200: 	 * will cause the cylinder group index to be hashed using the
  201: 	 * lower bits of the hash multiplier, which will not distribute
  202: 	 * the keys as uniformly in a classic Fibonacci hash where a
  203: 	 * relatively small number of the upper bits of the result
  204: 	 * are used.  Using 2^16 as a close-enough approximation to
  205: 	 * fpg, split the hash multiplier in half, with the upper 16
  206: 	 * bits being the inverse of the golden ratio, and the lower
  207: 	 * 16 bits being a fraction between 1/3 and 3/7 (closer to
  208: 	 * 3/7 in this case), that gives good experimental results.
  209: 	 */
  210: 	hashkey64 = ((u_int64_t)(uintptr_t)vnp >> 3) + (u_int64_t)bn;
  211: 	hashkey = (((u_int32_t)(hashkey64 + (hashkey64 >> 32)) * 0x9E376DB1u) >>
  212: 	    bufhashshift) & bufhashmask;
  213: 	return(&bufhashtbl[hashkey]);
  214: }
  215: 
  216: /*
  217:  *	numdirtywakeup:
  218:  *
  219:  *	If someone is blocked due to there being too many dirty buffers,
  220:  *	and numdirtybuffers is now reasonable, wake them up.
  221:  */
  222: 
  223: static __inline void
  224: numdirtywakeup(int level)
  225: {
  226: 	if (numdirtybuffers <= level) {
  227: 		if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) {
  228: 			needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH;
  229: 			wakeup(&needsbuffer);
  230: 		}
  231: 	}
  232: }
  233: 
  234: /*
  235:  *	bufspacewakeup:
  236:  *
  237:  *	Called when buffer space is potentially available for recovery.
  238:  *	getnewbuf() will block on this flag when it is unable to free 
  239:  *	sufficient buffer space.  Buffer space becomes recoverable when 
  240:  *	bp's get placed back in the queues.
  241:  */
  242: 
  243: static __inline void
  244: bufspacewakeup(void)
  245: {
  246: 	/*
  247: 	 * If someone is waiting for BUF space, wake them up.  Even
  248: 	 * though we haven't freed the kva space yet, the waiting
  249: 	 * process will be able to now.
  250: 	 */
  251: 	if (needsbuffer & VFS_BIO_NEED_BUFSPACE) {
  252: 		needsbuffer &= ~VFS_BIO_NEED_BUFSPACE;
  253: 		wakeup(&needsbuffer);
  254: 	}
  255: }
  256: 
  257: /*
  258:  * runningbufwakeup() - in-progress I/O accounting.
  259:  *
  260:  */
  261: static __inline void
  262: runningbufwakeup(struct buf *bp)
  263: {
  264: 	if (bp->b_runningbufspace) {
  265: 		runningbufspace -= bp->b_runningbufspace;
  266: 		bp->b_runningbufspace = 0;
  267: 		if (runningbufreq && runningbufspace <= lorunningspace) {
  268: 			runningbufreq = 0;
  269: 			wakeup(&runningbufreq);
  270: 		}
  271: 	}
  272: }
  273: 
  274: /*
  275:  *	bufcountwakeup:
  276:  *
  277:  *	Called when a buffer has been added to one of the free queues to
  278:  *	account for the buffer and to wakeup anyone waiting for free buffers.
  279:  *	This typically occurs when large amounts of metadata are being handled
  280:  *	by the buffer cache ( else buffer space runs out first, usually ).
  281:  */
  282: 
  283: static __inline void
  284: bufcountwakeup(void) 
  285: {
  286: 	++numfreebuffers;
  287: 	if (needsbuffer) {
  288: 		needsbuffer &= ~VFS_BIO_NEED_ANY;
  289: 		if (numfreebuffers >= hifreebuffers)
  290: 			needsbuffer &= ~VFS_BIO_NEED_FREE;
  291: 		wakeup(&needsbuffer);
  292: 	}
  293: }
  294: 
  295: /*
  296:  *	waitrunningbufspace()
  297:  *
  298:  *	runningbufspace is a measure of the amount of I/O currently
  299:  *	running.  This routine is used in async-write situations to
  300:  *	prevent creating huge backups of pending writes to a device.
  301:  *	Only asynchronous writes are governed by this function.  
  302:  *
  303:  *	Reads will adjust runningbufspace, but will not block based on it.
  304:  *	The read load has a side effect of reducing the allowed write load.
  305:  *
  306:  *	This does NOT turn an async write into a sync write.  It waits
  307:  *	for earlier writes to complete and generally returns before the
  308:  *	caller's write has reached the device.
  309:  */
  310: static __inline void
  311: waitrunningbufspace(void)
  312: {
  313: 	while (runningbufspace > hirunningspace) {
  314: 		int s;
  315: 
  316: 		s = splbio();	/* fix race against interrupt/biodone() */
  317: 		++runningbufreq;
  318: 		tsleep(&runningbufreq, 0, "wdrain", 0);
  319: 		splx(s);
  320: 	}
  321: }
  322: 
  323: /*
  324:  *	vfs_buf_test_cache:
  325:  *
  326:  *	Called when a buffer is extended.  This function clears the B_CACHE
  327:  *	bit if the newly extended portion of the buffer does not contain
  328:  *	valid data.
  329:  */
  330: static __inline__
  331: void
  332: vfs_buf_test_cache(struct buf *bp,
  333: 		  vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
  334: 		  vm_page_t m)
  335: {
  336: 	if (bp->b_flags & B_CACHE) {
  337: 		int base = (foff + off) & PAGE_MASK;
  338: 		if (vm_page_is_valid(m, base, size) == 0)
  339: 			bp->b_flags &= ~B_CACHE;
  340: 	}
  341: }
  342: 
  343: static __inline__
  344: void
  345: bd_wakeup(int dirtybuflevel)
  346: {
  347: 	if (bd_request == 0 && numdirtybuffers >= dirtybuflevel) {
  348: 		bd_request = 1;
  349: 		wakeup(&bd_request);
  350: 	}
  351: }
  352: 
  353: /*
  354:  * bd_speedup - speedup the buffer cache flushing code
  355:  */
  356: 
  357: static __inline__
  358: void
  359: bd_speedup(void)
  360: {
  361: 	bd_wakeup(1);
  362: }
  363: 
  364: /*
  365:  * Initialize buffer headers and related structures. 
  366:  */
  367: 
  368: caddr_t
  369: bufhashinit(caddr_t vaddr)
  370: {
  371: 	/* first, make a null hash table */
  372: 	bufhashshift = 29;
  373: 	for (bufhashmask = 8; bufhashmask < nbuf / 4; bufhashmask <<= 1)
  374: 		bufhashshift--;
  375: 	bufhashtbl = (void *)vaddr;
  376: 	vaddr = vaddr + sizeof(*bufhashtbl) * bufhashmask;
  377: 	--bufhashmask;
  378: 	return(vaddr);
  379: }
  380: 
  381: void
  382: bufinit(void)
  383: {
  384: 	struct buf *bp;
  385: 	int i;
  386: 
  387: 	TAILQ_INIT(&bswlist);
  388: 	LIST_INIT(&invalhash);
  389: 	lwkt_token_init(&buftimetoken);
  390: 
  391: 	for (i = 0; i <= bufhashmask; i++)
  392: 		LIST_INIT(&bufhashtbl[i]);
  393: 
  394: 	/* next, make a null set of free lists */
  395: 	for (i = 0; i < BUFFER_QUEUES; i++)
  396: 		TAILQ_INIT(&bufqueues[i]);
  397: 
  398: 	/* finally, initialize each buffer header and stick on empty q */
  399: 	for (i = 0; i < nbuf; i++) {
  400: 		bp = &buf[i];
  401: 		bzero(bp, sizeof *bp);
  402: 		bp->b_flags = B_INVAL;	/* we're just an empty header */
  403: 		bp->b_dev = NODEV;
  404: 		bp->b_qindex = QUEUE_EMPTY;
  405: 		bp->b_xflags = 0;
  406: 		LIST_INIT(&bp->b_dep);
  407: 		BUF_LOCKINIT(bp);
  408: 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
  409: 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
  410: 	}
  411: 
  412: 	/*
  413: 	 * maxbufspace is the absolute maximum amount of buffer space we are 
  414: 	 * allowed to reserve in KVM and in real terms.  The absolute maximum
  415: 	 * is nominally used by buf_daemon.  hibufspace is the nominal maximum
  416: 	 * used by most other processes.  The differential is required to 
  417: 	 * ensure that buf_daemon is able to run when other processes might 
  418: 	 * be blocked waiting for buffer space.
  419: 	 *
  420: 	 * maxbufspace is based on BKVASIZE.  Allocating buffers larger then
  421: 	 * this may result in KVM fragmentation which is not handled optimally
  422: 	 * by the system.
  423: 	 */
  424: 	maxbufspace = nbuf * BKVASIZE;
  425: 	hibufspace = imax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10);
  426: 	lobufspace = hibufspace - MAXBSIZE;
  427: 
  428: 	lorunningspace = 512 * 1024;
  429: 	hirunningspace = 1024 * 1024;
  430: 
  431: /*
  432:  * Limit the amount of malloc memory since it is wired permanently into
  433:  * the kernel space.  Even though this is accounted for in the buffer
  434:  * allocation, we don't want the malloced region to grow uncontrolled.
  435:  * The malloc scheme improves memory utilization significantly on average
  436:  * (small) directories.
  437:  */
  438: 	maxbufmallocspace = hibufspace / 20;
  439: 
  440: /*
  441:  * Reduce the chance of a deadlock occuring by limiting the number
  442:  * of delayed-write dirty buffers we allow to stack up.
  443:  */
  444: 	hidirtybuffers = nbuf / 4 + 20;
  445: 	numdirtybuffers = 0;
  446: /*
  447:  * To support extreme low-memory systems, make sure hidirtybuffers cannot
  448:  * eat up all available buffer space.  This occurs when our minimum cannot
  449:  * be met.  We try to size hidirtybuffers to 3/4 our buffer space assuming
  450:  * BKVASIZE'd (8K) buffers.
  451:  */
  452: 	while (hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
  453: 		hidirtybuffers >>= 1;
  454: 	}
  455: 	lodirtybuffers = hidirtybuffers / 2;
  456: 
  457: /*
  458:  * Try to keep the number of free buffers in the specified range,
  459:  * and give special processes (e.g. like buf_daemon) access to an 
  460:  * emergency reserve.
  461:  */
  462: 	lofreebuffers = nbuf / 18 + 5;
  463: 	hifreebuffers = 2 * lofreebuffers;
  464: 	numfreebuffers = nbuf;
  465: 
  466: /*
  467:  * Maximum number of async ops initiated per buf_daemon loop.  This is
  468:  * somewhat of a hack at the moment, we really need to limit ourselves
  469:  * based on the number of bytes of I/O in-transit that were initiated
  470:  * from buf_daemon.
  471:  */
  472: 
  473: 	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
  474: 	bogus_page = vm_page_alloc(kernel_object,
  475: 			((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
  476: 			VM_ALLOC_NORMAL);
  477: 	vmstats.v_wire_count++;
  478: 
  479: }
  480: 
  481: /*
  482:  * bfreekva() - free the kva allocation for a buffer.
  483:  *
  484:  *	Must be called at splbio() or higher as this is the only locking for
  485:  *	buffer_map.
  486:  *
  487:  *	Since this call frees up buffer space, we call bufspacewakeup().
  488:  */
  489: static void
  490: bfreekva(struct buf * bp)
  491: {
  492: 	int count;
  493: 
  494: 	if (bp->b_kvasize) {
  495: 		++buffreekvacnt;
  496: 		count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
  497: 		vm_map_lock(buffer_map);
  498: 		bufspace -= bp->b_kvasize;
  499: 		vm_map_delete(buffer_map,
  500: 		    (vm_offset_t) bp->b_kvabase,
  501: 		    (vm_offset_t) bp->b_kvabase + bp->b_kvasize,
  502: 		    &count
  503: 		);
  504: 		vm_map_unlock(buffer_map);
  505: 		vm_map_entry_release(count);
  506: 		bp->b_kvasize = 0;
  507: 		bufspacewakeup();
  508: 	}
  509: }
  510: 
  511: /*
  512:  *	bremfree:
  513:  *
  514:  *	Remove the buffer from the appropriate free list.
  515:  */
  516: void
  517: bremfree(struct buf * bp)
  518: {
  519: 	int s = splbio();
  520: 	int old_qindex = bp->b_qindex;
  521: 
  522: 	if (bp->b_qindex != QUEUE_NONE) {
  523: 		KASSERT(BUF_REFCNT(bp) == 1, ("bremfree: bp %p not locked",bp));
  524: 		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
  525: 		bp->b_qindex = QUEUE_NONE;
  526: 	} else {
  527: 		if (BUF_REFCNT(bp) <= 1)
  528: 			panic("bremfree: removing a buffer not on a queue");
  529: 	}
  530: 
  531: 	/*
  532: 	 * Fixup numfreebuffers count.  If the buffer is invalid or not
  533: 	 * delayed-write, and it was on the EMPTY, LRU, or AGE queues,
  534: 	 * the buffer was free and we must decrement numfreebuffers.
  535: 	 */
  536: 	if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
  537: 		switch(old_qindex) {
  538: 		case QUEUE_DIRTY:
  539: 		case QUEUE_CLEAN:
  540: 		case QUEUE_EMPTY:
  541: 		case QUEUE_EMPTYKVA:
  542: 			--numfreebuffers;
  543: 			break;
  544: 		default:
  545: 			break;
  546: 		}
  547: 	}
  548: 	splx(s);
  549: }
  550: 
  551: 
  552: /*
  553:  * Get a buffer with the specified data.  Look in the cache first.  We
  554:  * must clear B_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
  555:  * is set, the buffer is valid and we do not have to do anything ( see
  556:  * getblk() ).
  557:  */
  558: int
  559: bread(struct vnode * vp, daddr_t blkno, int size, struct buf ** bpp)
  560: {
  561: 	struct buf *bp;
  562: 
  563: 	bp = getblk(vp, blkno, size, 0, 0);
  564: 	*bpp = bp;
  565: 
  566: 	/* if not found in cache, do some I/O */
  567: 	if ((bp->b_flags & B_CACHE) == 0) {
  568: 		KASSERT(!(bp->b_flags & B_ASYNC), ("bread: illegal async bp %p", bp));
  569: 		bp->b_flags |= B_READ;
  570: 		bp->b_flags &= ~(B_ERROR | B_INVAL);
  571: 		vfs_busy_pages(bp, 0);
  572: 		VOP_STRATEGY(vp, bp);
  573: 		return (biowait(bp));
  574: 	}
  575: 	return (0);
  576: }
  577: 
  578: /*
  579:  * Operates like bread, but also starts asynchronous I/O on
  580:  * read-ahead blocks.  We must clear B_ERROR and B_INVAL prior
  581:  * to initiating I/O . If B_CACHE is set, the buffer is valid 
  582:  * and we do not have to do anything.
  583:  */
  584: int
  585: breadn(struct vnode * vp, daddr_t blkno, int size, daddr_t * rablkno,
  586: 	int *rabsize, int cnt, struct buf ** bpp)
  587: {
  588: 	struct buf *bp, *rabp;
  589: 	int i;
  590: 	int rv = 0, readwait = 0;
  591: 
  592: 	*bpp = bp = getblk(vp, blkno, size, 0, 0);
  593: 
  594: 	/* if not found in cache, do some I/O */
  595: 	if ((bp->b_flags & B_CACHE) == 0) {
  596: 		bp->b_flags |= B_READ;
  597: 		bp->b_flags &= ~(B_ERROR | B_INVAL);
  598: 		vfs_busy_pages(bp, 0);
  599: 		VOP_STRATEGY(vp, bp);
  600: 		++readwait;
  601: 	}
  602: 
  603: 	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
  604: 		if (inmem(vp, *rablkno))
  605: 			continue;
  606: 		rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
  607: 
  608: 		if ((rabp->b_flags & B_CACHE) == 0) {
  609: 			rabp->b_flags |= B_READ | B_ASYNC;
  610: 			rabp->b_flags &= ~(B_ERROR | B_INVAL);
  611: 			vfs_busy_pages(rabp, 0);
  612: 			BUF_KERNPROC(rabp);
  613: 			VOP_STRATEGY(vp, rabp);
  614: 		} else {
  615: 			brelse(rabp);
  616: 		}
  617: 	}
  618: 
  619: 	if (readwait) {
  620: 		rv = biowait(bp);
  621: 	}
  622: 	return (rv);
  623: }
  624: 
  625: /*
  626:  * Write, release buffer on completion.  (Done by iodone
  627:  * if async).  Do not bother writing anything if the buffer
  628:  * is invalid.
  629:  *
  630:  * Note that we set B_CACHE here, indicating that buffer is
  631:  * fully valid and thus cacheable.  This is true even of NFS
  632:  * now so we set it generally.  This could be set either here 
  633:  * or in biodone() since the I/O is synchronous.  We put it
  634:  * here.
  635:  */
  636: int
  637: bwrite(struct buf * bp)
  638: {
  639: 	int oldflags, s;
  640: 	struct buf *newbp;
  641: 
  642: 	if (bp->b_flags & B_INVAL) {
  643: 		brelse(bp);
  644: 		return (0);
  645: 	}
  646: 
  647: 	oldflags = bp->b_flags;
  648: 
  649: 	if (BUF_REFCNT(bp) == 0)
  650: 		panic("bwrite: buffer is not busy???");
  651: 	s = splbio();
  652: 	/*
  653: 	 * If a background write is already in progress, delay
  654: 	 * writing this block if it is asynchronous. Otherwise
  655: 	 * wait for the background write to complete.
  656: 	 */
  657: 	if (bp->b_xflags & BX_BKGRDINPROG) {
  658: 		if (bp->b_flags & B_ASYNC) {
  659: 			splx(s);
  660: 			bdwrite(bp);
  661: 			return (0);
  662: 		}
  663: 		bp->b_xflags |= BX_BKGRDWAIT;
  664: 		tsleep(&bp->b_xflags, 0, "biord", 0);
  665: 		if (bp->b_xflags & BX_BKGRDINPROG)
  666: 			panic("bwrite: still writing");
  667: 	}
  668: 
  669: 	/* Mark the buffer clean */
  670: 	bundirty(bp);
  671: 
  672: 	/*
  673: 	 * If this buffer is marked for background writing and we
  674: 	 * do not have to wait for it, make a copy and write the
  675: 	 * copy so as to leave this buffer ready for further use.
  676: 	 *
  677: 	 * This optimization eats a lot of memory.  If we have a page
  678: 	 * or buffer shortfull we can't do it.
  679: 	 */
  680: 	if (dobkgrdwrite &&
  681: 	    (bp->b_xflags & BX_BKGRDWRITE) &&
  682: 	    (bp->b_flags & B_ASYNC) &&
  683: 	    !vm_page_count_severe() &&
  684: 	    !buf_dirty_count_severe()) {
  685: 		if (bp->b_flags & B_CALL)
  686: 			panic("bwrite: need chained iodone");
  687: 
  688: 		/* get a new block */
  689: 		newbp = geteblk(bp->b_bufsize);
  690: 
  691: 		/* set it to be identical to the old block */
  692: 		memcpy(newbp->b_data, bp->b_data, bp->b_bufsize);
  693: 		bgetvp(bp->b_vp, newbp);
  694: 		newbp->b_lblkno = bp->b_lblkno;
  695: 		newbp->b_blkno = bp->b_blkno;
  696: 		newbp->b_offset = bp->b_offset;
  697: 		newbp->b_iodone = vfs_backgroundwritedone;
  698: 		newbp->b_flags |= B_ASYNC | B_CALL;
  699: 		newbp->b_flags &= ~B_INVAL;
  700: 
  701: 		/* move over the dependencies */
  702: 		if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_movedeps)
  703: 			(*bioops.io_movedeps)(bp, newbp);
  704: 
  705: 		/*
  706: 		 * Initiate write on the copy, release the original to
  707: 		 * the B_LOCKED queue so that it cannot go away until
  708: 		 * the background write completes. If not locked it could go
  709: 		 * away and then be reconstituted while it was being written.
  710: 		 * If the reconstituted buffer were written, we could end up
  711: 		 * with two background copies being written at the same time.
  712: 		 */
  713: 		bp->b_xflags |= BX_BKGRDINPROG;
  714: 		bp->b_flags |= B_LOCKED;
  715: 		bqrelse(bp);
  716: 		bp = newbp;
  717: 	}
  718: 
  719: 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR);
  720: 	bp->b_flags |= B_WRITEINPROG | B_CACHE;
  721: 
  722: 	bp->b_vp->v_numoutput++;
  723: 	vfs_busy_pages(bp, 1);
  724: 
  725: 	/*
  726: 	 * Normal bwrites pipeline writes
  727: 	 */
  728: 	bp->b_runningbufspace = bp->b_bufsize;
  729: 	runningbufspace += bp->b_runningbufspace;
  730: 
  731: 	splx(s);
  732: 	if (oldflags & B_ASYNC)
  733: 		BUF_KERNPROC(bp);
  734: 	VOP_STRATEGY(bp->b_vp, bp);
  735: 
  736: 	if ((oldflags & B_ASYNC) == 0) {
  737: 		int rtval = biowait(bp);
  738: 		brelse(bp);
  739: 		return (rtval);
  740: 	} else if ((oldflags & B_NOWDRAIN) == 0) {
  741: 		/*
  742: 		 * don't allow the async write to saturate the I/O
  743: 		 * system.  Deadlocks can occur only if a device strategy
  744: 		 * routine (like in VN) turns around and issues another
  745: 		 * high-level write, in which case B_NOWDRAIN is expected
  746: 		 * to be set.   Otherwise we will not deadlock here because
  747: 		 * we are blocking waiting for I/O that is already in-progress
  748: 		 * to complete.
  749: 		 */
  750: 		waitrunningbufspace();
  751: 	}
  752: 
  753: 	return (0);
  754: }
  755: 
  756: /*
  757:  * Complete a background write started from bwrite.
  758:  */
  759: static void
  760: vfs_backgroundwritedone(bp)
  761: 	struct buf *bp;
  762: {
  763: 	struct buf *origbp;
  764: 
  765: 	/*
  766: 	 * Find the original buffer that we are writing.
  767: 	 */
  768: 	if ((origbp = gbincore(bp->b_vp, bp->b_lblkno)) == NULL)
  769: 		panic("backgroundwritedone: lost buffer");
  770: 	/*
  771: 	 * Process dependencies then return any unfinished ones.
  772: 	 */
  773: 	if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_complete)
  774: 		(*bioops.io_complete)(bp);
  775: 	if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_movedeps)
  776: 		(*bioops.io_movedeps)(bp, origbp);
  777: 	/*
  778: 	 * Clear the BX_BKGRDINPROG flag in the original buffer
  779: 	 * and awaken it if it is waiting for the write to complete.
  780: 	 * If BX_BKGRDINPROG is not set in the original buffer it must
  781: 	 * have been released and re-instantiated - which is not legal.
  782: 	 */
  783: 	KASSERT((origbp->b_xflags & BX_BKGRDINPROG), ("backgroundwritedone: lost buffer2"));
  784: 	origbp->b_xflags &= ~BX_BKGRDINPROG;
  785: 	if (origbp->b_xflags & BX_BKGRDWAIT) {
  786: 		origbp->b_xflags &= ~BX_BKGRDWAIT;
  787: 		wakeup(&origbp->b_xflags);
  788: 	}
  789: 	/*
  790: 	 * Clear the B_LOCKED flag and remove it from the locked
  791: 	 * queue if it currently resides there.
  792: 	 */
  793: 	origbp->b_flags &= ~B_LOCKED;
  794: 	if (BUF_LOCK(origbp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
  795: 		bremfree(origbp);
  796: 		bqrelse(origbp);
  797: 	}
  798: 	/*
  799: 	 * This buffer is marked B_NOCACHE, so when it is released
  800: 	 * by biodone, it will be tossed. We mark it with B_READ
  801: 	 * to avoid biodone doing a second vwakeup.
  802: 	 */
  803: 	bp->b_flags |= B_NOCACHE | B_READ;
  804: 	bp->b_flags &= ~(B_CACHE | B_CALL | B_DONE);
  805: 	bp->b_iodone = 0;
  806: 	biodone(bp);
  807: }
  808: 
  809: /*
  810:  * Delayed write. (Buffer is marked dirty).  Do not bother writing
  811:  * anything if the buffer is marked invalid.
  812:  *
  813:  * Note that since the buffer must be completely valid, we can safely
  814:  * set B_CACHE.  In fact, we have to set B_CACHE here rather then in
  815:  * biodone() in order to prevent getblk from writing the buffer
  816:  * out synchronously.
  817:  */
  818: void
  819: bdwrite(struct buf * bp)
  820: {
  821: 	if (BUF_REFCNT(bp) == 0)
  822: 		panic("bdwrite: buffer is not busy");
  823: 
  824: 	if (bp->b_flags & B_INVAL) {
  825: 		brelse(bp);
  826: 		return;
  827: 	}
  828: 	bdirty(bp);
  829: 
  830: 	/*
  831: 	 * Set B_CACHE, indicating that the buffer is fully valid.  This is
  832: 	 * true even of NFS now.
  833: 	 */
  834: 	bp->b_flags |= B_CACHE;
  835: 
  836: 	/*
  837: 	 * This bmap keeps the system from needing to do the bmap later,
  838: 	 * perhaps when the system is attempting to do a sync.  Since it
  839: 	 * is likely that the indirect block -- or whatever other datastructure
  840: 	 * that the filesystem needs is still in memory now, it is a good
  841: 	 * thing to do this.  Note also, that if the pageout daemon is
  842: 	 * requesting a sync -- there might not be enough memory to do
  843: 	 * the bmap then...  So, this is important to do.
  844: 	 */
  845: 	if (bp->b_lblkno == bp->b_blkno) {
  846: 		VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
  847: 	}
  848: 
  849: 	/*
  850: 	 * Set the *dirty* buffer range based upon the VM system dirty pages.
  851: 	 */
  852: 	vfs_setdirty(bp);
  853: 
  854: 	/*
  855: 	 * We need to do this here to satisfy the vnode_pager and the
  856: 	 * pageout daemon, so that it thinks that the pages have been
  857: 	 * "cleaned".  Note that since the pages are in a delayed write
  858: 	 * buffer -- the VFS layer "will" see that the pages get written
  859: 	 * out on the next sync, or perhaps the cluster will be completed.
  860: 	 */
  861: 	vfs_clean_pages(bp);
  862: 	bqrelse(bp);
  863: 
  864: 	/*
  865: 	 * Wakeup the buffer flushing daemon if we have a lot of dirty
  866: 	 * buffers (midpoint between our recovery point and our stall
  867: 	 * point).
  868: 	 */
  869: 	bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
  870: 
  871: 	/*
  872: 	 * note: we cannot initiate I/O from a bdwrite even if we wanted to,
  873: 	 * due to the softdep code.
  874: 	 */
  875: }
  876: 
  877: /*
  878:  *	bdirty:
  879:  *
  880:  *	Turn buffer into delayed write request.  We must clear B_READ and
  881:  *	B_RELBUF, and we must set B_DELWRI.  We reassign the buffer to 
  882:  *	itself to properly update it in the dirty/clean lists.  We mark it
  883:  *	B_DONE to ensure that any asynchronization of the buffer properly
  884:  *	clears B_DONE ( else a panic will occur later ).  
  885:  *
  886:  *	bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
  887:  *	might have been set pre-getblk().  Unlike bwrite/bdwrite, bdirty()
  888:  *	should only be called if the buffer is known-good.
  889:  *
  890:  *	Since the buffer is not on a queue, we do not update the numfreebuffers
  891:  *	count.
  892:  *
  893:  *	Must be called at splbio().
  894:  *	The buffer must be on QUEUE_NONE.
  895:  */
  896: void
  897: bdirty(bp)
  898: 	struct buf *bp;
  899: {
  900: 	KASSERT(bp->b_qindex == QUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
  901: 	bp->b_flags &= ~(B_READ|B_RELBUF);
  902: 
  903: 	if ((bp->b_flags & B_DELWRI) == 0) {
  904: 		bp->b_flags |= B_DONE | B_DELWRI;
  905: 		reassignbuf(bp, bp->b_vp);
  906: 		++numdirtybuffers;
  907: 		bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
  908: 	}
  909: }
  910: 
  911: /*
  912:  *	bundirty:
  913:  *
  914:  *	Clear B_DELWRI for buffer.
  915:  *
  916:  *	Since the buffer is not on a queue, we do not update the numfreebuffers
  917:  *	count.
  918:  *	
  919:  *	Must be called at splbio().
  920:  *	The buffer must be on QUEUE_NONE.
  921:  */
  922: 
  923: void
  924: bundirty(bp)
  925: 	struct buf *bp;
  926: {
  927: 	KASSERT(bp->b_qindex == QUEUE_NONE, ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));
  928: 
  929: 	if (bp->b_flags & B_DELWRI) {
  930: 		bp->b_flags &= ~B_DELWRI;
  931: 		reassignbuf(bp, bp->b_vp);
  932: 		--numdirtybuffers;
  933: 		numdirtywakeup(lodirtybuffers);
  934: 	}
  935: 	/*
  936: 	 * Since it is now being written, we can clear its deferred write flag.
  937: 	 */
  938: 	bp->b_flags &= ~B_DEFERRED;
  939: }
  940: 
  941: /*
  942:  *	bawrite:
  943:  *
  944:  *	Asynchronous write.  Start output on a buffer, but do not wait for
  945:  *	it to complete.  The buffer is released when the output completes.
  946:  *
  947:  *	bwrite() ( or the VOP routine anyway ) is responsible for handling 
  948:  *	B_INVAL buffers.  Not us.
  949:  */
  950: void
  951: bawrite(struct buf * bp)
  952: {
  953: 	bp->b_flags |= B_ASYNC;
  954: 	(void) VOP_BWRITE(bp->b_vp, bp);
  955: }
  956: 
  957: /*
  958:  *	bowrite:
  959:  *
  960:  *	Ordered write.  Start output on a buffer, and flag it so that the 
  961:  *	device will write it in the order it was queued.  The buffer is 
  962:  *	released when the output completes.  bwrite() ( or the VOP routine
  963:  *	anyway ) is responsible for handling B_INVAL buffers.
  964:  */
  965: int
  966: bowrite(struct buf * bp)
  967: {
  968: 	bp->b_flags |= B_ORDERED | B_ASYNC;
  969: 	return (VOP_BWRITE(bp->b_vp, bp));
  970: }
  971: 
  972: /*
  973:  *	bwillwrite:
  974:  *
  975:  *	Called prior to the locking of any vnodes when we are expecting to
  976:  *	write.  We do not want to starve the buffer cache with too many
  977:  *	dirty buffers so we block here.  By blocking prior to the locking
  978:  *	of any vnodes we attempt to avoid the situation where a locked vnode
  979:  *	prevents the various system daemons from flushing related buffers.
  980:  */
  981: 
  982: void
  983: bwillwrite(void)
  984: {
  985: 	if (numdirtybuffers >= hidirtybuffers) {
  986: 		int s;
  987: 
  988: 		s = splbio();
  989: 		while (numdirtybuffers >= hidirtybuffers) {
  990: 			bd_wakeup(1);
  991: 			needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH;
  992: 			tsleep(&needsbuffer, 0, "flswai", 0);
  993: 		}
  994: 		splx(s);
  995: 	}
  996: }
  997: 
  998: /*
  999:  * Return true if we have too many dirty buffers.
 1000:  */
 1001: int
 1002: buf_dirty_count_severe(void)
 1003: {
 1004: 	return(numdirtybuffers >= hidirtybuffers);
 1005: }
 1006: 
 1007: /*
 1008:  *	brelse:
 1009:  *
 1010:  *	Release a busy buffer and, if requested, free its resources.  The
 1011:  *	buffer will be stashed in the appropriate bufqueue[] allowing it
 1012:  *	to be accessed later as a cache entity or reused for other purposes.
 1013:  */
 1014: void
 1015: brelse(struct buf * bp)
 1016: {
 1017: 	int s;
 1018: 
 1019: 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 1020: 
 1021: 	s = splbio();
 1022: 
 1023: 	if (bp->b_flags & B_LOCKED)
 1024: 		bp->b_flags &= ~B_ERROR;
 1025: 
 1026: 	if ((bp->b_flags & (B_READ | B_ERROR | B_INVAL)) == B_ERROR) {
 1027: 		/*
 1028: 		 * Failed write, redirty.  Must clear B_ERROR to prevent
 1029: 		 * pages from being scrapped.  If B_INVAL is set then
 1030: 		 * this case is not run and the next case is run to 
 1031: 		 * destroy the buffer.  B_INVAL can occur if the buffer
 1032: 		 * is outside the range supported by the underlying device.
 1033: 		 */
 1034: 		bp->b_flags &= ~B_ERROR;
 1035: 		bdirty(bp);
 1036: 	} else if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_FREEBUF)) ||
 1037: 	    (bp->b_bufsize <= 0)) {
 1038: 		/*
 1039: 		 * Either a failed I/O or we were asked to free or not
 1040: 		 * cache the buffer.
 1041: 		 */
 1042: 		bp->b_flags |= B_INVAL;
 1043: 		if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
 1044: 			(*bioops.io_deallocate)(bp);
 1045: 		if (bp->b_flags & B_DELWRI) {
 1046: 			--numdirtybuffers;
 1047: 			numdirtywakeup(lodirtybuffers);
 1048: 		}
 1049: 		bp->b_flags &= ~(B_DELWRI | B_CACHE | B_FREEBUF);
 1050: 		if ((bp->b_flags & B_VMIO) == 0) {
 1051: 			if (bp->b_bufsize)
 1052: 				allocbuf(bp, 0);
 1053: 			if (bp->b_vp)
 1054: 				brelvp(bp);
 1055: 		}
 1056: 	}
 1057: 
 1058: 	/*
 1059: 	 * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_release() 
 1060: 	 * is called with B_DELWRI set, the underlying pages may wind up
 1061: 	 * getting freed causing a previous write (bdwrite()) to get 'lost'
 1062: 	 * because pages associated with a B_DELWRI bp are marked clean.
 1063: 	 * 
 1064: 	 * We still allow the B_INVAL case to call vfs_vmio_release(), even
 1065: 	 * if B_DELWRI is set.
 1066: 	 *
 1067: 	 * If B_DELWRI is not set we may have to set B_RELBUF if we are low
 1068: 	 * on pages to return pages to the VM page queues.
 1069: 	 */
 1070: 	if (bp->b_flags & B_DELWRI)
 1071: 		bp->b_flags &= ~B_RELBUF;
 1072: 	else if (vm_page_count_severe() && !(bp->b_xflags & BX_BKGRDINPROG))
 1073: 		bp->b_flags |= B_RELBUF;
 1074: 
 1075: 	/*
 1076: 	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
 1077: 	 * constituted, not even NFS buffers now.  Two flags effect this.  If
 1078: 	 * B_INVAL, the struct buf is invalidated but the VM object is kept
 1079: 	 * around ( i.e. so it is trivial to reconstitute the buffer later ).
 1080: 	 *
 1081: 	 * If B_ERROR or B_NOCACHE is set, pages in the VM object will be
 1082: 	 * invalidated.  B_ERROR cannot be set for a failed write unless the
 1083: 	 * buffer is also B_INVAL because it hits the re-dirtying code above.
 1084: 	 *
 1085: 	 * Normally we can do this whether a buffer is B_DELWRI or not.  If
 1086: 	 * the buffer is an NFS buffer, it is tracking piecemeal writes or
 1087: 	 * the commit state and we cannot afford to lose the buffer. If the
 1088: 	 * buffer has a background write in progress, we need to keep it
 1089: 	 * around to prevent it from being reconstituted and starting a second
 1090: 	 * background write.
 1091: 	 */
 1092: 	if ((bp->b_flags & B_VMIO)
 1093: 	    && !(bp->b_vp->v_tag == VT_NFS &&
 1094: 		 !vn_isdisk(bp->b_vp, NULL) &&
 1095: 		 (bp->b_flags & B_DELWRI))
 1096: 	    ) {
 1097: 
 1098: 		int i, j, resid;
 1099: 		vm_page_t m;
 1100: 		off_t foff;
 1101: 		vm_pindex_t poff;
 1102: 		vm_object_t obj;
 1103: 		struct vnode *vp;
 1104: 
 1105: 		vp = bp->b_vp;
 1106: 
 1107: 		/*
 1108: 		 * Get the base offset and length of the buffer.  Note that 
 1109: 		 * in the VMIO case if the buffer block size is not
 1110: 		 * page-aligned then b_data pointer may not be page-aligned.
 1111: 		 * But our b_pages[] array *IS* page aligned.
 1112: 		 *
 1113: 		 * block sizes less then DEV_BSIZE (usually 512) are not 
 1114: 		 * supported due to the page granularity bits (m->valid,
 1115: 		 * m->dirty, etc...). 
 1116: 		 *
 1117: 		 * See man buf(9) for more information
 1118: 		 */
 1119: 
 1120: 		resid = bp->b_bufsize;
 1121: 		foff = bp->b_offset;
 1122: 
 1123: 		for (i = 0; i < bp->b_npages; i++) {
 1124: 			m = bp->b_pages[i];
 1125: 			vm_page_flag_clear(m, PG_ZERO);
 1126: 			/*
 1127: 			 * If we hit a bogus page, fixup *all* of them
 1128: 			 * now.  Note that we left these pages wired
 1129: 			 * when we removed them so they had better exist,
 1130: 			 * and they cannot be ripped out from under us so
 1131: 			 * no splvm() protection is necessary.
 1132: 			 */
 1133: 			if (m == bogus_page) {
 1134: 				VOP_GETVOBJECT(vp, &obj);
 1135: 				poff = OFF_TO_IDX(bp->b_offset);
 1136: 
 1137: 				for (j = i; j < bp->b_npages; j++) {
 1138: 					vm_page_t mtmp;
 1139: 
 1140: 					mtmp = bp->b_pages[j];
 1141: 					if (mtmp == bogus_page) {
 1142: 						mtmp = vm_page_lookup(obj, poff + j);
 1143: 						if (!mtmp) {
 1144: 							panic("brelse: page missing");
 1145: 						}
 1146: 						bp->b_pages[j] = mtmp;
 1147: 					}
 1148: 				}
 1149: 
 1150: 				if ((bp->b_flags & B_INVAL) == 0) {
 1151: 					pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
 1152: 				}
 1153: 				m = bp->b_pages[i];
 1154: 			}
 1155: 
 1156: 			/*
 1157: 			 * Invalidate the backing store if B_NOCACHE is set
 1158: 			 * (e.g. used with vinvalbuf()).  If this is NFS
 1159: 			 * we impose a requirement that the block size be
 1160: 			 * a multiple of PAGE_SIZE and create a temporary
 1161: 			 * hack to basically invalidate the whole page.  The
 1162: 			 * problem is that NFS uses really odd buffer sizes
 1163: 			 * especially when tracking piecemeal writes and
 1164: 			 * it also vinvalbuf()'s a lot, which would result
 1165: 			 * in only partial page validation and invalidation
 1166: 			 * here.  If the file page is mmap()'d, however,
 1167: 			 * all the valid bits get set so after we invalidate
 1168: 			 * here we would end up with weird m->valid values
 1169: 			 * like 0xfc.  nfs_getpages() can't handle this so
 1170: 			 * we clear all the valid bits for the NFS case
 1171: 			 * instead of just some of them.
 1172: 			 *
 1173: 			 * The real bug is the VM system having to set m->valid
 1174: 			 * to VM_PAGE_BITS_ALL for faulted-in pages, which
 1175: 			 * itself is an artifact of the whole 512-byte
 1176: 			 * granular mess that exists to support odd block 
 1177: 			 * sizes and UFS meta-data block sizes (e.g. 6144).
 1178: 			 * A complete rewrite is required.
 1179: 			 */
 1180: 			if (bp->b_flags & (B_NOCACHE|B_ERROR)) {
 1181: 				int poffset = foff & PAGE_MASK;
 1182: 				int presid;
 1183: 
 1184: 				presid = PAGE_SIZE - poffset;
 1185: 				if (bp->b_vp->v_tag == VT_NFS &&
 1186: 				    bp->b_vp->v_type == VREG) {
 1187: 					; /* entire page */
 1188: 				} else if (presid > resid) {
 1189: 					presid = resid;
 1190: 				}
 1191: 				KASSERT(presid >= 0, ("brelse: extra page"));
 1192: 				vm_page_set_invalid(m, poffset, presid);
 1193: 			}
 1194: 			resid -= PAGE_SIZE - (foff & PAGE_MASK);
 1195: 			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 1196: 		}
 1197: 
 1198: 		if (bp->b_flags & (B_INVAL | B_RELBUF))
 1199: 			vfs_vmio_release(bp);
 1200: 
 1201: 	} else if (bp->b_flags & B_VMIO) {
 1202: 
 1203: 		if (bp->b_flags & (B_INVAL | B_RELBUF))
 1204: 			vfs_vmio_release(bp);
 1205: 
 1206: 	}
 1207: 			
 1208: 	if (bp->b_qindex != QUEUE_NONE)
 1209: 		panic("brelse: free buffer onto another queue???");
 1210: 	if (BUF_REFCNT(bp) > 1) {
 1211: 		/* Temporary panic to verify exclusive locking */
 1212: 		/* This panic goes away when we allow shared refs */
 1213: 		panic("brelse: multiple refs");
 1214: 		/* do not release to free list */
 1215: 		BUF_UNLOCK(bp);
 1216: 		splx(s);
 1217: 		return;
 1218: 	}
 1219: 
 1220: 	/* enqueue */
 1221: 
 1222: 	/* buffers with no memory */
 1223: 	if (bp->b_bufsize == 0) {
 1224: 		bp->b_flags |= B_INVAL;
 1225: 		bp->b_xflags &= ~BX_BKGRDWRITE;
 1226: 		if (bp->b_xflags & BX_BKGRDINPROG)
 1227: 			panic("losing buffer 1");
 1228: 		if (bp->b_kvasize) {
 1229: 			bp->b_qindex = QUEUE_EMPTYKVA;
 1230: 		} else {
 1231: 			bp->b_qindex = QUEUE_EMPTY;
 1232: 		}
 1233: 		TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
 1234: 		LIST_REMOVE(bp, b_hash);
 1235: 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 1236: 		bp->b_dev = NODEV;
 1237: 	/* buffers with junk contents */
 1238: 	} else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
 1239: 		bp->b_flags |= B_INVAL;
 1240: 		bp->b_xflags &= ~BX_BKGRDWRITE;
 1241: 		if (bp->b_xflags & BX_BKGRDINPROG)
 1242: 			panic("losing buffer 2");
 1243: 		bp->b_qindex = QUEUE_CLEAN;
 1244: 		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
 1245: 		LIST_REMOVE(bp, b_hash);
 1246: 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 1247: 		bp->b_dev = NODEV;
 1248: 
 1249: 	/* buffers that are locked */
 1250: 	} else if (bp->b_flags & B_LOCKED) {
 1251: 		bp->b_qindex = QUEUE_LOCKED;
 1252: 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
 1253: 
 1254: 	/* remaining buffers */
 1255: 	} else {
 1256: 		switch(bp->b_flags & (B_DELWRI|B_AGE)) {
 1257: 		case B_DELWRI | B_AGE:
 1258: 		    bp->b_qindex = QUEUE_DIRTY;
 1259: 		    TAILQ_INSERT_HEAD(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
 1260: 		    break;
 1261: 		case B_DELWRI:
 1262: 		    bp->b_qindex = QUEUE_DIRTY;
 1263: 		    TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
 1264: 		    break;
 1265: 		case B_AGE:
 1266: 		    bp->b_qindex = QUEUE_CLEAN;
 1267: 		    TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
 1268: 		    break;
 1269: 		default:
 1270: 		    bp->b_qindex = QUEUE_CLEAN;
 1271: 		    TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
 1272: 		    break;
 1273: 		}
 1274: 	}
 1275: 
 1276: 	/*
 1277: 	 * If B_INVAL, clear B_DELWRI.  We've already placed the buffer
 1278: 	 * on the correct queue.
 1279: 	 */
 1280: 	if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI))
 1281: 		bundirty(bp);
 1282: 
 1283: 	/*
 1284: 	 * Fixup numfreebuffers count.  The bp is on an appropriate queue
 1285: 	 * unless locked.  We then bump numfreebuffers if it is not B_DELWRI.
 1286: 	 * We've already handled the B_INVAL case ( B_DELWRI will be clear
 1287: 	 * if B_INVAL is set ).
 1288: 	 */
 1289: 
 1290: 	if ((bp->b_flags & B_LOCKED) == 0 && !(bp->b_flags & B_DELWRI))
 1291: 		bufcountwakeup();
 1292: 
 1293: 	/*
 1294: 	 * Something we can maybe free or reuse
 1295: 	 */
 1296: 	if (bp->b_bufsize || bp->b_kvasize)
 1297: 		bufspacewakeup();
 1298: 
 1299: 	/* unlock */
 1300: 	BUF_UNLOCK(bp);
 1301: 	bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF |
 1302: 			B_DIRECT | B_NOWDRAIN);
 1303: 	splx(s);
 1304: }
 1305: 
 1306: /*
 1307:  * Release a buffer back to the appropriate queue but do not try to free
 1308:  * it.  The buffer is expected to be used again soon.
 1309:  *
 1310:  * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
 1311:  * biodone() to requeue an async I/O on completion.  It is also used when
 1312:  * known good buffers need to be requeued but we think we may need the data
 1313:  * again soon.
 1314:  *
 1315:  * XXX we should be able to leave the B_RELBUF hint set on completion.
 1316:  */
 1317: void
 1318: bqrelse(struct buf * bp)
 1319: {
 1320: 	int s;
 1321: 
 1322: 	s = splbio();
 1323: 
 1324: 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 1325: 
 1326: 	if (bp->b_qindex != QUEUE_NONE)
 1327: 		panic("bqrelse: free buffer onto another queue???");
 1328: 	if (BUF_REFCNT(bp) > 1) {
 1329: 		/* do not release to free list */
 1330: 		panic("bqrelse: multiple refs");
 1331: 		BUF_UNLOCK(bp);
 1332: 		splx(s);
 1333: 		return;
 1334: 	}
 1335: 	if (bp->b_flags & B_LOCKED) {
 1336: 		bp->b_flags &= ~B_ERROR;
 1337: 		bp->b_qindex = QUEUE_LOCKED;
 1338: 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
 1339: 		/* buffers with stale but valid contents */
 1340: 	} else if (bp->b_flags & B_DELWRI) {
 1341: 		bp->b_qindex = QUEUE_DIRTY;
 1342: 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
 1343: 	} else if (vm_page_count_severe()) {
 1344: 		/*
 1345: 		 * We are too low on memory, we have to try to free the
 1346: 		 * buffer (most importantly: the wired pages making up its
 1347: 		 * backing store) *now*.
 1348: 		 */
 1349: 		splx(s);
 1350: 		brelse(bp);
 1351: 		return;
 1352: 	} else {
 1353: 		bp->b_qindex = QUEUE_CLEAN;
 1354: 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
 1355: 	}
 1356: 
 1357: 	if ((bp->b_flags & B_LOCKED) == 0 &&
 1358: 	    ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))) {
 1359: 		bufcountwakeup();
 1360: 	}
 1361: 
 1362: 	/*
 1363: 	 * Something we can maybe free or reuse.
 1364: 	 */
 1365: 	if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
 1366: 		bufspacewakeup();
 1367: 
 1368: 	/* unlock */
 1369: 	BUF_UNLOCK(bp);
 1370: 	bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
 1371: 	splx(s);
 1372: }
 1373: 
 1374: static void
 1375: vfs_vmio_release(bp)
 1376: 	struct buf *bp;
 1377: {
 1378: 	int i, s;
 1379: 	vm_page_t m;
 1380: 
 1381: 	s = splvm();
 1382: 	for (i = 0; i < bp->b_npages; i++) {
 1383: 		m = bp->b_pages[i];
 1384: 		bp->b_pages[i] = NULL;
 1385: 		/*
 1386: 		 * In order to keep page LRU ordering consistent, put
 1387: 		 * everything on the inactive queue.
 1388: 		 */
 1389: 		vm_page_unwire(m, 0);
 1390: 		/*
 1391: 		 * We don't mess with busy pages, it is
 1392: 		 * the responsibility of the process that
 1393: 		 * busied the pages to deal with them.
 1394: 		 */
 1395: 		if ((m->flags & PG_BUSY) || (m->busy != 0))
 1396: 			continue;
 1397: 			
 1398: 		if (m->wire_count == 0) {
 1399: 			vm_page_flag_clear(m, PG_ZERO);
 1400: 			/*
 1401: 			 * Might as well free the page if we can and it has
 1402: 			 * no valid data.  We also free the page if the
 1403: 			 * buffer was used for direct I/O.
 1404: 			 */
 1405: 			if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && m->hold_count == 0) {
 1406: 				vm_page_busy(m);
 1407: 				vm_page_protect(m, VM_PROT_NONE);
 1408: 				vm_page_free(m);
 1409: 			} else if (bp->b_flags & B_DIRECT) {
 1410: 				vm_page_try_to_free(m);
 1411: 			} else if (vm_page_count_severe()) {
 1412: 				vm_page_try_to_cache(m);
 1413: 			}
 1414: 		}
 1415: 	}
 1416: 	splx(s);
 1417: 	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
 1418: 	if (bp->b_bufsize) {
 1419: 		bufspacewakeup();
 1420: 		bp->b_bufsize = 0;
 1421: 	}
 1422: 	bp->b_npages = 0;
 1423: 	bp->b_flags &= ~B_VMIO;
 1424: 	if (bp->b_vp)
 1425: 		brelvp(bp);
 1426: }
 1427: 
 1428: /*
 1429:  * Check to see if a block is currently memory resident.
 1430:  */
 1431: struct buf *
 1432: gbincore(struct vnode * vp, daddr_t blkno)
 1433: {
 1434: 	struct buf *bp;
 1435: 	struct bufhashhdr *bh;
 1436: 
 1437: 	bh = bufhash(vp, blkno);
 1438: 
 1439: 	/* Search hash chain */
 1440: 	LIST_FOREACH(bp, bh, b_hash) {
 1441: 		/* hit */
 1442: 		if (bp->b_vp == vp && bp->b_lblkno == blkno &&
 1443: 		    (bp->b_flags & B_INVAL) == 0) {
 1444: 			break;
 1445: 		}
 1446: 	}
 1447: 	return (bp);
 1448: }
 1449: 
 1450: /*
 1451:  *	vfs_bio_awrite:
 1452:  *
 1453:  *	Implement clustered async writes for clearing out B_DELWRI buffers.
 1454:  *	This is much better then the old way of writing only one buffer at
 1455:  *	a time.  Note that we may not be presented with the buffers in the 
 1456:  *	correct order, so we search for the cluster in both directions.
 1457:  */
 1458: int
 1459: vfs_bio_awrite(struct buf * bp)
 1460: {
 1461: 	int i;
 1462: 	int j;
 1463: 	daddr_t lblkno = bp->b_lblkno;
 1464: 	struct vnode *vp = bp->b_vp;
 1465: 	int s;
 1466: 	int ncl;
 1467: 	struct buf *bpa;
 1468: 	int nwritten;
 1469: 	int size;
 1470: 	int maxcl;
 1471: 
 1472: 	s = splbio();
 1473: 	/*
 1474: 	 * right now we support clustered writing only to regular files.  If
 1475: 	 * we find a clusterable block we could be in the middle of a cluster
 1476: 	 * rather then at the beginning.
 1477: 	 */
 1478: 	if ((vp->v_type == VREG) && 
 1479: 	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
 1480: 	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
 1481: 
 1482: 		size = vp->v_mount->mnt_stat.f_iosize;
 1483: 		maxcl = MAXPHYS / size;
 1484: 
 1485: 		for (i = 1; i < maxcl; i++) {
 1486: 			if ((bpa = gbincore(vp, lblkno + i)) &&
 1487: 			    BUF_REFCNT(bpa) == 0 &&
 1488: 			    ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
 1489: 			    (B_DELWRI | B_CLUSTEROK)) &&
 1490: 			    (bpa->b_bufsize == size)) {
 1491: 				if ((bpa->b_blkno == bpa->b_lblkno) ||
 1492: 				    (bpa->b_blkno !=
 1493: 				     bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
 1494: 					break;
 1495: 			} else {
 1496: 				break;
 1497: 			}
 1498: 		}
 1499: 		for (j = 1; i + j <= maxcl && j <= lblkno; j++) {
 1500: 			if ((bpa = gbincore(vp, lblkno - j)) &&
 1501: 			    BUF_REFCNT(bpa) == 0 &&
 1502: 			    ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
 1503: 			    (B_DELWRI | B_CLUSTEROK)) &&
 1504: 			    (bpa->b_bufsize == size)) {
 1505: 				if ((bpa->b_blkno == bpa->b_lblkno) ||
 1506: 				    (bpa->b_blkno !=
 1507: 				     bp->b_blkno - ((j * size) >> DEV_BSHIFT)))
 1508: 					break;
 1509: 			} else {
 1510: 				break;
 1511: 			}
 1512: 		}
 1513: 		--j;
 1514: 		ncl = i + j;
 1515: 		/*
 1516: 		 * this is a possible cluster write
 1517: 		 */
 1518: 		if (ncl != 1) {
 1519: 			nwritten = cluster_wbuild(vp, size, lblkno - j, ncl);
 1520: 			splx(s);
 1521: 			return nwritten;
 1522: 		}
 1523: 	}
 1524: 
 1525: 	BUF_LOCK(bp, LK_EXCLUSIVE);
 1526: 	bremfree(bp);
 1527: 	bp->b_flags |= B_ASYNC;
 1528: 
 1529: 	splx(s);
 1530: 	/*
 1531: 	 * default (old) behavior, writing out only one block
 1532: 	 *
 1533: 	 * XXX returns b_bufsize instead of b_bcount for nwritten?
 1534: 	 */
 1535: 	nwritten = bp->b_bufsize;
 1536: 	(void) VOP_BWRITE(bp->b_vp, bp);
 1537: 
 1538: 	return nwritten;
 1539: }
 1540: 
 1541: /*
 1542:  *	getnewbuf:
 1543:  *
 1544:  *	Find and initialize a new buffer header, freeing up existing buffers 
 1545:  *	in the bufqueues as necessary.  The new buffer is returned locked.
 1546:  *
 1547:  *	Important:  B_INVAL is not set.  If the caller wishes to throw the
 1548:  *	buffer away, the caller must set B_INVAL prior to calling brelse().
 1549:  *
 1550:  *	We block if:
 1551:  *		We have insufficient buffer headers
 1552:  *		We have insufficient buffer space
 1553:  *		buffer_map is too fragmented ( space reservation fails )
 1554:  *		If we have to flush dirty buffers ( but we try to avoid this )
 1555:  *
 1556:  *	To avoid VFS layer recursion we do not flush dirty buffers ourselves.
 1557:  *	Instead we ask the buf daemon to do it for us.  We attempt to
 1558:  *	avoid piecemeal wakeups of the pageout daemon.
 1559:  */
 1560: 
 1561: static struct buf *
 1562: getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
 1563: {
 1564: 	struct buf *bp;
 1565: 	struct buf *nbp;
 1566: 	int defrag = 0;
 1567: 	int nqindex;
 1568: 	static int flushingbufs;
 1569: 
 1570: 	/*
 1571: 	 * We can't afford to block since we might be holding a vnode lock,
 1572: 	 * which may prevent system daemons from running.  We deal with
 1573: 	 * low-memory situations by proactively returning memory and running
 1574: 	 * async I/O rather then sync I/O.
 1575: 	 */
 1576: 	
 1577: 	++getnewbufcalls;
 1578: 	--getnewbufrestarts;
 1579: restart:
 1580: 	++getnewbufrestarts;
 1581: 
 1582: 	/*
 1583: 	 * Setup for scan.  If we do not have enough free buffers,
 1584: 	 * we setup a degenerate case that immediately fails.  Note
 1585: 	 * that if we are specially marked process, we are allowed to
 1586: 	 * dip into our reserves.
 1587: 	 *
 1588: 	 * The scanning sequence is nominally:  EMPTY->EMPTYKVA->CLEAN
 1589: 	 *
 1590: 	 * We start with EMPTYKVA.  If the list is empty we backup to EMPTY.
 1591: 	 * However, there are a number of cases (defragging, reusing, ...)
 1592: 	 * where we cannot backup.
 1593: 	 */
 1594: 	nqindex = QUEUE_EMPTYKVA;
 1595: 	nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
 1596: 
 1597: 	if (nbp == NULL) {
 1598: 		/*
 1599: 		 * If no EMPTYKVA buffers and we are either
 1600: 		 * defragging or reusing, locate a CLEAN buffer
 1601: 		 * to free or reuse.  If bufspace useage is low
 1602: 		 * skip this step so we can allocate a new buffer.
 1603: 		 */
 1604: 		if (defrag || bufspace >= lobufspace) {
 1605: 			nqindex = QUEUE_CLEAN;
 1606: 			nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
 1607: 		}
 1608: 
 1609: 		/*
 1610: 		 * If we could not find or were not allowed to reuse a
 1611: 		 * CLEAN buffer, check to see if it is ok to use an EMPTY
 1612: 		 * buffer.  We can only use an EMPTY buffer if allocating
 1613: 		 * its KVA would not otherwise run us out of buffer space.
 1614: 		 */
 1615: 		if (nbp == NULL && defrag == 0 &&
 1616: 		    bufspace + maxsize < hibufspace) {
 1617: 			nqindex = QUEUE_EMPTY;
 1618: 			nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
 1619: 		}
 1620: 	}
 1621: 
 1622: 	/*
 1623: 	 * Run scan, possibly freeing data and/or kva mappings on the fly
 1624: 	 * depending.
 1625: 	 */
 1626: 
 1627: 	while ((bp = nbp) != NULL) {
 1628: 		int qindex = nqindex;
 1629: 
 1630: 		/*
 1631: 		 * Calculate next bp ( we can only use it if we do not block
 1632: 		 * or do other fancy things ).
 1633: 		 */
 1634: 		if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) {
 1635: 			switch(qindex) {
 1636: 			case QUEUE_EMPTY:
 1637: 				nqindex = QUEUE_EMPTYKVA;
 1638: 				if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA])))
 1639: 					break;
 1640: 				/* fall through */
 1641: 			case QUEUE_EMPTYKVA:
 1642: 				nqindex = QUEUE_CLEAN;
 1643: 				if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN])))
 1644: 					break;
 1645: 				/* fall through */
 1646: 			case QUEUE_CLEAN:
 1647: 				/*
 1648: 				 * nbp is NULL. 
 1649: 				 */
 1650: 				break;
 1651: 			}
 1652: 		}
 1653: 
 1654: 		/*
 1655: 		 * Sanity Checks
 1656: 		 */
 1657: 		KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp));
 1658: 
 1659: 		/*
 1660: 		 * Note: we no longer distinguish between VMIO and non-VMIO
 1661: 		 * buffers.
 1662: 		 */
 1663: 
 1664: 		KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex));
 1665: 
 1666: 		/*
 1667: 		 * If we are defragging then we need a buffer with 
 1668: 		 * b_kvasize != 0.  XXX this situation should no longer
 1669: 		 * occur, if defrag is non-zero the buffer's b_kvasize
 1670: 		 * should also be non-zero at this point.  XXX
 1671: 		 */
 1672: 		if (defrag && bp->b_kvasize == 0) {
 1673: 			printf("Warning: defrag empty buffer %p\n", bp);
 1674: 			continue;
 1675: 		}
 1676: 
 1677: 		/*
 1678: 		 * Start freeing the bp.  This is somewhat involved.  nbp
 1679: 		 * remains valid only for QUEUE_EMPTY[KVA] bp's.
 1680: 		 */
 1681: 
 1682: 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0)
 1683: 			panic("getnewbuf: locked buf");
 1684: 		bremfree(bp);
 1685: 
 1686: 		if (qindex == QUEUE_CLEAN) {
 1687: 			if (bp->b_flags & B_VMIO) {
 1688: 				bp->b_flags &= ~B_ASYNC;
 1689: 				vfs_vmio_release(bp);
 1690: 			}
 1691: 			if (bp->b_vp)
 1692: 				brelvp(bp);
 1693: 		}
 1694: 
 1695: 		/*
 1696: 		 * NOTE:  nbp is now entirely invalid.  We can only restart
 1697: 		 * the scan from this point on.
 1698: 		 *
 1699: 		 * Get the rest of the buffer freed up.  b_kva* is still
 1700: 		 * valid after this operation.
 1701: 		 */
 1702: 
 1703: 		if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
 1704: 			(*bioops.io_deallocate)(bp);
 1705: 		if (bp->b_xflags & BX_BKGRDINPROG)
 1706: 			panic("losing buffer 3");
 1707: 		LIST_REMOVE(bp, b_hash);
 1708: 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 1709: 
 1710: 		/*
 1711: 		 * spl protection not required when scrapping a buffer's
 1712: 		 * contents because it is already wired.
 1713: 		 */
 1714: 		if (bp->b_bufsize)
 1715: 			allocbuf(bp, 0);
 1716: 
 1717: 		bp->b_flags = 0;
 1718: 		bp->b_xflags = 0;
 1719: 		bp->b_dev = NODEV;
 1720: 		bp->b_vp = NULL;
 1721: 		bp->b_blkno = bp->b_lblkno = 0;
 1722: 		bp->b_offset = NOOFFSET;
 1723: 		bp->b_iodone = 0;
 1724: 		bp->b_error = 0;
 1725: 		bp->b_resid = 0;
 1726: 		bp->b_bcount = 0;
 1727: 		bp->b_npages = 0;
 1728: 		bp->b_dirtyoff = bp->b_dirtyend = 0;
 1729: 
 1730: 		LIST_INIT(&bp->b_dep);
 1731: 
 1732: 		/*
 1733: 		 * If we are defragging then free the buffer.
 1734: 		 */
 1735: 		if (defrag) {
 1736: 			bp->b_flags |= B_INVAL;
 1737: 			bfreekva(bp);
 1738: 			brelse(bp);
 1739: 			defrag = 0;
 1740: 			goto restart;
 1741: 		}
 1742: 
 1743: 		/*
 1744: 		 * If we are overcomitted then recover the buffer and its
 1745: 		 * KVM space.  This occurs in rare situations when multiple
 1746: 		 * processes are blocked in getnewbuf() or allocbuf().
 1747: 		 */
 1748: 		if (bufspace >= hibufspace)
 1749: 			flushingbufs = 1;
 1750: 		if (flushingbufs && bp->b_kvasize != 0) {
 1751: 			bp->b_flags |= B_INVAL;
 1752: 			bfreekva(bp);
 1753: 			brelse(bp);
 1754: 			goto restart;
 1755: 		}
 1756: 		if (bufspace < lobufspace)
 1757: 			flushingbufs = 0;
 1758: 		break;
 1759: 	}
 1760: 
 1761: 	/*
 1762: 	 * If we exhausted our list, sleep as appropriate.  We may have to
 1763: 	 * wakeup various daemons and write out some dirty buffers.
 1764: 	 *
 1765: 	 * Generally we are sleeping due to insufficient buffer space.
 1766: 	 */
 1767: 
 1768: 	if (bp == NULL) {
 1769: 		int flags;
 1770: 		char *waitmsg;
 1771: 
 1772: 		if (defrag) {
 1773: 			flags = VFS_BIO_NEED_BUFSPACE;
 1774: 			waitmsg = "nbufkv";
 1775: 		} else if (bufspace >= hibufspace) {
 1776: 			waitmsg = "nbufbs";
 1777: 			flags = VFS_BIO_NEED_BUFSPACE;
 1778: 		} else {
 1779: 			waitmsg = "newbuf";
 1780: 			flags = VFS_BIO_NEED_ANY;
 1781: 		}
 1782: 
 1783: 		bd_speedup();	/* heeeelp */
 1784: 
 1785: 		needsbuffer |= flags;
 1786: 		while (needsbuffer & flags) {
 1787: 			if (tsleep(&needsbuffer, slpflag, waitmsg, slptimeo))
 1788: 				return (NULL);
 1789: 		}
 1790: 	} else {
 1791: 		/*
 1792: 		 * We finally have a valid bp.  We aren't quite out of the
 1793: 		 * woods, we still have to reserve kva space.  In order
 1794: 		 * to keep fragmentation sane we only allocate kva in
 1795: 		 * BKVASIZE chunks.
 1796: 		 */
 1797: 		maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
 1798: 
 1799: 		if (maxsize != bp->b_kvasize) {
 1800: 			vm_offset_t addr = 0;
 1801: 			int count;
 1802: 
 1803: 			bfreekva(bp);
 1804: 
 1805: 			count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
 1806: 			vm_map_lock(buffer_map);
 1807: 
 1808: 			if (vm_map_findspace(buffer_map,
 1809: 				    vm_map_min(buffer_map), maxsize,
 1810: 				    maxsize, &addr)) {
 1811: 				/*
 1812: 				 * Uh oh.  Buffer map is to fragmented.  We
 1813: 				 * must defragment the map.
 1814: 				 */
 1815: 				vm_map_unlock(buffer_map);
 1816: 				vm_map_entry_release(count);
 1817: 				++bufdefragcnt;
 1818: 				defrag = 1;
 1819: 				bp->b_flags |= B_INVAL;
 1820: 				brelse(bp);
 1821: 				goto restart;
 1822: 			}
 1823: 			if (addr) {
 1824: 				vm_map_insert(buffer_map, &count,
 1825: 					NULL, 0,
 1826: 					addr, addr + maxsize,
 1827: 					VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
 1828: 
 1829: 				bp->b_kvabase = (caddr_t) addr;
 1830: 				bp->b_kvasize = maxsize;
 1831: 				bufspace += bp->b_kvasize;
 1832: 				++bufreusecnt;
 1833: 			}
 1834: 			vm_map_unlock(buffer_map);
 1835: 			vm_map_entry_release(count);
 1836: 		}
 1837: 		bp->b_data = bp->b_kvabase;
 1838: 	}
 1839: 	return(bp);
 1840: }
 1841: 
 1842: /*
 1843:  *	buf_daemon:
 1844:  *
 1845:  *	buffer flushing daemon.  Buffers are normally flushed by the
 1846:  *	update daemon but if it cannot keep up this process starts to
 1847:  *	take the load in an attempt to prevent getnewbuf() from blocking.
 1848:  */
 1849: 
 1850: static struct thread *bufdaemonthread;
 1851: 
 1852: static struct kproc_desc buf_kp = {
 1853: 	"bufdaemon",
 1854: 	buf_daemon,
 1855: 	&bufdaemonthread
 1856: };
 1857: SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp)
 1858: 
 1859: static void
 1860: buf_daemon()
 1861: {
 1862: 	int s;
 1863: 
 1864: 	/*
 1865: 	 * This process needs to be suspended prior to shutdown sync.
 1866: 	 */
 1867: 	EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc,
 1868: 	    bufdaemonthread, SHUTDOWN_PRI_LAST);
 1869: 
 1870: 	/*
 1871: 	 * This process is allowed to take the buffer cache to the limit
 1872: 	 */
 1873: 	s = splbio();
 1874: 
 1875: 	for (;;) {
 1876: 		kproc_suspend_loop();
 1877: 
 1878: 		/*
 1879: 		 * Do the flush.  Limit the amount of in-transit I/O we
 1880: 		 * allow to build up, otherwise we would completely saturate
 1881: 		 * the I/O system.  Wakeup any waiting processes before we
 1882: 		 * normally would so they can run in parallel with our drain.
 1883: 		 */
 1884: 		while (numdirtybuffers > lodirtybuffers) {
 1885: 			if (flushbufqueues() == 0)
 1886: 				break;
 1887: 			waitrunningbufspace();
 1888: 			numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2);
 1889: 		}
 1890: 
 1891: 		/*
 1892: 		 * Only clear bd_request if we have reached our low water
 1893: 		 * mark.  The buf_daemon normally waits 5 seconds and
 1894: 		 * then incrementally flushes any dirty buffers that have
 1895: 		 * built up, within reason.
 1896: 		 *
 1897: 		 * If we were unable to hit our low water mark and couldn't
 1898: 		 * find any flushable buffers, we sleep half a second. 
 1899: 		 * Otherwise we loop immediately.
 1900: 		 */
 1901: 		if (numdirtybuffers <= lodirtybuffers) {
 1902: 			/*
 1903: 			 * We reached our low water mark, reset the
 1904: 			 * request and sleep until we are needed again.
 1905: 			 * The sleep is just so the suspend code works.
 1906: 			 */
 1907: 			bd_request = 0;
 1908: 			tsleep(&bd_request, 0, "psleep", hz);
 1909: 		} else {
 1910: 			/*
 1911: 			 * We couldn't find any flushable dirty buffers but
 1912: 			 * still have too many dirty buffers, we
 1913: 			 * have to sleep and try again.  (rare)
 1914: 			 */
 1915: 			tsleep(&bd_request, 0, "qsleep", hz / 2);
 1916: 		}
 1917: 	}
 1918: }
 1919: 
 1920: /*
 1921:  *	flushbufqueues:
 1922:  *
 1923:  *	Try to flush a buffer in the dirty queue.  We must be careful to
 1924:  *	free up B_INVAL buffers instead of write them, which NFS is 
 1925:  *	particularly sensitive to.
 1926:  */
 1927: 
 1928: static int
 1929: flushbufqueues(void)
 1930: {
 1931: 	struct buf *bp;
 1932: 	int r = 0;
 1933: 
 1934: 	bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]);
 1935: 
 1936: 	while (bp) {
 1937: 		KASSERT((bp->b_flags & B_DELWRI), ("unexpected clean buffer %p", bp));
 1938: 		if ((bp->b_flags & B_DELWRI) != 0 &&
 1939: 		    (bp->b_xflags & BX_BKGRDINPROG) == 0) {
 1940: 			if (bp->b_flags & B_INVAL) {
 1941: 				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0)
 1942: 					panic("flushbufqueues: locked buf");
 1943: 				bremfree(bp);
 1944: 				brelse(bp);
 1945: 				++r;
 1946: 				break;
 1947: 			}
 1948: 			if (LIST_FIRST(&bp->b_dep) != NULL &&
 1949: 			    bioops.io_countdeps &&
 1950: 			    (bp->b_flags & B_DEFERRED) == 0 &&
 1951: 			    (*bioops.io_countdeps)(bp, 0)) {
 1952: 				TAILQ_REMOVE(&bufqueues[QUEUE_DIRTY],
 1953: 				    bp, b_freelist);
 1954: 				TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY],
 1955: 				    bp, b_freelist);
 1956: 				bp->b_flags |= B_DEFERRED;
 1957: 				bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]);
 1958: 				continue;
 1959: 			}
 1960: 			vfs_bio_awrite(bp);
 1961: 			++r;
 1962: 			break;
 1963: 		}
 1964: 		bp = TAILQ_NEXT(bp, b_freelist);
 1965: 	}
 1966: 	return (r);
 1967: }
 1968: 
 1969: /*
 1970:  * Check to see if a block is currently memory resident.
 1971:  */
 1972: struct buf *
 1973: incore(struct vnode * vp, daddr_t blkno)
 1974: {
 1975: 	struct buf *bp;
 1976: 
 1977: 	int s = splbio();
 1978: 	bp = gbincore(vp, blkno);
 1979: 	splx(s);
 1980: 	return (bp);
 1981: }
 1982: 
 1983: /*
 1984:  * Returns true if no I/O is needed to access the associated VM object.
 1985:  * This is like incore except it also hunts around in the VM system for
 1986:  * the data.
 1987:  *
 1988:  * Note that we ignore vm_page_free() races from interrupts against our
 1989:  * lookup, since if the caller is not protected our return value will not
 1990:  * be any more valid then otherwise once we splx().
 1991:  */
 1992: int
 1993: inmem(struct vnode * vp, daddr_t blkno)
 1994: {
 1995: 	vm_object_t obj;
 1996: 	vm_offset_t toff, tinc, size;
 1997: 	vm_page_t m;
 1998: 	vm_ooffset_t off;
 1999: 
 2000: 	if (incore(vp, blkno))
 2001: 		return 1;
 2002: 	if (vp->v_mount == NULL)
 2003: 		return 0;
 2004: 	if (VOP_GETVOBJECT(vp, &obj) != 0 || (vp->v_flag & VOBJBUF) == 0)
 2005:  		return 0;
 2006: 
 2007: 	size = PAGE_SIZE;
 2008: 	if (size > vp->v_mount->mnt_stat.f_iosize)
 2009: 		size = vp->v_mount->mnt_stat.f_iosize;
 2010: 	off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
 2011: 
 2012: 	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
 2013: 		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
 2014: 		if (!m)
 2015: 			return 0;
 2016: 		tinc = size;
 2017: 		if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
 2018: 			tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
 2019: 		if (vm_page_is_valid(m,
 2020: 		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
 2021: 			return 0;
 2022: 	}
 2023: 	return 1;
 2024: }
 2025: 
 2026: /*
 2027:  *	vfs_setdirty:
 2028:  *
 2029:  *	Sets the dirty range for a buffer based on the status of the dirty
 2030:  *	bits in the pages comprising the buffer.
 2031:  *
 2032:  *	The range is limited to the size of the buffer.
 2033:  *
 2034:  *	This routine is primarily used by NFS, but is generalized for the
 2035:  *	B_VMIO case.
 2036:  */
 2037: static void
 2038: vfs_setdirty(struct buf *bp) 
 2039: {
 2040: 	int i;
 2041: 	vm_object_t object;
 2042: 
 2043: 	/*
 2044: 	 * Degenerate case - empty buffer
 2045: 	 */
 2046: 
 2047: 	if (bp->b_bufsize == 0)
 2048: 		return;
 2049: 
 2050: 	/*
 2051: 	 * We qualify the scan for modified pages on whether the
 2052: 	 * object has been flushed yet.  The OBJ_WRITEABLE flag
 2053: 	 * is not cleared simply by protecting pages off.
 2054: 	 */
 2055: 
 2056: 	if ((bp->b_flags & B_VMIO) == 0)
 2057: 		return;
 2058: 
 2059: 	object = bp->b_pages[0]->object;
 2060: 
 2061: 	if ((object->flags & OBJ_WRITEABLE) && !(object->flags & OBJ_MIGHTBEDIRTY))
 2062: 		printf("Warning: object %p writeable but not mightbedirty\n", object);
 2063: 	if (!(object->flags & OBJ_WRITEABLE) && (object->flags & OBJ_MIGHTBEDIRTY))
 2064: 		printf("Warning: object %p mightbedirty but not writeable\n", object);
 2065: 
 2066: 	if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) {
 2067: 		vm_offset_t boffset;
 2068: 		vm_offset_t eoffset;
 2069: 
 2070: 		/*
 2071: 		 * test the pages to see if they have been modified directly
 2072: 		 * by users through the VM system.
 2073: 		 */
 2074: 		for (i = 0; i < bp->b_npages; i++) {
 2075: 			vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
 2076: 			vm_page_test_dirty(bp->b_pages[i]);
 2077: 		}
 2078: 
 2079: 		/*
 2080: 		 * Calculate the encompassing dirty range, boffset and eoffset,
 2081: 		 * (eoffset - boffset) bytes.
 2082: 		 */
 2083: 
 2084: 		for (i = 0; i < bp->b_npages; i++) {
 2085: 			if (bp->b_pages[i]->dirty)
 2086: 				break;
 2087: 		}
 2088: 		boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
 2089: 
 2090: 		for (i = bp->b_npages - 1; i >= 0; --i) {
 2091: 			if (bp->b_pages[i]->dirty) {
 2092: 				break;
 2093: 			}
 2094: 		}
 2095: 		eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
 2096: 
 2097: 		/*
 2098: 		 * Fit it to the buffer.
 2099: 		 */
 2100: 
 2101: 		if (eoffset > bp->b_bcount)
 2102: 			eoffset = bp->b_bcount;
 2103: 
 2104: 		/*
 2105: 		 * If we have a good dirty range, merge with the existing
 2106: 		 * dirty range.
 2107: 		 */
 2108: 
 2109: 		if (boffset < eoffset) {
 2110: 			if (bp->b_dirtyoff > boffset)
 2111: 				bp->b_dirtyoff = boffset;
 2112: 			if (bp->b_dirtyend < eoffset)
 2113: 				bp->b_dirtyend = eoffset;
 2114: 		}
 2115: 	}
 2116: }
 2117: 
 2118: /*
 2119:  *	getblk:
 2120:  *
 2121:  *	Get a block given a specified block and offset into a file/device.
 2122:  *	The buffers B_DONE bit will be cleared on return, making it almost
 2123:  * 	ready for an I/O initiation.  B_INVAL may or may not be set on 
 2124:  *	return.  The caller should clear B_INVAL prior to initiating a
 2125:  *	READ.
 2126:  *
 2127:  *	For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
 2128:  *	an existing buffer.
 2129:  *
 2130:  *	For a VMIO buffer, B_CACHE is modified according to the backing VM.
 2131:  *	If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
 2132:  *	and then cleared based on the backing VM.  If the previous buffer is
 2133:  *	non-0-sized but invalid, B_CACHE will be cleared.
 2134:  *
 2135:  *	If getblk() must create a new buffer, the new buffer is returned with
 2136:  *	both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
 2137:  *	case it is returned with B_INVAL clear and B_CACHE set based on the
 2138:  *	backing VM.
 2139:  *
 2140:  *	getblk() also forces a VOP_BWRITE() for any B_DELWRI buffer whos
 2141:  *	B_CACHE bit is clear.
 2142:  *	
 2143:  *	What this means, basically, is that the caller should use B_CACHE to
 2144:  *	determine whether the buffer is fully valid or not and should clear
 2145:  *	B_INVAL prior to issuing a read.  If the caller intends to validate
 2146:  *	the buffer by loading its data area with something, the caller needs
 2147:  *	to clear B_INVAL.  If the caller does this without issuing an I/O, 
 2148:  *	the caller should set B_CACHE ( as an optimization ), else the caller
 2149:  *	should issue the I/O and biodone() will set B_CACHE if the I/O was
 2150:  *	a write attempt or if it was a successfull read.  If the caller 
 2151:  *	intends to issue a READ, the caller must clear B_INVAL and B_ERROR
 2152:  *	prior to issuing the READ.  biodone() will *not* clear B_INVAL.
 2153:  */
 2154: struct buf *
 2155: getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
 2156: {
 2157: 	struct buf *bp;
 2158: 	int s;
 2159: 	struct bufhashhdr *bh;
 2160: 
 2161: 	if (size > MAXBSIZE)
 2162: 		panic("getblk: size(%d) > MAXBSIZE(%d)", size, MAXBSIZE);
 2163: 
 2164: 	s = splbio();
 2165: loop:
 2166: 	/*
 2167: 	 * Block if we are low on buffers.   Certain processes are allowed
 2168: 	 * to completely exhaust the buffer cache.
 2169:          *
 2170:          * If this check ever becomes a bottleneck it may be better to
 2171:          * move it into the else, when gbincore() fails.  At the moment
 2172:          * it isn't a problem.
 2173: 	 *
 2174: 	 * XXX remove, we cannot afford to block anywhere if holding a vnode
 2175: 	 * lock in low-memory situation, so take it to the max.
 2176:          */
 2177: 	if (numfreebuffers == 0) {
 2178: 		if (!curproc)
 2179: 			return NULL;
 2180: 		needsbuffer |= VFS_BIO_NEED_ANY;
 2181: 		tsleep(&needsbuffer, slpflag, "newbuf", slptimeo);
 2182: 	}
 2183: 
 2184: 	if ((bp = gbincore(vp, blkno))) {
 2185: 		/*
 2186: 		 * Buffer is in-core.  If the buffer is not busy, it must
 2187: 		 * be on a queue.
 2188: 		 */
 2189: 
 2190: 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 2191: 			if (BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL,
 2192: 			    "getblk", slpflag, slptimeo) == ENOLCK)
 2193: 				goto loop;
 2194: 			splx(s);
 2195: 			return (struct buf *) NULL;
 2196: 		}
 2197: 
 2198: 		/*
 2199: 		 * The buffer is locked.  B_CACHE is cleared if the buffer is 
 2200: 		 * invalid.  Ohterwise, for a non-VMIO buffer, B_CACHE is set
 2201: 		 * and for a VMIO buffer B_CACHE is adjusted according to the
 2202: 		 * backing VM cache.
 2203: 		 */
 2204: 		if (bp->b_flags & B_INVAL)
 2205: 			bp->b_flags &= ~B_CACHE;
 2206: 		else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
 2207: 			bp->b_flags |= B_CACHE;
 2208: 		bremfree(bp);
 2209: 
 2210: 		/*
 2211: 		 * check for size inconsistancies for non-VMIO case.
 2212: 		 */
 2213: 
 2214: 		if (bp->b_bcount != size) {
 2215: 			if ((bp->b_flags & B_VMIO) == 0 ||
 2216: 			    (size > bp->b_kvasize)) {
 2217: 				if (bp->b_flags & B_DELWRI) {
 2218: 					bp->b_flags |= B_NOCACHE;
 2219: 					VOP_BWRITE(bp->b_vp, bp);
 2220: 				} else {
 2221: 					if ((bp->b_flags & B_VMIO) &&
 2222: 					   (LIST_FIRST(&bp->b_dep) == NULL)) {
 2223: 						bp->b_flags |= B_RELBUF;
 2224: 						brelse(bp);
 2225: 					} else {
 2226: 						bp->b_flags |= B_NOCACHE;
 2227: 						VOP_BWRITE(bp->b_vp, bp);
 2228: 					}
 2229: 				}
 2230: 				goto loop;
 2231: 			}
 2232: 		}
 2233: 
 2234: 		/*
 2235: 		 * If the size is inconsistant in the VMIO case, we can resize
 2236: 		 * the buffer.  This might lead to B_CACHE getting set or
 2237: 		 * cleared.  If the size has not changed, B_CACHE remains
 2238: 		 * unchanged from its previous state.
 2239: 		 */
 2240: 
 2241: 		if (bp->b_bcount != size)
 2242: 			allocbuf(bp, size);
 2243: 
 2244: 		KASSERT(bp->b_offset != NOOFFSET, 
 2245: 		    ("getblk: no buffer offset"));
 2246: 
 2247: 		/*
 2248: 		 * A buffer with B_DELWRI set and B_CACHE clear must
 2249: 		 * be committed before we can return the buffer in
 2250: 		 * order to prevent the caller from issuing a read
 2251: 		 * ( due to B_CACHE not being set ) and overwriting
 2252: 		 * it.
 2253: 		 *
 2254: 		 * Most callers, including NFS and FFS, need this to
 2255: 		 * operate properly either because they assume they
 2256: 		 * can issue a read if B_CACHE is not set, or because
 2257: 		 * ( for example ) an uncached B_DELWRI might loop due 
 2258: 		 * to softupdates re-dirtying the buffer.  In the latter
 2259: 		 * case, B_CACHE is set after the first write completes,
 2260: 		 * preventing further loops.
 2261: 		 *
 2262: 		 * NOTE!  b*write() sets B_CACHE.  If we cleared B_CACHE
 2263: 		 * above while extending the buffer, we cannot allow the
 2264: 		 * buffer to remain with B_CACHE set after the write
 2265: 		 * completes or it will represent a corrupt state.  To
 2266: 		 * deal with this we set B_NOCACHE to scrap the buffer
 2267: 		 * after the write.
 2268: 		 *
 2269: 		 * We might be able to do something fancy, like setting
 2270: 		 * B_CACHE in bwrite() except if B_DELWRI is already set,
 2271: 		 * so the below call doesn't set B_CACHE, but that gets real
 2272: 		 * confusing.  This is much easier.
 2273: 		 */
 2274: 
 2275: 		if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
 2276: 			bp->b_flags |= B_NOCACHE;
 2277: 			VOP_BWRITE(bp->b_vp, bp);
 2278: 			goto loop;
 2279: 		}
 2280: 
 2281: 		splx(s);
 2282: 		bp->b_flags &= ~B_DONE;
 2283: 	} else {
 2284: 		/*
 2285: 		 * Buffer is not in-core, create new buffer.  The buffer
 2286: 		 * returned by getnewbuf() is locked.  Note that the returned
 2287: 		 * buffer is also considered valid (not marked B_INVAL).
 2288: 		 */
 2289: 		int bsize, maxsize, vmio;
 2290: 		off_t offset;
 2291: 
 2292: 		if (vn_isdisk(vp, NULL))
 2293: 			bsize = DEV_BSIZE;
 2294: 		else if (vp->v_mountedhere)
 2295: 			bsize = vp->v_mountedhere->mnt_stat.f_iosize;
 2296: 		else if (vp->v_mount)
 2297: 			bsize = vp->v_mount->mnt_stat.f_iosize;
 2298: 		else
 2299: 			bsize = size;
 2300: 
 2301: 		offset = (off_t)blkno * bsize;
 2302: 		vmio = (VOP_GETVOBJECT(vp, NULL) == 0) && (vp->v_flag & VOBJBUF);
 2303: 		maxsize = vmio ? size + (offset & PAGE_MASK) : size;
 2304: 		maxsize = imax(maxsize, bsize);
 2305: 
 2306: 		if ((bp = getnewbuf(slpflag, slptimeo, size, maxsize)) == NULL) {
 2307: 			if (slpflag || slptimeo) {
 2308: 				splx(s);
 2309: 				return NULL;
 2310: 			}
 2311: 			goto loop;
 2312: 		}
 2313: 
 2314: 		/*
 2315: 		 * This code is used to make sure that a buffer is not
 2316: 		 * created while the getnewbuf routine is blocked.
 2317: 		 * This can be a problem whether the vnode is locked or not.
 2318: 		 * If the buffer is created out from under us, we have to
 2319: 		 * throw away the one we just created.  There is now window
 2320: 		 * race because we are safely running at splbio() from the
 2321: 		 * point of the duplicate buffer creation through to here,
 2322: 		 * and we've locked the buffer.
 2323: 		 */
 2324: 		if (gbincore(vp, blkno)) {
 2325: 			bp->b_flags |= B_INVAL;
 2326: 			brelse(bp);
 2327: 			goto loop;
 2328: 		}
 2329: 
 2330: 		/*
 2331: 		 * Insert the buffer into the hash, so that it can
 2332: 		 * be found by incore.
 2333: 		 */
 2334: 		bp->b_blkno = bp->b_lblkno = blkno;
 2335: 		bp->b_offset = offset;
 2336: 
 2337: 		bgetvp(vp, bp);
 2338: 		LIST_REMOVE(bp, b_hash);
 2339: 		bh = bufhash(vp, blkno);
 2340: 		LIST_INSERT_HEAD(bh, bp, b_hash);
 2341: 
 2342: 		/*
 2343: 		 * set B_VMIO bit.  allocbuf() the buffer bigger.  Since the
 2344: 		 * buffer size starts out as 0, B_CACHE will be set by
 2345: 		 * allocbuf() for the VMIO case prior to it testing the
 2346: 		 * backing store for validity.
 2347: 		 */
 2348: 
 2349: 		if (vmio) {
 2350: 			bp->b_flags |= B_VMIO;
 2351: #if defined(VFS_BIO_DEBUG)
 2352: 			if (vn_canvmio(vp) != TRUE)
 2353: 				printf("getblk: vmioing file type %d???\n", vp->v_type);
 2354: #endif
 2355: 		} else {
 2356: 			bp->b_flags &= ~B_VMIO;
 2357: 		}
 2358: 
 2359: 		allocbuf(bp, size);
 2360: 
 2361: 		splx(s);
 2362: 		bp->b_flags &= ~B_DONE;
 2363: 	}
 2364: 	return (bp);
 2365: }
 2366: 
 2367: /*
 2368:  * Get an empty, disassociated buffer of given size.  The buffer is initially
 2369:  * set to B_INVAL.
 2370:  *
 2371:  * spl protection is not required for the allocbuf() call because races are
 2372:  * impossible here.
 2373:  */
 2374: struct buf *
 2375: geteblk(int size)
 2376: {
 2377: 	struct buf *bp;
 2378: 	int s;
 2379: 	int maxsize;
 2380: 
 2381: 	maxsize = (size + BKVAMASK) & ~BKVAMASK;
 2382: 
 2383: 	s = splbio();
 2384: 	while ((bp = getnewbuf(0, 0, size, maxsize)) == 0);
 2385: 	splx(s);
 2386: 	allocbuf(bp, size);
 2387: 	bp->b_flags |= B_INVAL;	/* b_dep cleared by getnewbuf() */
 2388: 	return (bp);
 2389: }
 2390: 
 2391: 
 2392: /*
 2393:  * This code constitutes the buffer memory from either anonymous system
 2394:  * memory (in the case of non-VMIO operations) or from an associated
 2395:  * VM object (in the case of VMIO operations).  This code is able to
 2396:  * resize a buffer up or down.
 2397:  *
 2398:  * Note that this code is tricky, and has many complications to resolve
 2399:  * deadlock or inconsistant data situations.  Tread lightly!!! 
 2400:  * There are B_CACHE and B_DELWRI interactions that must be dealt with by 
 2401:  * the caller.  Calling this code willy nilly can result in the loss of data.
 2402:  *
 2403:  * allocbuf() only adjusts B_CACHE for VMIO buffers.  getblk() deals with
 2404:  * B_CACHE for the non-VMIO case.
 2405:  *
 2406:  * This routine does not need to be called at splbio() but you must own the
 2407:  * buffer.
 2408:  */
 2409: int
 2410: allocbuf(struct buf *bp, int size)
 2411: {
 2412: 	int newbsize, mbsize;
 2413: 	int i;
 2414: 
 2415: 	if (BUF_REFCNT(bp) == 0)
 2416: 		panic("allocbuf: buffer not busy");
 2417: 
 2418: 	if (bp->b_kvasize < size)
 2419: 		panic("allocbuf: buffer too small");
 2420: 
 2421: 	if ((bp->b_flags & B_VMIO) == 0) {
 2422: 		caddr_t origbuf;
 2423: 		int origbufsize;
 2424: 		/*
 2425: 		 * Just get anonymous memory from the kernel.  Don't
 2426: 		 * mess with B_CACHE.
 2427: 		 */
 2428: 		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 2429: #if !defined(NO_B_MALLOC)
 2430: 		if (bp->b_flags & B_MALLOC)
 2431: 			newbsize = mbsize;
 2432: 		else
 2433: #endif
 2434: 			newbsize = round_page(size);
 2435: 
 2436: 		if (newbsize < bp->b_bufsize) {
 2437: #if !defined(NO_B_MALLOC)
 2438: 			/*
 2439: 			 * malloced buffers are not shrunk
 2440: 			 */
 2441: 			if (bp->b_flags & B_MALLOC) {
 2442: 				if (newbsize) {
 2443: 					bp->b_bcount = size;
 2444: 				} else {
 2445: 					free(bp->b_data, M_BIOBUF);
 2446: 					if (bp->b_bufsize) {
 2447: 						bufmallocspace -= bp->b_bufsize;
 2448: 						bufspacewakeup();
 2449: 						bp->b_bufsize = 0;
 2450: 					}
 2451: 					bp->b_data = bp->b_kvabase;
 2452: 					bp->b_bcount = 0;
 2453: 					bp->b_flags &= ~B_MALLOC;
 2454: 				}
 2455: 				return 1;
 2456: 			}		
 2457: #endif
 2458: 			vm_hold_free_pages(
 2459: 			    bp,
 2460: 			    (vm_offset_t) bp->b_data + newbsize,
 2461: 			    (vm_offset_t) bp->b_data + bp->b_bufsize);
 2462: 		} else if (newbsize > bp->b_bufsize) {
 2463: #if !defined(NO_B_MALLOC)
 2464: 			/*
 2465: 			 * We only use malloced memory on the first allocation.
 2466: 			 * and revert to page-allocated memory when the buffer
 2467: 			 * grows.
 2468: 			 */
 2469: 			if ( (bufmallocspace < maxbufmallocspace) &&
 2470: 				(bp->b_bufsize == 0) &&
 2471: 				(mbsize <= PAGE_SIZE/2)) {
 2472: 
 2473: 				bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
 2474: 				bp->b_bufsize = mbsize;
 2475: 				bp->b_bcount = size;
 2476: 				bp->b_flags |= B_MALLOC;
 2477: 				bufmallocspace += mbsize;
 2478: 				return 1;
 2479: 			}
 2480: #endif
 2481: 			origbuf = NULL;
 2482: 			origbufsize = 0;
 2483: #if !defined(NO_B_MALLOC)
 2484: 			/*
 2485: 			 * If the buffer is growing on its other-than-first allocation,
 2486: 			 * then we revert to the page-allocation scheme.
 2487: 			 */
 2488: 			if (bp->b_flags & B_MALLOC) {
 2489: 				origbuf = bp->b_data;
 2490: 				origbufsize = bp->b_bufsize;
 2491: 				bp->b_data = bp->b_kvabase;
 2492: 				if (bp->b_bufsize) {
 2493: 					bufmallocspace -= bp->b_bufsize;
 2494: 					bufspacewakeup();
 2495: 					bp->b_bufsize = 0;
 2496: 				}
 2497: 				bp->b_flags &= ~B_MALLOC;
 2498: 				newbsize = round_page(newbsize);
 2499: 			}
 2500: #endif
 2501: 			vm_hold_load_pages(
 2502: 			    bp,
 2503: 			    (vm_offset_t) bp->b_data + bp->b_bufsize,
 2504: 			    (vm_offset_t) bp->b_data + newbsize);
 2505: #if !defined(NO_B_MALLOC)
 2506: 			if (origbuf) {
 2507: 				bcopy(origbuf, bp->b_data, origbufsize);
 2508: 				free(origbuf, M_BIOBUF);
 2509: 			}
 2510: #endif
 2511: 		}
 2512: 	} else {
 2513: 		vm_page_t m;
 2514: 		int desiredpages;
 2515: 
 2516: 		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 2517: 		desiredpages = (size == 0) ? 0 :
 2518: 			num_pages((bp->b_offset & PAGE_MASK) + newbsize);
 2519: 
 2520: #if !defined(NO_B_MALLOC)
 2521: 		if (bp->b_flags & B_MALLOC)
 2522: 			panic("allocbuf: VMIO buffer can't be malloced");
 2523: #endif
 2524: 		/*
 2525: 		 * Set B_CACHE initially if buffer is 0 length or will become
 2526: 		 * 0-length.
 2527: 		 */
 2528: 		if (size == 0 || bp->b_bufsize == 0)
 2529: 			bp->b_flags |= B_CACHE;
 2530: 
 2531: 		if (newbsize < bp->b_bufsize) {
 2532: 			/*
 2533: 			 * DEV_BSIZE aligned new buffer size is less then the
 2534: 			 * DEV_BSIZE aligned existing buffer size.  Figure out
 2535: 			 * if we have to remove any pages.
 2536: 			 */
 2537: 			if (desiredpages < bp->b_npages) {
 2538: 				for (i = desiredpages; i < bp->b_npages; i++) {
 2539: 					/*
 2540: 					 * the page is not freed here -- it
 2541: 					 * is the responsibility of 
 2542: 					 * vnode_pager_setsize
 2543: 					 */
 2544: 					m = bp->b_pages[i];
 2545: 					KASSERT(m != bogus_page,
 2546: 					    ("allocbuf: bogus page found"));
 2547: 					while (vm_page_sleep_busy(m, TRUE, "biodep"))
 2548: 						;
 2549: 
 2550: 					bp->b_pages[i] = NULL;
 2551: 					vm_page_unwire(m, 0);
 2552: 				}
 2553: 				pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) +
 2554: 				    (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
 2555: 				bp->b_npages = desiredpages;
 2556: 			}
 2557: 		} else if (size > bp->b_bcount) {
 2558: 			/*
 2559: 			 * We are growing the buffer, possibly in a 
 2560: 			 * byte-granular fashion.
 2561: 			 */
 2562: 			struct vnode *vp;
 2563: 			vm_object_t obj;
 2564: 			vm_offset_t toff;
 2565: 			vm_offset_t tinc;
 2566: 			int s;
 2567: 
 2568: 			/*
 2569: 			 * Step 1, bring in the VM pages from the object, 
 2570: 			 * allocating them if necessary.  We must clear
 2571: 			 * B_CACHE if these pages are not valid for the 
 2572: 			 * range covered by the buffer.
 2573: 			 *
 2574: 			 * spl protection is required to protect against
 2575: 			 * interrupts unbusying and freeing pages between
 2576: 			 * our vm_page_lookup() and our busycheck/wiring
 2577: 			 * call.
 2578: 			 */
 2579: 			vp = bp->b_vp;
 2580: 			VOP_GETVOBJECT(vp, &obj);
 2581: 
 2582: 			s = splbio();
 2583: 			while (bp->b_npages < desiredpages) {
 2584: 				vm_page_t m;
 2585: 				vm_pindex_t pi;
 2586: 
 2587: 				pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages;
 2588: 				if ((m = vm_page_lookup(obj, pi)) == NULL) {
 2589: 					/*
 2590: 					 * note: must allocate system pages
 2591: 					 * since blocking here could intefere
 2592: 					 * with paging I/O, no matter which
 2593: 					 * process we are.
 2594: 					 */
 2595: 					m = vm_page_alloc(obj, pi, VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM);
 2596: 					if (m == NULL) {
 2597: 						VM_WAIT;
 2598: 						vm_pageout_deficit += desiredpages - bp->b_npages;
 2599: 					} else {
 2600: 						vm_page_wire(m);
 2601: 						vm_page_wakeup(m);
 2602: 						bp->b_flags &= ~B_CACHE;
 2603: 						bp->b_pages[bp->b_npages] = m;
 2604: 						++bp->b_npages;
 2605: 					}
 2606: 					continue;
 2607: 				}
 2608: 
 2609: 				/*
 2610: 				 * We found a page.  If we have to sleep on it,
 2611: 				 * retry because it might have gotten freed out
 2612: 				 * from under us.
 2613: 				 *
 2614: 				 * We can only test PG_BUSY here.  Blocking on
 2615: 				 * m->busy might lead to a deadlock:
 2616: 				 *
 2617: 				 *  vm_fault->getpages->cluster_read->allocbuf
 2618: 				 *
 2619: 				 */
 2620: 
 2621: 				if (vm_page_sleep_busy(m, FALSE, "pgtblk"))
 2622: 					continue;
 2623: 
 2624: 				/*
 2625: 				 * We have a good page.  Should we wakeup the
 2626: 				 * page daemon?
 2627: 				 */
 2628: 				if ((curthread != pagethread) &&
 2629: 				    ((m->queue - m->pc) == PQ_CACHE) &&
 2630: 				    ((vmstats.v_free_count + vmstats.v_cache_count) <
 2631: 					(vmstats.v_free_min + vmstats.v_cache_min))) {
 2632: 					pagedaemon_wakeup();
 2633: 				}
 2634: 				vm_page_flag_clear(m, PG_ZERO);
 2635: 				vm_page_wire(m);
 2636: 				bp->b_pages[bp->b_npages] = m;
 2637: 				++bp->b_npages;
 2638: 			}
 2639: 			splx(s);
 2640: 
 2641: 			/*
 2642: 			 * Step 2.  We've loaded the pages into the buffer,
 2643: 			 * we have to figure out if we can still have B_CACHE
 2644: 			 * set.  Note that B_CACHE is set according to the
 2645: 			 * byte-granular range ( bcount and size ), new the
 2646: 			 * aligned range ( newbsize ).
 2647: 			 *
 2648: 			 * The VM test is against m->valid, which is DEV_BSIZE
 2649: 			 * aligned.  Needless to say, the validity of the data
 2650: 			 * needs to also be DEV_BSIZE aligned.  Note that this
 2651: 			 * fails with NFS if the server or some other client
 2652: 			 * extends the file's EOF.  If our buffer is resized, 
 2653: 			 * B_CACHE may remain set! XXX
 2654: 			 */
 2655: 
 2656: 			toff = bp->b_bcount;
 2657: 			tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
 2658: 
 2659: 			while ((bp->b_flags & B_CACHE) && toff < size) {
 2660: 				vm_pindex_t pi;
 2661: 
 2662: 				if (tinc > (size - toff))
 2663: 					tinc = size - toff;
 2664: 
 2665: 				pi = ((bp->b_offset & PAGE_MASK) + toff) >> 
 2666: 				    PAGE_SHIFT;
 2667: 
 2668: 				vfs_buf_test_cache(
 2669: 				    bp, 
 2670: 				    bp->b_offset,
 2671: 				    toff, 
 2672: 				    tinc, 
 2673: 				    bp->b_pages[pi]
 2674: 				);
 2675: 				toff += tinc;
 2676: 				tinc = PAGE_SIZE;
 2677: 			}
 2678: 
 2679: 			/*
 2680: 			 * Step 3, fixup the KVM pmap.  Remember that
 2681: 			 * bp->b_data is relative to bp->b_offset, but 
 2682: 			 * bp->b_offset may be offset into the first page.
 2683: 			 */
 2684: 
 2685: 			bp->b_data = (caddr_t)
 2686: 			    trunc_page((vm_offset_t)bp->b_data);
 2687: 			pmap_qenter(
 2688: 			    (vm_offset_t)bp->b_data,
 2689: 			    bp->b_pages, 
 2690: 			    bp->b_npages
 2691: 			);
 2692: 			bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | 
 2693: 			    (vm_offset_t)(bp->b_offset & PAGE_MASK));
 2694: 		}
 2695: 	}
 2696: 	if (newbsize < bp->b_bufsize)
 2697: 		bufspacewakeup();
 2698: 	bp->b_bufsize = newbsize;	/* actual buffer allocation	*/
 2699: 	bp->b_bcount = size;		/* requested buffer size	*/
 2700: 	return 1;
 2701: }
 2702: 
 2703: /*
 2704:  *	biowait:
 2705:  *
 2706:  *	Wait for buffer I/O completion, returning error status.  The buffer
 2707:  *	is left locked and B_DONE on return.  B_EINTR is converted into a EINTR
 2708:  *	error and cleared.
 2709:  */
 2710: int
 2711: biowait(struct buf * bp)
 2712: {
 2713: 	int s;
 2714: 
 2715: 	s = splbio();
 2716: 	while ((bp->b_flags & B_DONE) == 0) {
 2717: #if defined(NO_SCHEDULE_MODS)
 2718: 		tsleep(bp, 0, "biowait", 0);
 2719: #else
 2720: 		if (bp->b_flags & B_READ)
 2721: 			tsleep(bp, 0, "biord", 0);
 2722: 		else
 2723: 			tsleep(bp, 0, "biowr", 0);
 2724: #endif
 2725: 	}
 2726: 	splx(s);
 2727: 	if (bp->b_flags & B_EINTR) {
 2728: 		bp->b_flags &= ~B_EINTR;
 2729: 		return (EINTR);
 2730: 	}
 2731: 	if (bp->b_flags & B_ERROR) {
 2732: 		return (bp->b_error ? bp->b_error : EIO);
 2733: 	} else {
 2734: 		return (0);
 2735: 	}
 2736: }
 2737: 
 2738: /*
 2739:  *	biodone:
 2740:  *
 2741:  *	Finish I/O on a buffer, optionally calling a completion function.
 2742:  *	This is usually called from an interrupt so process blocking is
 2743:  *	not allowed.
 2744:  *
 2745:  *	biodone is also responsible for setting B_CACHE in a B_VMIO bp.
 2746:  *	In a non-VMIO bp, B_CACHE will be set on the next getblk() 
 2747:  *	assuming B_INVAL is clear.
 2748:  *
 2749:  *	For the VMIO case, we set B_CACHE if the op was a read and no
 2750:  *	read error occured, or if the op was a write.  B_CACHE is never
 2751:  *	set if the buffer is invalid or otherwise uncacheable.
 2752:  *
 2753:  *	biodone does not mess with B_INVAL, allowing the I/O routine or the
 2754:  *	initiator to leave B_INVAL set to brelse the buffer out of existance
 2755:  *	in the biodone routine.
 2756:  *
 2757:  *	b_dev is required to be reinitialized prior to the top level strategy
 2758:  *	call in a device stack.  To avoid improper reuse, biodone() sets
 2759:  *	b_dev to NODEV.
 2760:  */
 2761: void
 2762: biodone(struct buf * bp)
 2763: {
 2764: 	int s, error;
 2765: 
 2766: 	s = splbio();
 2767: 
 2768: 	KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, BUF_REFCNT(bp)));
 2769: 	KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
 2770: 
 2771: 	bp->b_flags |= B_DONE;
 2772: 	bp->b_dev = NODEV;
 2773: 	runningbufwakeup(bp);
 2774: 
 2775: 	if (bp->b_flags & B_FREEBUF) {
 2776: 		brelse(bp);
 2777: 		splx(s);
 2778: 		return;
 2779: 	}
 2780: 
 2781: 	if ((bp->b_flags & B_READ) == 0) {
 2782: 		vwakeup(bp);
 2783: 	}
 2784: 
 2785: 	/* call optional completion function if requested */
 2786: 	if (bp->b_flags & B_CALL) {
 2787: 		bp->b_flags &= ~B_CALL;
 2788: 		(*bp->b_iodone) (bp);
 2789: 		splx(s);
 2790: 		return;
 2791: 	}
 2792: 	if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_complete)
 2793: 		(*bioops.io_complete)(bp);
 2794: 
 2795: 	if (bp->b_flags & B_VMIO) {
 2796: 		int i;
 2797: 		vm_ooffset_t foff;
 2798: 		vm_page_t m;
 2799: 		vm_object_t obj;
 2800: 		int iosize;
 2801: 		struct vnode *vp = bp->b_vp;
 2802: 
 2803: 		error = VOP_GETVOBJECT(vp, &obj);
 2804: 
 2805: #if defined(VFS_BIO_DEBUG)
 2806: 		if (vp->v_holdcnt == 0) {
 2807: 			panic("biodone: zero vnode hold count");
 2808: 		}
 2809: 
 2810: 		if (error) {
 2811: 			panic("biodone: missing VM object");
 2812: 		}
 2813: 
 2814: 		if ((vp->v_flag & VOBJBUF) == 0) {
 2815: 			panic("biodone: vnode is not setup for merged cache");
 2816: 		}
 2817: #endif
 2818: 
 2819: 		foff = bp->b_offset;
 2820: 		KASSERT(bp->b_offset != NOOFFSET,
 2821: 		    ("biodone: no buffer offset"));
 2822: 
 2823: 		if (error) {
 2824: 			panic("biodone: no object");
 2825: 		}
 2826: #if defined(VFS_BIO_DEBUG)
 2827: 		if (obj->paging_in_progress < bp->b_npages) {
 2828: 			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
 2829: 			    obj->paging_in_progress, bp->b_npages);
 2830: 		}
 2831: #endif
 2832: 
 2833: 		/*
 2834: 		 * Set B_CACHE if the op was a normal read and no error
 2835: 		 * occured.  B_CACHE is set for writes in the b*write()
 2836: 		 * routines.
 2837: 		 */
 2838: 		iosize = bp->b_bcount - bp->b_resid;
 2839: 		if ((bp->b_flags & (B_READ|B_FREEBUF|B_INVAL|B_NOCACHE|B_ERROR)) == B_READ) {
 2840: 			bp->b_flags |= B_CACHE;
 2841: 		}
 2842: 
 2843: 		for (i = 0; i < bp->b_npages; i++) {
 2844: 			int bogusflag = 0;
 2845: 			int resid;
 2846: 
 2847: 			resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
 2848: 			if (resid > iosize)
 2849: 				resid = iosize;
 2850: 
 2851: 			/*
 2852: 			 * cleanup bogus pages, restoring the originals.  Since
 2853: 			 * the originals should still be wired, we don't have
 2854: 			 * to worry about interrupt/freeing races destroying
 2855: 			 * the VM object association.
 2856: 			 */
 2857: 			m = bp->b_pages[i];
 2858: 			if (m == bogus_page) {
 2859: 				bogusflag = 1;
 2860: 				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
 2861: 				if (m == NULL)
 2862: 					panic("biodone: page disappeared");
 2863: 				bp->b_pages[i] = m;
 2864: 				pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
 2865: 			}
 2866: #if defined(VFS_BIO_DEBUG)
 2867: 			if (OFF_TO_IDX(foff) != m->pindex) {
 2868: 				printf(
 2869: "biodone: foff(%lu)/m->pindex(%d) mismatch\n",
 2870: 				    (unsigned long)foff, m->pindex);
 2871: 			}
 2872: #endif
 2873: 
 2874: 			/*
 2875: 			 * In the write case, the valid and clean bits are
 2876: 			 * already changed correctly ( see bdwrite() ), so we 
 2877: 			 * only need to do this here in the read case.
 2878: 			 */
 2879: 			if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
 2880: 				vfs_page_set_valid(bp, foff, i, m);
 2881: 			}
 2882: 			vm_page_flag_clear(m, PG_ZERO);
 2883: 
 2884: 			/*
 2885: 			 * when debugging new filesystems or buffer I/O methods, this
 2886: 			 * is the most common error that pops up.  if you see this, you
 2887: 			 * have not set the page busy flag correctly!!!
 2888: 			 */
 2889: 			if (m->busy == 0) {
 2890: 				printf("biodone: page busy < 0, "
 2891: 				    "pindex: %d, foff: 0x(%x,%x), "
 2892: 				    "resid: %d, index: %d\n",
 2893: 				    (int) m->pindex, (int)(foff >> 32),
 2894: 						(int) foff & 0xffffffff, resid, i);
 2895: 				if (!vn_isdisk(vp, NULL))
 2896: 					printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n",
 2897: 					    bp->b_vp->v_mount->mnt_stat.f_iosize,
 2898: 					    (int) bp->b_lblkno,
 2899: 					    bp->b_flags, bp->b_npages);
 2900: 				else
 2901: 					printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n",
 2902: 					    (int) bp->b_lblkno,
 2903: 					    bp->b_flags, bp->b_npages);
 2904: 				printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n",
 2905: 				    m->valid, m->dirty, m->wire_count);
 2906: 				panic("biodone: page busy < 0");
 2907: 			}
 2908: 			vm_page_io_finish(m);
 2909: 			vm_object_pip_subtract(obj, 1);
 2910: 			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 2911: 			iosize -= resid;
 2912: 		}
 2913: 		if (obj)
 2914: 			vm_object_pip_wakeupn(obj, 0);
 2915: 	}
 2916: 
 2917: 	/*
 2918: 	 * For asynchronous completions, release the buffer now. The brelse
 2919: 	 * will do a wakeup there if necessary - so no need to do a wakeup
 2920: 	 * here in the async case. The sync case always needs to do a wakeup.
 2921: 	 */
 2922: 
 2923: 	if (bp->b_flags & B_ASYNC) {
 2924: 		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0)
 2925: 			brelse(bp);
 2926: 		else
 2927: 			bqrelse(bp);
 2928: 	} else {
 2929: 		wakeup(bp);
 2930: 	}
 2931: 	splx(s);
 2932: }
 2933: 
 2934: /*
 2935:  * This routine is called in lieu of iodone in the case of
 2936:  * incomplete I/O.  This keeps the busy status for pages
 2937:  * consistant.
 2938:  */
 2939: void
 2940: vfs_unbusy_pages(struct buf * bp)
 2941: {
 2942: 	int i;
 2943: 
 2944: 	runningbufwakeup(bp);
 2945: 	if (bp->b_flags & B_VMIO) {
 2946: 		struct vnode *vp = bp->b_vp;
 2947: 		vm_object_t obj;
 2948: 
 2949: 		VOP_GETVOBJECT(vp, &obj);
 2950: 
 2951: 		for (i = 0; i < bp->b_npages; i++) {
 2952: 			vm_page_t m = bp->b_pages[i];
 2953: 
 2954: 			/*
 2955: 			 * When restoring bogus changes the original pages
 2956: 			 * should still be wired, so we are in no danger of
 2957: 			 * losing the object association and do not need
 2958: 			 * spl protection particularly.
 2959: 			 */
 2960: 			if (m == bogus_page) {
 2961: 				m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
 2962: 				if (!m) {
 2963: 					panic("vfs_unbusy_pages: page missing");
 2964: 				}
 2965: 				bp->b_pages[i] = m;
 2966: 				pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
 2967: 			}
 2968: 			vm_object_pip_subtract(obj, 1);
 2969: 			vm_page_flag_clear(m, PG_ZERO);
 2970: 			vm_page_io_finish(m);
 2971: 		}
 2972: 		vm_object_pip_wakeupn(obj, 0);
 2973: 	}
 2974: }
 2975: 
 2976: /*
 2977:  * vfs_page_set_valid:
 2978:  *
 2979:  *	Set the valid bits in a page based on the supplied offset.   The
 2980:  *	range is restricted to the buffer's size.
 2981:  *
 2982:  *	This routine is typically called after a read completes.
 2983:  */
 2984: static void
 2985: vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
 2986: {
 2987: 	vm_ooffset_t soff, eoff;
 2988: 
 2989: 	/*
 2990: 	 * Start and end offsets in buffer.  eoff - soff may not cross a
 2991: 	 * page boundry or cross the end of the buffer.  The end of the
 2992: 	 * buffer, in this case, is our file EOF, not the allocation size
 2993: 	 * of the buffer.
 2994: 	 */
 2995: 	soff = off;
 2996: 	eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 2997: 	if (eoff > bp->b_offset + bp->b_bcount)
 2998: 		eoff = bp->b_offset + bp->b_bcount;
 2999: 
 3000: 	/*
 3001: 	 * Set valid range.  This is typically the entire buffer and thus the
 3002: 	 * entire page.
 3003: 	 */
 3004: 	if (eoff > soff) {
 3005: 		vm_page_set_validclean(
 3006: 		    m,
 3007: 		   (vm_offset_t) (soff & PAGE_MASK),
 3008: 		   (vm_offset_t) (eoff - soff)
 3009: 		);
 3010: 	}
 3011: }
 3012: 
 3013: /*
 3014:  * This routine is called before a device strategy routine.
 3015:  * It is used to tell the VM system that paging I/O is in
 3016:  * progress, and treat the pages associated with the buffer
 3017:  * almost as being PG_BUSY.  Also the object paging_in_progress
 3018:  * flag is handled to make sure that the object doesn't become
 3019:  * inconsistant.
 3020:  *
 3021:  * Since I/O has not been initiated yet, certain buffer flags
 3022:  * such as B_ERROR or B_INVAL may be in an inconsistant state
 3023:  * and should be ignored.
 3024:  */
 3025: void
 3026: vfs_busy_pages(struct buf * bp, int clear_modify)
 3027: {
 3028: 	int i, bogus;
 3029: 
 3030: 	if (bp->b_flags & B_VMIO) {
 3031: 		struct vnode *vp = bp->b_vp;
 3032: 		vm_object_t obj;
 3033: 		vm_ooffset_t foff;
 3034: 
 3035: 		VOP_GETVOBJECT(vp, &obj);
 3036: 		foff = bp->b_offset;
 3037: 		KASSERT(bp->b_offset != NOOFFSET,
 3038: 		    ("vfs_busy_pages: no buffer offset"));
 3039: 		vfs_setdirty(bp);
 3040: 
 3041: retry:
 3042: 		for (i = 0; i < bp->b_npages; i++) {
 3043: 			vm_page_t m = bp->b_pages[i];
 3044: 			if (vm_page_sleep_busy(m, FALSE, "vbpage"))
 3045: 				goto retry;
 3046: 		}
 3047: 
 3048: 		bogus = 0;
 3049: 		for (i = 0; i < bp->b_npages; i++) {
 3050: 			vm_page_t m = bp->b_pages[i];
 3051: 
 3052: 			vm_page_flag_clear(m, PG_ZERO);
 3053: 			if ((bp->b_flags & B_CLUSTER) == 0) {
 3054: 				vm_object_pip_add(obj, 1);
 3055: 				vm_page_io_start(m);
 3056: 			}
 3057: 
 3058: 			/*
 3059: 			 * When readying a buffer for a read ( i.e
 3060: 			 * clear_modify == 0 ), it is important to do
 3061: 			 * bogus_page replacement for valid pages in 
 3062: 			 * partially instantiated buffers.  Partially 
 3063: 			 * instantiated buffers can, in turn, occur when
 3064: 			 * reconstituting a buffer from its VM backing store
 3065: 			 * base.  We only have to do this if B_CACHE is
 3066: 			 * clear ( which causes the I/O to occur in the
 3067: 			 * first place ).  The replacement prevents the read
 3068: 			 * I/O from overwriting potentially dirty VM-backed
 3069: 			 * pages.  XXX bogus page replacement is, uh, bogus.
 3070: 			 * It may not work properly with small-block devices.
 3071: 			 * We need to find a better way.
 3072: 			 */
 3073: 
 3074: 			vm_page_protect(m, VM_PROT_NONE);
 3075: 			if (clear_modify)
 3076: 				vfs_page_set_valid(bp, foff, i, m);
 3077: 			else if (m->valid == VM_PAGE_BITS_ALL &&
 3078: 				(bp->b_flags & B_CACHE) == 0) {
 3079: 				bp->b_pages[i] = bogus_page;
 3080: 				bogus++;
 3081: 			}
 3082: 			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 3083: 		}
 3084: 		if (bogus)
 3085: 			pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
 3086: 	}
 3087: 
 3088: 	/*
 3089: 	 * This is the easiest place to put the process accounting for the I/O
 3090: 	 * for now.
 3091: 	 */
 3092: 	{
 3093: 		struct proc *p;
 3094: 
 3095: 		if ((p = curthread->td_proc) != NULL) {
 3096: 			if (bp->b_flags & B_READ)
 3097: 				p->p_stats->p_ru.ru_inblock++;
 3098: 			else
 3099: 				p->p_stats->p_ru.ru_oublock++;
 3100: 		}
 3101: 	}
 3102: }
 3103: 
 3104: /*
 3105:  * Tell the VM system that the pages associated with this buffer
 3106:  * are clean.  This is used for delayed writes where the data is
 3107:  * going to go to disk eventually without additional VM intevention.
 3108:  *
 3109:  * Note that while we only really need to clean through to b_bcount, we
 3110:  * just go ahead and clean through to b_bufsize.
 3111:  */
 3112: static void
 3113: vfs_clean_pages(struct buf * bp)
 3114: {
 3115: 	int i;
 3116: 
 3117: 	if (bp->b_flags & B_VMIO) {
 3118: 		vm_ooffset_t foff;
 3119: 
 3120: 		foff = bp->b_offset;
 3121: 		KASSERT(bp->b_offset != NOOFFSET,
 3122: 		    ("vfs_clean_pages: no buffer offset"));
 3123: 		for (i = 0; i < bp->b_npages; i++) {
 3124: 			vm_page_t m = bp->b_pages[i];
 3125: 			vm_ooffset_t noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 3126: 			vm_ooffset_t eoff = noff;
 3127: 
 3128: 			if (eoff > bp->b_offset + bp->b_bufsize)
 3129: 				eoff = bp->b_offset + bp->b_bufsize;
 3130: 			vfs_page_set_valid(bp, foff, i, m);
 3131: 			/* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
 3132: 			foff = noff;
 3133: 		}
 3134: 	}
 3135: }
 3136: 
 3137: /*
 3138:  *	vfs_bio_set_validclean:
 3139:  *
 3140:  *	Set the range within the buffer to valid and clean.  The range is 
 3141:  *	relative to the beginning of the buffer, b_offset.  Note that b_offset
 3142:  *	itself may be offset from the beginning of the first page.
 3143:  */
 3144: 
 3145: void   
 3146: vfs_bio_set_validclean(struct buf *bp, int base, int size)
 3147: {
 3148: 	if (bp->b_flags & B_VMIO) {
 3149: 		int i;
 3150: 		int n;
 3151: 
 3152: 		/*
 3153: 		 * Fixup base to be relative to beginning of first page.
 3154: 		 * Set initial n to be the maximum number of bytes in the
 3155: 		 * first page that can be validated.
 3156: 		 */
 3157: 
 3158: 		base += (bp->b_offset & PAGE_MASK);
 3159: 		n = PAGE_SIZE - (base & PAGE_MASK);
 3160: 
 3161: 		for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
 3162: 			vm_page_t m = bp->b_pages[i];
 3163: 
 3164: 			if (n > size)
 3165: 				n = size;
 3166: 
 3167: 			vm_page_set_validclean(m, base & PAGE_MASK, n);
 3168: 			base += n;
 3169: 			size -= n;
 3170: 			n = PAGE_SIZE;
 3171: 		}
 3172: 	}
 3173: }
 3174: 
 3175: /*
 3176:  *	vfs_bio_clrbuf:
 3177:  *
 3178:  *	clear a buffer.  This routine essentially fakes an I/O, so we need
 3179:  *	to clear B_ERROR and B_INVAL.
 3180:  *
 3181:  *	Note that while we only theoretically need to clear through b_bcount,
 3182:  *	we go ahead and clear through b_bufsize.
 3183:  */
 3184: 
 3185: void
 3186: vfs_bio_clrbuf(struct buf *bp)
 3187: {
 3188: 	int i, mask = 0;
 3189: 	caddr_t sa, ea;
 3190: 	if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) {
 3191: 		bp->b_flags &= ~(B_INVAL|B_ERROR);
 3192: 		if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
 3193: 		    (bp->b_offset & PAGE_MASK) == 0) {
 3194: 			mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
 3195: 			if ((bp->b_pages[0]->valid & mask) == mask) {
 3196: 				bp->b_resid = 0;
 3197: 				return;
 3198: 			}
 3199: 			if (((bp->b_pages[0]->flags & PG_ZERO) == 0) &&
 3200: 			    ((bp->b_pages[0]->valid & mask) == 0)) {
 3201: 				bzero(bp->b_data, bp->b_bufsize);
 3202: 				bp->b_pages[0]->valid |= mask;
 3203: 				bp->b_resid = 0;
 3204: 				return;
 3205: 			}
 3206: 		}
 3207: 		ea = sa = bp->b_data;
 3208: 		for(i=0;i<bp->b_npages;i++,sa=ea) {
 3209: 			int j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE;
 3210: 			ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE);
 3211: 			ea = (caddr_t)(vm_offset_t)ulmin(
 3212: 			    (u_long)(vm_offset_t)ea,
 3213: 			    (u_long)(vm_offset_t)bp->b_data + bp->b_bufsize);
 3214: 			mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
 3215: 			if ((bp->b_pages[i]->valid & mask) == mask)
 3216: 				continue;
 3217: 			if ((bp->b_pages[i]->valid & mask) == 0) {
 3218: 				if ((bp->b_pages[i]->flags & PG_ZERO) == 0) {
 3219: 					bzero(sa, ea - sa);
 3220: 				}
 3221: 			} else {
 3222: 				for (; sa < ea; sa += DEV_BSIZE, j++) {
 3223: 					if (((bp->b_pages[i]->flags & PG_ZERO) == 0) &&
 3224: 						(bp->b_pages[i]->valid & (1<<j)) == 0)
 3225: 						bzero(sa, DEV_BSIZE);
 3226: 				}
 3227: 			}
 3228: 			bp->b_pages[i]->valid |= mask;
 3229: 			vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
 3230: 		}
 3231: 		bp->b_resid = 0;
 3232: 	} else {
 3233: 		clrbuf(bp);
 3234: 	}
 3235: }
 3236: 
 3237: /*
 3238:  * vm_hold_load_pages and vm_hold_unload pages get pages into
 3239:  * a buffers address space.  The pages are anonymous and are
 3240:  * not associated with a file object.
 3241:  */
 3242: void
 3243: vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
 3244: {
 3245: 	vm_offset_t pg;
 3246: 	vm_page_t p;
 3247: 	int index;
 3248: 
 3249: 	to = round_page(to);
 3250: 	from = round_page(from);
 3251: 	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
 3252: 
 3253: 	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
 3254: 
 3255: tryagain:
 3256: 
 3257: 		/*
 3258: 		 * note: must allocate system pages since blocking here
 3259: 		 * could intefere with paging I/O, no matter which
 3260: 		 * process we are.
 3261: 		 */
 3262: 		p = vm_page_alloc(kernel_object,
 3263: 			((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
 3264: 			VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM);
 3265: 		if (!p) {
 3266: 			vm_pageout_deficit += (to - from) >> PAGE_SHIFT;
 3267: 			VM_WAIT;
 3268: 			goto tryagain;
 3269: 		}
 3270: 		vm_page_wire(p);
 3271: 		p->valid = VM_PAGE_BITS_ALL;
 3272: 		vm_page_flag_clear(p, PG_ZERO);
 3273: 		pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
 3274: 		bp->b_pages[index] = p;
 3275: 		vm_page_wakeup(p);
 3276: 	}
 3277: 	bp->b_npages = index;
 3278: }
 3279: 
 3280: void
 3281: vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
 3282: {
 3283: 	vm_offset_t pg;
 3284: 	vm_page_t p;
 3285: 	int index, newnpages;
 3286: 
 3287: 	from = round_page(from);
 3288: 	to = round_page(to);
 3289: 	newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
 3290: 
 3291: 	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
 3292: 		p = bp->b_pages[index];
 3293: 		if (p && (index < bp->b_npages)) {
 3294: 			if (p->busy) {
 3295: 				printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n",
 3296: 					bp->b_blkno, bp->b_lblkno);
 3297: 			}
 3298: 			bp->b_pages[index] = NULL;
 3299: 			pmap_kremove(pg);
 3300: 			vm_page_busy(p);
 3301: 			vm_page_unwire(p, 0);
 3302: 			vm_page_free(p);
 3303: 		}
 3304: 	}
 3305: 	bp->b_npages = newnpages;
 3306: }
 3307: 
 3308: /*
 3309:  * Map an IO request into kernel virtual address space.
 3310:  *
 3311:  * All requests are (re)mapped into kernel VA space.
 3312:  * Notice that we use b_bufsize for the size of the buffer
 3313:  * to be mapped.  b_bcount might be modified by the driver.
 3314:  */
 3315: int
 3316: vmapbuf(struct buf *bp)
 3317: {
 3318: 	caddr_t addr, v, kva;
 3319: 	vm_paddr_t pa;
 3320: 	int pidx;
 3321: 	int i;
 3322: 	struct vm_page *m;
 3323: 
 3324: 	if ((bp->b_flags & B_PHYS) == 0)
 3325: 		panic("vmapbuf");
 3326: 	if (bp->b_bufsize < 0)
 3327: 		return (-1);
 3328: 	for (v = bp->b_saveaddr,
 3329: 		     addr = (caddr_t)trunc_page((vm_offset_t)bp->b_data),
 3330: 		     pidx = 0;
 3331: 	     addr < bp->b_data + bp->b_bufsize;
 3332: 	     addr += PAGE_SIZE, v += PAGE_SIZE, pidx++) {
 3333: 		/*
 3334: 		 * Do the vm_fault if needed; do the copy-on-write thing
 3335: 		 * when reading stuff off device into memory.
 3336: 		 */
 3337: retry:
 3338: 		i = vm_fault_quick((addr >= bp->b_data) ? addr : bp->b_data,
 3339: 			(bp->b_flags&B_READ)?(VM_PROT_READ|VM_PROT_WRITE):VM_PROT_READ);
 3340: 		if (i < 0) {
 3341: 			for (i = 0; i < pidx; ++i) {
 3342: 			    vm_page_unhold(bp->b_pages[i]);
 3343: 			    bp->b_pages[i] = NULL;
 3344: 			}
 3345: 			return(-1);
 3346: 		}
 3347: 
 3348: 		/*
 3349: 		 * WARNING!  If sparc support is MFCd in the future this will
 3350: 		 * have to be changed from pmap_kextract() to pmap_extract()
 3351: 		 * ala -current.
 3352: 		 */
 3353: #ifdef __sparc64__
 3354: #error "If MFCing sparc support use pmap_extract"
 3355: #endif
 3356: 		pa = pmap_kextract((vm_offset_t)addr);
 3357: 		if (pa == 0) {
 3358: 			printf("vmapbuf: warning, race against user address during I/O");
 3359: 			goto retry;
 3360: 		}
 3361: 		m = PHYS_TO_VM_PAGE(pa);
 3362: 		vm_page_hold(m);
 3363: 		bp->b_pages[pidx] = m;
 3364: 	}
 3365: 	if (pidx > btoc(MAXPHYS))
 3366: 		panic("vmapbuf: mapped more than MAXPHYS");
 3367: 	pmap_qenter((vm_offset_t)bp->b_saveaddr, bp->b_pages, pidx);
 3368: 	
 3369: 	kva = bp->b_saveaddr;
 3370: 	bp->b_npages = pidx;
 3371: 	bp->b_saveaddr = bp->b_data;
 3372: 	bp->b_data = kva + (((vm_offset_t) bp->b_data) & PAGE_MASK);
 3373: 	return(0);
 3374: }
 3375: 
 3376: /*
 3377:  * Free the io map PTEs associated with this IO operation.
 3378:  * We also invalidate the TLB entries and restore the original b_addr.
 3379:  */
 3380: void
 3381: vunmapbuf(bp)
 3382: 	struct buf *bp;
 3383: {
 3384: 	int pidx;
 3385: 	int npages;
 3386: 	vm_page_t *m;
 3387: 
 3388: 	if ((bp->b_flags & B_PHYS) == 0)
 3389: 		panic("vunmapbuf");
 3390: 
 3391: 	npages = bp->b_npages;
 3392: 	pmap_qremove(trunc_page((vm_offset_t)bp->b_data),
 3393: 		     npages);
 3394: 	m = bp->b_pages;
 3395: 	for (pidx = 0; pidx < npages; pidx++)
 3396: 		vm_page_unhold(*m++);
 3397: 
 3398: 	bp->b_data = bp->b_saveaddr;
 3399: }
 3400: 
 3401: #include "opt_ddb.h"
 3402: #ifdef DDB
 3403: #include <ddb/ddb.h>
 3404: 
 3405: DB_SHOW_COMMAND(buffer, db_show_buffer)
 3406: {
 3407: 	/* get args */
 3408: 	struct buf *bp = (struct buf *)addr;
 3409: 
 3410: 	if (!have_addr) {
 3411: 		db_printf("usage: show buffer <addr>\n");
 3412: 		return;
 3413: 	}
 3414: 
 3415: 	db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS);
 3416: 	db_printf("b_error = %d, b_bufsize = %ld, b_bcount = %ld, "
 3417: 		  "b_resid = %ld\nb_dev = (%d,%d), b_data = %p, "
 3418: 		  "b_blkno = %d, b_pblkno = %d\n",
 3419: 		  bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
 3420: 		  major(bp->b_dev), minor(bp->b_dev),
 3421: 		  bp->b_data, bp->b_blkno, bp->b_pblkno);
 3422: 	if (bp->b_npages) {
 3423: 		int i;
 3424: 		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
 3425: 		for (i = 0; i < bp->b_npages; i++) {
 3426: 			vm_page_t m;
 3427: 			m = bp->b_pages[i];
 3428: 			db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
 3429: 			    (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
 3430: 			if ((i + 1) < bp->b_npages)
 3431: 				db_printf(",");
 3432: 		}
 3433: 		db_printf("\n");
 3434: 	}
 3435: }
 3436: #endif /* DDB */