File:  [DragonFly] / src / sys / kern / vfs_bio.c
Revision 1.25: download - view: text, annotated - select for diffs
Thu May 13 17:40:15 2004 UTC (10 years, 2 months ago) by dillon
Branches: MAIN
CVS tags: HEAD
Close an interrupt race between vm_page_lookup() and (typically) a
vm_page_sleep_busy() check by using the correct spl protection.
An interrupt can occur inbetween the two operations and unbusy/free
the page in question, causing the busy check to fail and for the code
to fall through and then operate on a page that may have been freed
and possibly even reused.   Also note that vm_page_grab() had the same
issue between the lookup, busy check, and vm_page_busy() call.

Close an interrupt race when scanning a VM object's memq.  Interrupts
can free pages, removing them from memq, which interferes with memq scans
and can cause a page unassociated with the object to be processed as if it
were associated with the object.

Calls to vm_page_hold() and vm_page_unhold() require spl protection.

Rename the passed socket descriptor argument in sendfile() to make the
code more readable.

Fix several serious bugs in procfs_rwmem().  In particular, force it to
block if a page is busy and then retry.

Get rid of vm_pager_map_pag() and vm_pager_unmap_page(), make the functions
that used to use these routines use SFBUF's instead.

Get rid of the (userland?) 4MB page mapping feature in pmap_object_init_pt()
for now.  The code appears to not track the page directory properly and
could result in a non-zero page being freed as PG_ZERO.

This commit also includes updated code comments and some additional
non-operational code cleanups.

    1: /*
    2:  * Copyright (c) 1994,1997 John S. Dyson
    3:  * All rights reserved.
    4:  *
    5:  * Redistribution and use in source and binary forms, with or without
    6:  * modification, are permitted provided that the following conditions
    7:  * are met:
    8:  * 1. Redistributions of source code must retain the above copyright
    9:  *    notice immediately at the beginning of the file, without modification,
   10:  *    this list of conditions, and the following disclaimer.
   11:  * 2. Absolutely no warranty of function or purpose is made by the author
   12:  *		John S. Dyson.
   13:  *
   14:  * $FreeBSD: src/sys/kern/vfs_bio.c,v 1.242.2.20 2003/05/28 18:38:10 alc Exp $
   15:  * $DragonFly: src/sys/kern/vfs_bio.c,v 1.25 2004/05/13 17:40:15 dillon Exp $
   16:  */
   17: 
   18: /*
   19:  * this file contains a new buffer I/O scheme implementing a coherent
   20:  * VM object and buffer cache scheme.  Pains have been taken to make
   21:  * sure that the performance degradation associated with schemes such
   22:  * as this is not realized.
   23:  *
   24:  * Author:  John S. Dyson
   25:  * Significant help during the development and debugging phases
   26:  * had been provided by David Greenman, also of the FreeBSD core team.
   27:  *
   28:  * see man buf(9) for more info.
   29:  */
   30: 
   31: #include <sys/param.h>
   32: #include <sys/systm.h>
   33: #include <sys/buf.h>
   34: #include <sys/conf.h>
   35: #include <sys/eventhandler.h>
   36: #include <sys/lock.h>
   37: #include <sys/malloc.h>
   38: #include <sys/mount.h>
   39: #include <sys/kernel.h>
   40: #include <sys/kthread.h>
   41: #include <sys/proc.h>
   42: #include <sys/reboot.h>
   43: #include <sys/resourcevar.h>
   44: #include <sys/sysctl.h>
   45: #include <sys/vmmeter.h>
   46: #include <sys/vnode.h>
   47: #include <sys/proc.h>
   48: #include <vm/vm.h>
   49: #include <vm/vm_param.h>
   50: #include <vm/vm_kern.h>
   51: #include <vm/vm_pageout.h>
   52: #include <vm/vm_page.h>
   53: #include <vm/vm_object.h>
   54: #include <vm/vm_extern.h>
   55: #include <vm/vm_map.h>
   56: #include <sys/buf2.h>
   57: #include <vm/vm_page2.h>
   58: 
   59: static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer");
   60: 
   61: struct	bio_ops bioops;		/* I/O operation notification */
   62: 
   63: struct buf *buf;		/* buffer header pool */
   64: struct swqueue bswlist;
   65: 
   66: static void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
   67: 		vm_offset_t to);
   68: static void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
   69: 		vm_offset_t to);
   70: static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off,
   71: 			       int pageno, vm_page_t m);
   72: static void vfs_clean_pages(struct buf * bp);
   73: static void vfs_setdirty(struct buf *bp);
   74: static void vfs_vmio_release(struct buf *bp);
   75: static void vfs_backgroundwritedone(struct buf *bp);
   76: static int flushbufqueues(void);
   77: 
   78: static int bd_request;
   79: 
   80: static void buf_daemon (void);
   81: /*
   82:  * bogus page -- for I/O to/from partially complete buffers
   83:  * this is a temporary solution to the problem, but it is not
   84:  * really that bad.  it would be better to split the buffer
   85:  * for input in the case of buffers partially already in memory,
   86:  * but the code is intricate enough already.
   87:  */
   88: vm_page_t bogus_page;
   89: int vmiodirenable = TRUE;
   90: int runningbufspace;
   91: struct lwkt_token buftimetoken;  /* Interlock on setting prio and timo */
   92: 
   93: static vm_offset_t bogus_offset;
   94: 
   95: static int bufspace, maxbufspace,
   96: 	bufmallocspace, maxbufmallocspace, lobufspace, hibufspace;
   97: static int bufreusecnt, bufdefragcnt, buffreekvacnt;
   98: static int needsbuffer;
   99: static int lorunningspace, hirunningspace, runningbufreq;
  100: static int numdirtybuffers, lodirtybuffers, hidirtybuffers;
  101: static int numfreebuffers, lofreebuffers, hifreebuffers;
  102: static int getnewbufcalls;
  103: static int getnewbufrestarts;
  104: 
  105: SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD,
  106: 	&numdirtybuffers, 0, "");
  107: SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW,
  108: 	&lodirtybuffers, 0, "");
  109: SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW,
  110: 	&hidirtybuffers, 0, "");
  111: SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD,
  112: 	&numfreebuffers, 0, "");
  113: SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW,
  114: 	&lofreebuffers, 0, "");
  115: SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW,
  116: 	&hifreebuffers, 0, "");
  117: SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD,
  118: 	&runningbufspace, 0, "");
  119: SYSCTL_INT(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW,
  120: 	&lorunningspace, 0, "");
  121: SYSCTL_INT(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW,
  122: 	&hirunningspace, 0, "");
  123: SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD,
  124: 	&maxbufspace, 0, "");
  125: SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD,
  126: 	&hibufspace, 0, "");
  127: SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD,
  128: 	&lobufspace, 0, "");
  129: SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD,
  130: 	&bufspace, 0, "");
  131: SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW,
  132: 	&maxbufmallocspace, 0, "");
  133: SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD,
  134: 	&bufmallocspace, 0, "");
  135: SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW,
  136: 	&getnewbufcalls, 0, "");
  137: SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW,
  138: 	&getnewbufrestarts, 0, "");
  139: SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW,
  140: 	&vmiodirenable, 0, "");
  141: SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW,
  142: 	&bufdefragcnt, 0, "");
  143: SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW,
  144: 	&buffreekvacnt, 0, "");
  145: SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW,
  146: 	&bufreusecnt, 0, "");
  147: 
  148: /*
  149:  * Disable background writes for now.  There appear to be races in the 
  150:  * flags tests and locking operations as well as races in the completion
  151:  * code modifying the original bp (origbp) without holding a lock, assuming
  152:  * splbio protection when there might not be splbio protection.
  153:  */
  154: static int dobkgrdwrite = 0;
  155: SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0,
  156: 	"Do background writes (honoring the BV_BKGRDWRITE flag)?");
  157: 
  158: static int bufhashmask;
  159: static int bufhashshift;
  160: static LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
  161: struct bqueues bufqueues[BUFFER_QUEUES] = { { 0 } };
  162: char *buf_wmesg = BUF_WMESG;
  163: 
  164: extern int vm_swap_size;
  165: 
  166: #define VFS_BIO_NEED_ANY	0x01	/* any freeable buffer */
  167: #define VFS_BIO_NEED_DIRTYFLUSH	0x02	/* waiting for dirty buffer flush */
  168: #define VFS_BIO_NEED_FREE	0x04	/* wait for free bufs, hi hysteresis */
  169: #define VFS_BIO_NEED_BUFSPACE	0x08	/* wait for buf space, lo hysteresis */
  170: 
  171: /*
  172:  * Buffer hash table code.  Note that the logical block scans linearly, which
  173:  * gives us some L1 cache locality.
  174:  */
  175: 
  176: static __inline 
  177: struct bufhashhdr *
  178: bufhash(struct vnode *vnp, daddr_t bn)
  179: {
  180: 	u_int64_t hashkey64;
  181: 	int hashkey; 
  182: 	
  183: 	/*
  184: 	 * A variation on the Fibonacci hash that Knuth credits to
  185: 	 * R. W. Floyd, see Knuth's _Art of Computer Programming,
  186: 	 * Volume 3 / Sorting and Searching_
  187: 	 *
  188:          * We reduce the argument to 32 bits before doing the hash to
  189: 	 * avoid the need for a slow 64x64 multiply on 32 bit platforms.
  190: 	 *
  191: 	 * sizeof(struct vnode) is 168 on i386, so toss some of the lower
  192: 	 * bits of the vnode address to reduce the key range, which
  193: 	 * improves the distribution of keys across buckets.
  194: 	 *
  195: 	 * The file system cylinder group blocks are very heavily
  196: 	 * used.  They are located at invervals of fbg, which is
  197: 	 * on the order of 89 to 94 * 2^10, depending on other
  198: 	 * filesystem parameters, for a 16k block size.  Smaller block
  199: 	 * sizes will reduce fpg approximately proportionally.  This
  200: 	 * will cause the cylinder group index to be hashed using the
  201: 	 * lower bits of the hash multiplier, which will not distribute
  202: 	 * the keys as uniformly in a classic Fibonacci hash where a
  203: 	 * relatively small number of the upper bits of the result
  204: 	 * are used.  Using 2^16 as a close-enough approximation to
  205: 	 * fpg, split the hash multiplier in half, with the upper 16
  206: 	 * bits being the inverse of the golden ratio, and the lower
  207: 	 * 16 bits being a fraction between 1/3 and 3/7 (closer to
  208: 	 * 3/7 in this case), that gives good experimental results.
  209: 	 */
  210: 	hashkey64 = ((u_int64_t)(uintptr_t)vnp >> 3) + (u_int64_t)bn;
  211: 	hashkey = (((u_int32_t)(hashkey64 + (hashkey64 >> 32)) * 0x9E376DB1u) >>
  212: 	    bufhashshift) & bufhashmask;
  213: 	return(&bufhashtbl[hashkey]);
  214: }
  215: 
  216: /*
  217:  *	numdirtywakeup:
  218:  *
  219:  *	If someone is blocked due to there being too many dirty buffers,
  220:  *	and numdirtybuffers is now reasonable, wake them up.
  221:  */
  222: 
  223: static __inline void
  224: numdirtywakeup(int level)
  225: {
  226: 	if (numdirtybuffers <= level) {
  227: 		if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) {
  228: 			needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH;
  229: 			wakeup(&needsbuffer);
  230: 		}
  231: 	}
  232: }
  233: 
  234: /*
  235:  *	bufspacewakeup:
  236:  *
  237:  *	Called when buffer space is potentially available for recovery.
  238:  *	getnewbuf() will block on this flag when it is unable to free 
  239:  *	sufficient buffer space.  Buffer space becomes recoverable when 
  240:  *	bp's get placed back in the queues.
  241:  */
  242: 
  243: static __inline void
  244: bufspacewakeup(void)
  245: {
  246: 	/*
  247: 	 * If someone is waiting for BUF space, wake them up.  Even
  248: 	 * though we haven't freed the kva space yet, the waiting
  249: 	 * process will be able to now.
  250: 	 */
  251: 	if (needsbuffer & VFS_BIO_NEED_BUFSPACE) {
  252: 		needsbuffer &= ~VFS_BIO_NEED_BUFSPACE;
  253: 		wakeup(&needsbuffer);
  254: 	}
  255: }
  256: 
  257: /*
  258:  * runningbufwakeup() - in-progress I/O accounting.
  259:  *
  260:  */
  261: static __inline void
  262: runningbufwakeup(struct buf *bp)
  263: {
  264: 	if (bp->b_runningbufspace) {
  265: 		runningbufspace -= bp->b_runningbufspace;
  266: 		bp->b_runningbufspace = 0;
  267: 		if (runningbufreq && runningbufspace <= lorunningspace) {
  268: 			runningbufreq = 0;
  269: 			wakeup(&runningbufreq);
  270: 		}
  271: 	}
  272: }
  273: 
  274: /*
  275:  *	bufcountwakeup:
  276:  *
  277:  *	Called when a buffer has been added to one of the free queues to
  278:  *	account for the buffer and to wakeup anyone waiting for free buffers.
  279:  *	This typically occurs when large amounts of metadata are being handled
  280:  *	by the buffer cache ( else buffer space runs out first, usually ).
  281:  */
  282: 
  283: static __inline void
  284: bufcountwakeup(void) 
  285: {
  286: 	++numfreebuffers;
  287: 	if (needsbuffer) {
  288: 		needsbuffer &= ~VFS_BIO_NEED_ANY;
  289: 		if (numfreebuffers >= hifreebuffers)
  290: 			needsbuffer &= ~VFS_BIO_NEED_FREE;
  291: 		wakeup(&needsbuffer);
  292: 	}
  293: }
  294: 
  295: /*
  296:  *	waitrunningbufspace()
  297:  *
  298:  *	runningbufspace is a measure of the amount of I/O currently
  299:  *	running.  This routine is used in async-write situations to
  300:  *	prevent creating huge backups of pending writes to a device.
  301:  *	Only asynchronous writes are governed by this function.  
  302:  *
  303:  *	Reads will adjust runningbufspace, but will not block based on it.
  304:  *	The read load has a side effect of reducing the allowed write load.
  305:  *
  306:  *	This does NOT turn an async write into a sync write.  It waits
  307:  *	for earlier writes to complete and generally returns before the
  308:  *	caller's write has reached the device.
  309:  */
  310: static __inline void
  311: waitrunningbufspace(void)
  312: {
  313: 	while (runningbufspace > hirunningspace) {
  314: 		int s;
  315: 
  316: 		s = splbio();	/* fix race against interrupt/biodone() */
  317: 		++runningbufreq;
  318: 		tsleep(&runningbufreq, 0, "wdrain", 0);
  319: 		splx(s);
  320: 	}
  321: }
  322: 
  323: /*
  324:  *	vfs_buf_test_cache:
  325:  *
  326:  *	Called when a buffer is extended.  This function clears the B_CACHE
  327:  *	bit if the newly extended portion of the buffer does not contain
  328:  *	valid data.
  329:  */
  330: static __inline__
  331: void
  332: vfs_buf_test_cache(struct buf *bp,
  333: 		  vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
  334: 		  vm_page_t m)
  335: {
  336: 	if (bp->b_flags & B_CACHE) {
  337: 		int base = (foff + off) & PAGE_MASK;
  338: 		if (vm_page_is_valid(m, base, size) == 0)
  339: 			bp->b_flags &= ~B_CACHE;
  340: 	}
  341: }
  342: 
  343: static __inline__
  344: void
  345: bd_wakeup(int dirtybuflevel)
  346: {
  347: 	if (bd_request == 0 && numdirtybuffers >= dirtybuflevel) {
  348: 		bd_request = 1;
  349: 		wakeup(&bd_request);
  350: 	}
  351: }
  352: 
  353: /*
  354:  * bd_speedup - speedup the buffer cache flushing code
  355:  */
  356: 
  357: static __inline__
  358: void
  359: bd_speedup(void)
  360: {
  361: 	bd_wakeup(1);
  362: }
  363: 
  364: /*
  365:  * Initialize buffer headers and related structures. 
  366:  */
  367: 
  368: caddr_t
  369: bufhashinit(caddr_t vaddr)
  370: {
  371: 	/* first, make a null hash table */
  372: 	bufhashshift = 29;
  373: 	for (bufhashmask = 8; bufhashmask < nbuf / 4; bufhashmask <<= 1)
  374: 		bufhashshift--;
  375: 	bufhashtbl = (void *)vaddr;
  376: 	vaddr = vaddr + sizeof(*bufhashtbl) * bufhashmask;
  377: 	--bufhashmask;
  378: 	return(vaddr);
  379: }
  380: 
  381: void
  382: bufinit(void)
  383: {
  384: 	struct buf *bp;
  385: 	int i;
  386: 
  387: 	TAILQ_INIT(&bswlist);
  388: 	LIST_INIT(&invalhash);
  389: 	lwkt_token_init(&buftimetoken);
  390: 
  391: 	for (i = 0; i <= bufhashmask; i++)
  392: 		LIST_INIT(&bufhashtbl[i]);
  393: 
  394: 	/* next, make a null set of free lists */
  395: 	for (i = 0; i < BUFFER_QUEUES; i++)
  396: 		TAILQ_INIT(&bufqueues[i]);
  397: 
  398: 	/* finally, initialize each buffer header and stick on empty q */
  399: 	for (i = 0; i < nbuf; i++) {
  400: 		bp = &buf[i];
  401: 		bzero(bp, sizeof *bp);
  402: 		bp->b_flags = B_INVAL;	/* we're just an empty header */
  403: 		bp->b_dev = NODEV;
  404: 		bp->b_qindex = QUEUE_EMPTY;
  405: 		bp->b_xflags = 0;
  406: 		LIST_INIT(&bp->b_dep);
  407: 		BUF_LOCKINIT(bp);
  408: 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
  409: 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
  410: 	}
  411: 
  412: 	/*
  413: 	 * maxbufspace is the absolute maximum amount of buffer space we are 
  414: 	 * allowed to reserve in KVM and in real terms.  The absolute maximum
  415: 	 * is nominally used by buf_daemon.  hibufspace is the nominal maximum
  416: 	 * used by most other processes.  The differential is required to 
  417: 	 * ensure that buf_daemon is able to run when other processes might 
  418: 	 * be blocked waiting for buffer space.
  419: 	 *
  420: 	 * maxbufspace is based on BKVASIZE.  Allocating buffers larger then
  421: 	 * this may result in KVM fragmentation which is not handled optimally
  422: 	 * by the system.
  423: 	 */
  424: 	maxbufspace = nbuf * BKVASIZE;
  425: 	hibufspace = imax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10);
  426: 	lobufspace = hibufspace - MAXBSIZE;
  427: 
  428: 	lorunningspace = 512 * 1024;
  429: 	hirunningspace = 1024 * 1024;
  430: 
  431: /*
  432:  * Limit the amount of malloc memory since it is wired permanently into
  433:  * the kernel space.  Even though this is accounted for in the buffer
  434:  * allocation, we don't want the malloced region to grow uncontrolled.
  435:  * The malloc scheme improves memory utilization significantly on average
  436:  * (small) directories.
  437:  */
  438: 	maxbufmallocspace = hibufspace / 20;
  439: 
  440: /*
  441:  * Reduce the chance of a deadlock occuring by limiting the number
  442:  * of delayed-write dirty buffers we allow to stack up.
  443:  */
  444: 	hidirtybuffers = nbuf / 4 + 20;
  445: 	numdirtybuffers = 0;
  446: /*
  447:  * To support extreme low-memory systems, make sure hidirtybuffers cannot
  448:  * eat up all available buffer space.  This occurs when our minimum cannot
  449:  * be met.  We try to size hidirtybuffers to 3/4 our buffer space assuming
  450:  * BKVASIZE'd (8K) buffers.
  451:  */
  452: 	while (hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
  453: 		hidirtybuffers >>= 1;
  454: 	}
  455: 	lodirtybuffers = hidirtybuffers / 2;
  456: 
  457: /*
  458:  * Try to keep the number of free buffers in the specified range,
  459:  * and give special processes (e.g. like buf_daemon) access to an 
  460:  * emergency reserve.
  461:  */
  462: 	lofreebuffers = nbuf / 18 + 5;
  463: 	hifreebuffers = 2 * lofreebuffers;
  464: 	numfreebuffers = nbuf;
  465: 
  466: /*
  467:  * Maximum number of async ops initiated per buf_daemon loop.  This is
  468:  * somewhat of a hack at the moment, we really need to limit ourselves
  469:  * based on the number of bytes of I/O in-transit that were initiated
  470:  * from buf_daemon.
  471:  */
  472: 
  473: 	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
  474: 	bogus_page = vm_page_alloc(kernel_object,
  475: 			((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
  476: 			VM_ALLOC_NORMAL);
  477: 	vmstats.v_wire_count++;
  478: 
  479: }
  480: 
  481: /*
  482:  * bfreekva() - free the kva allocation for a buffer.
  483:  *
  484:  *	Must be called at splbio() or higher as this is the only locking for
  485:  *	buffer_map.
  486:  *
  487:  *	Since this call frees up buffer space, we call bufspacewakeup().
  488:  */
  489: static void
  490: bfreekva(struct buf * bp)
  491: {
  492: 	int count;
  493: 
  494: 	if (bp->b_kvasize) {
  495: 		++buffreekvacnt;
  496: 		count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
  497: 		vm_map_lock(buffer_map);
  498: 		bufspace -= bp->b_kvasize;
  499: 		vm_map_delete(buffer_map,
  500: 		    (vm_offset_t) bp->b_kvabase,
  501: 		    (vm_offset_t) bp->b_kvabase + bp->b_kvasize,
  502: 		    &count
  503: 		);
  504: 		vm_map_unlock(buffer_map);
  505: 		vm_map_entry_release(count);
  506: 		bp->b_kvasize = 0;
  507: 		bufspacewakeup();
  508: 	}
  509: }
  510: 
  511: /*
  512:  *	bremfree:
  513:  *
  514:  *	Remove the buffer from the appropriate free list.
  515:  */
  516: void
  517: bremfree(struct buf * bp)
  518: {
  519: 	int s = splbio();
  520: 	int old_qindex = bp->b_qindex;
  521: 
  522: 	if (bp->b_qindex != QUEUE_NONE) {
  523: 		KASSERT(BUF_REFCNT(bp) == 1, ("bremfree: bp %p not locked",bp));
  524: 		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
  525: 		bp->b_qindex = QUEUE_NONE;
  526: 	} else {
  527: 		if (BUF_REFCNT(bp) <= 1)
  528: 			panic("bremfree: removing a buffer not on a queue");
  529: 	}
  530: 
  531: 	/*
  532: 	 * Fixup numfreebuffers count.  If the buffer is invalid or not
  533: 	 * delayed-write, and it was on the EMPTY, LRU, or AGE queues,
  534: 	 * the buffer was free and we must decrement numfreebuffers.
  535: 	 */
  536: 	if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
  537: 		switch(old_qindex) {
  538: 		case QUEUE_DIRTY:
  539: 		case QUEUE_CLEAN:
  540: 		case QUEUE_EMPTY:
  541: 		case QUEUE_EMPTYKVA:
  542: 			--numfreebuffers;
  543: 			break;
  544: 		default:
  545: 			break;
  546: 		}
  547: 	}
  548: 	splx(s);
  549: }
  550: 
  551: 
  552: /*
  553:  * Get a buffer with the specified data.  Look in the cache first.  We
  554:  * must clear B_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
  555:  * is set, the buffer is valid and we do not have to do anything ( see
  556:  * getblk() ).
  557:  */
  558: int
  559: bread(struct vnode * vp, daddr_t blkno, int size, struct buf ** bpp)
  560: {
  561: 	struct buf *bp;
  562: 
  563: 	bp = getblk(vp, blkno, size, 0, 0);
  564: 	*bpp = bp;
  565: 
  566: 	/* if not found in cache, do some I/O */
  567: 	if ((bp->b_flags & B_CACHE) == 0) {
  568: 		KASSERT(!(bp->b_flags & B_ASYNC), ("bread: illegal async bp %p", bp));
  569: 		bp->b_flags |= B_READ;
  570: 		bp->b_flags &= ~(B_ERROR | B_INVAL);
  571: 		vfs_busy_pages(bp, 0);
  572: 		VOP_STRATEGY(vp, bp);
  573: 		return (biowait(bp));
  574: 	}
  575: 	return (0);
  576: }
  577: 
  578: /*
  579:  * Operates like bread, but also starts asynchronous I/O on
  580:  * read-ahead blocks.  We must clear B_ERROR and B_INVAL prior
  581:  * to initiating I/O . If B_CACHE is set, the buffer is valid 
  582:  * and we do not have to do anything.
  583:  */
  584: int
  585: breadn(struct vnode * vp, daddr_t blkno, int size, daddr_t * rablkno,
  586: 	int *rabsize, int cnt, struct buf ** bpp)
  587: {
  588: 	struct buf *bp, *rabp;
  589: 	int i;
  590: 	int rv = 0, readwait = 0;
  591: 
  592: 	*bpp = bp = getblk(vp, blkno, size, 0, 0);
  593: 
  594: 	/* if not found in cache, do some I/O */
  595: 	if ((bp->b_flags & B_CACHE) == 0) {
  596: 		bp->b_flags |= B_READ;
  597: 		bp->b_flags &= ~(B_ERROR | B_INVAL);
  598: 		vfs_busy_pages(bp, 0);
  599: 		VOP_STRATEGY(vp, bp);
  600: 		++readwait;
  601: 	}
  602: 
  603: 	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
  604: 		if (inmem(vp, *rablkno))
  605: 			continue;
  606: 		rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
  607: 
  608: 		if ((rabp->b_flags & B_CACHE) == 0) {
  609: 			rabp->b_flags |= B_READ | B_ASYNC;
  610: 			rabp->b_flags &= ~(B_ERROR | B_INVAL);
  611: 			vfs_busy_pages(rabp, 0);
  612: 			BUF_KERNPROC(rabp);
  613: 			VOP_STRATEGY(vp, rabp);
  614: 		} else {
  615: 			brelse(rabp);
  616: 		}
  617: 	}
  618: 
  619: 	if (readwait) {
  620: 		rv = biowait(bp);
  621: 	}
  622: 	return (rv);
  623: }
  624: 
  625: /*
  626:  * Write, release buffer on completion.  (Done by iodone
  627:  * if async).  Do not bother writing anything if the buffer
  628:  * is invalid.
  629:  *
  630:  * Note that we set B_CACHE here, indicating that buffer is
  631:  * fully valid and thus cacheable.  This is true even of NFS
  632:  * now so we set it generally.  This could be set either here 
  633:  * or in biodone() since the I/O is synchronous.  We put it
  634:  * here.
  635:  */
  636: int
  637: bwrite(struct buf * bp)
  638: {
  639: 	int oldflags, s;
  640: 	struct buf *newbp;
  641: 
  642: 	if (bp->b_flags & B_INVAL) {
  643: 		brelse(bp);
  644: 		return (0);
  645: 	}
  646: 
  647: 	oldflags = bp->b_flags;
  648: 
  649: 	if (BUF_REFCNT(bp) == 0)
  650: 		panic("bwrite: buffer is not busy???");
  651: 	s = splbio();
  652: 	/*
  653: 	 * If a background write is already in progress, delay
  654: 	 * writing this block if it is asynchronous. Otherwise
  655: 	 * wait for the background write to complete.
  656: 	 */
  657: 	if (bp->b_xflags & BX_BKGRDINPROG) {
  658: 		if (bp->b_flags & B_ASYNC) {
  659: 			splx(s);
  660: 			bdwrite(bp);
  661: 			return (0);
  662: 		}
  663: 		bp->b_xflags |= BX_BKGRDWAIT;
  664: 		tsleep(&bp->b_xflags, 0, "biord", 0);
  665: 		if (bp->b_xflags & BX_BKGRDINPROG)
  666: 			panic("bwrite: still writing");
  667: 	}
  668: 
  669: 	/* Mark the buffer clean */
  670: 	bundirty(bp);
  671: 
  672: 	/*
  673: 	 * If this buffer is marked for background writing and we
  674: 	 * do not have to wait for it, make a copy and write the
  675: 	 * copy so as to leave this buffer ready for further use.
  676: 	 *
  677: 	 * This optimization eats a lot of memory.  If we have a page
  678: 	 * or buffer shortfull we can't do it.
  679: 	 */
  680: 	if (dobkgrdwrite &&
  681: 	    (bp->b_xflags & BX_BKGRDWRITE) &&
  682: 	    (bp->b_flags & B_ASYNC) &&
  683: 	    !vm_page_count_severe() &&
  684: 	    !buf_dirty_count_severe()) {
  685: 		if (bp->b_flags & B_CALL)
  686: 			panic("bwrite: need chained iodone");
  687: 
  688: 		/* get a new block */
  689: 		newbp = geteblk(bp->b_bufsize);
  690: 
  691: 		/* set it to be identical to the old block */
  692: 		memcpy(newbp->b_data, bp->b_data, bp->b_bufsize);
  693: 		bgetvp(bp->b_vp, newbp);
  694: 		newbp->b_lblkno = bp->b_lblkno;
  695: 		newbp->b_blkno = bp->b_blkno;
  696: 		newbp->b_offset = bp->b_offset;
  697: 		newbp->b_iodone = vfs_backgroundwritedone;
  698: 		newbp->b_flags |= B_ASYNC | B_CALL;
  699: 		newbp->b_flags &= ~B_INVAL;
  700: 
  701: 		/* move over the dependencies */
  702: 		if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_movedeps)
  703: 			(*bioops.io_movedeps)(bp, newbp);
  704: 
  705: 		/*
  706: 		 * Initiate write on the copy, release the original to
  707: 		 * the B_LOCKED queue so that it cannot go away until
  708: 		 * the background write completes. If not locked it could go
  709: 		 * away and then be reconstituted while it was being written.
  710: 		 * If the reconstituted buffer were written, we could end up
  711: 		 * with two background copies being written at the same time.
  712: 		 */
  713: 		bp->b_xflags |= BX_BKGRDINPROG;
  714: 		bp->b_flags |= B_LOCKED;
  715: 		bqrelse(bp);
  716: 		bp = newbp;
  717: 	}
  718: 
  719: 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR);
  720: 	bp->b_flags |= B_WRITEINPROG | B_CACHE;
  721: 
  722: 	bp->b_vp->v_numoutput++;
  723: 	vfs_busy_pages(bp, 1);
  724: 
  725: 	/*
  726: 	 * Normal bwrites pipeline writes
  727: 	 */
  728: 	bp->b_runningbufspace = bp->b_bufsize;
  729: 	runningbufspace += bp->b_runningbufspace;
  730: 
  731: 	splx(s);
  732: 	if (oldflags & B_ASYNC)
  733: 		BUF_KERNPROC(bp);
  734: 	VOP_STRATEGY(bp->b_vp, bp);
  735: 
  736: 	if ((oldflags & B_ASYNC) == 0) {
  737: 		int rtval = biowait(bp);
  738: 		brelse(bp);
  739: 		return (rtval);
  740: 	} else if ((oldflags & B_NOWDRAIN) == 0) {
  741: 		/*
  742: 		 * don't allow the async write to saturate the I/O
  743: 		 * system.  Deadlocks can occur only if a device strategy
  744: 		 * routine (like in VN) turns around and issues another
  745: 		 * high-level write, in which case B_NOWDRAIN is expected
  746: 		 * to be set.   Otherwise we will not deadlock here because
  747: 		 * we are blocking waiting for I/O that is already in-progress
  748: 		 * to complete.
  749: 		 */
  750: 		waitrunningbufspace();
  751: 	}
  752: 
  753: 	return (0);
  754: }
  755: 
  756: /*
  757:  * Complete a background write started from bwrite.
  758:  */
  759: static void
  760: vfs_backgroundwritedone(bp)
  761: 	struct buf *bp;
  762: {
  763: 	struct buf *origbp;
  764: 
  765: 	/*
  766: 	 * Find the original buffer that we are writing.
  767: 	 */
  768: 	if ((origbp = gbincore(bp->b_vp, bp->b_lblkno)) == NULL)
  769: 		panic("backgroundwritedone: lost buffer");
  770: 	/*
  771: 	 * Process dependencies then return any unfinished ones.
  772: 	 */
  773: 	if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_complete)
  774: 		(*bioops.io_complete)(bp);
  775: 	if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_movedeps)
  776: 		(*bioops.io_movedeps)(bp, origbp);
  777: 	/*
  778: 	 * Clear the BX_BKGRDINPROG flag in the original buffer
  779: 	 * and awaken it if it is waiting for the write to complete.
  780: 	 * If BX_BKGRDINPROG is not set in the original buffer it must
  781: 	 * have been released and re-instantiated - which is not legal.
  782: 	 */
  783: 	KASSERT((origbp->b_xflags & BX_BKGRDINPROG), ("backgroundwritedone: lost buffer2"));
  784: 	origbp->b_xflags &= ~BX_BKGRDINPROG;
  785: 	if (origbp->b_xflags & BX_BKGRDWAIT) {
  786: 		origbp->b_xflags &= ~BX_BKGRDWAIT;
  787: 		wakeup(&origbp->b_xflags);
  788: 	}
  789: 	/*
  790: 	 * Clear the B_LOCKED flag and remove it from the locked
  791: 	 * queue if it currently resides there.
  792: 	 */
  793: 	origbp->b_flags &= ~B_LOCKED;
  794: 	if (BUF_LOCK(origbp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
  795: 		bremfree(origbp);
  796: 		bqrelse(origbp);
  797: 	}
  798: 	/*
  799: 	 * This buffer is marked B_NOCACHE, so when it is released
  800: 	 * by biodone, it will be tossed. We mark it with B_READ
  801: 	 * to avoid biodone doing a second vwakeup.
  802: 	 */
  803: 	bp->b_flags |= B_NOCACHE | B_READ;
  804: 	bp->b_flags &= ~(B_CACHE | B_CALL | B_DONE);
  805: 	bp->b_iodone = 0;
  806: 	biodone(bp);
  807: }
  808: 
  809: /*
  810:  * Delayed write. (Buffer is marked dirty).  Do not bother writing
  811:  * anything if the buffer is marked invalid.
  812:  *
  813:  * Note that since the buffer must be completely valid, we can safely
  814:  * set B_CACHE.  In fact, we have to set B_CACHE here rather then in
  815:  * biodone() in order to prevent getblk from writing the buffer
  816:  * out synchronously.
  817:  */
  818: void
  819: bdwrite(struct buf * bp)
  820: {
  821: 	if (BUF_REFCNT(bp) == 0)
  822: 		panic("bdwrite: buffer is not busy");
  823: 
  824: 	if (bp->b_flags & B_INVAL) {
  825: 		brelse(bp);
  826: 		return;
  827: 	}
  828: 	bdirty(bp);
  829: 
  830: 	/*
  831: 	 * Set B_CACHE, indicating that the buffer is fully valid.  This is
  832: 	 * true even of NFS now.
  833: 	 */
  834: 	bp->b_flags |= B_CACHE;
  835: 
  836: 	/*
  837: 	 * This bmap keeps the system from needing to do the bmap later,
  838: 	 * perhaps when the system is attempting to do a sync.  Since it
  839: 	 * is likely that the indirect block -- or whatever other datastructure
  840: 	 * that the filesystem needs is still in memory now, it is a good
  841: 	 * thing to do this.  Note also, that if the pageout daemon is
  842: 	 * requesting a sync -- there might not be enough memory to do
  843: 	 * the bmap then...  So, this is important to do.
  844: 	 */
  845: 	if (bp->b_lblkno == bp->b_blkno) {
  846: 		VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
  847: 	}
  848: 
  849: 	/*
  850: 	 * Set the *dirty* buffer range based upon the VM system dirty pages.
  851: 	 */
  852: 	vfs_setdirty(bp);
  853: 
  854: 	/*
  855: 	 * We need to do this here to satisfy the vnode_pager and the
  856: 	 * pageout daemon, so that it thinks that the pages have been
  857: 	 * "cleaned".  Note that since the pages are in a delayed write
  858: 	 * buffer -- the VFS layer "will" see that the pages get written
  859: 	 * out on the next sync, or perhaps the cluster will be completed.
  860: 	 */
  861: 	vfs_clean_pages(bp);
  862: 	bqrelse(bp);
  863: 
  864: 	/*
  865: 	 * Wakeup the buffer flushing daemon if we have a lot of dirty
  866: 	 * buffers (midpoint between our recovery point and our stall
  867: 	 * point).
  868: 	 */
  869: 	bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
  870: 
  871: 	/*
  872: 	 * note: we cannot initiate I/O from a bdwrite even if we wanted to,
  873: 	 * due to the softdep code.
  874: 	 */
  875: }
  876: 
  877: /*
  878:  *	bdirty:
  879:  *
  880:  *	Turn buffer into delayed write request.  We must clear B_READ and
  881:  *	B_RELBUF, and we must set B_DELWRI.  We reassign the buffer to 
  882:  *	itself to properly update it in the dirty/clean lists.  We mark it
  883:  *	B_DONE to ensure that any asynchronization of the buffer properly
  884:  *	clears B_DONE ( else a panic will occur later ).  
  885:  *
  886:  *	bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
  887:  *	might have been set pre-getblk().  Unlike bwrite/bdwrite, bdirty()
  888:  *	should only be called if the buffer is known-good.
  889:  *
  890:  *	Since the buffer is not on a queue, we do not update the numfreebuffers
  891:  *	count.
  892:  *
  893:  *	Must be called at splbio().
  894:  *	The buffer must be on QUEUE_NONE.
  895:  */
  896: void
  897: bdirty(bp)
  898: 	struct buf *bp;
  899: {
  900: 	KASSERT(bp->b_qindex == QUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
  901: 	bp->b_flags &= ~(B_READ|B_RELBUF);
  902: 
  903: 	if ((bp->b_flags & B_DELWRI) == 0) {
  904: 		bp->b_flags |= B_DONE | B_DELWRI;
  905: 		reassignbuf(bp, bp->b_vp);
  906: 		++numdirtybuffers;
  907: 		bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
  908: 	}
  909: }
  910: 
  911: /*
  912:  *	bundirty:
  913:  *
  914:  *	Clear B_DELWRI for buffer.
  915:  *
  916:  *	Since the buffer is not on a queue, we do not update the numfreebuffers
  917:  *	count.
  918:  *	
  919:  *	Must be called at splbio().
  920:  *	The buffer must be on QUEUE_NONE.
  921:  */
  922: 
  923: void
  924: bundirty(bp)
  925: 	struct buf *bp;
  926: {
  927: 	KASSERT(bp->b_qindex == QUEUE_NONE, ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));
  928: 
  929: 	if (bp->b_flags & B_DELWRI) {
  930: 		bp->b_flags &= ~B_DELWRI;
  931: 		reassignbuf(bp, bp->b_vp);
  932: 		--numdirtybuffers;
  933: 		numdirtywakeup(lodirtybuffers);
  934: 	}
  935: 	/*
  936: 	 * Since it is now being written, we can clear its deferred write flag.
  937: 	 */
  938: 	bp->b_flags &= ~B_DEFERRED;
  939: }
  940: 
  941: /*
  942:  *	bawrite:
  943:  *
  944:  *	Asynchronous write.  Start output on a buffer, but do not wait for
  945:  *	it to complete.  The buffer is released when the output completes.
  946:  *
  947:  *	bwrite() ( or the VOP routine anyway ) is responsible for handling 
  948:  *	B_INVAL buffers.  Not us.
  949:  */
  950: void
  951: bawrite(struct buf * bp)
  952: {
  953: 	bp->b_flags |= B_ASYNC;
  954: 	(void) VOP_BWRITE(bp->b_vp, bp);
  955: }
  956: 
  957: /*
  958:  *	bowrite:
  959:  *
  960:  *	Ordered write.  Start output on a buffer, and flag it so that the 
  961:  *	device will write it in the order it was queued.  The buffer is 
  962:  *	released when the output completes.  bwrite() ( or the VOP routine
  963:  *	anyway ) is responsible for handling B_INVAL buffers.
  964:  */
  965: int
  966: bowrite(struct buf * bp)
  967: {
  968: 	bp->b_flags |= B_ORDERED | B_ASYNC;
  969: 	return (VOP_BWRITE(bp->b_vp, bp));
  970: }
  971: 
  972: /*
  973:  *	bwillwrite:
  974:  *
  975:  *	Called prior to the locking of any vnodes when we are expecting to
  976:  *	write.  We do not want to starve the buffer cache with too many
  977:  *	dirty buffers so we block here.  By blocking prior to the locking
  978:  *	of any vnodes we attempt to avoid the situation where a locked vnode
  979:  *	prevents the various system daemons from flushing related buffers.
  980:  */
  981: 
  982: void
  983: bwillwrite(void)
  984: {
  985: 	if (numdirtybuffers >= hidirtybuffers) {
  986: 		int s;
  987: 
  988: 		s = splbio();
  989: 		while (numdirtybuffers >= hidirtybuffers) {
  990: 			bd_wakeup(1);
  991: 			needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH;
  992: 			tsleep(&needsbuffer, 0, "flswai", 0);
  993: 		}
  994: 		splx(s);
  995: 	}
  996: }
  997: 
  998: /*
  999:  * Return true if we have too many dirty buffers.
 1000:  */
 1001: int
 1002: buf_dirty_count_severe(void)
 1003: {
 1004: 	return(numdirtybuffers >= hidirtybuffers);
 1005: }
 1006: 
 1007: /*
 1008:  *	brelse:
 1009:  *
 1010:  *	Release a busy buffer and, if requested, free its resources.  The
 1011:  *	buffer will be stashed in the appropriate bufqueue[] allowing it
 1012:  *	to be accessed later as a cache entity or reused for other purposes.
 1013:  */
 1014: void
 1015: brelse(struct buf * bp)
 1016: {
 1017: 	int s;
 1018: 
 1019: 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 1020: 
 1021: 	s = splbio();
 1022: 
 1023: 	if (bp->b_flags & B_LOCKED)
 1024: 		bp->b_flags &= ~B_ERROR;
 1025: 
 1026: 	if ((bp->b_flags & (B_READ | B_ERROR | B_INVAL)) == B_ERROR) {
 1027: 		/*
 1028: 		 * Failed write, redirty.  Must clear B_ERROR to prevent
 1029: 		 * pages from being scrapped.  If B_INVAL is set then
 1030: 		 * this case is not run and the next case is run to 
 1031: 		 * destroy the buffer.  B_INVAL can occur if the buffer
 1032: 		 * is outside the range supported by the underlying device.
 1033: 		 */
 1034: 		bp->b_flags &= ~B_ERROR;
 1035: 		bdirty(bp);
 1036: 	} else if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_FREEBUF)) ||
 1037: 	    (bp->b_bufsize <= 0)) {
 1038: 		/*
 1039: 		 * Either a failed I/O or we were asked to free or not
 1040: 		 * cache the buffer.
 1041: 		 */
 1042: 		bp->b_flags |= B_INVAL;
 1043: 		if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
 1044: 			(*bioops.io_deallocate)(bp);
 1045: 		if (bp->b_flags & B_DELWRI) {
 1046: 			--numdirtybuffers;
 1047: 			numdirtywakeup(lodirtybuffers);
 1048: 		}
 1049: 		bp->b_flags &= ~(B_DELWRI | B_CACHE | B_FREEBUF);
 1050: 		if ((bp->b_flags & B_VMIO) == 0) {
 1051: 			if (bp->b_bufsize)
 1052: 				allocbuf(bp, 0);
 1053: 			if (bp->b_vp)
 1054: 				brelvp(bp);
 1055: 		}
 1056: 	}
 1057: 
 1058: 	/*
 1059: 	 * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_release() 
 1060: 	 * is called with B_DELWRI set, the underlying pages may wind up
 1061: 	 * getting freed causing a previous write (bdwrite()) to get 'lost'
 1062: 	 * because pages associated with a B_DELWRI bp are marked clean.
 1063: 	 * 
 1064: 	 * We still allow the B_INVAL case to call vfs_vmio_release(), even
 1065: 	 * if B_DELWRI is set.
 1066: 	 *
 1067: 	 * If B_DELWRI is not set we may have to set B_RELBUF if we are low
 1068: 	 * on pages to return pages to the VM page queues.
 1069: 	 */
 1070: 	if (bp->b_flags & B_DELWRI)
 1071: 		bp->b_flags &= ~B_RELBUF;
 1072: 	else if (vm_page_count_severe() && !(bp->b_xflags & BX_BKGRDINPROG))
 1073: 		bp->b_flags |= B_RELBUF;
 1074: 
 1075: 	/*
 1076: 	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
 1077: 	 * constituted, not even NFS buffers now.  Two flags effect this.  If
 1078: 	 * B_INVAL, the struct buf is invalidated but the VM object is kept
 1079: 	 * around ( i.e. so it is trivial to reconstitute the buffer later ).
 1080: 	 *
 1081: 	 * If B_ERROR or B_NOCACHE is set, pages in the VM object will be
 1082: 	 * invalidated.  B_ERROR cannot be set for a failed write unless the
 1083: 	 * buffer is also B_INVAL because it hits the re-dirtying code above.
 1084: 	 *
 1085: 	 * Normally we can do this whether a buffer is B_DELWRI or not.  If
 1086: 	 * the buffer is an NFS buffer, it is tracking piecemeal writes or
 1087: 	 * the commit state and we cannot afford to lose the buffer. If the
 1088: 	 * buffer has a background write in progress, we need to keep it
 1089: 	 * around to prevent it from being reconstituted and starting a second
 1090: 	 * background write.
 1091: 	 */
 1092: 	if ((bp->b_flags & B_VMIO)
 1093: 	    && !(bp->b_vp->v_tag == VT_NFS &&
 1094: 		 !vn_isdisk(bp->b_vp, NULL) &&
 1095: 		 (bp->b_flags & B_DELWRI))
 1096: 	    ) {
 1097: 
 1098: 		int i, j, resid;
 1099: 		vm_page_t m;
 1100: 		off_t foff;
 1101: 		vm_pindex_t poff;
 1102: 		vm_object_t obj;
 1103: 		struct vnode *vp;
 1104: 
 1105: 		vp = bp->b_vp;
 1106: 
 1107: 		/*
 1108: 		 * Get the base offset and length of the buffer.  Note that 
 1109: 		 * in the VMIO case if the buffer block size is not
 1110: 		 * page-aligned then b_data pointer may not be page-aligned.
 1111: 		 * But our b_pages[] array *IS* page aligned.
 1112: 		 *
 1113: 		 * block sizes less then DEV_BSIZE (usually 512) are not 
 1114: 		 * supported due to the page granularity bits (m->valid,
 1115: 		 * m->dirty, etc...). 
 1116: 		 *
 1117: 		 * See man buf(9) for more information
 1118: 		 */
 1119: 
 1120: 		resid = bp->b_bufsize;
 1121: 		foff = bp->b_offset;
 1122: 
 1123: 		for (i = 0; i < bp->b_npages; i++) {
 1124: 			m = bp->b_pages[i];
 1125: 			vm_page_flag_clear(m, PG_ZERO);
 1126: 			/*
 1127: 			 * If we hit a bogus page, fixup *all* of them
 1128: 			 * now.  Note that we left these pages wired
 1129: 			 * when we removed them so they had better exist,
 1130: 			 * and they cannot be ripped out from under us so
 1131: 			 * no splvm() protection is necessary.
 1132: 			 */
 1133: 			if (m == bogus_page) {
 1134: 				VOP_GETVOBJECT(vp, &obj);
 1135: 				poff = OFF_TO_IDX(bp->b_offset);
 1136: 
 1137: 				for (j = i; j < bp->b_npages; j++) {
 1138: 					vm_page_t mtmp;
 1139: 
 1140: 					mtmp = bp->b_pages[j];
 1141: 					if (mtmp == bogus_page) {
 1142: 						mtmp = vm_page_lookup(obj, poff + j);
 1143: 						if (!mtmp) {
 1144: 							panic("brelse: page missing");
 1145: 						}
 1146: 						bp->b_pages[j] = mtmp;
 1147: 					}
 1148: 				}
 1149: 
 1150: 				if ((bp->b_flags & B_INVAL) == 0) {
 1151: 					pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
 1152: 				}
 1153: 				m = bp->b_pages[i];
 1154: 			}
 1155: 
 1156: 			/*
 1157: 			 * Invalidate the backing store if B_NOCACHE is set
 1158: 			 * (e.g. used with vinvalbuf()).  If this is NFS
 1159: 			 * we impose a requirement that the block size be
 1160: 			 * a multiple of PAGE_SIZE and create a temporary
 1161: 			 * hack to basically invalidate the whole page.  The
 1162: 			 * problem is that NFS uses really odd buffer sizes
 1163: 			 * especially when tracking piecemeal writes and
 1164: 			 * it also vinvalbuf()'s a lot, which would result
 1165: 			 * in only partial page validation and invalidation
 1166: 			 * here.  If the file page is mmap()'d, however,
 1167: 			 * all the valid bits get set so after we invalidate
 1168: 			 * here we would end up with weird m->valid values
 1169: 			 * like 0xfc.  nfs_getpages() can't handle this so
 1170: 			 * we clear all the valid bits for the NFS case
 1171: 			 * instead of just some of them.
 1172: 			 *
 1173: 			 * The real bug is the VM system having to set m->valid
 1174: 			 * to VM_PAGE_BITS_ALL for faulted-in pages, which
 1175: 			 * itself is an artifact of the whole 512-byte
 1176: 			 * granular mess that exists to support odd block 
 1177: 			 * sizes and UFS meta-data block sizes (e.g. 6144).
 1178: 			 * A complete rewrite is required.
 1179: 			 */
 1180: 			if (bp->b_flags & (B_NOCACHE|B_ERROR)) {
 1181: 				int poffset = foff & PAGE_MASK;
 1182: 				int presid;
 1183: 
 1184: 				presid = PAGE_SIZE - poffset;
 1185: 				if (bp->b_vp->v_tag == VT_NFS &&
 1186: 				    bp->b_vp->v_type == VREG) {
 1187: 					; /* entire page */
 1188: 				} else if (presid > resid) {
 1189: 					presid = resid;
 1190: 				}
 1191: 				KASSERT(presid >= 0, ("brelse: extra page"));
 1192: 				vm_page_set_invalid(m, poffset, presid);
 1193: 			}
 1194: 			resid -= PAGE_SIZE - (foff & PAGE_MASK);
 1195: 			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 1196: 		}
 1197: 
 1198: 		if (bp->b_flags & (B_INVAL | B_RELBUF))
 1199: 			vfs_vmio_release(bp);
 1200: 
 1201: 	} else if (bp->b_flags & B_VMIO) {
 1202: 
 1203: 		if (bp->b_flags & (B_INVAL | B_RELBUF))
 1204: 			vfs_vmio_release(bp);
 1205: 
 1206: 	}
 1207: 			
 1208: 	if (bp->b_qindex != QUEUE_NONE)
 1209: 		panic("brelse: free buffer onto another queue???");
 1210: 	if (BUF_REFCNT(bp) > 1) {
 1211: 		/* Temporary panic to verify exclusive locking */
 1212: 		/* This panic goes away when we allow shared refs */
 1213: 		panic("brelse: multiple refs");
 1214: 		/* do not release to free list */
 1215: 		BUF_UNLOCK(bp);
 1216: 		splx(s);
 1217: 		return;
 1218: 	}
 1219: 
 1220: 	/* enqueue */
 1221: 
 1222: 	/* buffers with no memory */
 1223: 	if (bp->b_bufsize == 0) {
 1224: 		bp->b_flags |= B_INVAL;
 1225: 		bp->b_xflags &= ~BX_BKGRDWRITE;
 1226: 		if (bp->b_xflags & BX_BKGRDINPROG)
 1227: 			panic("losing buffer 1");
 1228: 		if (bp->b_kvasize) {
 1229: 			bp->b_qindex = QUEUE_EMPTYKVA;
 1230: 		} else {
 1231: 			bp->b_qindex = QUEUE_EMPTY;
 1232: 		}
 1233: 		TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
 1234: 		LIST_REMOVE(bp, b_hash);
 1235: 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 1236: 		bp->b_dev = NODEV;
 1237: 	/* buffers with junk contents */
 1238: 	} else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
 1239: 		bp->b_flags |= B_INVAL;
 1240: 		bp->b_xflags &= ~BX_BKGRDWRITE;
 1241: 		if (bp->b_xflags & BX_BKGRDINPROG)
 1242: 			panic("losing buffer 2");
 1243: 		bp->b_qindex = QUEUE_CLEAN;
 1244: 		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
 1245: 		LIST_REMOVE(bp, b_hash);
 1246: 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 1247: 		bp->b_dev = NODEV;
 1248: 
 1249: 	/* buffers that are locked */
 1250: 	} else if (bp->b_flags & B_LOCKED) {
 1251: 		bp->b_qindex = QUEUE_LOCKED;
 1252: 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
 1253: 
 1254: 	/* remaining buffers */
 1255: 	} else {
 1256: 		switch(bp->b_flags & (B_DELWRI|B_AGE)) {
 1257: 		case B_DELWRI | B_AGE:
 1258: 		    bp->b_qindex = QUEUE_DIRTY;
 1259: 		    TAILQ_INSERT_HEAD(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
 1260: 		    break;
 1261: 		case B_DELWRI:
 1262: 		    bp->b_qindex = QUEUE_DIRTY;
 1263: 		    TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
 1264: 		    break;
 1265: 		case B_AGE:
 1266: 		    bp->b_qindex = QUEUE_CLEAN;
 1267: 		    TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
 1268: 		    break;
 1269: 		default:
 1270: 		    bp->b_qindex = QUEUE_CLEAN;
 1271: 		    TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
 1272: 		    break;
 1273: 		}
 1274: 	}
 1275: 
 1276: 	/*
 1277: 	 * If B_INVAL, clear B_DELWRI.  We've already placed the buffer
 1278: 	 * on the correct queue.
 1279: 	 */
 1280: 	if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI))
 1281: 		bundirty(bp);
 1282: 
 1283: 	/*
 1284: 	 * Fixup numfreebuffers count.  The bp is on an appropriate queue
 1285: 	 * unless locked.  We then bump numfreebuffers if it is not B_DELWRI.
 1286: 	 * We've already handled the B_INVAL case ( B_DELWRI will be clear
 1287: 	 * if B_INVAL is set ).
 1288: 	 */
 1289: 
 1290: 	if ((bp->b_flags & B_LOCKED) == 0 && !(bp->b_flags & B_DELWRI))
 1291: 		bufcountwakeup();
 1292: 
 1293: 	/*
 1294: 	 * Something we can maybe free or reuse
 1295: 	 */
 1296: 	if (bp->b_bufsize || bp->b_kvasize)
 1297: 		bufspacewakeup();
 1298: 
 1299: 	/* unlock */
 1300: 	BUF_UNLOCK(bp);
 1301: 	bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF |
 1302: 			B_DIRECT | B_NOWDRAIN);
 1303: 	splx(s);
 1304: }
 1305: 
 1306: /*
 1307:  * Release a buffer back to the appropriate queue but do not try to free
 1308:  * it.  The buffer is expected to be used again soon.
 1309:  *
 1310:  * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
 1311:  * biodone() to requeue an async I/O on completion.  It is also used when
 1312:  * known good buffers need to be requeued but we think we may need the data
 1313:  * again soon.
 1314:  *
 1315:  * XXX we should be able to leave the B_RELBUF hint set on completion.
 1316:  */
 1317: void
 1318: bqrelse(struct buf * bp)
 1319: {
 1320: 	int s;
 1321: 
 1322: 	s = splbio();
 1323: 
 1324: 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 1325: 
 1326: 	if (bp->b_qindex != QUEUE_NONE)
 1327: 		panic("bqrelse: free buffer onto another queue???");
 1328: 	if (BUF_REFCNT(bp) > 1) {
 1329: 		/* do not release to free list */
 1330: 		panic("bqrelse: multiple refs");
 1331: 		BUF_UNLOCK(bp);
 1332: 		splx(s);
 1333: 		return;
 1334: 	}
 1335: 	if (bp->b_flags & B_LOCKED) {
 1336: 		bp->b_flags &= ~B_ERROR;
 1337: 		bp->b_qindex = QUEUE_LOCKED;
 1338: 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
 1339: 		/* buffers with stale but valid contents */
 1340: 	} else if (bp->b_flags & B_DELWRI) {
 1341: 		bp->b_qindex = QUEUE_DIRTY;
 1342: 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
 1343: 	} else if (vm_page_count_severe()) {
 1344: 		/*
 1345: 		 * We are too low on memory, we have to try to free the
 1346: 		 * buffer (most importantly: the wired pages making up its
 1347: 		 * backing store) *now*.
 1348: 		 */
 1349: 		splx(s);
 1350: 		brelse(bp);
 1351: 		return;
 1352: 	} else {
 1353: 		bp->b_qindex = QUEUE_CLEAN;
 1354: 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
 1355: 	}
 1356: 
 1357: 	if ((bp->b_flags & B_LOCKED) == 0 &&
 1358: 	    ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))) {
 1359: 		bufcountwakeup();
 1360: 	}
 1361: 
 1362: 	/*
 1363: 	 * Something we can maybe free or reuse.
 1364: 	 */
 1365: 	if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
 1366: 		bufspacewakeup();
 1367: 
 1368: 	/* unlock */
 1369: 	BUF_UNLOCK(bp);
 1370: 	bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
 1371: 	splx(s);
 1372: }
 1373: 
 1374: static void
 1375: vfs_vmio_release(bp)
 1376: 	struct buf *bp;
 1377: {
 1378: 	int i, s;
 1379: 	vm_page_t m;
 1380: 
 1381: 	s = splvm();
 1382: 	for (i = 0; i < bp->b_npages; i++) {
 1383: 		m = bp->b_pages[i];
 1384: 		bp->b_pages[i] = NULL;
 1385: 		/*
 1386: 		 * In order to keep page LRU ordering consistent, put
 1387: 		 * everything on the inactive queue.
 1388: 		 */
 1389: 		vm_page_unwire(m, 0);
 1390: 		/*
 1391: 		 * We don't mess with busy pages, it is
 1392: 		 * the responsibility of the process that
 1393: 		 * busied the pages to deal with them.
 1394: 		 */
 1395: 		if ((m->flags & PG_BUSY) || (m->busy != 0))
 1396: 			continue;
 1397: 			
 1398: 		if (m->wire_count == 0) {
 1399: 			vm_page_flag_clear(m, PG_ZERO);
 1400: 			/*
 1401: 			 * Might as well free the page if we can and it has
 1402: 			 * no valid data.  We also free the page if the
 1403: 			 * buffer was used for direct I/O.
 1404: 			 */
 1405: 			if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && m->hold_count == 0) {
 1406: 				vm_page_busy(m);
 1407: 				vm_page_protect(m, VM_PROT_NONE);
 1408: 				vm_page_free(m);
 1409: 			} else if (bp->b_flags & B_DIRECT) {
 1410: 				vm_page_try_to_free(m);
 1411: 			} else if (vm_page_count_severe()) {
 1412: 				vm_page_try_to_cache(m);
 1413: 			}
 1414: 		}
 1415: 	}
 1416: 	splx(s);
 1417: 	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
 1418: 	if (bp->b_bufsize) {
 1419: 		bufspacewakeup();
 1420: 		bp->b_bufsize = 0;
 1421: 	}
 1422: 	bp->b_npages = 0;
 1423: 	bp->b_flags &= ~B_VMIO;
 1424: 	if (bp->b_vp)
 1425: 		brelvp(bp);
 1426: }
 1427: 
 1428: /*
 1429:  * Check to see if a block is currently memory resident.
 1430:  */
 1431: struct buf *
 1432: gbincore(struct vnode * vp, daddr_t blkno)
 1433: {
 1434: 	struct buf *bp;
 1435: 	struct bufhashhdr *bh;
 1436: 
 1437: 	bh = bufhash(vp, blkno);
 1438: 
 1439: 	/* Search hash chain */
 1440: 	LIST_FOREACH(bp, bh, b_hash) {
 1441: 		/* hit */
 1442: 		if (bp->b_vp == vp && bp->b_lblkno == blkno &&
 1443: 		    (bp->b_flags & B_INVAL) == 0) {
 1444: 			break;
 1445: 		}
 1446: 	}
 1447: 	return (bp);
 1448: }
 1449: 
 1450: /*
 1451:  *	vfs_bio_awrite:
 1452:  *
 1453:  *	Implement clustered async writes for clearing out B_DELWRI buffers.
 1454:  *	This is much better then the old way of writing only one buffer at
 1455:  *	a time.  Note that we may not be presented with the buffers in the 
 1456:  *	correct order, so we search for the cluster in both directions.
 1457:  */
 1458: int
 1459: vfs_bio_awrite(struct buf * bp)
 1460: {
 1461: 	int i;
 1462: 	int j;
 1463: 	daddr_t lblkno = bp->b_lblkno;
 1464: 	struct vnode *vp = bp->b_vp;
 1465: 	int s;
 1466: 	int ncl;
 1467: 	struct buf *bpa;
 1468: 	int nwritten;
 1469: 	int size;
 1470: 	int maxcl;
 1471: 
 1472: 	s = splbio();
 1473: 	/*
 1474: 	 * right now we support clustered writing only to regular files.  If
 1475: 	 * we find a clusterable block we could be in the middle of a cluster
 1476: 	 * rather then at the beginning.
 1477: 	 */
 1478: 	if ((vp->v_type == VREG) && 
 1479: 	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
 1480: 	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
 1481: 
 1482: 		size = vp->v_mount->mnt_stat.f_iosize;
 1483: 		maxcl = MAXPHYS / size;
 1484: 
 1485: 		for (i = 1; i < maxcl; i++) {
 1486: 			if ((bpa = gbincore(vp, lblkno + i)) &&
 1487: 			    BUF_REFCNT(bpa) == 0 &&
 1488: 			    ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
 1489: 			    (B_DELWRI | B_CLUSTEROK)) &&
 1490: 			    (bpa->b_bufsize == size)) {
 1491: 				if ((bpa->b_blkno == bpa->b_lblkno) ||
 1492: 				    (bpa->b_blkno !=
 1493: 				     bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
 1494: 					break;
 1495: 			} else {
 1496: 				break;
 1497: 			}
 1498: 		}
 1499: 		for (j = 1; i + j <= maxcl && j <= lblkno; j++) {
 1500: 			if ((bpa = gbincore(vp, lblkno - j)) &&
 1501: 			    BUF_REFCNT(bpa) == 0 &&
 1502: 			    ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
 1503: 			    (B_DELWRI | B_CLUSTEROK)) &&
 1504: 			    (bpa->b_bufsize == size)) {
 1505: 				if ((bpa->b_blkno == bpa->b_lblkno) ||
 1506: 				    (bpa->b_blkno !=
 1507: 				     bp->b_blkno - ((j * size) >> DEV_BSHIFT)))
 1508: 					break;
 1509: 			} else {
 1510: 				break;
 1511: 			}
 1512: 		}
 1513: 		--j;
 1514: 		ncl = i + j;
 1515: 		/*
 1516: 		 * this is a possible cluster write
 1517: 		 */
 1518: 		if (ncl != 1) {
 1519: 			nwritten = cluster_wbuild(vp, size, lblkno - j, ncl);
 1520: 			splx(s);
 1521: 			return nwritten;
 1522: 		}
 1523: 	}
 1524: 
 1525: 	BUF_LOCK(bp, LK_EXCLUSIVE);
 1526: 	bremfree(bp);
 1527: 	bp->b_flags |= B_ASYNC;
 1528: 
 1529: 	splx(s);
 1530: 	/*
 1531: 	 * default (old) behavior, writing out only one block
 1532: 	 *
 1533: 	 * XXX returns b_bufsize instead of b_bcount for nwritten?
 1534: 	 */
 1535: 	nwritten = bp->b_bufsize;
 1536: 	(void) VOP_BWRITE(bp->b_vp, bp);
 1537: 
 1538: 	return nwritten;
 1539: }
 1540: 
 1541: /*
 1542:  *	getnewbuf:
 1543:  *
 1544:  *	Find and initialize a new buffer header, freeing up existing buffers 
 1545:  *	in the bufqueues as necessary.  The new buffer is returned locked.
 1546:  *
 1547:  *	Important:  B_INVAL is not set.  If the caller wishes to throw the
 1548:  *	buffer away, the caller must set B_INVAL prior to calling brelse().
 1549:  *
 1550:  *	We block if:
 1551:  *		We have insufficient buffer headers
 1552:  *		We have insufficient buffer space
 1553:  *		buffer_map is too fragmented ( space reservation fails )
 1554:  *		If we have to flush dirty buffers ( but we try to avoid this )
 1555:  *
 1556:  *	To avoid VFS layer recursion we do not flush dirty buffers ourselves.
 1557:  *	Instead we ask the buf daemon to do it for us.  We attempt to
 1558:  *	avoid piecemeal wakeups of the pageout daemon.
 1559:  */
 1560: 
 1561: static struct buf *
 1562: getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
 1563: {
 1564: 	struct buf *bp;
 1565: 	struct buf *nbp;
 1566: 	int defrag = 0;
 1567: 	int nqindex;
 1568: 	static int flushingbufs;
 1569: 
 1570: 	/*
 1571: 	 * We can't afford to block since we might be holding a vnode lock,
 1572: 	 * which may prevent system daemons from running.  We deal with
 1573: 	 * low-memory situations by proactively returning memory and running
 1574: 	 * async I/O rather then sync I/O.
 1575: 	 */
 1576: 	
 1577: 	++getnewbufcalls;
 1578: 	--getnewbufrestarts;
 1579: restart:
 1580: 	++getnewbufrestarts;
 1581: 
 1582: 	/*
 1583: 	 * Setup for scan.  If we do not have enough free buffers,
 1584: 	 * we setup a degenerate case that immediately fails.  Note
 1585: 	 * that if we are specially marked process, we are allowed to
 1586: 	 * dip into our reserves.
 1587: 	 *
 1588: 	 * The scanning sequence is nominally:  EMPTY->EMPTYKVA->CLEAN
 1589: 	 *
 1590: 	 * We start with EMPTYKVA.  If the list is empty we backup to EMPTY.
 1591: 	 * However, there are a number of cases (defragging, reusing, ...)
 1592: 	 * where we cannot backup.
 1593: 	 */
 1594: 	nqindex = QUEUE_EMPTYKVA;
 1595: 	nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
 1596: 
 1597: 	if (nbp == NULL) {
 1598: 		/*
 1599: 		 * If no EMPTYKVA buffers and we are either
 1600: 		 * defragging or reusing, locate a CLEAN buffer
 1601: 		 * to free or reuse.  If bufspace useage is low
 1602: 		 * skip this step so we can allocate a new buffer.
 1603: 		 */
 1604: 		if (defrag || bufspace >= lobufspace) {
 1605: 			nqindex = QUEUE_CLEAN;
 1606: 			nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
 1607: 		}
 1608: 
 1609: 		/*
 1610: 		 * If we could not find or were not allowed to reuse a
 1611: 		 * CLEAN buffer, check to see if it is ok to use an EMPTY
 1612: 		 * buffer.  We can only use an EMPTY buffer if allocating
 1613: 		 * its KVA would not otherwise run us out of buffer space.
 1614: 		 */
 1615: 		if (nbp == NULL && defrag == 0 &&
 1616: 		    bufspace + maxsize < hibufspace) {
 1617: 			nqindex = QUEUE_EMPTY;
 1618: 			nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
 1619: 		}
 1620: 	}
 1621: 
 1622: 	/*
 1623: 	 * Run scan, possibly freeing data and/or kva mappings on the fly
 1624: 	 * depending.
 1625: 	 */
 1626: 
 1627: 	while ((bp = nbp) != NULL) {
 1628: 		int qindex = nqindex;
 1629: 
 1630: 		/*
 1631: 		 * Calculate next bp ( we can only use it if we do not block
 1632: 		 * or do other fancy things ).
 1633: 		 */
 1634: 		if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) {
 1635: 			switch(qindex) {
 1636: 			case QUEUE_EMPTY:
 1637: 				nqindex = QUEUE_EMPTYKVA;
 1638: 				if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA])))
 1639: 					break;
 1640: 				/* fall through */
 1641: 			case QUEUE_EMPTYKVA:
 1642: 				nqindex = QUEUE_CLEAN;
 1643: 				if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN])))
 1644: 					break;
 1645: 				/* fall through */
 1646: 			case QUEUE_CLEAN:
 1647: 				/*
 1648: 				 * nbp is NULL. 
 1649: 				 */
 1650: 				break;
 1651: 			}
 1652: 		}
 1653: 
 1654: 		/*
 1655: 		 * Sanity Checks
 1656: 		 */
 1657: 		KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp));
 1658: 
 1659: 		/*
 1660: 		 * Note: we no longer distinguish between VMIO and non-VMIO
 1661: 		 * buffers.
 1662: 		 */
 1663: 
 1664: 		KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex));
 1665: 
 1666: 		/*
 1667: 		 * If we are defragging then we need a buffer with 
 1668: 		 * b_kvasize != 0.  XXX this situation should no longer
 1669: 		 * occur, if defrag is non-zero the buffer's b_kvasize
 1670: 		 * should also be non-zero at this point.  XXX
 1671: 		 */
 1672: 		if (defrag && bp->b_kvasize == 0) {
 1673: 			printf("Warning: defrag empty buffer %p\n", bp);
 1674: 			continue;
 1675: 		}
 1676: 
 1677: 		/*
 1678: 		 * Start freeing the bp.  This is somewhat involved.  nbp
 1679: 		 * remains valid only for QUEUE_EMPTY[KVA] bp's.
 1680: 		 */
 1681: 
 1682: 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0)
 1683: 			panic("getnewbuf: locked buf");
 1684: 		bremfree(bp);
 1685: 
 1686: 		if (qindex == QUEUE_CLEAN) {
 1687: 			if (bp->b_flags & B_VMIO) {
 1688: 				bp->b_flags &= ~B_ASYNC;
 1689: 				vfs_vmio_release(bp);
 1690: 			}
 1691: 			if (bp->b_vp)
 1692: 				brelvp(bp);
 1693: 		}
 1694: 
 1695: 		/*
 1696: 		 * NOTE:  nbp is now entirely invalid.  We can only restart
 1697: 		 * the scan from this point on.
 1698: 		 *
 1699: 		 * Get the rest of the buffer freed up.  b_kva* is still
 1700: 		 * valid after this operation.
 1701: 		 */
 1702: 
 1703: 		if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
 1704: 			(*bioops.io_deallocate)(bp);
 1705: 		if (bp->b_xflags & BX_BKGRDINPROG)
 1706: 			panic("losing buffer 3");
 1707: 		LIST_REMOVE(bp, b_hash);
 1708: 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 1709: 
 1710: 		/*
 1711: 		 * spl protection not required when scrapping a buffer's
 1712: 		 * contents because it is already wired.
 1713: 		 */
 1714: 		if (bp->b_bufsize)
 1715: 			allocbuf(bp, 0);
 1716: 
 1717: 		bp->b_flags = 0;
 1718: 		bp->b_xflags = 0;
 1719: 		bp->b_dev = NODEV;
 1720: 		bp->b_vp = NULL;
 1721: 		bp->b_blkno = bp->b_lblkno = 0;
 1722: 		bp->b_offset = NOOFFSET;
 1723: 		bp->b_iodone = 0;
 1724: 		bp->b_error = 0;
 1725: 		bp->b_resid = 0;
 1726: 		bp->b_bcount = 0;
 1727: 		bp->b_npages = 0;
 1728: 		bp->b_dirtyoff = bp->b_dirtyend = 0;
 1729: 
 1730: 		LIST_INIT(&bp->b_dep);
 1731: 
 1732: 		/*
 1733: 		 * If we are defragging then free the buffer.
 1734: 		 */
 1735: 		if (defrag) {
 1736: 			bp->b_flags |= B_INVAL;
 1737: 			bfreekva(bp);
 1738: 			brelse(bp);
 1739: 			defrag = 0;
 1740: 			goto restart;
 1741: 		}
 1742: 
 1743: 		/*
 1744: 		 * If we are overcomitted then recover the buffer and its
 1745: 		 * KVM space.  This occurs in rare situations when multiple
 1746: 		 * processes are blocked in getnewbuf() or allocbuf().
 1747: 		 */
 1748: 		if (bufspace >= hibufspace)
 1749: 			flushingbufs = 1;
 1750: 		if (flushingbufs && bp->b_kvasize != 0) {
 1751: 			bp->b_flags |= B_INVAL;
 1752: 			bfreekva(bp);
 1753: 			brelse(bp);
 1754: 			goto restart;
 1755: 		}
 1756: 		if (bufspace < lobufspace)
 1757: 			flushingbufs = 0;
 1758: 		break;
 1759: 	}
 1760: 
 1761: 	/*
 1762: 	 * If we exhausted our list, sleep as appropriate.  We may have to
 1763: 	 * wakeup various daemons and write out some dirty buffers.
 1764: 	 *
 1765: 	 * Generally we are sleeping due to insufficient buffer space.
 1766: 	 */
 1767: 
 1768: 	if (bp == NULL) {
 1769: 		int flags;
 1770: 		char *waitmsg;
 1771: 
 1772: 		if (defrag) {
 1773: 			flags = VFS_BIO_NEED_BUFSPACE;
 1774: 			waitmsg = "nbufkv";
 1775: 		} else if (bufspace >= hibufspace) {
 1776: 			waitmsg = "nbufbs";
 1777: 			flags = VFS_BIO_NEED_BUFSPACE;
 1778: 		} else {
 1779: 			waitmsg = "newbuf";
 1780: 			flags = VFS_BIO_NEED_ANY;
 1781: 		}
 1782: 
 1783: 		bd_speedup();	/* heeeelp */
 1784: 
 1785: 		needsbuffer |= flags;
 1786: 		while (needsbuffer & flags) {
 1787: 			if (tsleep(&needsbuffer, slpflag, waitmsg, slptimeo))
 1788: 				return (NULL);
 1789: 		}
 1790: 	} else {
 1791: 		/*
 1792: 		 * We finally have a valid bp.  We aren't quite out of the
 1793: 		 * woods, we still have to reserve kva space.  In order
 1794: 		 * to keep fragmentation sane we only allocate kva in
 1795: 		 * BKVASIZE chunks.
 1796: 		 */
 1797: 		maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
 1798: 
 1799: 		if (maxsize != bp->b_kvasize) {
 1800: 			vm_offset_t addr = 0;
 1801: 			int count;
 1802: 
 1803: 			bfreekva(bp);
 1804: 
 1805: 			count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
 1806: 			vm_map_lock(buffer_map);
 1807: 
 1808: 			if (vm_map_findspace(buffer_map,
 1809: 				    vm_map_min(buffer_map), maxsize,
 1810: 				    maxsize, &addr)) {
 1811: 				/*
 1812: 				 * Uh oh.  Buffer map is to fragmented.  We
 1813: 				 * must defragment the map.
 1814: 				 */
 1815: 				vm_map_unlock(buffer_map);
 1816: 				vm_map_entry_release(count);
 1817: 				++bufdefragcnt;
 1818: 				defrag = 1;
 1819: 				bp->b_flags |= B_INVAL;
 1820: 				brelse(bp);
 1821: 				goto restart;
 1822: 			}
 1823: 			if (addr) {
 1824: 				vm_map_insert(buffer_map, &count,
 1825: 					NULL, 0,
 1826: 					addr, addr + maxsize,
 1827: 					VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
 1828: 
 1829: 				bp->b_kvabase = (caddr_t) addr;
 1830: 				bp->b_kvasize = maxsize;
 1831: 				bufspace += bp->b_kvasize;
 1832: 				++bufreusecnt;
 1833: 			}
 1834: 			vm_map_unlock(buffer_map);
 1835: 			vm_map_entry_release(count);
 1836: 		}
 1837: 		bp->b_data = bp->b_kvabase;
 1838: 	}
 1839: 	return(bp);
 1840: }
 1841: 
 1842: /*
 1843:  *	buf_daemon:
 1844:  *
 1845:  *	buffer flushing daemon.  Buffers are normally flushed by the
 1846:  *	update daemon but if it cannot keep up this process starts to
 1847:  *	take the load in an attempt to prevent getnewbuf() from blocking.
 1848:  */
 1849: 
 1850: static struct thread *bufdaemonthread;
 1851: 
 1852: static struct kproc_desc buf_kp = {
 1853: 	"bufdaemon",
 1854: 	buf_daemon,
 1855: 	&bufdaemonthread
 1856: };
 1857: SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp)
 1858: 
 1859: static void
 1860: buf_daemon()
 1861: {
 1862: 	int s;
 1863: 
 1864: 	/*
 1865: 	 * This process needs to be suspended prior to shutdown sync.
 1866: 	 */
 1867: 	EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc,
 1868: 	    bufdaemonthread, SHUTDOWN_PRI_LAST);
 1869: 
 1870: 	/*
 1871: 	 * This process is allowed to take the buffer cache to the limit
 1872: 	 */
 1873: 	s = splbio();
 1874: 
 1875: 	for (;;) {
 1876: 		kproc_suspend_loop();
 1877: 
 1878: 		/*
 1879: 		 * Do the flush.  Limit the amount of in-transit I/O we
 1880: 		 * allow to build up, otherwise we would completely saturate
 1881: 		 * the I/O system.  Wakeup any waiting processes before we
 1882: 		 * normally would so they can run in parallel with our drain.
 1883: 		 */
 1884: 		while (numdirtybuffers > lodirtybuffers) {
 1885: 			if (flushbufqueues() == 0)
 1886: 				break;
 1887: 			waitrunningbufspace();
 1888: 			numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2);
 1889: 		}
 1890: 
 1891: 		/*
 1892: 		 * Only clear bd_request if we have reached our low water
 1893: 		 * mark.  The buf_daemon normally waits 5 seconds and
 1894: 		 * then incrementally flushes any dirty buffers that have
 1895: 		 * built up, within reason.
 1896: 		 *
 1897: 		 * If we were unable to hit our low water mark and couldn't
 1898: 		 * find any flushable buffers, we sleep half a second. 
 1899: 		 * Otherwise we loop immediately.
 1900: 		 */
 1901: 		if (numdirtybuffers <= lodirtybuffers) {
 1902: 			/*
 1903: 			 * We reached our low water mark, reset the
 1904: 			 * request and sleep until we are needed again.
 1905: 			 * The sleep is just so the suspend code works.
 1906: 			 */
 1907: 			bd_request = 0;
 1908: 			tsleep(&bd_request, 0, "psleep", hz);
 1909: 		} else {
 1910: 			/*
 1911: 			 * We couldn't find any flushable dirty buffers but
 1912: 			 * still have too many dirty buffers, we
 1913: 			 * have to sleep and try again.  (rare)
 1914: 			 */
 1915: 			tsleep(&bd_request, 0, "qsleep", hz / 2);
 1916: 		}
 1917: 	}
 1918: }
 1919: 
 1920: /*
 1921:  *	flushbufqueues:
 1922:  *
 1923:  *	Try to flush a buffer in the dirty queue.  We must be careful to
 1924:  *	free up B_INVAL buffers instead of write them, which NFS is 
 1925:  *	particularly sensitive to.
 1926:  */
 1927: 
 1928: static int
 1929: flushbufqueues(void)
 1930: {
 1931: 	struct buf *bp;
 1932: 	int r = 0;
 1933: 
 1934: 	bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]);
 1935: 
 1936: 	while (bp) {
 1937: 		KASSERT((bp->b_flags & B_DELWRI), ("unexpected clean buffer %p", bp));
 1938: 		if ((bp->b_flags & B_DELWRI) != 0 &&
 1939: 		    (bp->b_xflags & BX_BKGRDINPROG) == 0) {
 1940: 			if (bp->b_flags & B_INVAL) {
 1941: 				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0)
 1942: 					panic("flushbufqueues: locked buf");
 1943: 				bremfree(bp);
 1944: 				brelse(bp);
 1945: 				++r;
 1946: 				break;
 1947: 			}
 1948: 			if (LIST_FIRST(&bp->b_dep) != NULL &&
 1949: 			    bioops.io_countdeps &&
 1950: 			    (bp->b_flags & B_DEFERRED) == 0 &&
 1951: 			    (*bioops.io_countdeps)(bp, 0)) {
 1952: 				TAILQ_REMOVE(&bufqueues[QUEUE_DIRTY],
 1953: 				    bp, b_freelist);
 1954: 				TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY],
 1955: 				    bp, b_freelist);
 1956: 				bp->b_flags |= B_DEFERRED;
 1957: 				bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]);
 1958: 				continue;
 1959: 			}
 1960: 			vfs_bio_awrite(bp);
 1961: 			++r;
 1962: 			break;
 1963: 		}
 1964: 		bp = TAILQ_NEXT(bp, b_freelist);
 1965: 	}
 1966: 	return (r);
 1967: }
 1968: 
 1969: /*
 1970:  * Check to see if a block is currently memory resident.
 1971:  */
 1972: struct buf *
 1973: incore(struct vnode * vp, daddr_t blkno)
 1974: {
 1975: 	struct buf *bp;
 1976: 
 1977: 	int s = splbio();
 1978: 	bp = gbincore(vp, blkno);
 1979: 	splx(s);
 1980: 	return (bp);
 1981: }
 1982: 
 1983: /*
 1984:  * Returns true if no I/O is needed to access the associated VM object.
 1985:  * This is like incore except it also hunts around in the VM system for
 1986:  * the data.
 1987:  *
 1988:  * Note that we ignore vm_page_free() races from interrupts against our
 1989:  * lookup, since if the caller is not protected our return value will not
 1990:  * be any more valid then otherwise once we splx().
 1991:  */
 1992: int
 1993: inmem(struct vnode * vp, daddr_t blkno)
 1994: {
 1995: 	vm_object_t obj;
 1996: 	vm_offset_t toff, tinc, size;
 1997: 	vm_page_t m;
 1998: 	vm_ooffset_t off;
 1999: 
 2000: 	if (incore(vp, blkno))
 2001: 		return 1;
 2002: 	if (vp->v_mount == NULL)
 2003: 		return 0;
 2004: 	if (VOP_GETVOBJECT(vp, &obj) != 0 || (vp->v_flag & VOBJBUF) == 0)
 2005:  		return 0;
 2006: 
 2007: 	size = PAGE_SIZE;
 2008: 	if (size > vp->v_mount->mnt_stat.f_iosize)
 2009: 		size = vp->v_mount->mnt_stat.f_iosize;
 2010: 	off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
 2011: 
 2012: 	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
 2013: 		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
 2014: 		if (!m)
 2015: 			return 0;
 2016: 		tinc = size;
 2017: 		if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
 2018: 			tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
 2019: 		if (vm_page_is_valid(m,
 2020: 		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
 2021: 			return 0;
 2022: 	}
 2023: 	return 1;
 2024: }
 2025: 
 2026: /*
 2027:  *	vfs_setdirty:
 2028:  *
 2029:  *	Sets the dirty range for a buffer based on the status of the dirty
 2030:  *	bits in the pages comprising the buffer.
 2031:  *
 2032:  *	The range is limited to the size of the buffer.
 2033:  *
 2034:  *	This routine is primarily used by NFS, but is generalized for the
 2035:  *	B_VMIO case.
 2036:  */
 2037: static void
 2038: vfs_setdirty(struct buf *bp) 
 2039: {
 2040: 	int i;
 2041: 	vm_object_t object;
 2042: 
 2043: 	/*
 2044: 	 * Degenerate case - empty buffer
 2045: 	 */
 2046: 
 2047: 	if (bp->b_bufsize == 0)
 2048: 		return;
 2049: 
 2050: 	/*
 2051: 	 * We qualify the scan for modified pages on whether the
 2052: 	 * object has been flushed yet.  The OBJ_WRITEABLE flag
 2053: 	 * is not cleared simply by protecting pages off.
 2054: 	 */
 2055: 
 2056: 	if ((bp->b_flags & B_VMIO) == 0)
 2057: 		return;
 2058: 
 2059: 	object = bp->b_pages[0]->object;
 2060: 
 2061: 	if ((object->flags & OBJ_WRITEABLE) && !(object->flags & OBJ_MIGHTBEDIRTY))
 2062: 		printf("Warning: object %p writeable but not mightbedirty\n", object);
 2063: 	if (!(object->flags & OBJ_WRITEABLE) && (object->flags & OBJ_MIGHTBEDIRTY))
 2064: 		printf("Warning: object %p mightbedirty but not writeable\n", object);
 2065: 
 2066: 	if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) {
 2067: 		vm_offset_t boffset;
 2068: 		vm_offset_t eoffset;
 2069: 
 2070: 		/*
 2071: 		 * test the pages to see if they have been modified directly
 2072: 		 * by users through the VM system.
 2073: 		 */
 2074: 		for (i = 0; i < bp->b_npages; i++) {
 2075: 			vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
 2076: 			vm_page_test_dirty(bp->b_pages[i]);
 2077: 		}
 2078: 
 2079: 		/*
 2080: 		 * Calculate the encompassing dirty range, boffset and eoffset,
 2081: 		 * (eoffset - boffset) bytes.
 2082: 		 */
 2083: 
 2084: 		for (i = 0; i < bp->b_npages; i++) {
 2085: 			if (bp->b_pages[i]->dirty)
 2086: 				break;
 2087: 		}
 2088: 		boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
 2089: 
 2090: 		for (i = bp->b_npages - 1; i >= 0; --i) {
 2091: 			if (bp->b_pages[i]->dirty) {
 2092: 				break;
 2093: 			}
 2094: 		}
 2095: 		eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
 2096: 
 2097: 		/*
 2098: 		 * Fit it to the buffer.
 2099: 		 */
 2100: 
 2101: 		if (eoffset > bp->b_bcount)
 2102: 			eoffset = bp->b_bcount;
 2103: 
 2104: 		/*
 2105: 		 * If we have a good dirty range, merge with the existing
 2106: 		 * dirty range.
 2107: 		 */
 2108: 
 2109: 		if (boffset < eoffset) {
 2110: 			if (bp->b_dirtyoff > boffset)
 2111: 				bp->b_dirtyoff = boffset;
 2112: 			if (bp->b_dirtyend < eoffset)
 2113: 				bp->b_dirtyend = eoffset;
 2114: 		}
 2115: 	}
 2116: }
 2117: 
 2118: /*
 2119:  *	getblk:
 2120:  *
 2121:  *	Get a block given a specified block and offset into a file/device.
 2122:  *	The buffers B_DONE bit will be cleared on return, making it almost
 2123:  * 	ready for an I/O initiation.  B_INVAL may or may not be set on 
 2124:  *	return.  The caller should clear B_INVAL prior to initiating a
 2125:  *	READ.
 2126:  *
 2127:  *	For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
 2128:  *	an existing buffer.
 2129:  *
 2130:  *	For a VMIO buffer, B_CACHE is modified according to the backing VM.
 2131:  *	If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
 2132:  *	and then cleared based on the backing VM.  If the previous buffer is
 2133:  *	non-0-sized but invalid, B_CACHE will be cleared.
 2134:  *
 2135:  *	If getblk() must create a new buffer, the new buffer is returned with
 2136:  *	both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
 2137:  *	case it is returned with B_INVAL clear and B_CACHE set based on the
 2138:  *	backing VM.
 2139:  *
 2140:  *	getblk() also forces a VOP_BWRITE() for any B_DELWRI buffer whos
 2141:  *	B_CACHE bit is clear.
 2142:  *	
 2143:  *	What this means, basically, is that the caller should use B_CACHE to
 2144:  *	determine whether the buffer is fully valid or not and should clear
 2145:  *	B_INVAL prior to issuing a read.  If the caller intends to validate
 2146:  *	the buffer by loading its data area with something, the caller needs
 2147:  *	to clear B_INVAL.  If the caller does this without issuing an I/O, 
 2148:  *	the caller should set B_CACHE ( as an optimization ), else the caller
 2149:  *	should issue the I/O and biodone() will set B_CACHE if the I/O was
 2150:  *	a write attempt or if it was a successfull read.  If the caller 
 2151:  *	intends to issue a READ, the caller must clear B_INVAL and B_ERROR
 2152:  *	prior to issuing the READ.  biodone() will *not* clear B_INVAL.
 2153:  */
 2154: struct buf *
 2155: getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
 2156: {
 2157: 	struct buf *bp;
 2158: 	int s;
 2159: 	struct bufhashhdr *bh;
 2160: 
 2161: 	if (size > MAXBSIZE)
 2162: 		panic("getblk: size(%d) > MAXBSIZE(%d)", size, MAXBSIZE);
 2163: 
 2164: 	s = splbio();
 2165: loop:
 2166: 	/*
 2167: 	 * Block if we are low on buffers.   Certain processes are allowed
 2168: 	 * to completely exhaust the buffer cache.
 2169:          *
 2170:          * If this check ever becomes a bottleneck it may be better to
 2171:          * move it into the else, when gbincore() fails.  At the moment
 2172:          * it isn't a problem.
 2173: 	 *
 2174: 	 * XXX remove, we cannot afford to block anywhere if holding a vnode
 2175: 	 * lock in low-memory situation, so take it to the max.
 2176:          */
 2177: 	if (numfreebuffers == 0) {
 2178: 		if (!curproc)
 2179: 			return NULL;
 2180: 		needsbuffer |= VFS_BIO_NEED_ANY;
 2181: 		tsleep(&needsbuffer, slpflag, "newbuf", slptimeo);
 2182: 	}
 2183: 
 2184: 	if ((bp = gbincore(vp, blkno))) {
 2185: 		/*
 2186: 		 * Buffer is in-core.  If the buffer is not busy, it must
 2187: 		 * be on a queue.
 2188: 		 */
 2189: 
 2190: 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 2191: 			if (BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL,
 2192: 			    "getblk", slpflag, slptimeo) == ENOLCK)
 2193: 				goto loop;
 2194: 			splx(s);
 2195: 			return (struct buf *) NULL;
 2196: 		}
 2197: 
 2198: 		/*
 2199: 		 * The buffer is locked.  B_CACHE is cleared if the buffer is 
 2200: 		 * invalid.  Ohterwise, for a non-VMIO buffer, B_CACHE is set
 2201: 		 * and for a VMIO buffer B_CACHE is adjusted according to the
 2202: 		 * backing VM cache.
 2203: 		 */
 2204: 		if (bp->b_flags & B_INVAL)
 2205: 			bp->b_flags &= ~B_CACHE;
 2206: 		else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
 2207: 			bp->b_flags |= B_CACHE;
 2208: 		bremfree(bp);
 2209: 
 2210: 		/*
 2211: 		 * check for size inconsistancies for non-VMIO case.
 2212: 		 */
 2213: 
 2214: 		if (bp->b_bcount != size) {
 2215: 			if ((bp->b_flags & B_VMIO) == 0 ||
 2216: 			    (size > bp->b_kvasize)) {
 2217: 				if (bp->b_flags & B_DELWRI) {
 2218: 					bp->b_flags |= B_NOCACHE;
 2219: 					VOP_BWRITE(bp->b_vp, bp);
 2220: 				} else {
 2221: 					if ((bp->b_flags & B_VMIO) &&
 2222: 					   (LIST_FIRST(&bp->b_dep) == NULL)) {
 2223: 						bp->b_flags |= B_RELBUF;
 2224: 						brelse(bp);
 2225: 					} else {
 2226: 						bp->b_flags |= B_NOCACHE;
 2227: 						VOP_BWRITE(bp->b_vp, bp);
 2228: 					}
 2229: 				}
 2230: 				goto loop;
 2231: 			}
 2232: 		}
 2233: 
 2234: 		/*
 2235: 		 * If the size is inconsistant in the VMIO case, we can resize
 2236: 		 * the buffer.  This might lead to B_CACHE getting set or
 2237: 		 * cleared.  If the size has not changed, B_CACHE remains
 2238: 		 * unchanged from its previous state.
 2239: 		 */
 2240: 
 2241: 		if (bp->b_bcount != size)
 2242: 			allocbuf(bp, size);
 2243: 
 2244: 		KASSERT(bp->b_offset != NOOFFSET, 
 2245: 		    ("getblk: no buffer offset"));
 2246: 
 2247: 		/*
 2248: 		 * A buffer with B_DELWRI set and B_CACHE clear must
 2249: 		 * be committed before we can return the buffer in
 2250: 		 * order to prevent the caller from issuing a read
 2251: 		 * ( due to B_CACHE not being set ) and overwriting
 2252: 		 * it.
 2253: 		 *
 2254: 		 * Most callers, including NFS and FFS, need this to
 2255: 		 * operate properly either because they assume they
 2256: 		 * can issue a read if B_CACHE is not set, or because
 2257: 		 * ( for example ) an uncached B_DELWRI might loop due 
 2258: 		 * to softupdates re-dirtying the buffer.  In the latter
 2259: 		 * case, B_CACHE is set after the first write completes,
 2260: 		 * preventing further loops.
 2261: 		 *
 2262: 		 * NOTE!  b*write() sets B_CACHE.  If we cleared B_CACHE
 2263: 		 * above while extending the buffer, we cannot allow the
 2264: 		 * buffer to remain with B_CACHE set after the write
 2265: 		 * completes or it will represent a corrupt state.  To
 2266: 		 * deal with this we set B_NOCACHE to scrap the buffer
 2267: 		 * after the write.
 2268: 		 *
 2269: 		 * We might be able to do something fancy, like setting
 2270: 		 * B_CACHE in bwrite() except if B_DELWRI is already set,
 2271: 		 * so the below call doesn't set B_CACHE, but that gets real
 2272: 		 * confusing.  This is much easier.
 2273: 		 */
 2274: 
 2275: 		if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
 2276: 			bp->b_flags |= B_NOCACHE;
 2277: 			VOP_BWRITE(bp->b_vp, bp);
 2278: 			goto loop;
 2279: 		}
 2280: 
 2281: 		splx(s);
 2282: 		bp->b_flags &= ~B_DONE;
 2283: 	} else {
 2284: 		/*
 2285: 		 * Buffer is not in-core, create new buffer.  The buffer
 2286: 		 * returned by getnewbuf() is locked.  Note that the returned
 2287: 		 * buffer is also considered valid (not marked B_INVAL).
 2288: 		 */
 2289: 		int bsize, maxsize, vmio;
 2290: 		off_t offset;
 2291: 
 2292: 		if (vn_isdisk(vp, NULL))
 2293: 			bsize = DEV_BSIZE;
 2294: 		else if (vp->v_mountedhere)
 2295: 			bsize = vp->v_mountedhere->mnt_stat.f_iosize;
 2296: 		else if (vp->v_mount)
 2297: 			bsize = vp->v_mount->mnt_stat.f_iosize;
 2298: 		else
 2299: 			bsize = size;
 2300: 
 2301: 		offset = (off_t)blkno * bsize;
 2302: 		vmio = (VOP_GETVOBJECT(vp, NULL) == 0) && (vp->v_flag & VOBJBUF);
 2303: 		maxsize = vmio ? size + (offset & PAGE_MASK) : size;
 2304: 		maxsize = imax(maxsize, bsize);
 2305: 
 2306: 		if ((bp = getnewbuf(slpflag, slptimeo, size, maxsize)) == NULL) {
 2307: 			if (slpflag || slptimeo) {
 2308: 				splx(s);
 2309: 				return NULL;
 2310: 			}
 2311: 			goto loop;
 2312: 		}
 2313: 
 2314: 		/*
 2315: 		 * This code is used to make sure that a buffer is not
 2316: 		 * created while the getnewbuf routine is blocked.
 2317: 		 * This can be a problem whether the vnode is locked or not.
 2318: 		 * If the buffer is created out from under us, we have to
 2319: 		 * throw away the one we just created.  There is now window
 2320: 		 * race because we are safely running at splbio() from the
 2321: 		 * point of the duplicate buffer creation through to here,
 2322: 		 * and we've locked the buffer.
 2323: 		 */
 2324: 		if (gbincore(vp, blkno)) {
 2325: 			bp->b_flags |= B_INVAL;
 2326: 			brelse(bp);
 2327: 			goto loop;
 2328: 		}
 2329: 
 2330: 		/*
 2331: 		 * Insert the buffer into the hash, so that it can
 2332: 		 * be found by incore.
 2333: 		 */
 2334: 		bp->b_blkno = bp->b_lblkno = blkno;
 2335: 		bp->b_offset = offset;
 2336: 
 2337: 		bgetvp(vp, bp);
 2338: 		LIST_REMOVE(bp, b_hash);
 2339: 		bh = bufhash(vp, blkno);
 2340: 		LIST_INSERT_HEAD(bh, bp, b_hash);
 2341: 
 2342: 		/*
 2343: 		 * set B_VMIO bit.  allocbuf() the buffer bigger.  Since the
 2344: 		 * buffer size starts out as 0, B_CACHE will be set by
 2345: 		 * allocbuf() for the VMIO case prior to it testing the
 2346: 		 * backing store for validity.
 2347: 		 */
 2348: 
 2349: 		if (vmio) {
 2350: 			bp->b_flags |= B_VMIO;
 2351: #if defined(VFS_BIO_DEBUG)
 2352: 			if (vn_canvmio(vp) != TRUE)
 2353: 				printf("getblk: vmioing file type %d???\n", vp->v_type);
 2354: #endif
 2355: 		} else {
 2356: 			bp->b_flags &= ~B_VMIO;
 2357: 		}
 2358: 
 2359: 		allocbuf(bp, size);
 2360: 
 2361: 		splx(s);
 2362: 		bp->b_flags &= ~B_DONE;
 2363: 	}
 2364: 	return (bp);
 2365: }
 2366: 
 2367: /*
 2368:  * Get an empty, disassociated buffer of given size.  The buffer is initially
 2369:  * set to B_INVAL.
 2370:  *
 2371:  * spl protection is not required for the allocbuf() call because races are
 2372:  * impossible here.
 2373:  */
 2374: struct buf *
 2375: geteblk(int size)
 2376: {
 2377: 	struct buf *bp;
 2378: 	int s;
 2379: 	int maxsize;
 2380: 
 2381: 	maxsize = (size + BKVAMASK) & ~BKVAMASK;
 2382: 
 2383: 	s = splbio();
 2384: 	while ((bp = getnewbuf(0, 0, size, maxsize)) == 0);
 2385: 	splx(s);
 2386: 	allocbuf(bp, size);
 2387: 	bp->b_flags |= B_INVAL;	/* b_dep cleared by getnewbuf() */
 2388: 	return (bp);
 2389: }
 2390: 
 2391: 
 2392: /*
 2393:  * This code constitutes the buffer memory from either anonymous system
 2394:  * memory (in the case of non-VMIO operations) or from an associated
 2395:  * VM object (in the case of VMIO operations).  This code is able to
 2396:  * resize a buffer up or down.
 2397:  *
 2398:  * Note that this code is tricky, and has many complications to resolve
 2399:  * deadlock or inconsistant data situations.  Tread lightly!!! 
 2400:  * There are B_CACHE and B_DELWRI interactions that must be dealt with by 
 2401:  * the caller.  Calling this code willy nilly can result in the loss of data.
 2402:  *
 2403:  * allocbuf() only adjusts B_CACHE for VMIO buffers.  getblk() deals with
 2404:  * B_CACHE for the non-VMIO case.
 2405:  *
 2406:  * This routine does not need to be called at splbio() but you must own the
 2407:  * buffer.
 2408:  */
 2409: int
 2410: allocbuf(struct buf *bp, int size)
 2411: {
 2412: 	int newbsize, mbsize;
 2413: 	int i;
 2414: 
 2415: 	if (BUF_REFCNT(bp) == 0)
 2416: 		panic("allocbuf: buffer not busy");
 2417: 
 2418: 	if (bp->b_kvasize < size)
 2419: 		panic("allocbuf: buffer too small");
 2420: 
 2421: 	if ((bp->b_flags & B_VMIO) == 0) {
 2422: 		caddr_t origbuf;
 2423: 		int origbufsize;
 2424: 		/*
 2425: 		 * Just get anonymous memory from the kernel.  Don't
 2426: 		 * mess with B_CACHE.
 2427: 		 */
 2428: 		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 2429: #if !defined(NO_B_MALLOC)
 2430: 		if (bp->b_flags & B_MALLOC)
 2431: 			newbsize = mbsize;
 2432: 		else
 2433: #endif
 2434: 			newbsize = round_page(size);
 2435: 
 2436: 		if (newbsize < bp->b_bufsize) {
 2437: #if !defined(NO_B_MALLOC)
 2438: 			/*
 2439: 			 * malloced buffers are not shrunk
 2440: 			 */
 2441: 			if (bp->b_flags & B_MALLOC) {
 2442: 				if (newbsize) {
 2443: 					bp->b_bcount = size;
 2444: 				} else {
 2445: 					free(bp->b_data, M_BIOBUF);
 2446: 					if (bp->b_bufsize) {
 2447: 						bufmallocspace -= bp->b_bufsize;
 2448: 						bufspacewakeup();
 2449: 						bp->b_bufsize = 0;
 2450: 					}
 2451: 					bp->b_data = bp->b_kvabase;
 2452: 					bp->b_bcount = 0;
 2453: 					bp->b_flags &= ~B_MALLOC;
 2454: 				}
 2455: 				return 1;
 2456: 			}		
 2457: #endif
 2458: 			vm_hold_free_pages(
 2459: 			    bp,
 2460: 			    (vm_offset_t) bp->b_data + newbsize,
 2461: 			    (vm_offset_t) bp->b_data + bp->b_bufsize);
 2462: 		} else if (newbsize > bp->b_bufsize) {
 2463: #if !defined(NO_B_MALLOC)
 2464: 			/*
 2465: 			 * We only use malloced memory on the first allocation.
 2466: 			 * and revert to page-allocated memory when the buffer
 2467: 			 * grows.
 2468: 			 */
 2469: 			if ( (bufmallocspace < maxbufmallocspace) &&
 2470: 				(bp->b_bufsize == 0) &&
 2471: 				(mbsize <= PAGE_SIZE/2)) {
 2472: 
 2473: 				bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
 2474: 				bp->b_bufsize = mbsize;
 2475: 				bp->b_bcount = size;
 2476: 				bp->b_flags |= B_MALLOC;
 2477: 				bufmallocspace += mbsize;
 2478: 				return 1;
 2479: 			}
 2480: #endif
 2481: 			origbuf = NULL;
 2482: 			origbufsize = 0;
 2483: #if !defined(NO_B_MALLOC)
 2484: 			/*
 2485: 			 * If the buffer is growing on its other-than-first allocation,
 2486: 			 * then we revert to the page-allocation scheme.
 2487: 			 */
 2488: 			if (bp->b_flags & B_MALLOC) {
 2489: 				origbuf = bp->b_data;
 2490: 				origbufsize = bp->b_bufsize;
 2491: 				bp->b_data = bp->b_kvabase;
 2492: 				if (bp->b_bufsize) {
 2493: 					bufmallocspace -= bp->b_bufsize;
 2494: 					bufspacewakeup();
 2495: 					bp->b_bufsize = 0;
 2496: 				}
 2497: 				bp->b_flags &= ~B_MALLOC;
 2498: 				newbsize = round_page(newbsize);
 2499: 			}
 2500: #endif
 2501: 			vm_hold_load_pages(
 2502: 			    bp,
 2503: 			    (vm_offset_t) bp->b_data + bp->b_bufsize,
 2504: 			    (vm_offset_t) bp->b_data + newbsize);
 2505: #if !defined(NO_B_MALLOC)
 2506: 			if (origbuf) {
 2507: 				bcopy(origbuf, bp->b_data, origbufsize);
 2508: 				free(origbuf, M_BIOBUF);
 2509: 			}
 2510: #endif
 2511: 		}
 2512: 	} else {
 2513: 		vm_page_t m;
 2514: 		int desiredpages;
 2515: 
 2516: 		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 2517: 		desiredpages = (size == 0) ? 0 :
 2518: 			num_pages((bp->b_offset & PAGE_MASK) + newbsize);
 2519: 
 2520: #if !defined(NO_B_MALLOC)
 2521: 		if (bp->b_flags & B_MALLOC)
 2522: 			panic("allocbuf: VMIO buffer can't be malloced");
 2523: #endif
 2524: 		/*
 2525: 		 * Set B_CACHE initially if buffer is 0 length or will become
 2526: 		 * 0-length.
 2527: 		 */
 2528: 		if (size == 0 || bp->b_bufsize == 0)
 2529: 			bp->b_flags |= B_CACHE;
 2530: 
 2531: 		if (newbsize < bp->b_bufsize) {
 2532: 			/*
 2533: 			 * DEV_BSIZE aligned new buffer size is less then the
 2534: 			 * DEV_BSIZE aligned existing buffer size.  Figure out
 2535: 			 * if we have to remove any pages.
 2536: 			 */
 2537: 			if (desiredpages < bp->b_npages) {
 2538: 				for (i = desiredpages; i < bp->b_npages; i++) {
 2539: 					/*
 2540: 					 * the page is not freed here -- it
 2541: 					 * is the responsibility of 
 2542: 					 * vnode_pager_setsize
 2543: 					 */
 2544: 					m = bp->b_pages[i];
 2545: 					KASSERT(m != bogus_page,
 2546: 					    ("allocbuf: bogus page found"));
 2547: 					while (vm_page_sleep_busy(m, TRUE, "biodep"))
 2548: 						;
 2549: 
 2550: 					bp->b_pages[i] = NULL;
 2551: 					vm_page_unwire(m, 0);
 2552: 				}
 2553: 				pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) +
 2554: 				    (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
 2555: 				bp->b_npages = desiredpages;
 2556: 			}
 2557: 		} else if (size > bp->b_bcount) {
 2558: 			/*
 2559: 			 * We are growing the buffer, possibly in a 
 2560: 			 * byte-granular fashion.
 2561: 			 */
 2562: 			struct vnode *vp;
 2563: 			vm_object_t obj;
 2564: 			vm_offset_t toff;
 2565: 			vm_offset_t tinc;
 2566: 			int s;
 2567: 
 2568: 			/*
 2569: 			 * Step 1, bring in the VM pages from the object, 
 2570: 			 * allocating them if necessary.  We must clear
 2571: 			 * B_CACHE if these pages are not valid for the 
 2572: 			 * range covered by the buffer.
 2573: 			 *
 2574: 			 * spl protection is required to protect against
 2575: 			 * interrupts unbusying and freeing pages between
 2576: 			 * our vm_page_lookup() and our busycheck/wiring
 2577: 			 * call.
 2578: 			 */
 2579: 			vp = bp->b_vp;
 2580: 			VOP_GETVOBJECT(vp, &obj);
 2581: 
 2582: 			s = splbio();
 2583: 			while (bp->b_npages < desiredpages) {
 2584: 				vm_page_t m;
 2585: 				vm_pindex_t pi;
 2586: 
 2587: 				pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages;
 2588: 				if ((m = vm_page_lookup(obj, pi)) == NULL) {
 2589: 					/*
 2590: 					 * note: must allocate system pages
 2591: 					 * since blocking here could intefere
 2592: 					 * with paging I/O, no matter which
 2593: 					 * process we are.
 2594: 					 */
 2595: 					m = vm_page_alloc(obj, pi, VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM);
 2596: 					if (m == NULL) {
 2597: 						VM_WAIT;
 2598: 						vm_pageout_deficit += desiredpages - bp->b_npages;
 2599: 					} else {
 2600: 						vm_page_wire(m);
 2601: 						vm_page_wakeup(m);
 2602: 						bp->b_flags &= ~B_CACHE;
 2603: 						bp->b_pages[bp->b_npages] = m;
 2604: 						++bp->b_npages;
 2605: 					}
 2606: 					continue;
 2607: 				}
 2608: 
 2609: 				/*
 2610: 				 * We found a page.  If we have to sleep on it,
 2611: 				 * retry because it might have gotten freed out
 2612: 				 * from under us.
 2613: 				 *
 2614: 				 * We can only test PG_BUSY here.  Blocking on
 2615: 				 * m->busy might lead to a deadlock:
 2616: 				 *
 2617: 				 *  vm_fault->getpages->cluster_read->allocbuf
 2618: 				 *
 2619: 				 */
 2620: 
 2621: 				if (vm_page_sleep_busy(m, FALSE, "pgtblk"))
 2622: 					continue;
 2623: 
 2624: 				/*
 2625: 				 * We have a good page.  Should we wakeup the
 2626: 				 * page daemon?
 2627: 				 */
 2628: 				if ((curthread != pagethread) &&
 2629: 				    ((m->queue - m->pc) == PQ_CACHE) &&
 2630: 				    ((vmstats.v_free_count + vmstats.v_cache_count) <
 2631: 					(vmstats.v_free_min + vmstats.v_cache_min))) {
 2632: 					pagedaemon_wakeup();
 2633: 				}
 2634: 				vm_page_flag_clear(m, PG_ZERO);
 2635: 				vm_page_wire(m);
 2636: 				bp->b_pages[bp->b_npages] = m;
 2637: 				++bp->b_npages;
 2638: 			}
 2639: 			splx(s);
 2640: 
 2641: 			/*
 2642: 			 * Step 2.  We've loaded the pages into the buffer,
 2643: 			 * we have to figure out if we can still have B_CACHE
 2644: 			 * set.  Note that B_CACHE is set according to the
 2645: 			 * byte-granular range ( bcount and size ), new the
 2646: 			 * aligned range ( newbsize ).
 2647: 			 *
 2648: 			 * The VM test is against m->valid, which is DEV_BSIZE
 2649: 			 * aligned.  Needless to say, the validity of the data
 2650: 			 * needs to also be DEV_BSIZE aligned.  Note that this
 2651: 			 * fails with NFS if the server or some other client
 2652: 			 * extends the file's EOF.  If our buffer is resized, 
 2653: 			 * B_CACHE may remain set! XXX
 2654: 			 */
 2655: 
 2656: 			toff = bp->b_bcount;
 2657: 			tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
 2658: 
 2659: 			while ((bp->b_flags & B_CACHE) && toff < size) {
 2660: 				vm_pindex_t pi;
 2661: 
 2662: 				if (tinc > (size - toff))
 2663: 					tinc = size - toff;
 2664: 
 2665: 				pi = ((bp->b_offset & PAGE_MASK) + toff) >> 
 2666: 				    PAGE_SHIFT;
 2667: 
 2668: 				vfs_buf_test_cache(
 2669: 				    bp, 
 2670: 				    bp->b_offset,
 2671: 				    toff, 
 2672: 				    tinc, 
 2673: 				    bp->b_pages[pi]
 2674: 				);
 2675: 				toff += tinc;
 2676: 				tinc = PAGE_SIZE;
 2677: 			}
 2678: 
 2679: 			/*
 2680: 			 * Step 3, fixup the KVM pmap.  Remember that
 2681: 			 * bp->b_data is relative to bp->b_offset, but 
 2682: 			 * bp->b_offset may be offset into the first page.
 2683: 			 */
 2684: 
 2685: 			bp->b_data = (caddr_t)
 2686: 			    trunc_page((vm_offset_t)bp->b_data);
 2687: 			pmap_qenter(
 2688: 			    (vm_offset_t)bp->b_data,
 2689: 			    bp->b_pages, 
 2690: 			    bp->b_npages
 2691: 			);
 2692: 			bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | 
 2693: 			    (vm_offset_t)(bp->b_offset & PAGE_MASK));
 2694: 		}
 2695: 	}
 2696: 	if (newbsize < bp->b_bufsize)
 2697: 		bufspacewakeup();
 2698: 	bp->b_bufsize = newbsize;	/* actual buffer allocation	*/
 2699: 	bp->b_bcount = size;		/* requested buffer size	*/
 2700: 	return 1;
 2701: }
 2702: 
 2703: /*
 2704:  *	biowait:
 2705:  *
 2706:  *	Wait for buffer I/O completion, returning error status.  The buffer
 2707:  *	is left locked and B_DONE on return.  B_EINTR is converted into a EINTR
 2708:  *	error and cleared.
 2709:  */
 2710: int
 2711: biowait(struct buf * bp)
 2712: {
 2713: 	int s;
 2714: 
 2715: 	s = splbio();
 2716: 	while ((bp->b_flags & B_DONE) == 0) {
 2717: #if defined(NO_SCHEDULE_MODS)
 2718: 		tsleep(bp, 0, "biowait", 0);
 2719: #else
 2720: 		if (bp->b_flags & B_READ)
 2721: 			tsleep(bp, 0, "biord", 0);
 2722: 		else
 2723: 			tsleep(bp, 0, "biowr", 0);
 2724: #endif
 2725: 	}
 2726: 	splx(s);
 2727: 	if (bp->b_flags & B_EINTR) {
 2728: 		bp->b_flags &= ~B_EINTR;
 2729: 		return (EINTR);
 2730: 	}
 2731: 	if (bp->b_flags & B_ERROR) {
 2732: 		return (bp->b_error ? bp->b_error : EIO);
 2733: 	} else {
 2734: 		return (0);
 2735: 	}
 2736: }
 2737: 
 2738: /*
 2739:  *	biodone:
 2740:  *
 2741:  *	Finish I/O on a buffer, optionally calling a completion function.
 2742:  *	This is usually called from an interrupt so process blocking is
 2743:  *	not allowed.
 2744:  *
 2745:  *	biodone is also responsible for setting B_CACHE in a B_VMIO bp.
 2746:  *	In a non-VMIO bp, B_CACHE will be set on the next getblk() 
 2747:  *	assuming B_INVAL is clear.
 2748:  *
 2749:  *	For the VMIO case, we set B_CACHE if the op was a read and no
 2750:  *	read error occured, or if the op was a write.  B_CACHE is never
 2751:  *	set if the buffer is invalid or otherwise uncacheable.
 2752:  *
 2753:  *	biodone does not mess with B_INVAL, allowing the I/O routine or the
 2754:  *	initiator to leave B_INVAL set to brelse the buffer out of existance
 2755:  *	in the biodone routine.
 2756:  */
 2757: void
 2758: biodone(struct buf * bp)
 2759: {
 2760: 	int s, error;
 2761: 
 2762: 	s = splbio();
 2763: 
 2764: 	KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, BUF_REFCNT(bp)));
 2765: 	KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
 2766: 
 2767: 	bp->b_flags |= B_DONE;
 2768: 	runningbufwakeup(bp);
 2769: 
 2770: 	if (bp->b_flags & B_FREEBUF) {
 2771: 		brelse(bp);
 2772: 		splx(s);
 2773: 		return;
 2774: 	}
 2775: 
 2776: 	if ((bp->b_flags & B_READ) == 0) {
 2777: 		vwakeup(bp);
 2778: 	}
 2779: 
 2780: 	/* call optional completion function if requested */
 2781: 	if (bp->b_flags & B_CALL) {
 2782: 		bp->b_flags &= ~B_CALL;
 2783: 		(*bp->b_iodone) (bp);
 2784: 		splx(s);
 2785: 		return;
 2786: 	}
 2787: 	if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_complete)
 2788: 		(*bioops.io_complete)(bp);
 2789: 
 2790: 	if (bp->b_flags & B_VMIO) {
 2791: 		int i;
 2792: 		vm_ooffset_t foff;
 2793: 		vm_page_t m;
 2794: 		vm_object_t obj;
 2795: 		int iosize;
 2796: 		struct vnode *vp = bp->b_vp;
 2797: 
 2798: 		error = VOP_GETVOBJECT(vp, &obj);
 2799: 
 2800: #if defined(VFS_BIO_DEBUG)
 2801: 		if (vp->v_holdcnt == 0) {
 2802: 			panic("biodone: zero vnode hold count");
 2803: 		}
 2804: 
 2805: 		if (error) {
 2806: 			panic("biodone: missing VM object");
 2807: 		}
 2808: 
 2809: 		if ((vp->v_flag & VOBJBUF) == 0) {
 2810: 			panic("biodone: vnode is not setup for merged cache");
 2811: 		}
 2812: #endif
 2813: 
 2814: 		foff = bp->b_offset;
 2815: 		KASSERT(bp->b_offset != NOOFFSET,
 2816: 		    ("biodone: no buffer offset"));
 2817: 
 2818: 		if (error) {
 2819: 			panic("biodone: no object");
 2820: 		}
 2821: #if defined(VFS_BIO_DEBUG)
 2822: 		if (obj->paging_in_progress < bp->b_npages) {
 2823: 			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
 2824: 			    obj->paging_in_progress, bp->b_npages);
 2825: 		}
 2826: #endif
 2827: 
 2828: 		/*
 2829: 		 * Set B_CACHE if the op was a normal read and no error
 2830: 		 * occured.  B_CACHE is set for writes in the b*write()
 2831: 		 * routines.
 2832: 		 */
 2833: 		iosize = bp->b_bcount - bp->b_resid;
 2834: 		if ((bp->b_flags & (B_READ|B_FREEBUF|B_INVAL|B_NOCACHE|B_ERROR)) == B_READ) {
 2835: 			bp->b_flags |= B_CACHE;
 2836: 		}
 2837: 
 2838: 		for (i = 0; i < bp->b_npages; i++) {
 2839: 			int bogusflag = 0;
 2840: 			int resid;
 2841: 
 2842: 			resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
 2843: 			if (resid > iosize)
 2844: 				resid = iosize;
 2845: 
 2846: 			/*
 2847: 			 * cleanup bogus pages, restoring the originals.  Since
 2848: 			 * the originals should still be wired, we don't have
 2849: 			 * to worry about interrupt/freeing races destroying
 2850: 			 * the VM object association.
 2851: 			 */
 2852: 			m = bp->b_pages[i];
 2853: 			if (m == bogus_page) {
 2854: 				bogusflag = 1;
 2855: 				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
 2856: 				if (m == NULL)
 2857: 					panic("biodone: page disappeared");
 2858: 				bp->b_pages[i] = m;
 2859: 				pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
 2860: 			}
 2861: #if defined(VFS_BIO_DEBUG)
 2862: 			if (OFF_TO_IDX(foff) != m->pindex) {
 2863: 				printf(
 2864: "biodone: foff(%lu)/m->pindex(%d) mismatch\n",
 2865: 				    (unsigned long)foff, m->pindex);
 2866: 			}
 2867: #endif
 2868: 
 2869: 			/*
 2870: 			 * In the write case, the valid and clean bits are
 2871: 			 * already changed correctly ( see bdwrite() ), so we 
 2872: 			 * only need to do this here in the read case.
 2873: 			 */
 2874: 			if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
 2875: 				vfs_page_set_valid(bp, foff, i, m);
 2876: 			}
 2877: 			vm_page_flag_clear(m, PG_ZERO);
 2878: 
 2879: 			/*
 2880: 			 * when debugging new filesystems or buffer I/O methods, this
 2881: 			 * is the most common error that pops up.  if you see this, you
 2882: 			 * have not set the page busy flag correctly!!!
 2883: 			 */
 2884: 			if (m->busy == 0) {
 2885: 				printf("biodone: page busy < 0, "
 2886: 				    "pindex: %d, foff: 0x(%x,%x), "
 2887: 				    "resid: %d, index: %d\n",
 2888: 				    (int) m->pindex, (int)(foff >> 32),
 2889: 						(int) foff & 0xffffffff, resid, i);
 2890: 				if (!vn_isdisk(vp, NULL))
 2891: 					printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n",
 2892: 					    bp->b_vp->v_mount->mnt_stat.f_iosize,
 2893: 					    (int) bp->b_lblkno,
 2894: 					    bp->b_flags, bp->b_npages);
 2895: 				else
 2896: 					printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n",
 2897: 					    (int) bp->b_lblkno,
 2898: 					    bp->b_flags, bp->b_npages);
 2899: 				printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n",
 2900: 				    m->valid, m->dirty, m->wire_count);
 2901: 				panic("biodone: page busy < 0");
 2902: 			}
 2903: 			vm_page_io_finish(m);
 2904: 			vm_object_pip_subtract(obj, 1);
 2905: 			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 2906: 			iosize -= resid;
 2907: 		}
 2908: 		if (obj)
 2909: 			vm_object_pip_wakeupn(obj, 0);
 2910: 	}
 2911: 
 2912: 	/*
 2913: 	 * For asynchronous completions, release the buffer now. The brelse
 2914: 	 * will do a wakeup there if necessary - so no need to do a wakeup
 2915: 	 * here in the async case. The sync case always needs to do a wakeup.
 2916: 	 */
 2917: 
 2918: 	if (bp->b_flags & B_ASYNC) {
 2919: 		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0)
 2920: 			brelse(bp);
 2921: 		else
 2922: 			bqrelse(bp);
 2923: 	} else {
 2924: 		wakeup(bp);
 2925: 	}
 2926: 	splx(s);
 2927: }
 2928: 
 2929: /*
 2930:  * This routine is called in lieu of iodone in the case of
 2931:  * incomplete I/O.  This keeps the busy status for pages
 2932:  * consistant.
 2933:  */
 2934: void
 2935: vfs_unbusy_pages(struct buf * bp)
 2936: {
 2937: 	int i;
 2938: 
 2939: 	runningbufwakeup(bp);
 2940: 	if (bp->b_flags & B_VMIO) {
 2941: 		struct vnode *vp = bp->b_vp;
 2942: 		vm_object_t obj;
 2943: 
 2944: 		VOP_GETVOBJECT(vp, &obj);
 2945: 
 2946: 		for (i = 0; i < bp->b_npages; i++) {
 2947: 			vm_page_t m = bp->b_pages[i];
 2948: 
 2949: 			/*
 2950: 			 * When restoring bogus changes the original pages
 2951: 			 * should still be wired, so we are in no danger of
 2952: 			 * losing the object association and do not need
 2953: 			 * spl protection particularly.
 2954: 			 */
 2955: 			if (m == bogus_page) {
 2956: 				m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
 2957: 				if (!m) {
 2958: 					panic("vfs_unbusy_pages: page missing");
 2959: 				}
 2960: 				bp->b_pages[i] = m;
 2961: 				pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
 2962: 			}
 2963: 			vm_object_pip_subtract(obj, 1);
 2964: 			vm_page_flag_clear(m, PG_ZERO);
 2965: 			vm_page_io_finish(m);
 2966: 		}
 2967: 		vm_object_pip_wakeupn(obj, 0);
 2968: 	}
 2969: }
 2970: 
 2971: /*
 2972:  * vfs_page_set_valid:
 2973:  *
 2974:  *	Set the valid bits in a page based on the supplied offset.   The
 2975:  *	range is restricted to the buffer's size.
 2976:  *
 2977:  *	This routine is typically called after a read completes.
 2978:  */
 2979: static void
 2980: vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
 2981: {
 2982: 	vm_ooffset_t soff, eoff;
 2983: 
 2984: 	/*
 2985: 	 * Start and end offsets in buffer.  eoff - soff may not cross a
 2986: 	 * page boundry or cross the end of the buffer.  The end of the
 2987: 	 * buffer, in this case, is our file EOF, not the allocation size
 2988: 	 * of the buffer.
 2989: 	 */
 2990: 	soff = off;
 2991: 	eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 2992: 	if (eoff > bp->b_offset + bp->b_bcount)
 2993: 		eoff = bp->b_offset + bp->b_bcount;
 2994: 
 2995: 	/*
 2996: 	 * Set valid range.  This is typically the entire buffer and thus the
 2997: 	 * entire page.
 2998: 	 */
 2999: 	if (eoff > soff) {
 3000: 		vm_page_set_validclean(
 3001: 		    m,
 3002: 		   (vm_offset_t) (soff & PAGE_MASK),
 3003: 		   (vm_offset_t) (eoff - soff)
 3004: 		);
 3005: 	}
 3006: }
 3007: 
 3008: /*
 3009:  * This routine is called before a device strategy routine.
 3010:  * It is used to tell the VM system that paging I/O is in
 3011:  * progress, and treat the pages associated with the buffer
 3012:  * almost as being PG_BUSY.  Also the object paging_in_progress
 3013:  * flag is handled to make sure that the object doesn't become
 3014:  * inconsistant.
 3015:  *
 3016:  * Since I/O has not been initiated yet, certain buffer flags
 3017:  * such as B_ERROR or B_INVAL may be in an inconsistant state
 3018:  * and should be ignored.
 3019:  */
 3020: void
 3021: vfs_busy_pages(struct buf * bp, int clear_modify)
 3022: {
 3023: 	int i, bogus;
 3024: 
 3025: 	if (bp->b_flags & B_VMIO) {
 3026: 		struct vnode *vp = bp->b_vp;
 3027: 		vm_object_t obj;
 3028: 		vm_ooffset_t foff;
 3029: 
 3030: 		VOP_GETVOBJECT(vp, &obj);
 3031: 		foff = bp->b_offset;
 3032: 		KASSERT(bp->b_offset != NOOFFSET,
 3033: 		    ("vfs_busy_pages: no buffer offset"));
 3034: 		vfs_setdirty(bp);
 3035: 
 3036: retry:
 3037: 		for (i = 0; i < bp->b_npages; i++) {
 3038: 			vm_page_t m = bp->b_pages[i];
 3039: 			if (vm_page_sleep_busy(m, FALSE, "vbpage"))
 3040: 				goto retry;
 3041: 		}
 3042: 
 3043: 		bogus = 0;
 3044: 		for (i = 0; i < bp->b_npages; i++) {
 3045: 			vm_page_t m = bp->b_pages[i];
 3046: 
 3047: 			vm_page_flag_clear(m, PG_ZERO);
 3048: 			if ((bp->b_flags & B_CLUSTER) == 0) {
 3049: 				vm_object_pip_add(obj, 1);
 3050: 				vm_page_io_start(m);
 3051: 			}
 3052: 
 3053: 			/*
 3054: 			 * When readying a buffer for a read ( i.e
 3055: 			 * clear_modify == 0 ), it is important to do
 3056: 			 * bogus_page replacement for valid pages in 
 3057: 			 * partially instantiated buffers.  Partially 
 3058: 			 * instantiated buffers can, in turn, occur when
 3059: 			 * reconstituting a buffer from its VM backing store
 3060: 			 * base.  We only have to do this if B_CACHE is
 3061: 			 * clear ( which causes the I/O to occur in the
 3062: 			 * first place ).  The replacement prevents the read
 3063: 			 * I/O from overwriting potentially dirty VM-backed
 3064: 			 * pages.  XXX bogus page replacement is, uh, bogus.
 3065: 			 * It may not work properly with small-block devices.
 3066: 			 * We need to find a better way.
 3067: 			 */
 3068: 
 3069: 			vm_page_protect(m, VM_PROT_NONE);
 3070: 			if (clear_modify)
 3071: 				vfs_page_set_valid(bp, foff, i, m);
 3072: 			else if (m->valid == VM_PAGE_BITS_ALL &&
 3073: 				(bp->b_flags & B_CACHE) == 0) {
 3074: 				bp->b_pages[i] = bogus_page;
 3075: 				bogus++;
 3076: 			}
 3077: 			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 3078: 		}
 3079: 		if (bogus)
 3080: 			pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
 3081: 	}
 3082: 
 3083: 	/*
 3084: 	 * This is the easiest place to put the process accounting for the I/O
 3085: 	 * for now.
 3086: 	 */
 3087: 	{
 3088: 		struct proc *p;
 3089: 
 3090: 		if ((p = curthread->td_proc) != NULL) {
 3091: 			if (bp->b_flags & B_READ)
 3092: 				p->p_stats->p_ru.ru_inblock++;
 3093: 			else
 3094: 				p->p_stats->p_ru.ru_oublock++;
 3095: 		}
 3096: 	}
 3097: }
 3098: 
 3099: /*
 3100:  * Tell the VM system that the pages associated with this buffer
 3101:  * are clean.  This is used for delayed writes where the data is
 3102:  * going to go to disk eventually without additional VM intevention.
 3103:  *
 3104:  * Note that while we only really need to clean through to b_bcount, we
 3105:  * just go ahead and clean through to b_bufsize.
 3106:  */
 3107: static void
 3108: vfs_clean_pages(struct buf * bp)
 3109: {
 3110: 	int i;
 3111: 
 3112: 	if (bp->b_flags & B_VMIO) {
 3113: 		vm_ooffset_t foff;
 3114: 
 3115: 		foff = bp->b_offset;
 3116: 		KASSERT(bp->b_offset != NOOFFSET,
 3117: 		    ("vfs_clean_pages: no buffer offset"));
 3118: 		for (i = 0; i < bp->b_npages; i++) {
 3119: 			vm_page_t m = bp->b_pages[i];
 3120: 			vm_ooffset_t noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 3121: 			vm_ooffset_t eoff = noff;
 3122: 
 3123: 			if (eoff > bp->b_offset + bp->b_bufsize)
 3124: 				eoff = bp->b_offset + bp->b_bufsize;
 3125: 			vfs_page_set_valid(bp, foff, i, m);
 3126: 			/* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
 3127: 			foff = noff;
 3128: 		}
 3129: 	}
 3130: }
 3131: 
 3132: /*
 3133:  *	vfs_bio_set_validclean:
 3134:  *
 3135:  *	Set the range within the buffer to valid and clean.  The range is 
 3136:  *	relative to the beginning of the buffer, b_offset.  Note that b_offset
 3137:  *	itself may be offset from the beginning of the first page.
 3138:  */
 3139: 
 3140: void   
 3141: vfs_bio_set_validclean(struct buf *bp, int base, int size)
 3142: {
 3143: 	if (bp->b_flags & B_VMIO) {
 3144: 		int i;
 3145: 		int n;
 3146: 
 3147: 		/*
 3148: 		 * Fixup base to be relative to beginning of first page.
 3149: 		 * Set initial n to be the maximum number of bytes in the
 3150: 		 * first page that can be validated.
 3151: 		 */
 3152: 
 3153: 		base += (bp->b_offset & PAGE_MASK);
 3154: 		n = PAGE_SIZE - (base & PAGE_MASK);
 3155: 
 3156: 		for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
 3157: 			vm_page_t m = bp->b_pages[i];
 3158: 
 3159: 			if (n > size)
 3160: 				n = size;
 3161: 
 3162: 			vm_page_set_validclean(m, base & PAGE_MASK, n);
 3163: 			base += n;
 3164: 			size -= n;
 3165: 			n = PAGE_SIZE;
 3166: 		}
 3167: 	}
 3168: }
 3169: 
 3170: /*
 3171:  *	vfs_bio_clrbuf:
 3172:  *
 3173:  *	clear a buffer.  This routine essentially fakes an I/O, so we need
 3174:  *	to clear B_ERROR and B_INVAL.
 3175:  *
 3176:  *	Note that while we only theoretically need to clear through b_bcount,
 3177:  *	we go ahead and clear through b_bufsize.
 3178:  */
 3179: 
 3180: void
 3181: vfs_bio_clrbuf(struct buf *bp)
 3182: {
 3183: 	int i, mask = 0;
 3184: 	caddr_t sa, ea;
 3185: 	if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) {
 3186: 		bp->b_flags &= ~(B_INVAL|B_ERROR);
 3187: 		if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
 3188: 		    (bp->b_offset & PAGE_MASK) == 0) {
 3189: 			mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
 3190: 			if ((bp->b_pages[0]->valid & mask) == mask) {
 3191: 				bp->b_resid = 0;
 3192: 				return;
 3193: 			}
 3194: 			if (((bp->b_pages[0]->flags & PG_ZERO) == 0) &&
 3195: 			    ((bp->b_pages[0]->valid & mask) == 0)) {
 3196: 				bzero(bp->b_data, bp->b_bufsize);
 3197: 				bp->b_pages[0]->valid |= mask;
 3198: 				bp->b_resid = 0;
 3199: 				return;
 3200: 			}
 3201: 		}
 3202: 		ea = sa = bp->b_data;
 3203: 		for(i=0;i<bp->b_npages;i++,sa=ea) {
 3204: 			int j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE;
 3205: 			ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE);
 3206: 			ea = (caddr_t)(vm_offset_t)ulmin(
 3207: 			    (u_long)(vm_offset_t)ea,
 3208: 			    (u_long)(vm_offset_t)bp->b_data + bp->b_bufsize);
 3209: 			mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
 3210: 			if ((bp->b_pages[i]->valid & mask) == mask)
 3211: 				continue;
 3212: 			if ((bp->b_pages[i]->valid & mask) == 0) {
 3213: 				if ((bp->b_pages[i]->flags & PG_ZERO) == 0) {
 3214: 					bzero(sa, ea - sa);
 3215: 				}
 3216: 			} else {
 3217: 				for (; sa < ea; sa += DEV_BSIZE, j++) {
 3218: 					if (((bp->b_pages[i]->flags & PG_ZERO) == 0) &&
 3219: 						(bp->b_pages[i]->valid & (1<<j)) == 0)
 3220: 						bzero(sa, DEV_BSIZE);
 3221: 				}
 3222: 			}
 3223: 			bp->b_pages[i]->valid |= mask;
 3224: 			vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
 3225: 		}
 3226: 		bp->b_resid = 0;
 3227: 	} else {
 3228: 		clrbuf(bp);
 3229: 	}
 3230: }
 3231: 
 3232: /*
 3233:  * vm_hold_load_pages and vm_hold_unload pages get pages into
 3234:  * a buffers address space.  The pages are anonymous and are
 3235:  * not associated with a file object.
 3236:  */
 3237: void
 3238: vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
 3239: {
 3240: 	vm_offset_t pg;
 3241: 	vm_page_t p;
 3242: 	int index;
 3243: 
 3244: 	to = round_page(to);
 3245: 	from = round_page(from);
 3246: 	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
 3247: 
 3248: 	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
 3249: 
 3250: tryagain:
 3251: 
 3252: 		/*
 3253: 		 * note: must allocate system pages since blocking here
 3254: 		 * could intefere with paging I/O, no matter which
 3255: 		 * process we are.
 3256: 		 */
 3257: 		p = vm_page_alloc(kernel_object,
 3258: 			((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
 3259: 			VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM);
 3260: 		if (!p) {
 3261: 			vm_pageout_deficit += (to - from) >> PAGE_SHIFT;
 3262: 			VM_WAIT;
 3263: 			goto tryagain;
 3264: 		}
 3265: 		vm_page_wire(p);
 3266: 		p->valid = VM_PAGE_BITS_ALL;
 3267: 		vm_page_flag_clear(p, PG_ZERO);
 3268: 		pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
 3269: 		bp->b_pages[index] = p;
 3270: 		vm_page_wakeup(p);
 3271: 	}
 3272: 	bp->b_npages = index;
 3273: }
 3274: 
 3275: void
 3276: vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
 3277: {
 3278: 	vm_offset_t pg;
 3279: 	vm_page_t p;
 3280: 	int index, newnpages;
 3281: 
 3282: 	from = round_page(from);
 3283: 	to = round_page(to);
 3284: 	newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
 3285: 
 3286: 	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
 3287: 		p = bp->b_pages[index];
 3288: 		if (p && (index < bp->b_npages)) {
 3289: 			if (p->busy) {
 3290: 				printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n",
 3291: 					bp->b_blkno, bp->b_lblkno);
 3292: 			}
 3293: 			bp->b_pages[index] = NULL;
 3294: 			pmap_kremove(pg);
 3295: 			vm_page_busy(p);
 3296: 			vm_page_unwire(p, 0);
 3297: 			vm_page_free(p);
 3298: 		}
 3299: 	}
 3300: 	bp->b_npages = newnpages;
 3301: }
 3302: 
 3303: /*
 3304:  * Map an IO request into kernel virtual address space.
 3305:  *
 3306:  * All requests are (re)mapped into kernel VA space.
 3307:  * Notice that we use b_bufsize for the size of the buffer
 3308:  * to be mapped.  b_bcount might be modified by the driver.
 3309:  */
 3310: int
 3311: vmapbuf(struct buf *bp)
 3312: {
 3313: 	caddr_t addr, v, kva;
 3314: 	vm_paddr_t pa;
 3315: 	int pidx;
 3316: 	int i;
 3317: 	struct vm_page *m;
 3318: 
 3319: 	if ((bp->b_flags & B_PHYS) == 0)
 3320: 		panic("vmapbuf");
 3321: 	if (bp->b_bufsize < 0)
 3322: 		return (-1);
 3323: 	for (v = bp->b_saveaddr,
 3324: 		     addr = (caddr_t)trunc_page((vm_offset_t)bp->b_data),
 3325: 		     pidx = 0;
 3326: 	     addr < bp->b_data + bp->b_bufsize;
 3327: 	     addr += PAGE_SIZE, v += PAGE_SIZE, pidx++) {
 3328: 		/*
 3329: 		 * Do the vm_fault if needed; do the copy-on-write thing
 3330: 		 * when reading stuff off device into memory.
 3331: 		 */
 3332: retry:
 3333: 		i = vm_fault_quick((addr >= bp->b_data) ? addr : bp->b_data,
 3334: 			(bp->b_flags&B_READ)?(VM_PROT_READ|VM_PROT_WRITE):VM_PROT_READ);
 3335: 		if (i < 0) {
 3336: 			for (i = 0; i < pidx; ++i) {
 3337: 			    vm_page_unhold(bp->b_pages[i]);
 3338: 			    bp->b_pages[i] = NULL;
 3339: 			}
 3340: 			return(-1);
 3341: 		}
 3342: 
 3343: 		/*
 3344: 		 * WARNING!  If sparc support is MFCd in the future this will
 3345: 		 * have to be changed from pmap_kextract() to pmap_extract()
 3346: 		 * ala -current.
 3347: 		 */
 3348: #ifdef __sparc64__
 3349: #error "If MFCing sparc support use pmap_extract"
 3350: #endif
 3351: 		pa = pmap_kextract((vm_offset_t)addr);
 3352: 		if (pa == 0) {
 3353: 			printf("vmapbuf: warning, race against user address during I/O");
 3354: 			goto retry;
 3355: 		}
 3356: 		m = PHYS_TO_VM_PAGE(pa);
 3357: 		vm_page_hold(m);
 3358: 		bp->b_pages[pidx] = m;
 3359: 	}
 3360: 	if (pidx > btoc(MAXPHYS))
 3361: 		panic("vmapbuf: mapped more than MAXPHYS");
 3362: 	pmap_qenter((vm_offset_t)bp->b_saveaddr, bp->b_pages, pidx);
 3363: 	
 3364: 	kva = bp->b_saveaddr;
 3365: 	bp->b_npages = pidx;
 3366: 	bp->b_saveaddr = bp->b_data;
 3367: 	bp->b_data = kva + (((vm_offset_t) bp->b_data) & PAGE_MASK);
 3368: 	return(0);
 3369: }
 3370: 
 3371: /*
 3372:  * Free the io map PTEs associated with this IO operation.
 3373:  * We also invalidate the TLB entries and restore the original b_addr.
 3374:  */
 3375: void
 3376: vunmapbuf(bp)
 3377: 	struct buf *bp;
 3378: {
 3379: 	int pidx;
 3380: 	int npages;
 3381: 	vm_page_t *m;
 3382: 
 3383: 	if ((bp->b_flags & B_PHYS) == 0)
 3384: 		panic("vunmapbuf");
 3385: 
 3386: 	npages = bp->b_npages;
 3387: 	pmap_qremove(trunc_page((vm_offset_t)bp->b_data),
 3388: 		     npages);
 3389: 	m = bp->b_pages;
 3390: 	for (pidx = 0; pidx < npages; pidx++)
 3391: 		vm_page_unhold(*m++);
 3392: 
 3393: 	bp->b_data = bp->b_saveaddr;
 3394: }
 3395: 
 3396: #include "opt_ddb.h"
 3397: #ifdef DDB
 3398: #include <ddb/ddb.h>
 3399: 
 3400: DB_SHOW_COMMAND(buffer, db_show_buffer)
 3401: {
 3402: 	/* get args */
 3403: 	struct buf *bp = (struct buf *)addr;
 3404: 
 3405: 	if (!have_addr) {
 3406: 		db_printf("usage: show buffer <addr>\n");
 3407: 		return;
 3408: 	}
 3409: 
 3410: 	db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS);
 3411: 	db_printf("b_error = %d, b_bufsize = %ld, b_bcount = %ld, "
 3412: 		  "b_resid = %ld\nb_dev = (%d,%d), b_data = %p, "
 3413: 		  "b_blkno = %d, b_pblkno = %d\n",
 3414: 		  bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
 3415: 		  major(bp->b_dev), minor(bp->b_dev),
 3416: 		  bp->b_data, bp->b_blkno, bp->b_pblkno);
 3417: 	if (bp->b_npages) {
 3418: 		int i;
 3419: 		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
 3420: 		for (i = 0; i < bp->b_npages; i++) {
 3421: 			vm_page_t m;
 3422: 			m = bp->b_pages[i];
 3423: 			db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
 3424: 			    (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
 3425: 			if ((i + 1) < bp->b_npages)
 3426: 				db_printf(",");
 3427: 		}
 3428: 		db_printf("\n");
 3429: 	}
 3430: }
 3431: #endif /* DDB */