File:  [DragonFly] / src / sys / kern / vfs_bio.c
Revision 1.28: download - view: text, annotated - select for diffs
Tue Jun 1 22:19:30 2004 UTC (10 years, 2 months ago) by dillon
Branches: MAIN
CVS tags: HEAD, DragonFly_1_0_REL, DragonFly_1_0_RC1, DragonFly_1_0A_REL
ANSIfication.  No operational changes.

Submitted-by: Tim Wickberg <me@k9mach3.org>

    1: /*
    2:  * Copyright (c) 1994,1997 John S. Dyson
    3:  * All rights reserved.
    4:  *
    5:  * Redistribution and use in source and binary forms, with or without
    6:  * modification, are permitted provided that the following conditions
    7:  * are met:
    8:  * 1. Redistributions of source code must retain the above copyright
    9:  *    notice immediately at the beginning of the file, without modification,
   10:  *    this list of conditions, and the following disclaimer.
   11:  * 2. Absolutely no warranty of function or purpose is made by the author
   12:  *		John S. Dyson.
   13:  *
   14:  * $FreeBSD: src/sys/kern/vfs_bio.c,v 1.242.2.20 2003/05/28 18:38:10 alc Exp $
   15:  * $DragonFly: src/sys/kern/vfs_bio.c,v 1.28 2004/06/01 22:19:30 dillon Exp $
   16:  */
   17: 
   18: /*
   19:  * this file contains a new buffer I/O scheme implementing a coherent
   20:  * VM object and buffer cache scheme.  Pains have been taken to make
   21:  * sure that the performance degradation associated with schemes such
   22:  * as this is not realized.
   23:  *
   24:  * Author:  John S. Dyson
   25:  * Significant help during the development and debugging phases
   26:  * had been provided by David Greenman, also of the FreeBSD core team.
   27:  *
   28:  * see man buf(9) for more info.
   29:  */
   30: 
   31: #include <sys/param.h>
   32: #include <sys/systm.h>
   33: #include <sys/buf.h>
   34: #include <sys/conf.h>
   35: #include <sys/eventhandler.h>
   36: #include <sys/lock.h>
   37: #include <sys/malloc.h>
   38: #include <sys/mount.h>
   39: #include <sys/kernel.h>
   40: #include <sys/kthread.h>
   41: #include <sys/proc.h>
   42: #include <sys/reboot.h>
   43: #include <sys/resourcevar.h>
   44: #include <sys/sysctl.h>
   45: #include <sys/vmmeter.h>
   46: #include <sys/vnode.h>
   47: #include <sys/proc.h>
   48: #include <vm/vm.h>
   49: #include <vm/vm_param.h>
   50: #include <vm/vm_kern.h>
   51: #include <vm/vm_pageout.h>
   52: #include <vm/vm_page.h>
   53: #include <vm/vm_object.h>
   54: #include <vm/vm_extern.h>
   55: #include <vm/vm_map.h>
   56: #include <sys/buf2.h>
   57: #include <vm/vm_page2.h>
   58: 
   59: static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer");
   60: 
   61: struct	bio_ops bioops;		/* I/O operation notification */
   62: 
   63: struct buf *buf;		/* buffer header pool */
   64: struct swqueue bswlist;
   65: 
   66: static void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
   67: 		vm_offset_t to);
   68: static void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
   69: 		vm_offset_t to);
   70: static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off,
   71: 			       int pageno, vm_page_t m);
   72: static void vfs_clean_pages(struct buf * bp);
   73: static void vfs_setdirty(struct buf *bp);
   74: static void vfs_vmio_release(struct buf *bp);
   75: static void vfs_backgroundwritedone(struct buf *bp);
   76: static int flushbufqueues(void);
   77: 
   78: static int bd_request;
   79: 
   80: static void buf_daemon (void);
   81: /*
   82:  * bogus page -- for I/O to/from partially complete buffers
   83:  * this is a temporary solution to the problem, but it is not
   84:  * really that bad.  it would be better to split the buffer
   85:  * for input in the case of buffers partially already in memory,
   86:  * but the code is intricate enough already.
   87:  */
   88: vm_page_t bogus_page;
   89: int vmiodirenable = TRUE;
   90: int runningbufspace;
   91: struct lwkt_token buftimetoken;  /* Interlock on setting prio and timo */
   92: 
   93: static vm_offset_t bogus_offset;
   94: 
   95: static int bufspace, maxbufspace,
   96: 	bufmallocspace, maxbufmallocspace, lobufspace, hibufspace;
   97: static int bufreusecnt, bufdefragcnt, buffreekvacnt;
   98: static int needsbuffer;
   99: static int lorunningspace, hirunningspace, runningbufreq;
  100: static int numdirtybuffers, lodirtybuffers, hidirtybuffers;
  101: static int numfreebuffers, lofreebuffers, hifreebuffers;
  102: static int getnewbufcalls;
  103: static int getnewbufrestarts;
  104: 
  105: SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD,
  106: 	&numdirtybuffers, 0, "");
  107: SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW,
  108: 	&lodirtybuffers, 0, "");
  109: SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW,
  110: 	&hidirtybuffers, 0, "");
  111: SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD,
  112: 	&numfreebuffers, 0, "");
  113: SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW,
  114: 	&lofreebuffers, 0, "");
  115: SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW,
  116: 	&hifreebuffers, 0, "");
  117: SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD,
  118: 	&runningbufspace, 0, "");
  119: SYSCTL_INT(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW,
  120: 	&lorunningspace, 0, "");
  121: SYSCTL_INT(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW,
  122: 	&hirunningspace, 0, "");
  123: SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD,
  124: 	&maxbufspace, 0, "");
  125: SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD,
  126: 	&hibufspace, 0, "");
  127: SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD,
  128: 	&lobufspace, 0, "");
  129: SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD,
  130: 	&bufspace, 0, "");
  131: SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW,
  132: 	&maxbufmallocspace, 0, "");
  133: SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD,
  134: 	&bufmallocspace, 0, "");
  135: SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW,
  136: 	&getnewbufcalls, 0, "");
  137: SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW,
  138: 	&getnewbufrestarts, 0, "");
  139: SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW,
  140: 	&vmiodirenable, 0, "");
  141: SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW,
  142: 	&bufdefragcnt, 0, "");
  143: SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW,
  144: 	&buffreekvacnt, 0, "");
  145: SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW,
  146: 	&bufreusecnt, 0, "");
  147: 
  148: /*
  149:  * Disable background writes for now.  There appear to be races in the 
  150:  * flags tests and locking operations as well as races in the completion
  151:  * code modifying the original bp (origbp) without holding a lock, assuming
  152:  * splbio protection when there might not be splbio protection.
  153:  */
  154: static int dobkgrdwrite = 0;
  155: SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0,
  156: 	"Do background writes (honoring the BV_BKGRDWRITE flag)?");
  157: 
  158: static int bufhashmask;
  159: static int bufhashshift;
  160: static LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
  161: struct bqueues bufqueues[BUFFER_QUEUES] = { { 0 } };
  162: char *buf_wmesg = BUF_WMESG;
  163: 
  164: extern int vm_swap_size;
  165: 
  166: #define VFS_BIO_NEED_ANY	0x01	/* any freeable buffer */
  167: #define VFS_BIO_NEED_DIRTYFLUSH	0x02	/* waiting for dirty buffer flush */
  168: #define VFS_BIO_NEED_FREE	0x04	/* wait for free bufs, hi hysteresis */
  169: #define VFS_BIO_NEED_BUFSPACE	0x08	/* wait for buf space, lo hysteresis */
  170: 
  171: /*
  172:  * Buffer hash table code.  Note that the logical block scans linearly, which
  173:  * gives us some L1 cache locality.
  174:  */
  175: 
  176: static __inline 
  177: struct bufhashhdr *
  178: bufhash(struct vnode *vnp, daddr_t bn)
  179: {
  180: 	u_int64_t hashkey64;
  181: 	int hashkey; 
  182: 	
  183: 	/*
  184: 	 * A variation on the Fibonacci hash that Knuth credits to
  185: 	 * R. W. Floyd, see Knuth's _Art of Computer Programming,
  186: 	 * Volume 3 / Sorting and Searching_
  187: 	 *
  188:          * We reduce the argument to 32 bits before doing the hash to
  189: 	 * avoid the need for a slow 64x64 multiply on 32 bit platforms.
  190: 	 *
  191: 	 * sizeof(struct vnode) is 168 on i386, so toss some of the lower
  192: 	 * bits of the vnode address to reduce the key range, which
  193: 	 * improves the distribution of keys across buckets.
  194: 	 *
  195: 	 * The file system cylinder group blocks are very heavily
  196: 	 * used.  They are located at invervals of fbg, which is
  197: 	 * on the order of 89 to 94 * 2^10, depending on other
  198: 	 * filesystem parameters, for a 16k block size.  Smaller block
  199: 	 * sizes will reduce fpg approximately proportionally.  This
  200: 	 * will cause the cylinder group index to be hashed using the
  201: 	 * lower bits of the hash multiplier, which will not distribute
  202: 	 * the keys as uniformly in a classic Fibonacci hash where a
  203: 	 * relatively small number of the upper bits of the result
  204: 	 * are used.  Using 2^16 as a close-enough approximation to
  205: 	 * fpg, split the hash multiplier in half, with the upper 16
  206: 	 * bits being the inverse of the golden ratio, and the lower
  207: 	 * 16 bits being a fraction between 1/3 and 3/7 (closer to
  208: 	 * 3/7 in this case), that gives good experimental results.
  209: 	 */
  210: 	hashkey64 = ((u_int64_t)(uintptr_t)vnp >> 3) + (u_int64_t)bn;
  211: 	hashkey = (((u_int32_t)(hashkey64 + (hashkey64 >> 32)) * 0x9E376DB1u) >>
  212: 	    bufhashshift) & bufhashmask;
  213: 	return(&bufhashtbl[hashkey]);
  214: }
  215: 
  216: /*
  217:  *	numdirtywakeup:
  218:  *
  219:  *	If someone is blocked due to there being too many dirty buffers,
  220:  *	and numdirtybuffers is now reasonable, wake them up.
  221:  */
  222: 
  223: static __inline void
  224: numdirtywakeup(int level)
  225: {
  226: 	if (numdirtybuffers <= level) {
  227: 		if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) {
  228: 			needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH;
  229: 			wakeup(&needsbuffer);
  230: 		}
  231: 	}
  232: }
  233: 
  234: /*
  235:  *	bufspacewakeup:
  236:  *
  237:  *	Called when buffer space is potentially available for recovery.
  238:  *	getnewbuf() will block on this flag when it is unable to free 
  239:  *	sufficient buffer space.  Buffer space becomes recoverable when 
  240:  *	bp's get placed back in the queues.
  241:  */
  242: 
  243: static __inline void
  244: bufspacewakeup(void)
  245: {
  246: 	/*
  247: 	 * If someone is waiting for BUF space, wake them up.  Even
  248: 	 * though we haven't freed the kva space yet, the waiting
  249: 	 * process will be able to now.
  250: 	 */
  251: 	if (needsbuffer & VFS_BIO_NEED_BUFSPACE) {
  252: 		needsbuffer &= ~VFS_BIO_NEED_BUFSPACE;
  253: 		wakeup(&needsbuffer);
  254: 	}
  255: }
  256: 
  257: /*
  258:  * runningbufwakeup() - in-progress I/O accounting.
  259:  *
  260:  */
  261: static __inline void
  262: runningbufwakeup(struct buf *bp)
  263: {
  264: 	if (bp->b_runningbufspace) {
  265: 		runningbufspace -= bp->b_runningbufspace;
  266: 		bp->b_runningbufspace = 0;
  267: 		if (runningbufreq && runningbufspace <= lorunningspace) {
  268: 			runningbufreq = 0;
  269: 			wakeup(&runningbufreq);
  270: 		}
  271: 	}
  272: }
  273: 
  274: /*
  275:  *	bufcountwakeup:
  276:  *
  277:  *	Called when a buffer has been added to one of the free queues to
  278:  *	account for the buffer and to wakeup anyone waiting for free buffers.
  279:  *	This typically occurs when large amounts of metadata are being handled
  280:  *	by the buffer cache ( else buffer space runs out first, usually ).
  281:  */
  282: 
  283: static __inline void
  284: bufcountwakeup(void) 
  285: {
  286: 	++numfreebuffers;
  287: 	if (needsbuffer) {
  288: 		needsbuffer &= ~VFS_BIO_NEED_ANY;
  289: 		if (numfreebuffers >= hifreebuffers)
  290: 			needsbuffer &= ~VFS_BIO_NEED_FREE;
  291: 		wakeup(&needsbuffer);
  292: 	}
  293: }
  294: 
  295: /*
  296:  *	waitrunningbufspace()
  297:  *
  298:  *	runningbufspace is a measure of the amount of I/O currently
  299:  *	running.  This routine is used in async-write situations to
  300:  *	prevent creating huge backups of pending writes to a device.
  301:  *	Only asynchronous writes are governed by this function.  
  302:  *
  303:  *	Reads will adjust runningbufspace, but will not block based on it.
  304:  *	The read load has a side effect of reducing the allowed write load.
  305:  *
  306:  *	This does NOT turn an async write into a sync write.  It waits
  307:  *	for earlier writes to complete and generally returns before the
  308:  *	caller's write has reached the device.
  309:  */
  310: static __inline void
  311: waitrunningbufspace(void)
  312: {
  313: 	while (runningbufspace > hirunningspace) {
  314: 		int s;
  315: 
  316: 		s = splbio();	/* fix race against interrupt/biodone() */
  317: 		++runningbufreq;
  318: 		tsleep(&runningbufreq, 0, "wdrain", 0);
  319: 		splx(s);
  320: 	}
  321: }
  322: 
  323: /*
  324:  *	vfs_buf_test_cache:
  325:  *
  326:  *	Called when a buffer is extended.  This function clears the B_CACHE
  327:  *	bit if the newly extended portion of the buffer does not contain
  328:  *	valid data.
  329:  */
  330: static __inline__
  331: void
  332: vfs_buf_test_cache(struct buf *bp,
  333: 		  vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
  334: 		  vm_page_t m)
  335: {
  336: 	if (bp->b_flags & B_CACHE) {
  337: 		int base = (foff + off) & PAGE_MASK;
  338: 		if (vm_page_is_valid(m, base, size) == 0)
  339: 			bp->b_flags &= ~B_CACHE;
  340: 	}
  341: }
  342: 
  343: static __inline__
  344: void
  345: bd_wakeup(int dirtybuflevel)
  346: {
  347: 	if (bd_request == 0 && numdirtybuffers >= dirtybuflevel) {
  348: 		bd_request = 1;
  349: 		wakeup(&bd_request);
  350: 	}
  351: }
  352: 
  353: /*
  354:  * bd_speedup - speedup the buffer cache flushing code
  355:  */
  356: 
  357: static __inline__
  358: void
  359: bd_speedup(void)
  360: {
  361: 	bd_wakeup(1);
  362: }
  363: 
  364: /*
  365:  * Initialize buffer headers and related structures. 
  366:  */
  367: 
  368: caddr_t
  369: bufhashinit(caddr_t vaddr)
  370: {
  371: 	/* first, make a null hash table */
  372: 	bufhashshift = 29;
  373: 	for (bufhashmask = 8; bufhashmask < nbuf / 4; bufhashmask <<= 1)
  374: 		bufhashshift--;
  375: 	bufhashtbl = (void *)vaddr;
  376: 	vaddr = vaddr + sizeof(*bufhashtbl) * bufhashmask;
  377: 	--bufhashmask;
  378: 	return(vaddr);
  379: }
  380: 
  381: void
  382: bufinit(void)
  383: {
  384: 	struct buf *bp;
  385: 	int i;
  386: 
  387: 	TAILQ_INIT(&bswlist);
  388: 	LIST_INIT(&invalhash);
  389: 	lwkt_token_init(&buftimetoken);
  390: 
  391: 	for (i = 0; i <= bufhashmask; i++)
  392: 		LIST_INIT(&bufhashtbl[i]);
  393: 
  394: 	/* next, make a null set of free lists */
  395: 	for (i = 0; i < BUFFER_QUEUES; i++)
  396: 		TAILQ_INIT(&bufqueues[i]);
  397: 
  398: 	/* finally, initialize each buffer header and stick on empty q */
  399: 	for (i = 0; i < nbuf; i++) {
  400: 		bp = &buf[i];
  401: 		bzero(bp, sizeof *bp);
  402: 		bp->b_flags = B_INVAL;	/* we're just an empty header */
  403: 		bp->b_dev = NODEV;
  404: 		bp->b_qindex = QUEUE_EMPTY;
  405: 		bp->b_xflags = 0;
  406: 		LIST_INIT(&bp->b_dep);
  407: 		BUF_LOCKINIT(bp);
  408: 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
  409: 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
  410: 	}
  411: 
  412: 	/*
  413: 	 * maxbufspace is the absolute maximum amount of buffer space we are 
  414: 	 * allowed to reserve in KVM and in real terms.  The absolute maximum
  415: 	 * is nominally used by buf_daemon.  hibufspace is the nominal maximum
  416: 	 * used by most other processes.  The differential is required to 
  417: 	 * ensure that buf_daemon is able to run when other processes might 
  418: 	 * be blocked waiting for buffer space.
  419: 	 *
  420: 	 * maxbufspace is based on BKVASIZE.  Allocating buffers larger then
  421: 	 * this may result in KVM fragmentation which is not handled optimally
  422: 	 * by the system.
  423: 	 */
  424: 	maxbufspace = nbuf * BKVASIZE;
  425: 	hibufspace = imax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10);
  426: 	lobufspace = hibufspace - MAXBSIZE;
  427: 
  428: 	lorunningspace = 512 * 1024;
  429: 	hirunningspace = 1024 * 1024;
  430: 
  431: /*
  432:  * Limit the amount of malloc memory since it is wired permanently into
  433:  * the kernel space.  Even though this is accounted for in the buffer
  434:  * allocation, we don't want the malloced region to grow uncontrolled.
  435:  * The malloc scheme improves memory utilization significantly on average
  436:  * (small) directories.
  437:  */
  438: 	maxbufmallocspace = hibufspace / 20;
  439: 
  440: /*
  441:  * Reduce the chance of a deadlock occuring by limiting the number
  442:  * of delayed-write dirty buffers we allow to stack up.
  443:  */
  444: 	hidirtybuffers = nbuf / 4 + 20;
  445: 	numdirtybuffers = 0;
  446: /*
  447:  * To support extreme low-memory systems, make sure hidirtybuffers cannot
  448:  * eat up all available buffer space.  This occurs when our minimum cannot
  449:  * be met.  We try to size hidirtybuffers to 3/4 our buffer space assuming
  450:  * BKVASIZE'd (8K) buffers.
  451:  */
  452: 	while (hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
  453: 		hidirtybuffers >>= 1;
  454: 	}
  455: 	lodirtybuffers = hidirtybuffers / 2;
  456: 
  457: /*
  458:  * Try to keep the number of free buffers in the specified range,
  459:  * and give special processes (e.g. like buf_daemon) access to an 
  460:  * emergency reserve.
  461:  */
  462: 	lofreebuffers = nbuf / 18 + 5;
  463: 	hifreebuffers = 2 * lofreebuffers;
  464: 	numfreebuffers = nbuf;
  465: 
  466: /*
  467:  * Maximum number of async ops initiated per buf_daemon loop.  This is
  468:  * somewhat of a hack at the moment, we really need to limit ourselves
  469:  * based on the number of bytes of I/O in-transit that were initiated
  470:  * from buf_daemon.
  471:  */
  472: 
  473: 	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
  474: 	bogus_page = vm_page_alloc(kernel_object,
  475: 			((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
  476: 			VM_ALLOC_NORMAL);
  477: 	vmstats.v_wire_count++;
  478: 
  479: }
  480: 
  481: /*
  482:  * bfreekva() - free the kva allocation for a buffer.
  483:  *
  484:  *	Must be called at splbio() or higher as this is the only locking for
  485:  *	buffer_map.
  486:  *
  487:  *	Since this call frees up buffer space, we call bufspacewakeup().
  488:  */
  489: static void
  490: bfreekva(struct buf * bp)
  491: {
  492: 	int count;
  493: 
  494: 	if (bp->b_kvasize) {
  495: 		++buffreekvacnt;
  496: 		count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
  497: 		vm_map_lock(buffer_map);
  498: 		bufspace -= bp->b_kvasize;
  499: 		vm_map_delete(buffer_map,
  500: 		    (vm_offset_t) bp->b_kvabase,
  501: 		    (vm_offset_t) bp->b_kvabase + bp->b_kvasize,
  502: 		    &count
  503: 		);
  504: 		vm_map_unlock(buffer_map);
  505: 		vm_map_entry_release(count);
  506: 		bp->b_kvasize = 0;
  507: 		bufspacewakeup();
  508: 	}
  509: }
  510: 
  511: /*
  512:  *	bremfree:
  513:  *
  514:  *	Remove the buffer from the appropriate free list.
  515:  */
  516: void
  517: bremfree(struct buf * bp)
  518: {
  519: 	int s = splbio();
  520: 	int old_qindex = bp->b_qindex;
  521: 
  522: 	if (bp->b_qindex != QUEUE_NONE) {
  523: 		KASSERT(BUF_REFCNT(bp) == 1, ("bremfree: bp %p not locked",bp));
  524: 		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
  525: 		bp->b_qindex = QUEUE_NONE;
  526: 	} else {
  527: 		if (BUF_REFCNT(bp) <= 1)
  528: 			panic("bremfree: removing a buffer not on a queue");
  529: 	}
  530: 
  531: 	/*
  532: 	 * Fixup numfreebuffers count.  If the buffer is invalid or not
  533: 	 * delayed-write, and it was on the EMPTY, LRU, or AGE queues,
  534: 	 * the buffer was free and we must decrement numfreebuffers.
  535: 	 */
  536: 	if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
  537: 		switch(old_qindex) {
  538: 		case QUEUE_DIRTY:
  539: 		case QUEUE_CLEAN:
  540: 		case QUEUE_EMPTY:
  541: 		case QUEUE_EMPTYKVA:
  542: 			--numfreebuffers;
  543: 			break;
  544: 		default:
  545: 			break;
  546: 		}
  547: 	}
  548: 	splx(s);
  549: }
  550: 
  551: 
  552: /*
  553:  * Get a buffer with the specified data.  Look in the cache first.  We
  554:  * must clear B_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
  555:  * is set, the buffer is valid and we do not have to do anything ( see
  556:  * getblk() ).
  557:  */
  558: int
  559: bread(struct vnode * vp, daddr_t blkno, int size, struct buf ** bpp)
  560: {
  561: 	struct buf *bp;
  562: 
  563: 	bp = getblk(vp, blkno, size, 0, 0);
  564: 	*bpp = bp;
  565: 
  566: 	/* if not found in cache, do some I/O */
  567: 	if ((bp->b_flags & B_CACHE) == 0) {
  568: 		KASSERT(!(bp->b_flags & B_ASYNC), ("bread: illegal async bp %p", bp));
  569: 		bp->b_flags |= B_READ;
  570: 		bp->b_flags &= ~(B_ERROR | B_INVAL);
  571: 		vfs_busy_pages(bp, 0);
  572: 		VOP_STRATEGY(vp, bp);
  573: 		return (biowait(bp));
  574: 	}
  575: 	return (0);
  576: }
  577: 
  578: /*
  579:  * Operates like bread, but also starts asynchronous I/O on
  580:  * read-ahead blocks.  We must clear B_ERROR and B_INVAL prior
  581:  * to initiating I/O . If B_CACHE is set, the buffer is valid 
  582:  * and we do not have to do anything.
  583:  */
  584: int
  585: breadn(struct vnode * vp, daddr_t blkno, int size, daddr_t * rablkno,
  586: 	int *rabsize, int cnt, struct buf ** bpp)
  587: {
  588: 	struct buf *bp, *rabp;
  589: 	int i;
  590: 	int rv = 0, readwait = 0;
  591: 
  592: 	*bpp = bp = getblk(vp, blkno, size, 0, 0);
  593: 
  594: 	/* if not found in cache, do some I/O */
  595: 	if ((bp->b_flags & B_CACHE) == 0) {
  596: 		bp->b_flags |= B_READ;
  597: 		bp->b_flags &= ~(B_ERROR | B_INVAL);
  598: 		vfs_busy_pages(bp, 0);
  599: 		VOP_STRATEGY(vp, bp);
  600: 		++readwait;
  601: 	}
  602: 
  603: 	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
  604: 		if (inmem(vp, *rablkno))
  605: 			continue;
  606: 		rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
  607: 
  608: 		if ((rabp->b_flags & B_CACHE) == 0) {
  609: 			rabp->b_flags |= B_READ | B_ASYNC;
  610: 			rabp->b_flags &= ~(B_ERROR | B_INVAL);
  611: 			vfs_busy_pages(rabp, 0);
  612: 			BUF_KERNPROC(rabp);
  613: 			VOP_STRATEGY(vp, rabp);
  614: 		} else {
  615: 			brelse(rabp);
  616: 		}
  617: 	}
  618: 
  619: 	if (readwait) {
  620: 		rv = biowait(bp);
  621: 	}
  622: 	return (rv);
  623: }
  624: 
  625: /*
  626:  * Write, release buffer on completion.  (Done by iodone
  627:  * if async).  Do not bother writing anything if the buffer
  628:  * is invalid.
  629:  *
  630:  * Note that we set B_CACHE here, indicating that buffer is
  631:  * fully valid and thus cacheable.  This is true even of NFS
  632:  * now so we set it generally.  This could be set either here 
  633:  * or in biodone() since the I/O is synchronous.  We put it
  634:  * here.
  635:  */
  636: int
  637: bwrite(struct buf * bp)
  638: {
  639: 	int oldflags, s;
  640: 	struct buf *newbp;
  641: 
  642: 	if (bp->b_flags & B_INVAL) {
  643: 		brelse(bp);
  644: 		return (0);
  645: 	}
  646: 
  647: 	oldflags = bp->b_flags;
  648: 
  649: 	if (BUF_REFCNT(bp) == 0)
  650: 		panic("bwrite: buffer is not busy???");
  651: 	s = splbio();
  652: 	/*
  653: 	 * If a background write is already in progress, delay
  654: 	 * writing this block if it is asynchronous. Otherwise
  655: 	 * wait for the background write to complete.
  656: 	 */
  657: 	if (bp->b_xflags & BX_BKGRDINPROG) {
  658: 		if (bp->b_flags & B_ASYNC) {
  659: 			splx(s);
  660: 			bdwrite(bp);
  661: 			return (0);
  662: 		}
  663: 		bp->b_xflags |= BX_BKGRDWAIT;
  664: 		tsleep(&bp->b_xflags, 0, "biord", 0);
  665: 		if (bp->b_xflags & BX_BKGRDINPROG)
  666: 			panic("bwrite: still writing");
  667: 	}
  668: 
  669: 	/* Mark the buffer clean */
  670: 	bundirty(bp);
  671: 
  672: 	/*
  673: 	 * If this buffer is marked for background writing and we
  674: 	 * do not have to wait for it, make a copy and write the
  675: 	 * copy so as to leave this buffer ready for further use.
  676: 	 *
  677: 	 * This optimization eats a lot of memory.  If we have a page
  678: 	 * or buffer shortfull we can't do it.
  679: 	 */
  680: 	if (dobkgrdwrite &&
  681: 	    (bp->b_xflags & BX_BKGRDWRITE) &&
  682: 	    (bp->b_flags & B_ASYNC) &&
  683: 	    !vm_page_count_severe() &&
  684: 	    !buf_dirty_count_severe()) {
  685: 		if (bp->b_flags & B_CALL)
  686: 			panic("bwrite: need chained iodone");
  687: 
  688: 		/* get a new block */
  689: 		newbp = geteblk(bp->b_bufsize);
  690: 
  691: 		/* set it to be identical to the old block */
  692: 		memcpy(newbp->b_data, bp->b_data, bp->b_bufsize);
  693: 		bgetvp(bp->b_vp, newbp);
  694: 		newbp->b_lblkno = bp->b_lblkno;
  695: 		newbp->b_blkno = bp->b_blkno;
  696: 		newbp->b_offset = bp->b_offset;
  697: 		newbp->b_iodone = vfs_backgroundwritedone;
  698: 		newbp->b_flags |= B_ASYNC | B_CALL;
  699: 		newbp->b_flags &= ~B_INVAL;
  700: 
  701: 		/* move over the dependencies */
  702: 		if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_movedeps)
  703: 			(*bioops.io_movedeps)(bp, newbp);
  704: 
  705: 		/*
  706: 		 * Initiate write on the copy, release the original to
  707: 		 * the B_LOCKED queue so that it cannot go away until
  708: 		 * the background write completes. If not locked it could go
  709: 		 * away and then be reconstituted while it was being written.
  710: 		 * If the reconstituted buffer were written, we could end up
  711: 		 * with two background copies being written at the same time.
  712: 		 */
  713: 		bp->b_xflags |= BX_BKGRDINPROG;
  714: 		bp->b_flags |= B_LOCKED;
  715: 		bqrelse(bp);
  716: 		bp = newbp;
  717: 	}
  718: 
  719: 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR);
  720: 	bp->b_flags |= B_WRITEINPROG | B_CACHE;
  721: 
  722: 	bp->b_vp->v_numoutput++;
  723: 	vfs_busy_pages(bp, 1);
  724: 
  725: 	/*
  726: 	 * Normal bwrites pipeline writes
  727: 	 */
  728: 	bp->b_runningbufspace = bp->b_bufsize;
  729: 	runningbufspace += bp->b_runningbufspace;
  730: 
  731: 	splx(s);
  732: 	if (oldflags & B_ASYNC)
  733: 		BUF_KERNPROC(bp);
  734: 	VOP_STRATEGY(bp->b_vp, bp);
  735: 
  736: 	if ((oldflags & B_ASYNC) == 0) {
  737: 		int rtval = biowait(bp);
  738: 		brelse(bp);
  739: 		return (rtval);
  740: 	} else if ((oldflags & B_NOWDRAIN) == 0) {
  741: 		/*
  742: 		 * don't allow the async write to saturate the I/O
  743: 		 * system.  Deadlocks can occur only if a device strategy
  744: 		 * routine (like in VN) turns around and issues another
  745: 		 * high-level write, in which case B_NOWDRAIN is expected
  746: 		 * to be set.   Otherwise we will not deadlock here because
  747: 		 * we are blocking waiting for I/O that is already in-progress
  748: 		 * to complete.
  749: 		 */
  750: 		waitrunningbufspace();
  751: 	}
  752: 
  753: 	return (0);
  754: }
  755: 
  756: /*
  757:  * Complete a background write started from bwrite.
  758:  */
  759: static void
  760: vfs_backgroundwritedone(struct buf *bp)
  761: {
  762: 	struct buf *origbp;
  763: 
  764: 	/*
  765: 	 * Find the original buffer that we are writing.
  766: 	 */
  767: 	if ((origbp = gbincore(bp->b_vp, bp->b_lblkno)) == NULL)
  768: 		panic("backgroundwritedone: lost buffer");
  769: 	/*
  770: 	 * Process dependencies then return any unfinished ones.
  771: 	 */
  772: 	if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_complete)
  773: 		(*bioops.io_complete)(bp);
  774: 	if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_movedeps)
  775: 		(*bioops.io_movedeps)(bp, origbp);
  776: 	/*
  777: 	 * Clear the BX_BKGRDINPROG flag in the original buffer
  778: 	 * and awaken it if it is waiting for the write to complete.
  779: 	 * If BX_BKGRDINPROG is not set in the original buffer it must
  780: 	 * have been released and re-instantiated - which is not legal.
  781: 	 */
  782: 	KASSERT((origbp->b_xflags & BX_BKGRDINPROG), ("backgroundwritedone: lost buffer2"));
  783: 	origbp->b_xflags &= ~BX_BKGRDINPROG;
  784: 	if (origbp->b_xflags & BX_BKGRDWAIT) {
  785: 		origbp->b_xflags &= ~BX_BKGRDWAIT;
  786: 		wakeup(&origbp->b_xflags);
  787: 	}
  788: 	/*
  789: 	 * Clear the B_LOCKED flag and remove it from the locked
  790: 	 * queue if it currently resides there.
  791: 	 */
  792: 	origbp->b_flags &= ~B_LOCKED;
  793: 	if (BUF_LOCK(origbp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
  794: 		bremfree(origbp);
  795: 		bqrelse(origbp);
  796: 	}
  797: 	/*
  798: 	 * This buffer is marked B_NOCACHE, so when it is released
  799: 	 * by biodone, it will be tossed. We mark it with B_READ
  800: 	 * to avoid biodone doing a second vwakeup.
  801: 	 */
  802: 	bp->b_flags |= B_NOCACHE | B_READ;
  803: 	bp->b_flags &= ~(B_CACHE | B_CALL | B_DONE);
  804: 	bp->b_iodone = 0;
  805: 	biodone(bp);
  806: }
  807: 
  808: /*
  809:  * Delayed write. (Buffer is marked dirty).  Do not bother writing
  810:  * anything if the buffer is marked invalid.
  811:  *
  812:  * Note that since the buffer must be completely valid, we can safely
  813:  * set B_CACHE.  In fact, we have to set B_CACHE here rather then in
  814:  * biodone() in order to prevent getblk from writing the buffer
  815:  * out synchronously.
  816:  */
  817: void
  818: bdwrite(struct buf *bp)
  819: {
  820: 	if (BUF_REFCNT(bp) == 0)
  821: 		panic("bdwrite: buffer is not busy");
  822: 
  823: 	if (bp->b_flags & B_INVAL) {
  824: 		brelse(bp);
  825: 		return;
  826: 	}
  827: 	bdirty(bp);
  828: 
  829: 	/*
  830: 	 * Set B_CACHE, indicating that the buffer is fully valid.  This is
  831: 	 * true even of NFS now.
  832: 	 */
  833: 	bp->b_flags |= B_CACHE;
  834: 
  835: 	/*
  836: 	 * This bmap keeps the system from needing to do the bmap later,
  837: 	 * perhaps when the system is attempting to do a sync.  Since it
  838: 	 * is likely that the indirect block -- or whatever other datastructure
  839: 	 * that the filesystem needs is still in memory now, it is a good
  840: 	 * thing to do this.  Note also, that if the pageout daemon is
  841: 	 * requesting a sync -- there might not be enough memory to do
  842: 	 * the bmap then...  So, this is important to do.
  843: 	 */
  844: 	if (bp->b_lblkno == bp->b_blkno) {
  845: 		VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
  846: 	}
  847: 
  848: 	/*
  849: 	 * Set the *dirty* buffer range based upon the VM system dirty pages.
  850: 	 */
  851: 	vfs_setdirty(bp);
  852: 
  853: 	/*
  854: 	 * We need to do this here to satisfy the vnode_pager and the
  855: 	 * pageout daemon, so that it thinks that the pages have been
  856: 	 * "cleaned".  Note that since the pages are in a delayed write
  857: 	 * buffer -- the VFS layer "will" see that the pages get written
  858: 	 * out on the next sync, or perhaps the cluster will be completed.
  859: 	 */
  860: 	vfs_clean_pages(bp);
  861: 	bqrelse(bp);
  862: 
  863: 	/*
  864: 	 * Wakeup the buffer flushing daemon if we have a lot of dirty
  865: 	 * buffers (midpoint between our recovery point and our stall
  866: 	 * point).
  867: 	 */
  868: 	bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
  869: 
  870: 	/*
  871: 	 * note: we cannot initiate I/O from a bdwrite even if we wanted to,
  872: 	 * due to the softdep code.
  873: 	 */
  874: }
  875: 
  876: /*
  877:  *	bdirty:
  878:  *
  879:  *	Turn buffer into delayed write request.  We must clear B_READ and
  880:  *	B_RELBUF, and we must set B_DELWRI.  We reassign the buffer to 
  881:  *	itself to properly update it in the dirty/clean lists.  We mark it
  882:  *	B_DONE to ensure that any asynchronization of the buffer properly
  883:  *	clears B_DONE ( else a panic will occur later ).  
  884:  *
  885:  *	bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
  886:  *	might have been set pre-getblk().  Unlike bwrite/bdwrite, bdirty()
  887:  *	should only be called if the buffer is known-good.
  888:  *
  889:  *	Since the buffer is not on a queue, we do not update the numfreebuffers
  890:  *	count.
  891:  *
  892:  *	Must be called at splbio().
  893:  *	The buffer must be on QUEUE_NONE.
  894:  */
  895: void
  896: bdirty(struct buf *bp)
  897: {
  898: 	KASSERT(bp->b_qindex == QUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
  899: 	bp->b_flags &= ~(B_READ|B_RELBUF);
  900: 
  901: 	if ((bp->b_flags & B_DELWRI) == 0) {
  902: 		bp->b_flags |= B_DONE | B_DELWRI;
  903: 		reassignbuf(bp, bp->b_vp);
  904: 		++numdirtybuffers;
  905: 		bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
  906: 	}
  907: }
  908: 
  909: /*
  910:  *	bundirty:
  911:  *
  912:  *	Clear B_DELWRI for buffer.
  913:  *
  914:  *	Since the buffer is not on a queue, we do not update the numfreebuffers
  915:  *	count.
  916:  *	
  917:  *	Must be called at splbio().
  918:  *	The buffer must be on QUEUE_NONE.
  919:  */
  920: 
  921: void
  922: bundirty(struct buf *bp)
  923: {
  924: 	KASSERT(bp->b_qindex == QUEUE_NONE, ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));
  925: 
  926: 	if (bp->b_flags & B_DELWRI) {
  927: 		bp->b_flags &= ~B_DELWRI;
  928: 		reassignbuf(bp, bp->b_vp);
  929: 		--numdirtybuffers;
  930: 		numdirtywakeup(lodirtybuffers);
  931: 	}
  932: 	/*
  933: 	 * Since it is now being written, we can clear its deferred write flag.
  934: 	 */
  935: 	bp->b_flags &= ~B_DEFERRED;
  936: }
  937: 
  938: /*
  939:  *	bawrite:
  940:  *
  941:  *	Asynchronous write.  Start output on a buffer, but do not wait for
  942:  *	it to complete.  The buffer is released when the output completes.
  943:  *
  944:  *	bwrite() ( or the VOP routine anyway ) is responsible for handling 
  945:  *	B_INVAL buffers.  Not us.
  946:  */
  947: void
  948: bawrite(struct buf * bp)
  949: {
  950: 	bp->b_flags |= B_ASYNC;
  951: 	(void) VOP_BWRITE(bp->b_vp, bp);
  952: }
  953: 
  954: /*
  955:  *	bowrite:
  956:  *
  957:  *	Ordered write.  Start output on a buffer, and flag it so that the 
  958:  *	device will write it in the order it was queued.  The buffer is 
  959:  *	released when the output completes.  bwrite() ( or the VOP routine
  960:  *	anyway ) is responsible for handling B_INVAL buffers.
  961:  */
  962: int
  963: bowrite(struct buf * bp)
  964: {
  965: 	bp->b_flags |= B_ORDERED | B_ASYNC;
  966: 	return (VOP_BWRITE(bp->b_vp, bp));
  967: }
  968: 
  969: /*
  970:  *	bwillwrite:
  971:  *
  972:  *	Called prior to the locking of any vnodes when we are expecting to
  973:  *	write.  We do not want to starve the buffer cache with too many
  974:  *	dirty buffers so we block here.  By blocking prior to the locking
  975:  *	of any vnodes we attempt to avoid the situation where a locked vnode
  976:  *	prevents the various system daemons from flushing related buffers.
  977:  */
  978: 
  979: void
  980: bwillwrite(void)
  981: {
  982: 	if (numdirtybuffers >= hidirtybuffers) {
  983: 		int s;
  984: 
  985: 		s = splbio();
  986: 		while (numdirtybuffers >= hidirtybuffers) {
  987: 			bd_wakeup(1);
  988: 			needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH;
  989: 			tsleep(&needsbuffer, 0, "flswai", 0);
  990: 		}
  991: 		splx(s);
  992: 	}
  993: }
  994: 
  995: /*
  996:  * Return true if we have too many dirty buffers.
  997:  */
  998: int
  999: buf_dirty_count_severe(void)
 1000: {
 1001: 	return(numdirtybuffers >= hidirtybuffers);
 1002: }
 1003: 
 1004: /*
 1005:  *	brelse:
 1006:  *
 1007:  *	Release a busy buffer and, if requested, free its resources.  The
 1008:  *	buffer will be stashed in the appropriate bufqueue[] allowing it
 1009:  *	to be accessed later as a cache entity or reused for other purposes.
 1010:  */
 1011: void
 1012: brelse(struct buf * bp)
 1013: {
 1014: 	int s;
 1015: 
 1016: 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 1017: 
 1018: 	s = splbio();
 1019: 
 1020: 	if (bp->b_flags & B_LOCKED)
 1021: 		bp->b_flags &= ~B_ERROR;
 1022: 
 1023: 	if ((bp->b_flags & (B_READ | B_ERROR | B_INVAL)) == B_ERROR) {
 1024: 		/*
 1025: 		 * Failed write, redirty.  Must clear B_ERROR to prevent
 1026: 		 * pages from being scrapped.  If B_INVAL is set then
 1027: 		 * this case is not run and the next case is run to 
 1028: 		 * destroy the buffer.  B_INVAL can occur if the buffer
 1029: 		 * is outside the range supported by the underlying device.
 1030: 		 */
 1031: 		bp->b_flags &= ~B_ERROR;
 1032: 		bdirty(bp);
 1033: 	} else if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_FREEBUF)) ||
 1034: 	    (bp->b_bufsize <= 0)) {
 1035: 		/*
 1036: 		 * Either a failed I/O or we were asked to free or not
 1037: 		 * cache the buffer.
 1038: 		 */
 1039: 		bp->b_flags |= B_INVAL;
 1040: 		if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
 1041: 			(*bioops.io_deallocate)(bp);
 1042: 		if (bp->b_flags & B_DELWRI) {
 1043: 			--numdirtybuffers;
 1044: 			numdirtywakeup(lodirtybuffers);
 1045: 		}
 1046: 		bp->b_flags &= ~(B_DELWRI | B_CACHE | B_FREEBUF);
 1047: 		if ((bp->b_flags & B_VMIO) == 0) {
 1048: 			if (bp->b_bufsize)
 1049: 				allocbuf(bp, 0);
 1050: 			if (bp->b_vp)
 1051: 				brelvp(bp);
 1052: 		}
 1053: 	}
 1054: 
 1055: 	/*
 1056: 	 * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_release() 
 1057: 	 * is called with B_DELWRI set, the underlying pages may wind up
 1058: 	 * getting freed causing a previous write (bdwrite()) to get 'lost'
 1059: 	 * because pages associated with a B_DELWRI bp are marked clean.
 1060: 	 * 
 1061: 	 * We still allow the B_INVAL case to call vfs_vmio_release(), even
 1062: 	 * if B_DELWRI is set.
 1063: 	 *
 1064: 	 * If B_DELWRI is not set we may have to set B_RELBUF if we are low
 1065: 	 * on pages to return pages to the VM page queues.
 1066: 	 */
 1067: 	if (bp->b_flags & B_DELWRI)
 1068: 		bp->b_flags &= ~B_RELBUF;
 1069: 	else if (vm_page_count_severe() && !(bp->b_xflags & BX_BKGRDINPROG))
 1070: 		bp->b_flags |= B_RELBUF;
 1071: 
 1072: 	/*
 1073: 	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
 1074: 	 * constituted, not even NFS buffers now.  Two flags effect this.  If
 1075: 	 * B_INVAL, the struct buf is invalidated but the VM object is kept
 1076: 	 * around ( i.e. so it is trivial to reconstitute the buffer later ).
 1077: 	 *
 1078: 	 * If B_ERROR or B_NOCACHE is set, pages in the VM object will be
 1079: 	 * invalidated.  B_ERROR cannot be set for a failed write unless the
 1080: 	 * buffer is also B_INVAL because it hits the re-dirtying code above.
 1081: 	 *
 1082: 	 * Normally we can do this whether a buffer is B_DELWRI or not.  If
 1083: 	 * the buffer is an NFS buffer, it is tracking piecemeal writes or
 1084: 	 * the commit state and we cannot afford to lose the buffer. If the
 1085: 	 * buffer has a background write in progress, we need to keep it
 1086: 	 * around to prevent it from being reconstituted and starting a second
 1087: 	 * background write.
 1088: 	 */
 1089: 	if ((bp->b_flags & B_VMIO)
 1090: 	    && !(bp->b_vp->v_tag == VT_NFS &&
 1091: 		 !vn_isdisk(bp->b_vp, NULL) &&
 1092: 		 (bp->b_flags & B_DELWRI))
 1093: 	    ) {
 1094: 
 1095: 		int i, j, resid;
 1096: 		vm_page_t m;
 1097: 		off_t foff;
 1098: 		vm_pindex_t poff;
 1099: 		vm_object_t obj;
 1100: 		struct vnode *vp;
 1101: 
 1102: 		vp = bp->b_vp;
 1103: 
 1104: 		/*
 1105: 		 * Get the base offset and length of the buffer.  Note that 
 1106: 		 * in the VMIO case if the buffer block size is not
 1107: 		 * page-aligned then b_data pointer may not be page-aligned.
 1108: 		 * But our b_pages[] array *IS* page aligned.
 1109: 		 *
 1110: 		 * block sizes less then DEV_BSIZE (usually 512) are not 
 1111: 		 * supported due to the page granularity bits (m->valid,
 1112: 		 * m->dirty, etc...). 
 1113: 		 *
 1114: 		 * See man buf(9) for more information
 1115: 		 */
 1116: 
 1117: 		resid = bp->b_bufsize;
 1118: 		foff = bp->b_offset;
 1119: 
 1120: 		for (i = 0; i < bp->b_npages; i++) {
 1121: 			m = bp->b_pages[i];
 1122: 			vm_page_flag_clear(m, PG_ZERO);
 1123: 			/*
 1124: 			 * If we hit a bogus page, fixup *all* of them
 1125: 			 * now.  Note that we left these pages wired
 1126: 			 * when we removed them so they had better exist,
 1127: 			 * and they cannot be ripped out from under us so
 1128: 			 * no splvm() protection is necessary.
 1129: 			 */
 1130: 			if (m == bogus_page) {
 1131: 				VOP_GETVOBJECT(vp, &obj);
 1132: 				poff = OFF_TO_IDX(bp->b_offset);
 1133: 
 1134: 				for (j = i; j < bp->b_npages; j++) {
 1135: 					vm_page_t mtmp;
 1136: 
 1137: 					mtmp = bp->b_pages[j];
 1138: 					if (mtmp == bogus_page) {
 1139: 						mtmp = vm_page_lookup(obj, poff + j);
 1140: 						if (!mtmp) {
 1141: 							panic("brelse: page missing");
 1142: 						}
 1143: 						bp->b_pages[j] = mtmp;
 1144: 					}
 1145: 				}
 1146: 
 1147: 				if ((bp->b_flags & B_INVAL) == 0) {
 1148: 					pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
 1149: 				}
 1150: 				m = bp->b_pages[i];
 1151: 			}
 1152: 
 1153: 			/*
 1154: 			 * Invalidate the backing store if B_NOCACHE is set
 1155: 			 * (e.g. used with vinvalbuf()).  If this is NFS
 1156: 			 * we impose a requirement that the block size be
 1157: 			 * a multiple of PAGE_SIZE and create a temporary
 1158: 			 * hack to basically invalidate the whole page.  The
 1159: 			 * problem is that NFS uses really odd buffer sizes
 1160: 			 * especially when tracking piecemeal writes and
 1161: 			 * it also vinvalbuf()'s a lot, which would result
 1162: 			 * in only partial page validation and invalidation
 1163: 			 * here.  If the file page is mmap()'d, however,
 1164: 			 * all the valid bits get set so after we invalidate
 1165: 			 * here we would end up with weird m->valid values
 1166: 			 * like 0xfc.  nfs_getpages() can't handle this so
 1167: 			 * we clear all the valid bits for the NFS case
 1168: 			 * instead of just some of them.
 1169: 			 *
 1170: 			 * The real bug is the VM system having to set m->valid
 1171: 			 * to VM_PAGE_BITS_ALL for faulted-in pages, which
 1172: 			 * itself is an artifact of the whole 512-byte
 1173: 			 * granular mess that exists to support odd block 
 1174: 			 * sizes and UFS meta-data block sizes (e.g. 6144).
 1175: 			 * A complete rewrite is required.
 1176: 			 */
 1177: 			if (bp->b_flags & (B_NOCACHE|B_ERROR)) {
 1178: 				int poffset = foff & PAGE_MASK;
 1179: 				int presid;
 1180: 
 1181: 				presid = PAGE_SIZE - poffset;
 1182: 				if (bp->b_vp->v_tag == VT_NFS &&
 1183: 				    bp->b_vp->v_type == VREG) {
 1184: 					; /* entire page */
 1185: 				} else if (presid > resid) {
 1186: 					presid = resid;
 1187: 				}
 1188: 				KASSERT(presid >= 0, ("brelse: extra page"));
 1189: 				vm_page_set_invalid(m, poffset, presid);
 1190: 			}
 1191: 			resid -= PAGE_SIZE - (foff & PAGE_MASK);
 1192: 			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 1193: 		}
 1194: 
 1195: 		if (bp->b_flags & (B_INVAL | B_RELBUF))
 1196: 			vfs_vmio_release(bp);
 1197: 
 1198: 	} else if (bp->b_flags & B_VMIO) {
 1199: 
 1200: 		if (bp->b_flags & (B_INVAL | B_RELBUF))
 1201: 			vfs_vmio_release(bp);
 1202: 
 1203: 	}
 1204: 			
 1205: 	if (bp->b_qindex != QUEUE_NONE)
 1206: 		panic("brelse: free buffer onto another queue???");
 1207: 	if (BUF_REFCNT(bp) > 1) {
 1208: 		/* Temporary panic to verify exclusive locking */
 1209: 		/* This panic goes away when we allow shared refs */
 1210: 		panic("brelse: multiple refs");
 1211: 		/* do not release to free list */
 1212: 		BUF_UNLOCK(bp);
 1213: 		splx(s);
 1214: 		return;
 1215: 	}
 1216: 
 1217: 	/* enqueue */
 1218: 
 1219: 	/* buffers with no memory */
 1220: 	if (bp->b_bufsize == 0) {
 1221: 		bp->b_flags |= B_INVAL;
 1222: 		bp->b_xflags &= ~BX_BKGRDWRITE;
 1223: 		if (bp->b_xflags & BX_BKGRDINPROG)
 1224: 			panic("losing buffer 1");
 1225: 		if (bp->b_kvasize) {
 1226: 			bp->b_qindex = QUEUE_EMPTYKVA;
 1227: 		} else {
 1228: 			bp->b_qindex = QUEUE_EMPTY;
 1229: 		}
 1230: 		TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
 1231: 		LIST_REMOVE(bp, b_hash);
 1232: 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 1233: 		bp->b_dev = NODEV;
 1234: 	/* buffers with junk contents */
 1235: 	} else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
 1236: 		bp->b_flags |= B_INVAL;
 1237: 		bp->b_xflags &= ~BX_BKGRDWRITE;
 1238: 		if (bp->b_xflags & BX_BKGRDINPROG)
 1239: 			panic("losing buffer 2");
 1240: 		bp->b_qindex = QUEUE_CLEAN;
 1241: 		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
 1242: 		LIST_REMOVE(bp, b_hash);
 1243: 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 1244: 		bp->b_dev = NODEV;
 1245: 
 1246: 	/* buffers that are locked */
 1247: 	} else if (bp->b_flags & B_LOCKED) {
 1248: 		bp->b_qindex = QUEUE_LOCKED;
 1249: 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
 1250: 
 1251: 	/* remaining buffers */
 1252: 	} else {
 1253: 		switch(bp->b_flags & (B_DELWRI|B_AGE)) {
 1254: 		case B_DELWRI | B_AGE:
 1255: 		    bp->b_qindex = QUEUE_DIRTY;
 1256: 		    TAILQ_INSERT_HEAD(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
 1257: 		    break;
 1258: 		case B_DELWRI:
 1259: 		    bp->b_qindex = QUEUE_DIRTY;
 1260: 		    TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
 1261: 		    break;
 1262: 		case B_AGE:
 1263: 		    bp->b_qindex = QUEUE_CLEAN;
 1264: 		    TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
 1265: 		    break;
 1266: 		default:
 1267: 		    bp->b_qindex = QUEUE_CLEAN;
 1268: 		    TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
 1269: 		    break;
 1270: 		}
 1271: 	}
 1272: 
 1273: 	/*
 1274: 	 * If B_INVAL, clear B_DELWRI.  We've already placed the buffer
 1275: 	 * on the correct queue.
 1276: 	 */
 1277: 	if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI))
 1278: 		bundirty(bp);
 1279: 
 1280: 	/*
 1281: 	 * Fixup numfreebuffers count.  The bp is on an appropriate queue
 1282: 	 * unless locked.  We then bump numfreebuffers if it is not B_DELWRI.
 1283: 	 * We've already handled the B_INVAL case ( B_DELWRI will be clear
 1284: 	 * if B_INVAL is set ).
 1285: 	 */
 1286: 
 1287: 	if ((bp->b_flags & B_LOCKED) == 0 && !(bp->b_flags & B_DELWRI))
 1288: 		bufcountwakeup();
 1289: 
 1290: 	/*
 1291: 	 * Something we can maybe free or reuse
 1292: 	 */
 1293: 	if (bp->b_bufsize || bp->b_kvasize)
 1294: 		bufspacewakeup();
 1295: 
 1296: 	/* unlock */
 1297: 	BUF_UNLOCK(bp);
 1298: 	bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF |
 1299: 			B_DIRECT | B_NOWDRAIN);
 1300: 	splx(s);
 1301: }
 1302: 
 1303: /*
 1304:  * Release a buffer back to the appropriate queue but do not try to free
 1305:  * it.  The buffer is expected to be used again soon.
 1306:  *
 1307:  * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
 1308:  * biodone() to requeue an async I/O on completion.  It is also used when
 1309:  * known good buffers need to be requeued but we think we may need the data
 1310:  * again soon.
 1311:  *
 1312:  * XXX we should be able to leave the B_RELBUF hint set on completion.
 1313:  */
 1314: void
 1315: bqrelse(struct buf * bp)
 1316: {
 1317: 	int s;
 1318: 
 1319: 	s = splbio();
 1320: 
 1321: 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 1322: 
 1323: 	if (bp->b_qindex != QUEUE_NONE)
 1324: 		panic("bqrelse: free buffer onto another queue???");
 1325: 	if (BUF_REFCNT(bp) > 1) {
 1326: 		/* do not release to free list */
 1327: 		panic("bqrelse: multiple refs");
 1328: 		BUF_UNLOCK(bp);
 1329: 		splx(s);
 1330: 		return;
 1331: 	}
 1332: 	if (bp->b_flags & B_LOCKED) {
 1333: 		bp->b_flags &= ~B_ERROR;
 1334: 		bp->b_qindex = QUEUE_LOCKED;
 1335: 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
 1336: 		/* buffers with stale but valid contents */
 1337: 	} else if (bp->b_flags & B_DELWRI) {
 1338: 		bp->b_qindex = QUEUE_DIRTY;
 1339: 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
 1340: 	} else if (vm_page_count_severe()) {
 1341: 		/*
 1342: 		 * We are too low on memory, we have to try to free the
 1343: 		 * buffer (most importantly: the wired pages making up its
 1344: 		 * backing store) *now*.
 1345: 		 */
 1346: 		splx(s);
 1347: 		brelse(bp);
 1348: 		return;
 1349: 	} else {
 1350: 		bp->b_qindex = QUEUE_CLEAN;
 1351: 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
 1352: 	}
 1353: 
 1354: 	if ((bp->b_flags & B_LOCKED) == 0 &&
 1355: 	    ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))) {
 1356: 		bufcountwakeup();
 1357: 	}
 1358: 
 1359: 	/*
 1360: 	 * Something we can maybe free or reuse.
 1361: 	 */
 1362: 	if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
 1363: 		bufspacewakeup();
 1364: 
 1365: 	/* unlock */
 1366: 	BUF_UNLOCK(bp);
 1367: 	bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
 1368: 	splx(s);
 1369: }
 1370: 
 1371: static void
 1372: vfs_vmio_release(struct buf *bp)
 1373: {
 1374: 	int i, s;
 1375: 	vm_page_t m;
 1376: 
 1377: 	s = splvm();
 1378: 	for (i = 0; i < bp->b_npages; i++) {
 1379: 		m = bp->b_pages[i];
 1380: 		bp->b_pages[i] = NULL;
 1381: 		/*
 1382: 		 * In order to keep page LRU ordering consistent, put
 1383: 		 * everything on the inactive queue.
 1384: 		 */
 1385: 		vm_page_unwire(m, 0);
 1386: 		/*
 1387: 		 * We don't mess with busy pages, it is
 1388: 		 * the responsibility of the process that
 1389: 		 * busied the pages to deal with them.
 1390: 		 */
 1391: 		if ((m->flags & PG_BUSY) || (m->busy != 0))
 1392: 			continue;
 1393: 			
 1394: 		if (m->wire_count == 0) {
 1395: 			vm_page_flag_clear(m, PG_ZERO);
 1396: 			/*
 1397: 			 * Might as well free the page if we can and it has
 1398: 			 * no valid data.  We also free the page if the
 1399: 			 * buffer was used for direct I/O.
 1400: 			 */
 1401: 			if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && m->hold_count == 0) {
 1402: 				vm_page_busy(m);
 1403: 				vm_page_protect(m, VM_PROT_NONE);
 1404: 				vm_page_free(m);
 1405: 			} else if (bp->b_flags & B_DIRECT) {
 1406: 				vm_page_try_to_free(m);
 1407: 			} else if (vm_page_count_severe()) {
 1408: 				vm_page_try_to_cache(m);
 1409: 			}
 1410: 		}
 1411: 	}
 1412: 	splx(s);
 1413: 	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
 1414: 	if (bp->b_bufsize) {
 1415: 		bufspacewakeup();
 1416: 		bp->b_bufsize = 0;
 1417: 	}
 1418: 	bp->b_npages = 0;
 1419: 	bp->b_flags &= ~B_VMIO;
 1420: 	if (bp->b_vp)
 1421: 		brelvp(bp);
 1422: }
 1423: 
 1424: /*
 1425:  * Check to see if a block is currently memory resident.
 1426:  */
 1427: struct buf *
 1428: gbincore(struct vnode * vp, daddr_t blkno)
 1429: {
 1430: 	struct buf *bp;
 1431: 	struct bufhashhdr *bh;
 1432: 
 1433: 	bh = bufhash(vp, blkno);
 1434: 
 1435: 	/* Search hash chain */
 1436: 	LIST_FOREACH(bp, bh, b_hash) {
 1437: 		/* hit */
 1438: 		if (bp->b_vp == vp && bp->b_lblkno == blkno &&
 1439: 		    (bp->b_flags & B_INVAL) == 0) {
 1440: 			break;
 1441: 		}
 1442: 	}
 1443: 	return (bp);
 1444: }
 1445: 
 1446: /*
 1447:  *	vfs_bio_awrite:
 1448:  *
 1449:  *	Implement clustered async writes for clearing out B_DELWRI buffers.
 1450:  *	This is much better then the old way of writing only one buffer at
 1451:  *	a time.  Note that we may not be presented with the buffers in the 
 1452:  *	correct order, so we search for the cluster in both directions.
 1453:  */
 1454: int
 1455: vfs_bio_awrite(struct buf * bp)
 1456: {
 1457: 	int i;
 1458: 	int j;
 1459: 	daddr_t lblkno = bp->b_lblkno;
 1460: 	struct vnode *vp = bp->b_vp;
 1461: 	int s;
 1462: 	int ncl;
 1463: 	struct buf *bpa;
 1464: 	int nwritten;
 1465: 	int size;
 1466: 	int maxcl;
 1467: 
 1468: 	s = splbio();
 1469: 	/*
 1470: 	 * right now we support clustered writing only to regular files.  If
 1471: 	 * we find a clusterable block we could be in the middle of a cluster
 1472: 	 * rather then at the beginning.
 1473: 	 */
 1474: 	if ((vp->v_type == VREG) && 
 1475: 	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
 1476: 	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
 1477: 
 1478: 		size = vp->v_mount->mnt_stat.f_iosize;
 1479: 		maxcl = MAXPHYS / size;
 1480: 
 1481: 		for (i = 1; i < maxcl; i++) {
 1482: 			if ((bpa = gbincore(vp, lblkno + i)) &&
 1483: 			    BUF_REFCNT(bpa) == 0 &&
 1484: 			    ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
 1485: 			    (B_DELWRI | B_CLUSTEROK)) &&
 1486: 			    (bpa->b_bufsize == size)) {
 1487: 				if ((bpa->b_blkno == bpa->b_lblkno) ||
 1488: 				    (bpa->b_blkno !=
 1489: 				     bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
 1490: 					break;
 1491: 			} else {
 1492: 				break;
 1493: 			}
 1494: 		}
 1495: 		for (j = 1; i + j <= maxcl && j <= lblkno; j++) {
 1496: 			if ((bpa = gbincore(vp, lblkno - j)) &&
 1497: 			    BUF_REFCNT(bpa) == 0 &&
 1498: 			    ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
 1499: 			    (B_DELWRI | B_CLUSTEROK)) &&
 1500: 			    (bpa->b_bufsize == size)) {
 1501: 				if ((bpa->b_blkno == bpa->b_lblkno) ||
 1502: 				    (bpa->b_blkno !=
 1503: 				     bp->b_blkno - ((j * size) >> DEV_BSHIFT)))
 1504: 					break;
 1505: 			} else {
 1506: 				break;
 1507: 			}
 1508: 		}
 1509: 		--j;
 1510: 		ncl = i + j;
 1511: 		/*
 1512: 		 * this is a possible cluster write
 1513: 		 */
 1514: 		if (ncl != 1) {
 1515: 			nwritten = cluster_wbuild(vp, size, lblkno - j, ncl);
 1516: 			splx(s);
 1517: 			return nwritten;
 1518: 		}
 1519: 	}
 1520: 
 1521: 	BUF_LOCK(bp, LK_EXCLUSIVE);
 1522: 	bremfree(bp);
 1523: 	bp->b_flags |= B_ASYNC;
 1524: 
 1525: 	splx(s);
 1526: 	/*
 1527: 	 * default (old) behavior, writing out only one block
 1528: 	 *
 1529: 	 * XXX returns b_bufsize instead of b_bcount for nwritten?
 1530: 	 */
 1531: 	nwritten = bp->b_bufsize;
 1532: 	(void) VOP_BWRITE(bp->b_vp, bp);
 1533: 
 1534: 	return nwritten;
 1535: }
 1536: 
 1537: /*
 1538:  *	getnewbuf:
 1539:  *
 1540:  *	Find and initialize a new buffer header, freeing up existing buffers 
 1541:  *	in the bufqueues as necessary.  The new buffer is returned locked.
 1542:  *
 1543:  *	Important:  B_INVAL is not set.  If the caller wishes to throw the
 1544:  *	buffer away, the caller must set B_INVAL prior to calling brelse().
 1545:  *
 1546:  *	We block if:
 1547:  *		We have insufficient buffer headers
 1548:  *		We have insufficient buffer space
 1549:  *		buffer_map is too fragmented ( space reservation fails )
 1550:  *		If we have to flush dirty buffers ( but we try to avoid this )
 1551:  *
 1552:  *	To avoid VFS layer recursion we do not flush dirty buffers ourselves.
 1553:  *	Instead we ask the buf daemon to do it for us.  We attempt to
 1554:  *	avoid piecemeal wakeups of the pageout daemon.
 1555:  */
 1556: 
 1557: static struct buf *
 1558: getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
 1559: {
 1560: 	struct buf *bp;
 1561: 	struct buf *nbp;
 1562: 	int defrag = 0;
 1563: 	int nqindex;
 1564: 	static int flushingbufs;
 1565: 
 1566: 	/*
 1567: 	 * We can't afford to block since we might be holding a vnode lock,
 1568: 	 * which may prevent system daemons from running.  We deal with
 1569: 	 * low-memory situations by proactively returning memory and running
 1570: 	 * async I/O rather then sync I/O.
 1571: 	 */
 1572: 	
 1573: 	++getnewbufcalls;
 1574: 	--getnewbufrestarts;
 1575: restart:
 1576: 	++getnewbufrestarts;
 1577: 
 1578: 	/*
 1579: 	 * Setup for scan.  If we do not have enough free buffers,
 1580: 	 * we setup a degenerate case that immediately fails.  Note
 1581: 	 * that if we are specially marked process, we are allowed to
 1582: 	 * dip into our reserves.
 1583: 	 *
 1584: 	 * The scanning sequence is nominally:  EMPTY->EMPTYKVA->CLEAN
 1585: 	 *
 1586: 	 * We start with EMPTYKVA.  If the list is empty we backup to EMPTY.
 1587: 	 * However, there are a number of cases (defragging, reusing, ...)
 1588: 	 * where we cannot backup.
 1589: 	 */
 1590: 	nqindex = QUEUE_EMPTYKVA;
 1591: 	nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
 1592: 
 1593: 	if (nbp == NULL) {
 1594: 		/*
 1595: 		 * If no EMPTYKVA buffers and we are either
 1596: 		 * defragging or reusing, locate a CLEAN buffer
 1597: 		 * to free or reuse.  If bufspace useage is low
 1598: 		 * skip this step so we can allocate a new buffer.
 1599: 		 */
 1600: 		if (defrag || bufspace >= lobufspace) {
 1601: 			nqindex = QUEUE_CLEAN;
 1602: 			nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
 1603: 		}
 1604: 
 1605: 		/*
 1606: 		 * If we could not find or were not allowed to reuse a
 1607: 		 * CLEAN buffer, check to see if it is ok to use an EMPTY
 1608: 		 * buffer.  We can only use an EMPTY buffer if allocating
 1609: 		 * its KVA would not otherwise run us out of buffer space.
 1610: 		 */
 1611: 		if (nbp == NULL && defrag == 0 &&
 1612: 		    bufspace + maxsize < hibufspace) {
 1613: 			nqindex = QUEUE_EMPTY;
 1614: 			nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
 1615: 		}
 1616: 	}
 1617: 
 1618: 	/*
 1619: 	 * Run scan, possibly freeing data and/or kva mappings on the fly
 1620: 	 * depending.
 1621: 	 */
 1622: 
 1623: 	while ((bp = nbp) != NULL) {
 1624: 		int qindex = nqindex;
 1625: 
 1626: 		/*
 1627: 		 * Calculate next bp ( we can only use it if we do not block
 1628: 		 * or do other fancy things ).
 1629: 		 */
 1630: 		if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) {
 1631: 			switch(qindex) {
 1632: 			case QUEUE_EMPTY:
 1633: 				nqindex = QUEUE_EMPTYKVA;
 1634: 				if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA])))
 1635: 					break;
 1636: 				/* fall through */
 1637: 			case QUEUE_EMPTYKVA:
 1638: 				nqindex = QUEUE_CLEAN;
 1639: 				if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN])))
 1640: 					break;
 1641: 				/* fall through */
 1642: 			case QUEUE_CLEAN:
 1643: 				/*
 1644: 				 * nbp is NULL. 
 1645: 				 */
 1646: 				break;
 1647: 			}
 1648: 		}
 1649: 
 1650: 		/*
 1651: 		 * Sanity Checks
 1652: 		 */
 1653: 		KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp));
 1654: 
 1655: 		/*
 1656: 		 * Note: we no longer distinguish between VMIO and non-VMIO
 1657: 		 * buffers.
 1658: 		 */
 1659: 
 1660: 		KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex));
 1661: 
 1662: 		/*
 1663: 		 * If we are defragging then we need a buffer with 
 1664: 		 * b_kvasize != 0.  XXX this situation should no longer
 1665: 		 * occur, if defrag is non-zero the buffer's b_kvasize
 1666: 		 * should also be non-zero at this point.  XXX
 1667: 		 */
 1668: 		if (defrag && bp->b_kvasize == 0) {
 1669: 			printf("Warning: defrag empty buffer %p\n", bp);
 1670: 			continue;
 1671: 		}
 1672: 
 1673: 		/*
 1674: 		 * Start freeing the bp.  This is somewhat involved.  nbp
 1675: 		 * remains valid only for QUEUE_EMPTY[KVA] bp's.
 1676: 		 */
 1677: 
 1678: 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0)
 1679: 			panic("getnewbuf: locked buf");
 1680: 		bremfree(bp);
 1681: 
 1682: 		if (qindex == QUEUE_CLEAN) {
 1683: 			if (bp->b_flags & B_VMIO) {
 1684: 				bp->b_flags &= ~B_ASYNC;
 1685: 				vfs_vmio_release(bp);
 1686: 			}
 1687: 			if (bp->b_vp)
 1688: 				brelvp(bp);
 1689: 		}
 1690: 
 1691: 		/*
 1692: 		 * NOTE:  nbp is now entirely invalid.  We can only restart
 1693: 		 * the scan from this point on.
 1694: 		 *
 1695: 		 * Get the rest of the buffer freed up.  b_kva* is still
 1696: 		 * valid after this operation.
 1697: 		 */
 1698: 
 1699: 		if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
 1700: 			(*bioops.io_deallocate)(bp);
 1701: 		if (bp->b_xflags & BX_BKGRDINPROG)
 1702: 			panic("losing buffer 3");
 1703: 		LIST_REMOVE(bp, b_hash);
 1704: 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 1705: 
 1706: 		/*
 1707: 		 * spl protection not required when scrapping a buffer's
 1708: 		 * contents because it is already wired.
 1709: 		 */
 1710: 		if (bp->b_bufsize)
 1711: 			allocbuf(bp, 0);
 1712: 
 1713: 		bp->b_flags = 0;
 1714: 		bp->b_xflags = 0;
 1715: 		bp->b_dev = NODEV;
 1716: 		bp->b_vp = NULL;
 1717: 		bp->b_blkno = bp->b_lblkno = 0;
 1718: 		bp->b_offset = NOOFFSET;
 1719: 		bp->b_iodone = 0;
 1720: 		bp->b_error = 0;
 1721: 		bp->b_resid = 0;
 1722: 		bp->b_bcount = 0;
 1723: 		bp->b_npages = 0;
 1724: 		bp->b_dirtyoff = bp->b_dirtyend = 0;
 1725: 
 1726: 		LIST_INIT(&bp->b_dep);
 1727: 
 1728: 		/*
 1729: 		 * If we are defragging then free the buffer.
 1730: 		 */
 1731: 		if (defrag) {
 1732: 			bp->b_flags |= B_INVAL;
 1733: 			bfreekva(bp);
 1734: 			brelse(bp);
 1735: 			defrag = 0;
 1736: 			goto restart;
 1737: 		}
 1738: 
 1739: 		/*
 1740: 		 * If we are overcomitted then recover the buffer and its
 1741: 		 * KVM space.  This occurs in rare situations when multiple
 1742: 		 * processes are blocked in getnewbuf() or allocbuf().
 1743: 		 */
 1744: 		if (bufspace >= hibufspace)
 1745: 			flushingbufs = 1;
 1746: 		if (flushingbufs && bp->b_kvasize != 0) {
 1747: 			bp->b_flags |= B_INVAL;
 1748: 			bfreekva(bp);
 1749: 			brelse(bp);
 1750: 			goto restart;
 1751: 		}
 1752: 		if (bufspace < lobufspace)
 1753: 			flushingbufs = 0;
 1754: 		break;
 1755: 	}
 1756: 
 1757: 	/*
 1758: 	 * If we exhausted our list, sleep as appropriate.  We may have to
 1759: 	 * wakeup various daemons and write out some dirty buffers.
 1760: 	 *
 1761: 	 * Generally we are sleeping due to insufficient buffer space.
 1762: 	 */
 1763: 
 1764: 	if (bp == NULL) {
 1765: 		int flags;
 1766: 		char *waitmsg;
 1767: 
 1768: 		if (defrag) {
 1769: 			flags = VFS_BIO_NEED_BUFSPACE;
 1770: 			waitmsg = "nbufkv";
 1771: 		} else if (bufspace >= hibufspace) {
 1772: 			waitmsg = "nbufbs";
 1773: 			flags = VFS_BIO_NEED_BUFSPACE;
 1774: 		} else {
 1775: 			waitmsg = "newbuf";
 1776: 			flags = VFS_BIO_NEED_ANY;
 1777: 		}
 1778: 
 1779: 		bd_speedup();	/* heeeelp */
 1780: 
 1781: 		needsbuffer |= flags;
 1782: 		while (needsbuffer & flags) {
 1783: 			if (tsleep(&needsbuffer, slpflag, waitmsg, slptimeo))
 1784: 				return (NULL);
 1785: 		}
 1786: 	} else {
 1787: 		/*
 1788: 		 * We finally have a valid bp.  We aren't quite out of the
 1789: 		 * woods, we still have to reserve kva space.  In order
 1790: 		 * to keep fragmentation sane we only allocate kva in
 1791: 		 * BKVASIZE chunks.
 1792: 		 */
 1793: 		maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
 1794: 
 1795: 		if (maxsize != bp->b_kvasize) {
 1796: 			vm_offset_t addr = 0;
 1797: 			int count;
 1798: 
 1799: 			bfreekva(bp);
 1800: 
 1801: 			count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
 1802: 			vm_map_lock(buffer_map);
 1803: 
 1804: 			if (vm_map_findspace(buffer_map,
 1805: 				    vm_map_min(buffer_map), maxsize,
 1806: 				    maxsize, &addr)) {
 1807: 				/*
 1808: 				 * Uh oh.  Buffer map is to fragmented.  We
 1809: 				 * must defragment the map.
 1810: 				 */
 1811: 				vm_map_unlock(buffer_map);
 1812: 				vm_map_entry_release(count);
 1813: 				++bufdefragcnt;
 1814: 				defrag = 1;
 1815: 				bp->b_flags |= B_INVAL;
 1816: 				brelse(bp);
 1817: 				goto restart;
 1818: 			}
 1819: 			if (addr) {
 1820: 				vm_map_insert(buffer_map, &count,
 1821: 					NULL, 0,
 1822: 					addr, addr + maxsize,
 1823: 					VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
 1824: 
 1825: 				bp->b_kvabase = (caddr_t) addr;
 1826: 				bp->b_kvasize = maxsize;
 1827: 				bufspace += bp->b_kvasize;
 1828: 				++bufreusecnt;
 1829: 			}
 1830: 			vm_map_unlock(buffer_map);
 1831: 			vm_map_entry_release(count);
 1832: 		}
 1833: 		bp->b_data = bp->b_kvabase;
 1834: 	}
 1835: 	return(bp);
 1836: }
 1837: 
 1838: /*
 1839:  *	buf_daemon:
 1840:  *
 1841:  *	buffer flushing daemon.  Buffers are normally flushed by the
 1842:  *	update daemon but if it cannot keep up this process starts to
 1843:  *	take the load in an attempt to prevent getnewbuf() from blocking.
 1844:  */
 1845: 
 1846: static struct thread *bufdaemonthread;
 1847: 
 1848: static struct kproc_desc buf_kp = {
 1849: 	"bufdaemon",
 1850: 	buf_daemon,
 1851: 	&bufdaemonthread
 1852: };
 1853: SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp)
 1854: 
 1855: static void
 1856: buf_daemon()
 1857: {
 1858: 	int s;
 1859: 
 1860: 	/*
 1861: 	 * This process needs to be suspended prior to shutdown sync.
 1862: 	 */
 1863: 	EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc,
 1864: 	    bufdaemonthread, SHUTDOWN_PRI_LAST);
 1865: 
 1866: 	/*
 1867: 	 * This process is allowed to take the buffer cache to the limit
 1868: 	 */
 1869: 	s = splbio();
 1870: 
 1871: 	for (;;) {
 1872: 		kproc_suspend_loop();
 1873: 
 1874: 		/*
 1875: 		 * Do the flush.  Limit the amount of in-transit I/O we
 1876: 		 * allow to build up, otherwise we would completely saturate
 1877: 		 * the I/O system.  Wakeup any waiting processes before we
 1878: 		 * normally would so they can run in parallel with our drain.
 1879: 		 */
 1880: 		while (numdirtybuffers > lodirtybuffers) {
 1881: 			if (flushbufqueues() == 0)
 1882: 				break;
 1883: 			waitrunningbufspace();
 1884: 			numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2);
 1885: 		}
 1886: 
 1887: 		/*
 1888: 		 * Only clear bd_request if we have reached our low water
 1889: 		 * mark.  The buf_daemon normally waits 5 seconds and
 1890: 		 * then incrementally flushes any dirty buffers that have
 1891: 		 * built up, within reason.
 1892: 		 *
 1893: 		 * If we were unable to hit our low water mark and couldn't
 1894: 		 * find any flushable buffers, we sleep half a second. 
 1895: 		 * Otherwise we loop immediately.
 1896: 		 */
 1897: 		if (numdirtybuffers <= lodirtybuffers) {
 1898: 			/*
 1899: 			 * We reached our low water mark, reset the
 1900: 			 * request and sleep until we are needed again.
 1901: 			 * The sleep is just so the suspend code works.
 1902: 			 */
 1903: 			bd_request = 0;
 1904: 			tsleep(&bd_request, 0, "psleep", hz);
 1905: 		} else {
 1906: 			/*
 1907: 			 * We couldn't find any flushable dirty buffers but
 1908: 			 * still have too many dirty buffers, we
 1909: 			 * have to sleep and try again.  (rare)
 1910: 			 */
 1911: 			tsleep(&bd_request, 0, "qsleep", hz / 2);
 1912: 		}
 1913: 	}
 1914: }
 1915: 
 1916: /*
 1917:  *	flushbufqueues:
 1918:  *
 1919:  *	Try to flush a buffer in the dirty queue.  We must be careful to
 1920:  *	free up B_INVAL buffers instead of write them, which NFS is 
 1921:  *	particularly sensitive to.
 1922:  */
 1923: 
 1924: static int
 1925: flushbufqueues(void)
 1926: {
 1927: 	struct buf *bp;
 1928: 	int r = 0;
 1929: 
 1930: 	bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]);
 1931: 
 1932: 	while (bp) {
 1933: 		KASSERT((bp->b_flags & B_DELWRI), ("unexpected clean buffer %p", bp));
 1934: 		if ((bp->b_flags & B_DELWRI) != 0 &&
 1935: 		    (bp->b_xflags & BX_BKGRDINPROG) == 0) {
 1936: 			if (bp->b_flags & B_INVAL) {
 1937: 				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0)
 1938: 					panic("flushbufqueues: locked buf");
 1939: 				bremfree(bp);
 1940: 				brelse(bp);
 1941: 				++r;
 1942: 				break;
 1943: 			}
 1944: 			if (LIST_FIRST(&bp->b_dep) != NULL &&
 1945: 			    bioops.io_countdeps &&
 1946: 			    (bp->b_flags & B_DEFERRED) == 0 &&
 1947: 			    (*bioops.io_countdeps)(bp, 0)) {
 1948: 				TAILQ_REMOVE(&bufqueues[QUEUE_DIRTY],
 1949: 				    bp, b_freelist);
 1950: 				TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY],
 1951: 				    bp, b_freelist);
 1952: 				bp->b_flags |= B_DEFERRED;
 1953: 				bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]);
 1954: 				continue;
 1955: 			}
 1956: 			vfs_bio_awrite(bp);
 1957: 			++r;
 1958: 			break;
 1959: 		}
 1960: 		bp = TAILQ_NEXT(bp, b_freelist);
 1961: 	}
 1962: 	return (r);
 1963: }
 1964: 
 1965: /*
 1966:  * Check to see if a block is currently memory resident.
 1967:  */
 1968: struct buf *
 1969: incore(struct vnode * vp, daddr_t blkno)
 1970: {
 1971: 	struct buf *bp;
 1972: 
 1973: 	int s = splbio();
 1974: 	bp = gbincore(vp, blkno);
 1975: 	splx(s);
 1976: 	return (bp);
 1977: }
 1978: 
 1979: /*
 1980:  * Returns true if no I/O is needed to access the associated VM object.
 1981:  * This is like incore except it also hunts around in the VM system for
 1982:  * the data.
 1983:  *
 1984:  * Note that we ignore vm_page_free() races from interrupts against our
 1985:  * lookup, since if the caller is not protected our return value will not
 1986:  * be any more valid then otherwise once we splx().
 1987:  */
 1988: int
 1989: inmem(struct vnode * vp, daddr_t blkno)
 1990: {
 1991: 	vm_object_t obj;
 1992: 	vm_offset_t toff, tinc, size;
 1993: 	vm_page_t m;
 1994: 	vm_ooffset_t off;
 1995: 
 1996: 	if (incore(vp, blkno))
 1997: 		return 1;
 1998: 	if (vp->v_mount == NULL)
 1999: 		return 0;
 2000: 	if (VOP_GETVOBJECT(vp, &obj) != 0 || (vp->v_flag & VOBJBUF) == 0)
 2001:  		return 0;
 2002: 
 2003: 	size = PAGE_SIZE;
 2004: 	if (size > vp->v_mount->mnt_stat.f_iosize)
 2005: 		size = vp->v_mount->mnt_stat.f_iosize;
 2006: 	off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
 2007: 
 2008: 	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
 2009: 		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
 2010: 		if (!m)
 2011: 			return 0;
 2012: 		tinc = size;
 2013: 		if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
 2014: 			tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
 2015: 		if (vm_page_is_valid(m,
 2016: 		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
 2017: 			return 0;
 2018: 	}
 2019: 	return 1;
 2020: }
 2021: 
 2022: /*
 2023:  *	vfs_setdirty:
 2024:  *
 2025:  *	Sets the dirty range for a buffer based on the status of the dirty
 2026:  *	bits in the pages comprising the buffer.
 2027:  *
 2028:  *	The range is limited to the size of the buffer.
 2029:  *
 2030:  *	This routine is primarily used by NFS, but is generalized for the
 2031:  *	B_VMIO case.
 2032:  */
 2033: static void
 2034: vfs_setdirty(struct buf *bp) 
 2035: {
 2036: 	int i;
 2037: 	vm_object_t object;
 2038: 
 2039: 	/*
 2040: 	 * Degenerate case - empty buffer
 2041: 	 */
 2042: 
 2043: 	if (bp->b_bufsize == 0)
 2044: 		return;
 2045: 
 2046: 	/*
 2047: 	 * We qualify the scan for modified pages on whether the
 2048: 	 * object has been flushed yet.  The OBJ_WRITEABLE flag
 2049: 	 * is not cleared simply by protecting pages off.
 2050: 	 */
 2051: 
 2052: 	if ((bp->b_flags & B_VMIO) == 0)
 2053: 		return;
 2054: 
 2055: 	object = bp->b_pages[0]->object;
 2056: 
 2057: 	if ((object->flags & OBJ_WRITEABLE) && !(object->flags & OBJ_MIGHTBEDIRTY))
 2058: 		printf("Warning: object %p writeable but not mightbedirty\n", object);
 2059: 	if (!(object->flags & OBJ_WRITEABLE) && (object->flags & OBJ_MIGHTBEDIRTY))
 2060: 		printf("Warning: object %p mightbedirty but not writeable\n", object);
 2061: 
 2062: 	if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) {
 2063: 		vm_offset_t boffset;
 2064: 		vm_offset_t eoffset;
 2065: 
 2066: 		/*
 2067: 		 * test the pages to see if they have been modified directly
 2068: 		 * by users through the VM system.
 2069: 		 */
 2070: 		for (i = 0; i < bp->b_npages; i++) {
 2071: 			vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
 2072: 			vm_page_test_dirty(bp->b_pages[i]);
 2073: 		}
 2074: 
 2075: 		/*
 2076: 		 * Calculate the encompassing dirty range, boffset and eoffset,
 2077: 		 * (eoffset - boffset) bytes.
 2078: 		 */
 2079: 
 2080: 		for (i = 0; i < bp->b_npages; i++) {
 2081: 			if (bp->b_pages[i]->dirty)
 2082: 				break;
 2083: 		}
 2084: 		boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
 2085: 
 2086: 		for (i = bp->b_npages - 1; i >= 0; --i) {
 2087: 			if (bp->b_pages[i]->dirty) {
 2088: 				break;
 2089: 			}
 2090: 		}
 2091: 		eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
 2092: 
 2093: 		/*
 2094: 		 * Fit it to the buffer.
 2095: 		 */
 2096: 
 2097: 		if (eoffset > bp->b_bcount)
 2098: 			eoffset = bp->b_bcount;
 2099: 
 2100: 		/*
 2101: 		 * If we have a good dirty range, merge with the existing
 2102: 		 * dirty range.
 2103: 		 */
 2104: 
 2105: 		if (boffset < eoffset) {
 2106: 			if (bp->b_dirtyoff > boffset)
 2107: 				bp->b_dirtyoff = boffset;
 2108: 			if (bp->b_dirtyend < eoffset)
 2109: 				bp->b_dirtyend = eoffset;
 2110: 		}
 2111: 	}
 2112: }
 2113: 
 2114: /*
 2115:  *	getblk:
 2116:  *
 2117:  *	Get a block given a specified block and offset into a file/device.
 2118:  *	The buffers B_DONE bit will be cleared on return, making it almost
 2119:  * 	ready for an I/O initiation.  B_INVAL may or may not be set on 
 2120:  *	return.  The caller should clear B_INVAL prior to initiating a
 2121:  *	READ.
 2122:  *
 2123:  *	For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
 2124:  *	an existing buffer.
 2125:  *
 2126:  *	For a VMIO buffer, B_CACHE is modified according to the backing VM.
 2127:  *	If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
 2128:  *	and then cleared based on the backing VM.  If the previous buffer is
 2129:  *	non-0-sized but invalid, B_CACHE will be cleared.
 2130:  *
 2131:  *	If getblk() must create a new buffer, the new buffer is returned with
 2132:  *	both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
 2133:  *	case it is returned with B_INVAL clear and B_CACHE set based on the
 2134:  *	backing VM.
 2135:  *
 2136:  *	getblk() also forces a VOP_BWRITE() for any B_DELWRI buffer whos
 2137:  *	B_CACHE bit is clear.
 2138:  *	
 2139:  *	What this means, basically, is that the caller should use B_CACHE to
 2140:  *	determine whether the buffer is fully valid or not and should clear
 2141:  *	B_INVAL prior to issuing a read.  If the caller intends to validate
 2142:  *	the buffer by loading its data area with something, the caller needs
 2143:  *	to clear B_INVAL.  If the caller does this without issuing an I/O, 
 2144:  *	the caller should set B_CACHE ( as an optimization ), else the caller
 2145:  *	should issue the I/O and biodone() will set B_CACHE if the I/O was
 2146:  *	a write attempt or if it was a successfull read.  If the caller 
 2147:  *	intends to issue a READ, the caller must clear B_INVAL and B_ERROR
 2148:  *	prior to issuing the READ.  biodone() will *not* clear B_INVAL.
 2149:  */
 2150: struct buf *
 2151: getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
 2152: {
 2153: 	struct buf *bp;
 2154: 	int s;
 2155: 	struct bufhashhdr *bh;
 2156: 
 2157: 	if (size > MAXBSIZE)
 2158: 		panic("getblk: size(%d) > MAXBSIZE(%d)", size, MAXBSIZE);
 2159: 
 2160: 	s = splbio();
 2161: loop:
 2162: 	/*
 2163: 	 * Block if we are low on buffers.   Certain processes are allowed
 2164: 	 * to completely exhaust the buffer cache.
 2165:          *
 2166:          * If this check ever becomes a bottleneck it may be better to
 2167:          * move it into the else, when gbincore() fails.  At the moment
 2168:          * it isn't a problem.
 2169: 	 *
 2170: 	 * XXX remove, we cannot afford to block anywhere if holding a vnode
 2171: 	 * lock in low-memory situation, so take it to the max.
 2172:          */
 2173: 	if (numfreebuffers == 0) {
 2174: 		if (!curproc)
 2175: 			return NULL;
 2176: 		needsbuffer |= VFS_BIO_NEED_ANY;
 2177: 		tsleep(&needsbuffer, slpflag, "newbuf", slptimeo);
 2178: 	}
 2179: 
 2180: 	if ((bp = gbincore(vp, blkno))) {
 2181: 		/*
 2182: 		 * Buffer is in-core.  If the buffer is not busy, it must
 2183: 		 * be on a queue.
 2184: 		 */
 2185: 
 2186: 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 2187: 			if (BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL,
 2188: 			    "getblk", slpflag, slptimeo) == ENOLCK)
 2189: 				goto loop;
 2190: 			splx(s);
 2191: 			return (struct buf *) NULL;
 2192: 		}
 2193: 
 2194: 		/*
 2195: 		 * The buffer is locked.  B_CACHE is cleared if the buffer is 
 2196: 		 * invalid.  Ohterwise, for a non-VMIO buffer, B_CACHE is set
 2197: 		 * and for a VMIO buffer B_CACHE is adjusted according to the
 2198: 		 * backing VM cache.
 2199: 		 */
 2200: 		if (bp->b_flags & B_INVAL)
 2201: 			bp->b_flags &= ~B_CACHE;
 2202: 		else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
 2203: 			bp->b_flags |= B_CACHE;
 2204: 		bremfree(bp);
 2205: 
 2206: 		/*
 2207: 		 * check for size inconsistancies for non-VMIO case.
 2208: 		 */
 2209: 
 2210: 		if (bp->b_bcount != size) {
 2211: 			if ((bp->b_flags & B_VMIO) == 0 ||
 2212: 			    (size > bp->b_kvasize)) {
 2213: 				if (bp->b_flags & B_DELWRI) {
 2214: 					bp->b_flags |= B_NOCACHE;
 2215: 					VOP_BWRITE(bp->b_vp, bp);
 2216: 				} else {
 2217: 					if ((bp->b_flags & B_VMIO) &&
 2218: 					   (LIST_FIRST(&bp->b_dep) == NULL)) {
 2219: 						bp->b_flags |= B_RELBUF;
 2220: 						brelse(bp);
 2221: 					} else {
 2222: 						bp->b_flags |= B_NOCACHE;
 2223: 						VOP_BWRITE(bp->b_vp, bp);
 2224: 					}
 2225: 				}
 2226: 				goto loop;
 2227: 			}
 2228: 		}
 2229: 
 2230: 		/*
 2231: 		 * If the size is inconsistant in the VMIO case, we can resize
 2232: 		 * the buffer.  This might lead to B_CACHE getting set or
 2233: 		 * cleared.  If the size has not changed, B_CACHE remains
 2234: 		 * unchanged from its previous state.
 2235: 		 */
 2236: 
 2237: 		if (bp->b_bcount != size)
 2238: 			allocbuf(bp, size);
 2239: 
 2240: 		KASSERT(bp->b_offset != NOOFFSET, 
 2241: 		    ("getblk: no buffer offset"));
 2242: 
 2243: 		/*
 2244: 		 * A buffer with B_DELWRI set and B_CACHE clear must
 2245: 		 * be committed before we can return the buffer in
 2246: 		 * order to prevent the caller from issuing a read
 2247: 		 * ( due to B_CACHE not being set ) and overwriting
 2248: 		 * it.
 2249: 		 *
 2250: 		 * Most callers, including NFS and FFS, need this to
 2251: 		 * operate properly either because they assume they
 2252: 		 * can issue a read if B_CACHE is not set, or because
 2253: 		 * ( for example ) an uncached B_DELWRI might loop due 
 2254: 		 * to softupdates re-dirtying the buffer.  In the latter
 2255: 		 * case, B_CACHE is set after the first write completes,
 2256: 		 * preventing further loops.
 2257: 		 *
 2258: 		 * NOTE!  b*write() sets B_CACHE.  If we cleared B_CACHE
 2259: 		 * above while extending the buffer, we cannot allow the
 2260: 		 * buffer to remain with B_CACHE set after the write
 2261: 		 * completes or it will represent a corrupt state.  To
 2262: 		 * deal with this we set B_NOCACHE to scrap the buffer
 2263: 		 * after the write.
 2264: 		 *
 2265: 		 * We might be able to do something fancy, like setting
 2266: 		 * B_CACHE in bwrite() except if B_DELWRI is already set,
 2267: 		 * so the below call doesn't set B_CACHE, but that gets real
 2268: 		 * confusing.  This is much easier.
 2269: 		 */
 2270: 
 2271: 		if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
 2272: 			bp->b_flags |= B_NOCACHE;
 2273: 			VOP_BWRITE(bp->b_vp, bp);
 2274: 			goto loop;
 2275: 		}
 2276: 
 2277: 		splx(s);
 2278: 		bp->b_flags &= ~B_DONE;
 2279: 	} else {
 2280: 		/*
 2281: 		 * Buffer is not in-core, create new buffer.  The buffer
 2282: 		 * returned by getnewbuf() is locked.  Note that the returned
 2283: 		 * buffer is also considered valid (not marked B_INVAL).
 2284: 		 */
 2285: 		int bsize, maxsize, vmio;
 2286: 		off_t offset;
 2287: 
 2288: 		if (vn_isdisk(vp, NULL))
 2289: 			bsize = DEV_BSIZE;
 2290: 		else if (vp->v_mountedhere)
 2291: 			bsize = vp->v_mountedhere->mnt_stat.f_iosize;
 2292: 		else if (vp->v_mount)
 2293: 			bsize = vp->v_mount->mnt_stat.f_iosize;
 2294: 		else
 2295: 			bsize = size;
 2296: 
 2297: 		offset = (off_t)blkno * bsize;
 2298: 		vmio = (VOP_GETVOBJECT(vp, NULL) == 0) && (vp->v_flag & VOBJBUF);
 2299: 		maxsize = vmio ? size + (offset & PAGE_MASK) : size;
 2300: 		maxsize = imax(maxsize, bsize);
 2301: 
 2302: 		if ((bp = getnewbuf(slpflag, slptimeo, size, maxsize)) == NULL) {
 2303: 			if (slpflag || slptimeo) {
 2304: 				splx(s);
 2305: 				return NULL;
 2306: 			}
 2307: 			goto loop;
 2308: 		}
 2309: 
 2310: 		/*
 2311: 		 * This code is used to make sure that a buffer is not
 2312: 		 * created while the getnewbuf routine is blocked.
 2313: 		 * This can be a problem whether the vnode is locked or not.
 2314: 		 * If the buffer is created out from under us, we have to
 2315: 		 * throw away the one we just created.  There is now window
 2316: 		 * race because we are safely running at splbio() from the
 2317: 		 * point of the duplicate buffer creation through to here,
 2318: 		 * and we've locked the buffer.
 2319: 		 */
 2320: 		if (gbincore(vp, blkno)) {
 2321: 			bp->b_flags |= B_INVAL;
 2322: 			brelse(bp);
 2323: 			goto loop;
 2324: 		}
 2325: 
 2326: 		/*
 2327: 		 * Insert the buffer into the hash, so that it can
 2328: 		 * be found by incore.
 2329: 		 */
 2330: 		bp->b_blkno = bp->b_lblkno = blkno;
 2331: 		bp->b_offset = offset;
 2332: 
 2333: 		bgetvp(vp, bp);
 2334: 		LIST_REMOVE(bp, b_hash);
 2335: 		bh = bufhash(vp, blkno);
 2336: 		LIST_INSERT_HEAD(bh, bp, b_hash);
 2337: 
 2338: 		/*
 2339: 		 * set B_VMIO bit.  allocbuf() the buffer bigger.  Since the
 2340: 		 * buffer size starts out as 0, B_CACHE will be set by
 2341: 		 * allocbuf() for the VMIO case prior to it testing the
 2342: 		 * backing store for validity.
 2343: 		 */
 2344: 
 2345: 		if (vmio) {
 2346: 			bp->b_flags |= B_VMIO;
 2347: #if defined(VFS_BIO_DEBUG)
 2348: 			if (vn_canvmio(vp) != TRUE)
 2349: 				printf("getblk: vmioing file type %d???\n", vp->v_type);
 2350: #endif
 2351: 		} else {
 2352: 			bp->b_flags &= ~B_VMIO;
 2353: 		}
 2354: 
 2355: 		allocbuf(bp, size);
 2356: 
 2357: 		splx(s);
 2358: 		bp->b_flags &= ~B_DONE;
 2359: 	}
 2360: 	return (bp);
 2361: }
 2362: 
 2363: /*
 2364:  * Get an empty, disassociated buffer of given size.  The buffer is initially
 2365:  * set to B_INVAL.
 2366:  *
 2367:  * spl protection is not required for the allocbuf() call because races are
 2368:  * impossible here.
 2369:  */
 2370: struct buf *
 2371: geteblk(int size)
 2372: {
 2373: 	struct buf *bp;
 2374: 	int s;
 2375: 	int maxsize;
 2376: 
 2377: 	maxsize = (size + BKVAMASK) & ~BKVAMASK;
 2378: 
 2379: 	s = splbio();
 2380: 	while ((bp = getnewbuf(0, 0, size, maxsize)) == 0);
 2381: 	splx(s);
 2382: 	allocbuf(bp, size);
 2383: 	bp->b_flags |= B_INVAL;	/* b_dep cleared by getnewbuf() */
 2384: 	return (bp);
 2385: }
 2386: 
 2387: 
 2388: /*
 2389:  * This code constitutes the buffer memory from either anonymous system
 2390:  * memory (in the case of non-VMIO operations) or from an associated
 2391:  * VM object (in the case of VMIO operations).  This code is able to
 2392:  * resize a buffer up or down.
 2393:  *
 2394:  * Note that this code is tricky, and has many complications to resolve
 2395:  * deadlock or inconsistant data situations.  Tread lightly!!! 
 2396:  * There are B_CACHE and B_DELWRI interactions that must be dealt with by 
 2397:  * the caller.  Calling this code willy nilly can result in the loss of data.
 2398:  *
 2399:  * allocbuf() only adjusts B_CACHE for VMIO buffers.  getblk() deals with
 2400:  * B_CACHE for the non-VMIO case.
 2401:  *
 2402:  * This routine does not need to be called at splbio() but you must own the
 2403:  * buffer.
 2404:  */
 2405: int
 2406: allocbuf(struct buf *bp, int size)
 2407: {
 2408: 	int newbsize, mbsize;
 2409: 	int i;
 2410: 
 2411: 	if (BUF_REFCNT(bp) == 0)
 2412: 		panic("allocbuf: buffer not busy");
 2413: 
 2414: 	if (bp->b_kvasize < size)
 2415: 		panic("allocbuf: buffer too small");
 2416: 
 2417: 	if ((bp->b_flags & B_VMIO) == 0) {
 2418: 		caddr_t origbuf;
 2419: 		int origbufsize;
 2420: 		/*
 2421: 		 * Just get anonymous memory from the kernel.  Don't
 2422: 		 * mess with B_CACHE.
 2423: 		 */
 2424: 		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 2425: #if !defined(NO_B_MALLOC)
 2426: 		if (bp->b_flags & B_MALLOC)
 2427: 			newbsize = mbsize;
 2428: 		else
 2429: #endif
 2430: 			newbsize = round_page(size);
 2431: 
 2432: 		if (newbsize < bp->b_bufsize) {
 2433: #if !defined(NO_B_MALLOC)
 2434: 			/*
 2435: 			 * malloced buffers are not shrunk
 2436: 			 */
 2437: 			if (bp->b_flags & B_MALLOC) {
 2438: 				if (newbsize) {
 2439: 					bp->b_bcount = size;
 2440: 				} else {
 2441: 					free(bp->b_data, M_BIOBUF);
 2442: 					if (bp->b_bufsize) {
 2443: 						bufmallocspace -= bp->b_bufsize;
 2444: 						bufspacewakeup();
 2445: 						bp->b_bufsize = 0;
 2446: 					}
 2447: 					bp->b_data = bp->b_kvabase;
 2448: 					bp->b_bcount = 0;
 2449: 					bp->b_flags &= ~B_MALLOC;
 2450: 				}
 2451: 				return 1;
 2452: 			}		
 2453: #endif
 2454: 			vm_hold_free_pages(
 2455: 			    bp,
 2456: 			    (vm_offset_t) bp->b_data + newbsize,
 2457: 			    (vm_offset_t) bp->b_data + bp->b_bufsize);
 2458: 		} else if (newbsize > bp->b_bufsize) {
 2459: #if !defined(NO_B_MALLOC)
 2460: 			/*
 2461: 			 * We only use malloced memory on the first allocation.
 2462: 			 * and revert to page-allocated memory when the buffer
 2463: 			 * grows.
 2464: 			 */
 2465: 			if ( (bufmallocspace < maxbufmallocspace) &&
 2466: 				(bp->b_bufsize == 0) &&
 2467: 				(mbsize <= PAGE_SIZE/2)) {
 2468: 
 2469: 				bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
 2470: 				bp->b_bufsize = mbsize;
 2471: 				bp->b_bcount = size;
 2472: 				bp->b_flags |= B_MALLOC;
 2473: 				bufmallocspace += mbsize;
 2474: 				return 1;
 2475: 			}
 2476: #endif
 2477: 			origbuf = NULL;
 2478: 			origbufsize = 0;
 2479: #if !defined(NO_B_MALLOC)
 2480: 			/*
 2481: 			 * If the buffer is growing on its other-than-first allocation,
 2482: 			 * then we revert to the page-allocation scheme.
 2483: 			 */
 2484: 			if (bp->b_flags & B_MALLOC) {
 2485: 				origbuf = bp->b_data;
 2486: 				origbufsize = bp->b_bufsize;
 2487: 				bp->b_data = bp->b_kvabase;
 2488: 				if (bp->b_bufsize) {
 2489: 					bufmallocspace -= bp->b_bufsize;
 2490: 					bufspacewakeup();
 2491: 					bp->b_bufsize = 0;
 2492: 				}
 2493: 				bp->b_flags &= ~B_MALLOC;
 2494: 				newbsize = round_page(newbsize);
 2495: 			}
 2496: #endif
 2497: 			vm_hold_load_pages(
 2498: 			    bp,
 2499: 			    (vm_offset_t) bp->b_data + bp->b_bufsize,
 2500: 			    (vm_offset_t) bp->b_data + newbsize);
 2501: #if !defined(NO_B_MALLOC)
 2502: 			if (origbuf) {
 2503: 				bcopy(origbuf, bp->b_data, origbufsize);
 2504: 				free(origbuf, M_BIOBUF);
 2505: 			}
 2506: #endif
 2507: 		}
 2508: 	} else {
 2509: 		vm_page_t m;
 2510: 		int desiredpages;
 2511: 
 2512: 		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 2513: 		desiredpages = (size == 0) ? 0 :
 2514: 			num_pages((bp->b_offset & PAGE_MASK) + newbsize);
 2515: 
 2516: #if !defined(NO_B_MALLOC)
 2517: 		if (bp->b_flags & B_MALLOC)
 2518: 			panic("allocbuf: VMIO buffer can't be malloced");
 2519: #endif
 2520: 		/*
 2521: 		 * Set B_CACHE initially if buffer is 0 length or will become
 2522: 		 * 0-length.
 2523: 		 */
 2524: 		if (size == 0 || bp->b_bufsize == 0)
 2525: 			bp->b_flags |= B_CACHE;
 2526: 
 2527: 		if (newbsize < bp->b_bufsize) {
 2528: 			/*
 2529: 			 * DEV_BSIZE aligned new buffer size is less then the
 2530: 			 * DEV_BSIZE aligned existing buffer size.  Figure out
 2531: 			 * if we have to remove any pages.
 2532: 			 */
 2533: 			if (desiredpages < bp->b_npages) {
 2534: 				for (i = desiredpages; i < bp->b_npages; i++) {
 2535: 					/*
 2536: 					 * the page is not freed here -- it
 2537: 					 * is the responsibility of 
 2538: 					 * vnode_pager_setsize
 2539: 					 */
 2540: 					m = bp->b_pages[i];
 2541: 					KASSERT(m != bogus_page,
 2542: 					    ("allocbuf: bogus page found"));
 2543: 					while (vm_page_sleep_busy(m, TRUE, "biodep"))
 2544: 						;
 2545: 
 2546: 					bp->b_pages[i] = NULL;
 2547: 					vm_page_unwire(m, 0);
 2548: 				}
 2549: 				pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) +
 2550: 				    (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
 2551: 				bp->b_npages = desiredpages;
 2552: 			}
 2553: 		} else if (size > bp->b_bcount) {
 2554: 			/*
 2555: 			 * We are growing the buffer, possibly in a 
 2556: 			 * byte-granular fashion.
 2557: 			 */
 2558: 			struct vnode *vp;
 2559: 			vm_object_t obj;
 2560: 			vm_offset_t toff;
 2561: 			vm_offset_t tinc;
 2562: 			int s;
 2563: 
 2564: 			/*
 2565: 			 * Step 1, bring in the VM pages from the object, 
 2566: 			 * allocating them if necessary.  We must clear
 2567: 			 * B_CACHE if these pages are not valid for the 
 2568: 			 * range covered by the buffer.
 2569: 			 *
 2570: 			 * spl protection is required to protect against
 2571: 			 * interrupts unbusying and freeing pages between
 2572: 			 * our vm_page_lookup() and our busycheck/wiring
 2573: 			 * call.
 2574: 			 */
 2575: 			vp = bp->b_vp;
 2576: 			VOP_GETVOBJECT(vp, &obj);
 2577: 
 2578: 			s = splbio();
 2579: 			while (bp->b_npages < desiredpages) {
 2580: 				vm_page_t m;
 2581: 				vm_pindex_t pi;
 2582: 
 2583: 				pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages;
 2584: 				if ((m = vm_page_lookup(obj, pi)) == NULL) {
 2585: 					/*
 2586: 					 * note: must allocate system pages
 2587: 					 * since blocking here could intefere
 2588: 					 * with paging I/O, no matter which
 2589: 					 * process we are.
 2590: 					 */
 2591: 					m = vm_page_alloc(obj, pi, VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM);
 2592: 					if (m == NULL) {
 2593: 						vm_wait();
 2594: 						vm_pageout_deficit += desiredpages - bp->b_npages;
 2595: 					} else {
 2596: 						vm_page_wire(m);
 2597: 						vm_page_wakeup(m);
 2598: 						bp->b_flags &= ~B_CACHE;
 2599: 						bp->b_pages[bp->b_npages] = m;
 2600: 						++bp->b_npages;
 2601: 					}
 2602: 					continue;
 2603: 				}
 2604: 
 2605: 				/*
 2606: 				 * We found a page.  If we have to sleep on it,
 2607: 				 * retry because it might have gotten freed out
 2608: 				 * from under us.
 2609: 				 *
 2610: 				 * We can only test PG_BUSY here.  Blocking on
 2611: 				 * m->busy might lead to a deadlock:
 2612: 				 *
 2613: 				 *  vm_fault->getpages->cluster_read->allocbuf
 2614: 				 *
 2615: 				 */
 2616: 
 2617: 				if (vm_page_sleep_busy(m, FALSE, "pgtblk"))
 2618: 					continue;
 2619: 
 2620: 				/*
 2621: 				 * We have a good page.  Should we wakeup the
 2622: 				 * page daemon?
 2623: 				 */
 2624: 				if ((curthread != pagethread) &&
 2625: 				    ((m->queue - m->pc) == PQ_CACHE) &&
 2626: 				    ((vmstats.v_free_count + vmstats.v_cache_count) <
 2627: 					(vmstats.v_free_min + vmstats.v_cache_min))) {
 2628: 					pagedaemon_wakeup();
 2629: 				}
 2630: 				vm_page_flag_clear(m, PG_ZERO);
 2631: 				vm_page_wire(m);
 2632: 				bp->b_pages[bp->b_npages] = m;
 2633: 				++bp->b_npages;
 2634: 			}
 2635: 			splx(s);
 2636: 
 2637: 			/*
 2638: 			 * Step 2.  We've loaded the pages into the buffer,
 2639: 			 * we have to figure out if we can still have B_CACHE
 2640: 			 * set.  Note that B_CACHE is set according to the
 2641: 			 * byte-granular range ( bcount and size ), new the
 2642: 			 * aligned range ( newbsize ).
 2643: 			 *
 2644: 			 * The VM test is against m->valid, which is DEV_BSIZE
 2645: 			 * aligned.  Needless to say, the validity of the data
 2646: 			 * needs to also be DEV_BSIZE aligned.  Note that this
 2647: 			 * fails with NFS if the server or some other client
 2648: 			 * extends the file's EOF.  If our buffer is resized, 
 2649: 			 * B_CACHE may remain set! XXX
 2650: 			 */
 2651: 
 2652: 			toff = bp->b_bcount;
 2653: 			tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
 2654: 
 2655: 			while ((bp->b_flags & B_CACHE) && toff < size) {
 2656: 				vm_pindex_t pi;
 2657: 
 2658: 				if (tinc > (size - toff))
 2659: 					tinc = size - toff;
 2660: 
 2661: 				pi = ((bp->b_offset & PAGE_MASK) + toff) >> 
 2662: 				    PAGE_SHIFT;
 2663: 
 2664: 				vfs_buf_test_cache(
 2665: 				    bp, 
 2666: 				    bp->b_offset,
 2667: 				    toff, 
 2668: 				    tinc, 
 2669: 				    bp->b_pages[pi]
 2670: 				);
 2671: 				toff += tinc;
 2672: 				tinc = PAGE_SIZE;
 2673: 			}
 2674: 
 2675: 			/*
 2676: 			 * Step 3, fixup the KVM pmap.  Remember that
 2677: 			 * bp->b_data is relative to bp->b_offset, but 
 2678: 			 * bp->b_offset may be offset into the first page.
 2679: 			 */
 2680: 
 2681: 			bp->b_data = (caddr_t)
 2682: 			    trunc_page((vm_offset_t)bp->b_data);
 2683: 			pmap_qenter(
 2684: 			    (vm_offset_t)bp->b_data,
 2685: 			    bp->b_pages, 
 2686: 			    bp->b_npages
 2687: 			);
 2688: 			bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | 
 2689: 			    (vm_offset_t)(bp->b_offset & PAGE_MASK));
 2690: 		}
 2691: 	}
 2692: 	if (newbsize < bp->b_bufsize)
 2693: 		bufspacewakeup();
 2694: 	bp->b_bufsize = newbsize;	/* actual buffer allocation	*/
 2695: 	bp->b_bcount = size;		/* requested buffer size	*/
 2696: 	return 1;
 2697: }
 2698: 
 2699: /*
 2700:  *	biowait:
 2701:  *
 2702:  *	Wait for buffer I/O completion, returning error status.  The buffer
 2703:  *	is left locked and B_DONE on return.  B_EINTR is converted into a EINTR
 2704:  *	error and cleared.
 2705:  */
 2706: int
 2707: biowait(struct buf * bp)
 2708: {
 2709: 	int s;
 2710: 
 2711: 	s = splbio();
 2712: 	while ((bp->b_flags & B_DONE) == 0) {
 2713: #if defined(NO_SCHEDULE_MODS)
 2714: 		tsleep(bp, 0, "biowait", 0);
 2715: #else
 2716: 		if (bp->b_flags & B_READ)
 2717: 			tsleep(bp, 0, "biord", 0);
 2718: 		else
 2719: 			tsleep(bp, 0, "biowr", 0);
 2720: #endif
 2721: 	}
 2722: 	splx(s);
 2723: 	if (bp->b_flags & B_EINTR) {
 2724: 		bp->b_flags &= ~B_EINTR;
 2725: 		return (EINTR);
 2726: 	}
 2727: 	if (bp->b_flags & B_ERROR) {
 2728: 		return (bp->b_error ? bp->b_error : EIO);
 2729: 	} else {
 2730: 		return (0);
 2731: 	}
 2732: }
 2733: 
 2734: /*
 2735:  *	biodone:
 2736:  *
 2737:  *	Finish I/O on a buffer, optionally calling a completion function.
 2738:  *	This is usually called from an interrupt so process blocking is
 2739:  *	not allowed.
 2740:  *
 2741:  *	biodone is also responsible for setting B_CACHE in a B_VMIO bp.
 2742:  *	In a non-VMIO bp, B_CACHE will be set on the next getblk() 
 2743:  *	assuming B_INVAL is clear.
 2744:  *
 2745:  *	For the VMIO case, we set B_CACHE if the op was a read and no
 2746:  *	read error occured, or if the op was a write.  B_CACHE is never
 2747:  *	set if the buffer is invalid or otherwise uncacheable.
 2748:  *
 2749:  *	biodone does not mess with B_INVAL, allowing the I/O routine or the
 2750:  *	initiator to leave B_INVAL set to brelse the buffer out of existance
 2751:  *	in the biodone routine.
 2752:  *
 2753:  *	b_dev is required to be reinitialized prior to the top level strategy
 2754:  *	call in a device stack.  To avoid improper reuse, biodone() sets
 2755:  *	b_dev to NODEV.
 2756:  */
 2757: void
 2758: biodone(struct buf *bp)
 2759: {
 2760: 	int s, error;
 2761: 
 2762: 	s = splbio();
 2763: 
 2764: 	KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, BUF_REFCNT(bp)));
 2765: 	KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
 2766: 
 2767: 	bp->b_flags |= B_DONE;
 2768: 	bp->b_dev = NODEV;
 2769: 	runningbufwakeup(bp);
 2770: 
 2771: 	if (bp->b_flags & B_FREEBUF) {
 2772: 		brelse(bp);
 2773: 		splx(s);
 2774: 		return;
 2775: 	}
 2776: 
 2777: 	if ((bp->b_flags & B_READ) == 0) {
 2778: 		vwakeup(bp);
 2779: 	}
 2780: 
 2781: 	/* call optional completion function if requested */
 2782: 	if (bp->b_flags & B_CALL) {
 2783: 		bp->b_flags &= ~B_CALL;
 2784: 		(*bp->b_iodone) (bp);
 2785: 		splx(s);
 2786: 		return;
 2787: 	}
 2788: 	if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_complete)
 2789: 		(*bioops.io_complete)(bp);
 2790: 
 2791: 	if (bp->b_flags & B_VMIO) {
 2792: 		int i;
 2793: 		vm_ooffset_t foff;
 2794: 		vm_page_t m;
 2795: 		vm_object_t obj;
 2796: 		int iosize;
 2797: 		struct vnode *vp = bp->b_vp;
 2798: 
 2799: 		error = VOP_GETVOBJECT(vp, &obj);
 2800: 
 2801: #if defined(VFS_BIO_DEBUG)
 2802: 		if (vp->v_holdcnt == 0) {
 2803: 			panic("biodone: zero vnode hold count");
 2804: 		}
 2805: 
 2806: 		if (error) {
 2807: 			panic("biodone: missing VM object");
 2808: 		}
 2809: 
 2810: 		if ((vp->v_flag & VOBJBUF) == 0) {
 2811: 			panic("biodone: vnode is not setup for merged cache");
 2812: 		}
 2813: #endif
 2814: 
 2815: 		foff = bp->b_offset;
 2816: 		KASSERT(bp->b_offset != NOOFFSET,
 2817: 		    ("biodone: no buffer offset"));
 2818: 
 2819: 		if (error) {
 2820: 			panic("biodone: no object");
 2821: 		}
 2822: #if defined(VFS_BIO_DEBUG)
 2823: 		if (obj->paging_in_progress < bp->b_npages) {
 2824: 			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
 2825: 			    obj->paging_in_progress, bp->b_npages);
 2826: 		}
 2827: #endif
 2828: 
 2829: 		/*
 2830: 		 * Set B_CACHE if the op was a normal read and no error
 2831: 		 * occured.  B_CACHE is set for writes in the b*write()
 2832: 		 * routines.
 2833: 		 */
 2834: 		iosize = bp->b_bcount - bp->b_resid;
 2835: 		if ((bp->b_flags & (B_READ|B_FREEBUF|B_INVAL|B_NOCACHE|B_ERROR)) == B_READ) {
 2836: 			bp->b_flags |= B_CACHE;
 2837: 		}
 2838: 
 2839: 		for (i = 0; i < bp->b_npages; i++) {
 2840: 			int bogusflag = 0;
 2841: 			int resid;
 2842: 
 2843: 			resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
 2844: 			if (resid > iosize)
 2845: 				resid = iosize;
 2846: 
 2847: 			/*
 2848: 			 * cleanup bogus pages, restoring the originals.  Since
 2849: 			 * the originals should still be wired, we don't have
 2850: 			 * to worry about interrupt/freeing races destroying
 2851: 			 * the VM object association.
 2852: 			 */
 2853: 			m = bp->b_pages[i];
 2854: 			if (m == bogus_page) {
 2855: 				bogusflag = 1;
 2856: 				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
 2857: 				if (m == NULL)
 2858: 					panic("biodone: page disappeared");
 2859: 				bp->b_pages[i] = m;
 2860: 				pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
 2861: 			}
 2862: #if defined(VFS_BIO_DEBUG)
 2863: 			if (OFF_TO_IDX(foff) != m->pindex) {
 2864: 				printf(
 2865: "biodone: foff(%lu)/m->pindex(%d) mismatch\n",
 2866: 				    (unsigned long)foff, m->pindex);
 2867: 			}
 2868: #endif
 2869: 
 2870: 			/*
 2871: 			 * In the write case, the valid and clean bits are
 2872: 			 * already changed correctly ( see bdwrite() ), so we 
 2873: 			 * only need to do this here in the read case.
 2874: 			 */
 2875: 			if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
 2876: 				vfs_page_set_valid(bp, foff, i, m);
 2877: 			}
 2878: 			vm_page_flag_clear(m, PG_ZERO);
 2879: 
 2880: 			/*
 2881: 			 * when debugging new filesystems or buffer I/O methods, this
 2882: 			 * is the most common error that pops up.  if you see this, you
 2883: 			 * have not set the page busy flag correctly!!!
 2884: 			 */
 2885: 			if (m->busy == 0) {
 2886: 				printf("biodone: page busy < 0, "
 2887: 				    "pindex: %d, foff: 0x(%x,%x), "
 2888: 				    "resid: %d, index: %d\n",
 2889: 				    (int) m->pindex, (int)(foff >> 32),
 2890: 						(int) foff & 0xffffffff, resid, i);
 2891: 				if (!vn_isdisk(vp, NULL))
 2892: 					printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n",
 2893: 					    bp->b_vp->v_mount->mnt_stat.f_iosize,
 2894: 					    (int) bp->b_lblkno,
 2895: 					    bp->b_flags, bp->b_npages);
 2896: 				else
 2897: 					printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n",
 2898: 					    (int) bp->b_lblkno,
 2899: 					    bp->b_flags, bp->b_npages);
 2900: 				printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n",
 2901: 				    m->valid, m->dirty, m->wire_count);
 2902: 				panic("biodone: page busy < 0");
 2903: 			}
 2904: 			vm_page_io_finish(m);
 2905: 			vm_object_pip_subtract(obj, 1);
 2906: 			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 2907: 			iosize -= resid;
 2908: 		}
 2909: 		if (obj)
 2910: 			vm_object_pip_wakeupn(obj, 0);
 2911: 	}
 2912: 
 2913: 	/*
 2914: 	 * For asynchronous completions, release the buffer now. The brelse
 2915: 	 * will do a wakeup there if necessary - so no need to do a wakeup
 2916: 	 * here in the async case. The sync case always needs to do a wakeup.
 2917: 	 */
 2918: 
 2919: 	if (bp->b_flags & B_ASYNC) {
 2920: 		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0)
 2921: 			brelse(bp);
 2922: 		else
 2923: 			bqrelse(bp);
 2924: 	} else {
 2925: 		wakeup(bp);
 2926: 	}
 2927: 	splx(s);
 2928: }
 2929: 
 2930: /*
 2931:  * This routine is called in lieu of iodone in the case of
 2932:  * incomplete I/O.  This keeps the busy status for pages
 2933:  * consistant.
 2934:  */
 2935: void
 2936: vfs_unbusy_pages(struct buf *bp)
 2937: {
 2938: 	int i;
 2939: 
 2940: 	runningbufwakeup(bp);
 2941: 	if (bp->b_flags & B_VMIO) {
 2942: 		struct vnode *vp = bp->b_vp;
 2943: 		vm_object_t obj;
 2944: 
 2945: 		VOP_GETVOBJECT(vp, &obj);
 2946: 
 2947: 		for (i = 0; i < bp->b_npages; i++) {
 2948: 			vm_page_t m = bp->b_pages[i];
 2949: 
 2950: 			/*
 2951: 			 * When restoring bogus changes the original pages
 2952: 			 * should still be wired, so we are in no danger of
 2953: 			 * losing the object association and do not need
 2954: 			 * spl protection particularly.
 2955: 			 */
 2956: 			if (m == bogus_page) {
 2957: 				m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
 2958: 				if (!m) {
 2959: 					panic("vfs_unbusy_pages: page missing");
 2960: 				}
 2961: 				bp->b_pages[i] = m;
 2962: 				pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
 2963: 			}
 2964: 			vm_object_pip_subtract(obj, 1);
 2965: 			vm_page_flag_clear(m, PG_ZERO);
 2966: 			vm_page_io_finish(m);
 2967: 		}
 2968: 		vm_object_pip_wakeupn(obj, 0);
 2969: 	}
 2970: }
 2971: 
 2972: /*
 2973:  * vfs_page_set_valid:
 2974:  *
 2975:  *	Set the valid bits in a page based on the supplied offset.   The
 2976:  *	range is restricted to the buffer's size.
 2977:  *
 2978:  *	This routine is typically called after a read completes.
 2979:  */
 2980: static void
 2981: vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
 2982: {
 2983: 	vm_ooffset_t soff, eoff;
 2984: 
 2985: 	/*
 2986: 	 * Start and end offsets in buffer.  eoff - soff may not cross a
 2987: 	 * page boundry or cross the end of the buffer.  The end of the
 2988: 	 * buffer, in this case, is our file EOF, not the allocation size
 2989: 	 * of the buffer.
 2990: 	 */
 2991: 	soff = off;
 2992: 	eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 2993: 	if (eoff > bp->b_offset + bp->b_bcount)
 2994: 		eoff = bp->b_offset + bp->b_bcount;
 2995: 
 2996: 	/*
 2997: 	 * Set valid range.  This is typically the entire buffer and thus the
 2998: 	 * entire page.
 2999: 	 */
 3000: 	if (eoff > soff) {
 3001: 		vm_page_set_validclean(
 3002: 		    m,
 3003: 		   (vm_offset_t) (soff & PAGE_MASK),
 3004: 		   (vm_offset_t) (eoff - soff)
 3005: 		);
 3006: 	}
 3007: }
 3008: 
 3009: /*
 3010:  * This routine is called before a device strategy routine.
 3011:  * It is used to tell the VM system that paging I/O is in
 3012:  * progress, and treat the pages associated with the buffer
 3013:  * almost as being PG_BUSY.  Also the object paging_in_progress
 3014:  * flag is handled to make sure that the object doesn't become
 3015:  * inconsistant.
 3016:  *
 3017:  * Since I/O has not been initiated yet, certain buffer flags
 3018:  * such as B_ERROR or B_INVAL may be in an inconsistant state
 3019:  * and should be ignored.
 3020:  */
 3021: void
 3022: vfs_busy_pages(struct buf *bp, int clear_modify)
 3023: {
 3024: 	int i, bogus;
 3025: 
 3026: 	if (bp->b_flags & B_VMIO) {
 3027: 		struct vnode *vp = bp->b_vp;
 3028: 		vm_object_t obj;
 3029: 		vm_ooffset_t foff;
 3030: 
 3031: 		VOP_GETVOBJECT(vp, &obj);
 3032: 		foff = bp->b_offset;
 3033: 		KASSERT(bp->b_offset != NOOFFSET,
 3034: 		    ("vfs_busy_pages: no buffer offset"));
 3035: 		vfs_setdirty(bp);
 3036: 
 3037: retry:
 3038: 		for (i = 0; i < bp->b_npages; i++) {
 3039: 			vm_page_t m = bp->b_pages[i];
 3040: 			if (vm_page_sleep_busy(m, FALSE, "vbpage"))
 3041: 				goto retry;
 3042: 		}
 3043: 
 3044: 		bogus = 0;
 3045: 		for (i = 0; i < bp->b_npages; i++) {
 3046: 			vm_page_t m = bp->b_pages[i];
 3047: 
 3048: 			vm_page_flag_clear(m, PG_ZERO);
 3049: 			if ((bp->b_flags & B_CLUSTER) == 0) {
 3050: 				vm_object_pip_add(obj, 1);
 3051: 				vm_page_io_start(m);
 3052: 			}
 3053: 
 3054: 			/*
 3055: 			 * When readying a buffer for a read ( i.e
 3056: 			 * clear_modify == 0 ), it is important to do
 3057: 			 * bogus_page replacement for valid pages in 
 3058: 			 * partially instantiated buffers.  Partially 
 3059: 			 * instantiated buffers can, in turn, occur when
 3060: 			 * reconstituting a buffer from its VM backing store
 3061: 			 * base.  We only have to do this if B_CACHE is
 3062: 			 * clear ( which causes the I/O to occur in the
 3063: 			 * first place ).  The replacement prevents the read
 3064: 			 * I/O from overwriting potentially dirty VM-backed
 3065: 			 * pages.  XXX bogus page replacement is, uh, bogus.
 3066: 			 * It may not work properly with small-block devices.
 3067: 			 * We need to find a better way.
 3068: 			 */
 3069: 
 3070: 			vm_page_protect(m, VM_PROT_NONE);
 3071: 			if (clear_modify)
 3072: 				vfs_page_set_valid(bp, foff, i, m);
 3073: 			else if (m->valid == VM_PAGE_BITS_ALL &&
 3074: 				(bp->b_flags & B_CACHE) == 0) {
 3075: 				bp->b_pages[i] = bogus_page;
 3076: 				bogus++;
 3077: 			}
 3078: 			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 3079: 		}
 3080: 		if (bogus)
 3081: 			pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
 3082: 	}
 3083: 
 3084: 	/*
 3085: 	 * This is the easiest place to put the process accounting for the I/O
 3086: 	 * for now.
 3087: 	 */
 3088: 	{
 3089: 		struct proc *p;
 3090: 
 3091: 		if ((p = curthread->td_proc) != NULL) {
 3092: 			if (bp->b_flags & B_READ)
 3093: 				p->p_stats->p_ru.ru_inblock++;
 3094: 			else
 3095: 				p->p_stats->p_ru.ru_oublock++;
 3096: 		}
 3097: 	}
 3098: }
 3099: 
 3100: /*
 3101:  * Tell the VM system that the pages associated with this buffer
 3102:  * are clean.  This is used for delayed writes where the data is
 3103:  * going to go to disk eventually without additional VM intevention.
 3104:  *
 3105:  * Note that while we only really need to clean through to b_bcount, we
 3106:  * just go ahead and clean through to b_bufsize.
 3107:  */
 3108: static void
 3109: vfs_clean_pages(struct buf *bp)
 3110: {
 3111: 	int i;
 3112: 
 3113: 	if (bp->b_flags & B_VMIO) {
 3114: 		vm_ooffset_t foff;
 3115: 
 3116: 		foff = bp->b_offset;
 3117: 		KASSERT(bp->b_offset != NOOFFSET,
 3118: 		    ("vfs_clean_pages: no buffer offset"));
 3119: 		for (i = 0; i < bp->b_npages; i++) {
 3120: 			vm_page_t m = bp->b_pages[i];
 3121: 			vm_ooffset_t noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 3122: 			vm_ooffset_t eoff = noff;
 3123: 
 3124: 			if (eoff > bp->b_offset + bp->b_bufsize)
 3125: 				eoff = bp->b_offset + bp->b_bufsize;
 3126: 			vfs_page_set_valid(bp, foff, i, m);
 3127: 			/* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
 3128: 			foff = noff;
 3129: 		}
 3130: 	}
 3131: }
 3132: 
 3133: /*
 3134:  *	vfs_bio_set_validclean:
 3135:  *
 3136:  *	Set the range within the buffer to valid and clean.  The range is 
 3137:  *	relative to the beginning of the buffer, b_offset.  Note that b_offset
 3138:  *	itself may be offset from the beginning of the first page.
 3139:  */
 3140: 
 3141: void   
 3142: vfs_bio_set_validclean(struct buf *bp, int base, int size)
 3143: {
 3144: 	if (bp->b_flags & B_VMIO) {
 3145: 		int i;
 3146: 		int n;
 3147: 
 3148: 		/*
 3149: 		 * Fixup base to be relative to beginning of first page.
 3150: 		 * Set initial n to be the maximum number of bytes in the
 3151: 		 * first page that can be validated.
 3152: 		 */
 3153: 
 3154: 		base += (bp->b_offset & PAGE_MASK);
 3155: 		n = PAGE_SIZE - (base & PAGE_MASK);
 3156: 
 3157: 		for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
 3158: 			vm_page_t m = bp->b_pages[i];
 3159: 
 3160: 			if (n > size)
 3161: 				n = size;
 3162: 
 3163: 			vm_page_set_validclean(m, base & PAGE_MASK, n);
 3164: 			base += n;
 3165: 			size -= n;
 3166: 			n = PAGE_SIZE;
 3167: 		}
 3168: 	}
 3169: }
 3170: 
 3171: /*
 3172:  *	vfs_bio_clrbuf:
 3173:  *
 3174:  *	clear a buffer.  This routine essentially fakes an I/O, so we need
 3175:  *	to clear B_ERROR and B_INVAL.
 3176:  *
 3177:  *	Note that while we only theoretically need to clear through b_bcount,
 3178:  *	we go ahead and clear through b_bufsize.
 3179:  */
 3180: 
 3181: void
 3182: vfs_bio_clrbuf(struct buf *bp)
 3183: {
 3184: 	int i, mask = 0;
 3185: 	caddr_t sa, ea;
 3186: 	if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) {
 3187: 		bp->b_flags &= ~(B_INVAL|B_ERROR);
 3188: 		if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
 3189: 		    (bp->b_offset & PAGE_MASK) == 0) {
 3190: 			mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
 3191: 			if ((bp->b_pages[0]->valid & mask) == mask) {
 3192: 				bp->b_resid = 0;
 3193: 				return;
 3194: 			}
 3195: 			if (((bp->b_pages[0]->flags & PG_ZERO) == 0) &&
 3196: 			    ((bp->b_pages[0]->valid & mask) == 0)) {
 3197: 				bzero(bp->b_data, bp->b_bufsize);
 3198: 				bp->b_pages[0]->valid |= mask;
 3199: 				bp->b_resid = 0;
 3200: 				return;
 3201: 			}
 3202: 		}
 3203: 		ea = sa = bp->b_data;
 3204: 		for(i=0;i<bp->b_npages;i++,sa=ea) {
 3205: 			int j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE;
 3206: 			ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE);
 3207: 			ea = (caddr_t)(vm_offset_t)ulmin(
 3208: 			    (u_long)(vm_offset_t)ea,
 3209: 			    (u_long)(vm_offset_t)bp->b_data + bp->b_bufsize);
 3210: 			mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
 3211: 			if ((bp->b_pages[i]->valid & mask) == mask)
 3212: 				continue;
 3213: 			if ((bp->b_pages[i]->valid & mask) == 0) {
 3214: 				if ((bp->b_pages[i]->flags & PG_ZERO) == 0) {
 3215: 					bzero(sa, ea - sa);
 3216: 				}
 3217: 			} else {
 3218: 				for (; sa < ea; sa += DEV_BSIZE, j++) {
 3219: 					if (((bp->b_pages[i]->flags & PG_ZERO) == 0) &&
 3220: 						(bp->b_pages[i]->valid & (1<<j)) == 0)
 3221: 						bzero(sa, DEV_BSIZE);
 3222: 				}
 3223: 			}
 3224: 			bp->b_pages[i]->valid |= mask;
 3225: 			vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
 3226: 		}
 3227: 		bp->b_resid = 0;
 3228: 	} else {
 3229: 		clrbuf(bp);
 3230: 	}
 3231: }
 3232: 
 3233: /*
 3234:  * vm_hold_load_pages and vm_hold_unload pages get pages into
 3235:  * a buffers address space.  The pages are anonymous and are
 3236:  * not associated with a file object.
 3237:  */
 3238: void
 3239: vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
 3240: {
 3241: 	vm_offset_t pg;
 3242: 	vm_page_t p;
 3243: 	int index;
 3244: 
 3245: 	to = round_page(to);
 3246: 	from = round_page(from);
 3247: 	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
 3248: 
 3249: 	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
 3250: 
 3251: tryagain:
 3252: 
 3253: 		/*
 3254: 		 * note: must allocate system pages since blocking here
 3255: 		 * could intefere with paging I/O, no matter which
 3256: 		 * process we are.
 3257: 		 */
 3258: 		p = vm_page_alloc(kernel_object,
 3259: 			((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
 3260: 			VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM);
 3261: 		if (!p) {
 3262: 			vm_pageout_deficit += (to - from) >> PAGE_SHIFT;
 3263: 			vm_wait();
 3264: 			goto tryagain;
 3265: 		}
 3266: 		vm_page_wire(p);
 3267: 		p->valid = VM_PAGE_BITS_ALL;
 3268: 		vm_page_flag_clear(p, PG_ZERO);
 3269: 		pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
 3270: 		bp->b_pages[index] = p;
 3271: 		vm_page_wakeup(p);
 3272: 	}
 3273: 	bp->b_npages = index;
 3274: }
 3275: 
 3276: void
 3277: vm_hold_free_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
 3278: {
 3279: 	vm_offset_t pg;
 3280: 	vm_page_t p;
 3281: 	int index, newnpages;
 3282: 
 3283: 	from = round_page(from);
 3284: 	to = round_page(to);
 3285: 	newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
 3286: 
 3287: 	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
 3288: 		p = bp->b_pages[index];
 3289: 		if (p && (index < bp->b_npages)) {
 3290: 			if (p->busy) {
 3291: 				printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n",
 3292: 					bp->b_blkno, bp->b_lblkno);
 3293: 			}
 3294: 			bp->b_pages[index] = NULL;
 3295: 			pmap_kremove(pg);
 3296: 			vm_page_busy(p);
 3297: 			vm_page_unwire(p, 0);
 3298: 			vm_page_free(p);
 3299: 		}
 3300: 	}
 3301: 	bp->b_npages = newnpages;
 3302: }
 3303: 
 3304: /*
 3305:  * Map an IO request into kernel virtual address space.
 3306:  *
 3307:  * All requests are (re)mapped into kernel VA space.
 3308:  * Notice that we use b_bufsize for the size of the buffer
 3309:  * to be mapped.  b_bcount might be modified by the driver.
 3310:  */
 3311: int
 3312: vmapbuf(struct buf *bp)
 3313: {
 3314: 	caddr_t addr, v, kva;
 3315: 	vm_paddr_t pa;
 3316: 	int pidx;
 3317: 	int i;
 3318: 	struct vm_page *m;
 3319: 
 3320: 	if ((bp->b_flags & B_PHYS) == 0)
 3321: 		panic("vmapbuf");
 3322: 	if (bp->b_bufsize < 0)
 3323: 		return (-1);
 3324: 	for (v = bp->b_saveaddr,
 3325: 		     addr = (caddr_t)trunc_page((vm_offset_t)bp->b_data),
 3326: 		     pidx = 0;
 3327: 	     addr < bp->b_data + bp->b_bufsize;
 3328: 	     addr += PAGE_SIZE, v += PAGE_SIZE, pidx++) {
 3329: 		/*
 3330: 		 * Do the vm_fault if needed; do the copy-on-write thing
 3331: 		 * when reading stuff off device into memory.
 3332: 		 */
 3333: retry:
 3334: 		i = vm_fault_quick((addr >= bp->b_data) ? addr : bp->b_data,
 3335: 			(bp->b_flags&B_READ)?(VM_PROT_READ|VM_PROT_WRITE):VM_PROT_READ);
 3336: 		if (i < 0) {
 3337: 			for (i = 0; i < pidx; ++i) {
 3338: 			    vm_page_unhold(bp->b_pages[i]);
 3339: 			    bp->b_pages[i] = NULL;
 3340: 			}
 3341: 			return(-1);
 3342: 		}
 3343: 
 3344: 		/*
 3345: 		 * WARNING!  If sparc support is MFCd in the future this will
 3346: 		 * have to be changed from pmap_kextract() to pmap_extract()
 3347: 		 * ala -current.
 3348: 		 */
 3349: #ifdef __sparc64__
 3350: #error "If MFCing sparc support use pmap_extract"
 3351: #endif
 3352: 		pa = pmap_kextract((vm_offset_t)addr);
 3353: 		if (pa == 0) {
 3354: 			printf("vmapbuf: warning, race against user address during I/O");
 3355: 			goto retry;
 3356: 		}
 3357: 		m = PHYS_TO_VM_PAGE(pa);
 3358: 		vm_page_hold(m);
 3359: 		bp->b_pages[pidx] = m;
 3360: 	}
 3361: 	if (pidx > btoc(MAXPHYS))
 3362: 		panic("vmapbuf: mapped more than MAXPHYS");
 3363: 	pmap_qenter((vm_offset_t)bp->b_saveaddr, bp->b_pages, pidx);
 3364: 	
 3365: 	kva = bp->b_saveaddr;
 3366: 	bp->b_npages = pidx;
 3367: 	bp->b_saveaddr = bp->b_data;
 3368: 	bp->b_data = kva + (((vm_offset_t) bp->b_data) & PAGE_MASK);
 3369: 	return(0);
 3370: }
 3371: 
 3372: /*
 3373:  * Free the io map PTEs associated with this IO operation.
 3374:  * We also invalidate the TLB entries and restore the original b_addr.
 3375:  */
 3376: void
 3377: vunmapbuf(struct buf *bp)
 3378: {
 3379: 	int pidx;
 3380: 	int npages;
 3381: 	vm_page_t *m;
 3382: 
 3383: 	if ((bp->b_flags & B_PHYS) == 0)
 3384: 		panic("vunmapbuf");
 3385: 
 3386: 	npages = bp->b_npages;
 3387: 	pmap_qremove(trunc_page((vm_offset_t)bp->b_data),
 3388: 		     npages);
 3389: 	m = bp->b_pages;
 3390: 	for (pidx = 0; pidx < npages; pidx++)
 3391: 		vm_page_unhold(*m++);
 3392: 
 3393: 	bp->b_data = bp->b_saveaddr;
 3394: }
 3395: 
 3396: #include "opt_ddb.h"
 3397: #ifdef DDB
 3398: #include <ddb/ddb.h>
 3399: 
 3400: DB_SHOW_COMMAND(buffer, db_show_buffer)
 3401: {
 3402: 	/* get args */
 3403: 	struct buf *bp = (struct buf *)addr;
 3404: 
 3405: 	if (!have_addr) {
 3406: 		db_printf("usage: show buffer <addr>\n");
 3407: 		return;
 3408: 	}
 3409: 
 3410: 	db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS);
 3411: 	db_printf("b_error = %d, b_bufsize = %ld, b_bcount = %ld, "
 3412: 		  "b_resid = %ld\nb_dev = (%d,%d), b_data = %p, "
 3413: 		  "b_blkno = %d, b_pblkno = %d\n",
 3414: 		  bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
 3415: 		  major(bp->b_dev), minor(bp->b_dev),
 3416: 		  bp->b_data, bp->b_blkno, bp->b_pblkno);
 3417: 	if (bp->b_npages) {
 3418: 		int i;
 3419: 		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
 3420: 		for (i = 0; i < bp->b_npages; i++) {
 3421: 			vm_page_t m;
 3422: 			m = bp->b_pages[i];
 3423: 			db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
 3424: 			    (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
 3425: 			if ((i + 1) < bp->b_npages)
 3426: 				db_printf(",");
 3427: 		}
 3428: 		db_printf("\n");
 3429: 	}
 3430: }
 3431: #endif /* DDB */