File:  [DragonFly] / src / sys / vfs / ufs / ufs_readwrite.c
Revision 1.10: download - view: text, annotated - select for diffs
Fri Apr 23 06:23:46 2004 UTC (9 years, 11 months ago) by dillon
Branches: MAIN
CVS tags: HEAD
msync(..., MS_INVALIDATE) will incorrectly remove dirty pages without
synchronizing them to their backing store under certain circumstances,
and can also cause struct buf's to become inconsistent.  This can be
particularly gruesome when MS_INVALIDATE is used on a range of memory that
is mmap()'d to be read-only.

Fix MS_INVALIDATE's operation (1) by making UFS honor the invalidation
request when flushing to backing store to destroy the related struct buf
and (2) by never removing pages wired into the buffer cache and never
removing pages that are found to still be dirty.

Note that NFS was already coded to honor invalidation requests in
nfs_write().  Filesystems other then NFS and UFS do not currently support
buffer-invalidation-on-write but all that means now is that the pages
will remain in cache, rather then be incorrectly removed and cause corruption.

Reported-by: Stephan Uphoff <ups@tree.com>, Julian Elischer <julian@elischer.org>

    1: /*-
    2:  * Copyright (c) 1993
    3:  *	The Regents of the University of California.  All rights reserved.
    4:  *
    5:  * Redistribution and use in source and binary forms, with or without
    6:  * modification, are permitted provided that the following conditions
    7:  * are met:
    8:  * 1. Redistributions of source code must retain the above copyright
    9:  *    notice, this list of conditions and the following disclaimer.
   10:  * 2. Redistributions in binary form must reproduce the above copyright
   11:  *    notice, this list of conditions and the following disclaimer in the
   12:  *    documentation and/or other materials provided with the distribution.
   13:  * 3. All advertising materials mentioning features or use of this software
   14:  *    must display the following acknowledgement:
   15:  *	This product includes software developed by the University of
   16:  *	California, Berkeley and its contributors.
   17:  * 4. Neither the name of the University nor the names of its contributors
   18:  *    may be used to endorse or promote products derived from this software
   19:  *    without specific prior written permission.
   20:  *
   21:  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   22:  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   23:  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   24:  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   25:  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   26:  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   27:  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   28:  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   29:  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   30:  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   31:  * SUCH DAMAGE.
   32:  *
   33:  *	@(#)ufs_readwrite.c	8.11 (Berkeley) 5/8/95
   34:  * $FreeBSD: src/sys/ufs/ufs/ufs_readwrite.c,v 1.65.2.14 2003/04/04 22:21:29 tegge Exp $
   35:  * $DragonFly: src/sys/vfs/ufs/ufs_readwrite.c,v 1.10 2004/04/23 06:23:46 dillon Exp $
   36:  */
   37: 
   38: #define	BLKSIZE(a, b, c)	blksize(a, b, c)
   39: #define	FS			struct fs
   40: #define	I_FS			i_fs
   41: 
   42: #include <vm/vm.h>
   43: #include <vm/vm_object.h>
   44: #include <vm/vm_pager.h>
   45: #include <vm/vm_map.h>
   46: #include <vm/vnode_pager.h>
   47: #include <sys/event.h>
   48: #include <sys/vmmeter.h>
   49: #include <vm/vm_page2.h>
   50: 
   51: #include "opt_directio.h"
   52: 
   53: #define VN_KNOTE(vp, b) \
   54: 	KNOTE((struct klist *)&vp->v_pollinfo.vpi_selinfo.si_note, (b))
   55: 
   56: #ifdef DIRECTIO
   57: extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
   58: #endif
   59: 
   60: /*
   61:  * Vnode op for reading.
   62:  */
   63: /* ARGSUSED */
   64: int
   65: ffs_read(ap)
   66: 	struct vop_read_args /* {
   67: 		struct vnode *a_vp;
   68: 		struct uio *a_uio;
   69: 		int a_ioflag;
   70: 		struct ucred *a_cred;
   71: 	} */ *ap;
   72: {
   73: 	struct vnode *vp;
   74: 	struct inode *ip;
   75: 	struct uio *uio;
   76: 	FS *fs;
   77: 	struct buf *bp;
   78: 	ufs_daddr_t lbn, nextlbn;
   79: 	off_t bytesinfile;
   80: 	long size, xfersize, blkoffset;
   81: 	int error, orig_resid;
   82: 	u_short mode;
   83: 	int seqcount;
   84: 	int ioflag;
   85: 	vm_object_t object;
   86: 
   87: 	vp = ap->a_vp;
   88: 	seqcount = ap->a_ioflag >> 16;
   89: 	ip = VTOI(vp);
   90: 	mode = ip->i_mode;
   91: 	uio = ap->a_uio;
   92: 	ioflag = ap->a_ioflag;
   93: #ifdef DIRECTIO
   94: 	if ((ioflag & IO_DIRECT) != 0) {
   95: 		int workdone;
   96: 
   97: 		error = ffs_rawread(vp, uio, &workdone);
   98: 		if (error || workdone)
   99: 			return error;
  100: 	}
  101: #endif
  102: 
  103: #ifdef DIAGNOSTIC
  104: 	if (uio->uio_rw != UIO_READ)
  105: 		panic("ffs_read: mode");
  106: 
  107: 	if (vp->v_type == VLNK) {
  108: 		if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen)
  109: 			panic("ffs_read: short symlink");
  110: 	} else if (vp->v_type != VREG && vp->v_type != VDIR)
  111: 		panic("ffs_read: type %d", vp->v_type);
  112: #endif
  113: 	fs = ip->I_FS;
  114: 	if ((u_int64_t)uio->uio_offset > fs->fs_maxfilesize)
  115: 		return (EFBIG);
  116: 
  117: 	orig_resid = uio->uio_resid;
  118: 	if (orig_resid <= 0)
  119: 		return (0);
  120: 
  121: 	object = vp->v_object;
  122: 
  123: 	bytesinfile = ip->i_size - uio->uio_offset;
  124: 	if (bytesinfile <= 0) {
  125: 		if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
  126: 			ip->i_flag |= IN_ACCESS;
  127: 		return 0;
  128: 	}
  129: 
  130: 	if (object)
  131: 		vm_object_reference(object);
  132: 
  133: #ifdef ENABLE_VFS_IOOPT
  134: 	/*
  135: 	 * If IO optimisation is turned on,
  136: 	 * and we are NOT a VM based IO request, 
  137: 	 * (i.e. not headed for the buffer cache)
  138: 	 * but there IS a vm object associated with it.
  139: 	 */
  140: 	if ((ioflag & IO_VMIO) == 0 && (vfs_ioopt > 1) && object) {
  141: 		int nread, toread;
  142: 
  143: 		toread = uio->uio_resid;
  144: 		if (toread > bytesinfile)
  145: 			toread = bytesinfile;
  146: 		if (toread >= PAGE_SIZE) {
  147: 			/*
  148: 			 * Then if it's at least a page in size, try 
  149: 			 * get the data from the object using vm tricks
  150: 			 */
  151: 			error = uioread(toread, uio, object, &nread);
  152: 			if ((uio->uio_resid == 0) || (error != 0)) {
  153: 				/*
  154: 				 * If we finished or there was an error
  155: 				 * then finish up (the reference previously
  156: 				 * obtained on object must be released).
  157: 				 */
  158: 				if ((error == 0 ||
  159: 				    uio->uio_resid != orig_resid) &&
  160: 				    (vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
  161: 					ip->i_flag |= IN_ACCESS;
  162: 
  163: 				if (object)
  164: 					vm_object_vndeallocate(object);
  165: 				return error;
  166: 			}
  167: 		}
  168: 	}
  169: #endif
  170: 
  171: 	/*
  172: 	 * Ok so we couldn't do it all in one vm trick...
  173: 	 * so cycle around trying smaller bites..
  174: 	 */
  175: 	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
  176: 		if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
  177: 			break;
  178: #ifdef ENABLE_VFS_IOOPT
  179: 		if ((ioflag & IO_VMIO) == 0 && (vfs_ioopt > 1) && object) {
  180: 			/*
  181: 			 * Obviously we didn't finish above, but we
  182: 			 * didn't get an error either. Try the same trick again.
  183: 			 * but this time we are looping.
  184: 			 */
  185: 			int nread, toread;
  186: 			toread = uio->uio_resid;
  187: 			if (toread > bytesinfile)
  188: 				toread = bytesinfile;
  189: 
  190: 			/*
  191: 			 * Once again, if there isn't enough for a
  192: 			 * whole page, don't try optimising.
  193: 			 */
  194: 			if (toread >= PAGE_SIZE) {
  195: 				error = uioread(toread, uio, object, &nread);
  196: 				if ((uio->uio_resid == 0) || (error != 0)) {
  197: 					/*
  198: 					 * If we finished or there was an 
  199: 					 * error then finish up (the reference
  200: 					 * previously obtained on object must 
  201: 					 * be released).
  202: 					 */
  203: 					if ((error == 0 ||
  204: 					    uio->uio_resid != orig_resid) &&
  205: 					    (vp->v_mount->mnt_flag &
  206: 					    MNT_NOATIME) == 0)
  207: 						ip->i_flag |= IN_ACCESS;
  208: 					if (object)
  209: 						vm_object_vndeallocate(object);
  210: 					return error;
  211: 				}
  212: 				/*
  213: 				 * To get here we didnt't finish or err.
  214: 				 * If we did get some data,
  215: 				 * loop to try another bite.
  216: 				 */
  217: 				if (nread > 0) {
  218: 					continue;
  219: 				}
  220: 			}
  221: 		}
  222: #endif
  223: 
  224: 		lbn = lblkno(fs, uio->uio_offset);
  225: 		nextlbn = lbn + 1;
  226: 
  227: 		/*
  228: 		 * size of buffer.  The buffer representing the
  229: 		 * end of the file is rounded up to the size of
  230: 		 * the block type ( fragment or full block, 
  231: 		 * depending ).
  232: 		 */
  233: 		size = BLKSIZE(fs, ip, lbn);
  234: 		blkoffset = blkoff(fs, uio->uio_offset);
  235: 		
  236: 		/*
  237: 		 * The amount we want to transfer in this iteration is
  238: 		 * one FS block less the amount of the data before
  239: 		 * our startpoint (duh!)
  240: 		 */
  241: 		xfersize = fs->fs_bsize - blkoffset;
  242: 
  243: 		/*
  244: 		 * But if we actually want less than the block,
  245: 		 * or the file doesn't have a whole block more of data,
  246: 		 * then use the lesser number.
  247: 		 */
  248: 		if (uio->uio_resid < xfersize)
  249: 			xfersize = uio->uio_resid;
  250: 		if (bytesinfile < xfersize)
  251: 			xfersize = bytesinfile;
  252: 
  253: 		if (lblktosize(fs, nextlbn) >= ip->i_size) {
  254: 			/*
  255: 			 * Don't do readahead if this is the end of the file.
  256: 			 */
  257: 			error = bread(vp, lbn, size, &bp);
  258: 		} else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
  259: 			/* 
  260: 			 * Otherwise if we are allowed to cluster,
  261: 			 * grab as much as we can.
  262: 			 *
  263: 			 * XXX  This may not be a win if we are not
  264: 			 * doing sequential access.
  265: 			 */
  266: 			error = cluster_read(vp, ip->i_size, lbn,
  267: 				size, uio->uio_resid, seqcount, &bp);
  268: 		} else if (seqcount > 1) {
  269: 			/*
  270: 			 * If we are NOT allowed to cluster, then
  271: 			 * if we appear to be acting sequentially,
  272: 			 * fire off a request for a readahead
  273: 			 * as well as a read. Note that the 4th and 5th
  274: 			 * arguments point to arrays of the size specified in
  275: 			 * the 6th argument.
  276: 			 */
  277: 			int nextsize = BLKSIZE(fs, ip, nextlbn);
  278: 			error = breadn(vp, lbn,
  279: 			    size, &nextlbn, &nextsize, 1, &bp);
  280: 		} else {
  281: 			/*
  282: 			 * Failing all of the above, just read what the 
  283: 			 * user asked for. Interestingly, the same as
  284: 			 * the first option above.
  285: 			 */
  286: 			error = bread(vp, lbn, size, &bp);
  287: 		}
  288: 		if (error) {
  289: 			brelse(bp);
  290: 			bp = NULL;
  291: 			break;
  292: 		}
  293: 
  294: 		/*
  295: 		 * If IO_DIRECT then set B_DIRECT for the buffer.  This
  296: 		 * will cause us to attempt to release the buffer later on
  297: 		 * and will cause the buffer cache to attempt to free the
  298: 		 * underlying pages.
  299: 		 */
  300: 		if (ioflag & IO_DIRECT)
  301: 			bp->b_flags |= B_DIRECT;
  302: 
  303: 		/*
  304: 		 * We should only get non-zero b_resid when an I/O error
  305: 		 * has occurred, which should cause us to break above.
  306: 		 * However, if the short read did not cause an error,
  307: 		 * then we want to ensure that we do not uiomove bad
  308: 		 * or uninitialized data.
  309: 		 *
  310: 		 * XXX b_resid is only valid when an actual I/O has occured
  311: 		 * and may be incorrect if the buffer is B_CACHE or if the
  312: 		 * last op on the buffer was a failed write.  This KASSERT
  313: 		 * is a precursor to removing it from the UFS code.
  314: 		 */
  315: 		KASSERT(bp->b_resid == 0, ("bp->b_resid != 0"));
  316: 		size -= bp->b_resid;
  317: 		if (size < xfersize) {
  318: 			if (size == 0)
  319: 				break;
  320: 			xfersize = size;
  321: 		}
  322: 
  323: #ifdef ENABLE_VFS_IOOPT
  324: 		if (vfs_ioopt && object &&
  325: 		    (bp->b_flags & B_VMIO) &&
  326: 		    ((blkoffset & PAGE_MASK) == 0) &&
  327: 		    ((xfersize & PAGE_MASK) == 0)) {
  328: 			/*
  329: 			 * If VFS IO  optimisation is turned on,
  330: 			 * and it's an exact page multiple
  331: 			 * And a normal VM based op,
  332: 			 * then use uiomiveco()
  333: 			 */
  334: 			error =
  335: 				uiomoveco((char *)bp->b_data + blkoffset,
  336: 					(int)xfersize, uio, object);
  337: 		} else 
  338: #endif
  339: 		{
  340: 			/*
  341: 			 * otherwise use the general form
  342: 			 */
  343: 			error =
  344: 				uiomove((char *)bp->b_data + blkoffset,
  345: 					(int)xfersize, uio);
  346: 		}
  347: 
  348: 		if (error)
  349: 			break;
  350: 
  351: 		if ((ioflag & (IO_VMIO|IO_DIRECT)) && 
  352: 		    (LIST_FIRST(&bp->b_dep) == NULL)) {
  353: 			/*
  354: 			 * If there are no dependencies, and it's VMIO,
  355: 			 * then we don't need the buf, mark it available
  356: 			 * for freeing. The VM has the data.
  357: 			 */
  358: 			bp->b_flags |= B_RELBUF;
  359: 			brelse(bp);
  360: 		} else {
  361: 			/*
  362: 			 * Otherwise let whoever
  363: 			 * made the request take care of
  364: 			 * freeing it. We just queue
  365: 			 * it onto another list.
  366: 			 */
  367: 			bqrelse(bp);
  368: 		}
  369: 	}
  370: 
  371: 	/* 
  372: 	 * This can only happen in the case of an error
  373: 	 * because the loop above resets bp to NULL on each iteration
  374: 	 * and on normal completion has not set a new value into it.
  375: 	 * so it must have come from a 'break' statement
  376: 	 */
  377: 	if (bp != NULL) {
  378: 		if ((ioflag & (IO_VMIO|IO_DIRECT)) && 
  379: 		    (LIST_FIRST(&bp->b_dep) == NULL)) {
  380: 			bp->b_flags |= B_RELBUF;
  381: 			brelse(bp);
  382: 		} else {
  383: 			bqrelse(bp);
  384: 		}
  385: 	}
  386: 
  387: 	if (object)
  388: 		vm_object_vndeallocate(object);
  389: 	if ((error == 0 || uio->uio_resid != orig_resid) &&
  390: 	    (vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
  391: 		ip->i_flag |= IN_ACCESS;
  392: 	return (error);
  393: }
  394: 
  395: /*
  396:  * Vnode op for writing.
  397:  */
  398: int
  399: ffs_write(ap)
  400: 	struct vop_write_args /* {
  401: 		struct vnode *a_vp;
  402: 		struct uio *a_uio;
  403: 		int a_ioflag;
  404: 		struct ucred *a_cred;
  405: 	} */ *ap;
  406: {
  407: 	struct vnode *vp;
  408: 	struct uio *uio;
  409: 	struct inode *ip;
  410: 	FS *fs;
  411: 	struct buf *bp;
  412: 	ufs_daddr_t lbn;
  413: 	off_t osize;
  414: 	int seqcount;
  415: 	int blkoffset, error, extended, flags, ioflag, resid, size, xfersize;
  416: 	vm_object_t object;
  417: 	struct thread *td;
  418: 
  419: 	extended = 0;
  420: 	seqcount = ap->a_ioflag >> 16;
  421: 	ioflag = ap->a_ioflag;
  422: 	uio = ap->a_uio;
  423: 	vp = ap->a_vp;
  424: 	ip = VTOI(vp);
  425: 
  426: 	object = vp->v_object;
  427: 	if (object)
  428: 		vm_object_reference(object);
  429: 
  430: #ifdef DIAGNOSTIC
  431: 	if (uio->uio_rw != UIO_WRITE)
  432: 		panic("ffs_write: mode");
  433: #endif
  434: 
  435: 	switch (vp->v_type) {
  436: 	case VREG:
  437: 		if (ioflag & IO_APPEND)
  438: 			uio->uio_offset = ip->i_size;
  439: 		if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) {
  440: 			if (object)
  441: 				vm_object_vndeallocate(object);
  442: 			return (EPERM);
  443: 		}
  444: 		/* FALLTHROUGH */
  445: 	case VLNK:
  446: 		break;
  447: 	case VDIR:
  448: 		panic("ffs_write: dir write");
  449: 		break;
  450: 	default:
  451: 		panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type,
  452: 			(int)uio->uio_offset,
  453: 			(int)uio->uio_resid
  454: 		);
  455: 	}
  456: 
  457: 	fs = ip->I_FS;
  458: 	if (uio->uio_offset < 0 ||
  459: 	    (u_int64_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) {
  460: 		if (object)
  461: 			vm_object_vndeallocate(object);
  462: 		return (EFBIG);
  463: 	}
  464: 	/*
  465: 	 * Maybe this should be above the vnode op call, but so long as
  466: 	 * file servers have no limits, I don't think it matters.
  467: 	 */
  468: 	td = uio->uio_td;
  469: 	if (vp->v_type == VREG && td && td->td_proc &&
  470: 	    uio->uio_offset + uio->uio_resid >
  471: 	    td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
  472: 		psignal(td->td_proc, SIGXFSZ);
  473: 		if (object)
  474: 			vm_object_vndeallocate(object);
  475: 		return (EFBIG);
  476: 	}
  477: 
  478: 	resid = uio->uio_resid;
  479: 	osize = ip->i_size;
  480: 
  481: 	/*
  482: 	 * NOTE! These B_ flags are actually balloc-only flags, not buffer
  483: 	 * flags.  They are similar to the BA_ flags in -current.
  484: 	 */
  485: 	if (seqcount > B_SEQMAX)
  486: 		flags = B_SEQMAX << B_SEQSHIFT;
  487: 	else
  488: 		flags = seqcount << B_SEQSHIFT;
  489: 	if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
  490: 		flags |= B_SYNC;
  491: 
  492: 	if (object && (object->flags & OBJ_OPT)) {
  493: 		vm_freeze_copyopts(object,
  494: 			OFF_TO_IDX(uio->uio_offset),
  495: 			OFF_TO_IDX(uio->uio_offset + uio->uio_resid + PAGE_MASK));
  496: 	}
  497: 
  498: 	for (error = 0; uio->uio_resid > 0;) {
  499: 		lbn = lblkno(fs, uio->uio_offset);
  500: 		blkoffset = blkoff(fs, uio->uio_offset);
  501: 		xfersize = fs->fs_bsize - blkoffset;
  502: 		if (uio->uio_resid < xfersize)
  503: 			xfersize = uio->uio_resid;
  504: 
  505: 		if (uio->uio_offset + xfersize > ip->i_size)
  506: 			vnode_pager_setsize(vp, uio->uio_offset + xfersize);
  507: 
  508: 		/*      
  509: 		 * We must perform a read-before-write if the transfer
  510: 		 * size does not cover the entire buffer.
  511: 		 */
  512: 		if (fs->fs_bsize > xfersize)
  513: 			flags |= B_CLRBUF;
  514: 		else
  515: 			flags &= ~B_CLRBUF;
  516: /* XXX is uio->uio_offset the right thing here? */
  517: 		error = VOP_BALLOC(vp, uio->uio_offset, xfersize,
  518: 		    ap->a_cred, flags, &bp);
  519: 		if (error != 0)
  520: 			break;
  521: 		/*
  522: 		 * If the buffer is not valid and we did not clear garbage
  523: 		 * out above, we have to do so here even though the write
  524: 		 * covers the entire buffer in order to avoid a mmap()/write
  525: 		 * race where another process may see the garbage prior to
  526: 		 * the uiomove() for a write replacing it.
  527: 		 */
  528: 		if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
  529: 			vfs_bio_clrbuf(bp);
  530: 		if (ioflag & IO_DIRECT)
  531: 			bp->b_flags |= B_DIRECT;
  532: 		if (ioflag & IO_NOWDRAIN)
  533: 			bp->b_flags |= B_NOWDRAIN;
  534: 		if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL))
  535: 			bp->b_flags |= B_NOCACHE;
  536: 
  537: 		if (uio->uio_offset + xfersize > ip->i_size) {
  538: 			ip->i_size = uio->uio_offset + xfersize;
  539: 			extended = 1;
  540: 		}
  541: 
  542: 		size = BLKSIZE(fs, ip, lbn) - bp->b_resid;
  543: 		if (size < xfersize)
  544: 			xfersize = size;
  545: 
  546: 		error =
  547: 		    uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
  548: 		if ((ioflag & (IO_VMIO|IO_DIRECT)) && 
  549: 		    (LIST_FIRST(&bp->b_dep) == NULL)) {
  550: 			bp->b_flags |= B_RELBUF;
  551: 		}
  552: 
  553: 		/*
  554: 		 * If IO_SYNC each buffer is written synchronously.  Otherwise
  555: 		 * if we have a severe page deficiency write the buffer 
  556: 		 * asynchronously.  Otherwise try to cluster, and if that
  557: 		 * doesn't do it then either do an async write (if O_DIRECT),
  558: 		 * or a delayed write (if not).
  559: 		 */
  560: 
  561: 		if (ioflag & IO_SYNC) {
  562: 			(void)bwrite(bp);
  563: 		} else if (vm_page_count_severe() || 
  564: 			    buf_dirty_count_severe() ||
  565: 			    (ioflag & IO_ASYNC)) {
  566: 			bp->b_flags |= B_CLUSTEROK;
  567: 			bawrite(bp);
  568: 		} else if (xfersize + blkoffset == fs->fs_bsize) {
  569: 			if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
  570: 				bp->b_flags |= B_CLUSTEROK;
  571: 				cluster_write(bp, ip->i_size, seqcount);
  572: 			} else {
  573: 				bawrite(bp);
  574: 			}
  575: 		} else if (ioflag & IO_DIRECT) {
  576: 			bp->b_flags |= B_CLUSTEROK;
  577: 			bawrite(bp);
  578: 		} else {
  579: 			bp->b_flags |= B_CLUSTEROK;
  580: 			bdwrite(bp);
  581: 		}
  582: 		if (error || xfersize == 0)
  583: 			break;
  584: 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
  585: 	}
  586: 	/*
  587: 	 * If we successfully wrote any data, and we are not the superuser
  588: 	 * we clear the setuid and setgid bits as a precaution against
  589: 	 * tampering.
  590: 	 */
  591: 	if (resid > uio->uio_resid && ap->a_cred && ap->a_cred->cr_uid != 0)
  592: 		ip->i_mode &= ~(ISUID | ISGID);
  593: 	if (resid > uio->uio_resid)
  594: 		VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0));
  595: 	if (error) {
  596: 		if (ioflag & IO_UNIT) {
  597: 			(void)UFS_TRUNCATE(vp, osize,
  598: 			    ioflag & IO_SYNC, ap->a_cred, uio->uio_td);
  599: 			uio->uio_offset -= resid - uio->uio_resid;
  600: 			uio->uio_resid = resid;
  601: 		}
  602: 	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
  603: 		error = UFS_UPDATE(vp, 1);
  604: 
  605: 	if (object)
  606: 		vm_object_vndeallocate(object);
  607: 
  608: 	return (error);
  609: }
  610: 
  611: 
  612: /*
  613:  * get page routine
  614:  */
  615: int
  616: ffs_getpages(ap)
  617: 	struct vop_getpages_args *ap;
  618: {
  619: 	off_t foff, physoffset;
  620: 	int i, size, bsize;
  621: 	struct vnode *dp, *vp;
  622: 	vm_object_t obj;
  623: 	vm_pindex_t pindex, firstindex;
  624: 	vm_page_t mreq;
  625: 	int bbackwards, bforwards;
  626: 	int pbackwards, pforwards;
  627: 	int firstpage;
  628: 	int reqlblkno;
  629: 	daddr_t reqblkno;
  630: 	int poff;
  631: 	int pcount;
  632: 	int rtval;
  633: 	int pagesperblock;
  634: 
  635: 
  636: 	pcount = round_page(ap->a_count) / PAGE_SIZE;
  637: 	mreq = ap->a_m[ap->a_reqpage];
  638: 	firstindex = ap->a_m[0]->pindex;
  639: 
  640: 	/*
  641: 	 * if ANY DEV_BSIZE blocks are valid on a large filesystem block,
  642: 	 * then the entire page is valid.  Since the page may be mapped,
  643: 	 * user programs might reference data beyond the actual end of file
  644: 	 * occuring within the page.  We have to zero that data.
  645: 	 */
  646: 	if (mreq->valid) {
  647: 		if (mreq->valid != VM_PAGE_BITS_ALL)
  648: 			vm_page_zero_invalid(mreq, TRUE);
  649: 		for (i = 0; i < pcount; i++) {
  650: 			if (i != ap->a_reqpage) {
  651: 				vm_page_free(ap->a_m[i]);
  652: 			}
  653: 		}
  654: 		return VM_PAGER_OK;
  655: 	}
  656: 
  657: 	vp = ap->a_vp;
  658: 	obj = vp->v_object;
  659: 	bsize = vp->v_mount->mnt_stat.f_iosize;
  660: 	pindex = mreq->pindex;
  661: 	foff = IDX_TO_OFF(pindex) /* + ap->a_offset should be zero */;
  662: 
  663: 	if (bsize < PAGE_SIZE)
  664: 		return vnode_pager_generic_getpages(ap->a_vp, ap->a_m,
  665: 						    ap->a_count,
  666: 						    ap->a_reqpage);
  667: 
  668: 	/*
  669: 	 * foff is the file offset of the required page
  670: 	 * reqlblkno is the logical block that contains the page
  671: 	 * poff is the index of the page into the logical block
  672: 	 */
  673: 	reqlblkno = foff / bsize;
  674: 	poff = (foff % bsize) / PAGE_SIZE;
  675: 
  676: 	if ( VOP_BMAP( vp, reqlblkno, &dp, &reqblkno,
  677: 		&bforwards, &bbackwards) || (reqblkno == -1)) {
  678: 		for(i = 0; i < pcount; i++) {
  679: 			if (i != ap->a_reqpage)
  680: 				vm_page_free(ap->a_m[i]);
  681: 		}
  682: 		if (reqblkno == -1) {
  683: 			if ((mreq->flags & PG_ZERO) == 0)
  684: 				vm_page_zero_fill(mreq);
  685: 			vm_page_undirty(mreq);
  686: 			mreq->valid = VM_PAGE_BITS_ALL;
  687: 			return VM_PAGER_OK;
  688: 		} else {
  689: 			return VM_PAGER_ERROR;
  690: 		}
  691: 	}
  692: 
  693: 	physoffset = (off_t)reqblkno * DEV_BSIZE + poff * PAGE_SIZE;
  694: 	pagesperblock = bsize / PAGE_SIZE;
  695: 	/*
  696: 	 * find the first page that is contiguous...
  697: 	 * note that pbackwards is the number of pages that are contiguous
  698: 	 * backwards.
  699: 	 */
  700: 	firstpage = 0;
  701: 	if (ap->a_count) {
  702: 		pbackwards = poff + bbackwards * pagesperblock;
  703: 		if (ap->a_reqpage > pbackwards) {
  704: 			firstpage = ap->a_reqpage - pbackwards;
  705: 			for(i=0;i<firstpage;i++)
  706: 				vm_page_free(ap->a_m[i]);
  707: 		}
  708: 
  709: 	/*
  710: 	 * pforwards is the number of pages that are contiguous
  711: 	 * after the current page.
  712: 	 */
  713: 		pforwards = (pagesperblock - (poff + 1)) +
  714: 			bforwards * pagesperblock;
  715: 		if (pforwards < (pcount - (ap->a_reqpage + 1))) {
  716: 			for( i = ap->a_reqpage + pforwards + 1; i < pcount; i++)
  717: 				vm_page_free(ap->a_m[i]);
  718: 			pcount = ap->a_reqpage + pforwards + 1;
  719: 		}
  720: 
  721: 	/*
  722: 	 * number of pages for I/O corrected for the non-contig pages at
  723: 	 * the beginning of the array.
  724: 	 */
  725: 		pcount -= firstpage;
  726: 	}
  727: 
  728: 	/*
  729: 	 * calculate the size of the transfer
  730: 	 */
  731: 
  732: 	size = pcount * PAGE_SIZE;
  733: 
  734: 	if ((IDX_TO_OFF(ap->a_m[firstpage]->pindex) + size) >
  735: 		obj->un_pager.vnp.vnp_size)
  736: 		size = obj->un_pager.vnp.vnp_size -
  737: 			IDX_TO_OFF(ap->a_m[firstpage]->pindex);
  738: 
  739: 	physoffset -= foff;
  740: 	rtval = VOP_GETPAGES(dp, &ap->a_m[firstpage], size,
  741: 		(ap->a_reqpage - firstpage), physoffset);
  742: 
  743: 	return (rtval);
  744: }
  745: 
  746: /*
  747:  * put page routine
  748:  *
  749:  * XXX By default, wimp out... note that a_offset is ignored (and always
  750:  * XXX has been).
  751:  */
  752: int
  753: ffs_putpages(ap)
  754: 	struct vop_putpages_args *ap;
  755: {
  756: 	return vnode_pager_generic_putpages(ap->a_vp, ap->a_m, ap->a_count,
  757: 		ap->a_sync, ap->a_rtvals);
  758: }