File:  [DragonFly] / src / sys / kern / vfs_subr.c
Revision 1.30: download - view: text, annotated - select for diffs
Wed May 19 22:52:58 2004 UTC (10 years, 2 months ago) by dillon
Branches: MAIN
CVS tags: HEAD
Device layer rollup commit.

* cdevsw_add() is now required.  cdevsw_add() and cdevsw_remove() may specify
  a mask/match indicating the range of supported minor numbers.  Multiple
  cdevsw_add()'s using the same major number, but distinctly different
  ranges, may be issued.  All devices that failed to call cdevsw_add() before
  now do.

* cdevsw_remove() now automatically marks all devices within its supported
  range as being destroyed.

* vnode->v_rdev is no longer resolved when the vnode is created.  Instead,
  only v_udev (a newly added field) is resolved.  v_rdev is resolved when
  the vnode is opened and cleared on the last close.

* A great deal of code was making rather dubious assumptions with regards
  to the validity of devices associated with vnodes, primarily due to
  the persistence of a device structure due to being indexed by (major, minor)
  instead of by (cdevsw, major, minor).  In particular, if you run a program
  which connects to a USB device and then you pull the USB device and plug
  it back in, the vnode subsystem will continue to believe that the device
  is open when, in fact, it isn't (because it was destroyed and recreated).

  In particular, note that all the VFS mount procedures now check devices
  via v_udev instead of v_rdev prior to calling VOP_OPEN(), since v_rdev
  is NULL prior to the first open.

* The disk layer's device interaction has been rewritten.  The disk layer
  (i.e. the slice and disklabel management layer) no longer overloads
  its data onto the device structure representing the underlying physical
  disk.  Instead, the disk layer uses the new cdevsw_add() functionality
  to register its own cdevsw using the underlying device's major number,
  and simply does NOT register the underlying device's cdevsw.  No
  confusion is created because the device hash is now based on
  (cdevsw,major,minor) rather then (major,minor).

  NOTE: This also means that underlying raw disk devices may use the entire
  device minor number instead of having to reserve the bits used by the disk
  layer, and also means that can we (theoretically) stack a fully
  disklabel-supported 'disk' on top of any block device.

* The new reference counting scheme prevents this by associating a device
  with a cdevsw and disconnecting the device from its cdevsw when the cdevsw
  is removed.  Additionally, all udev2dev() lookups run through the cdevsw
  mask/match and only successfully find devices still associated with an
  active cdevsw.

* Major work on MFS:  MFS no longer shortcuts vnode and device creation.  It
  now creates a real vnode and a real device and implements real open and
  close VOPs.  Additionally, due to the disk layer changes, MFS is no longer
  limited to 255 mounts.  The new limit is 16 million.  Since MFS creates a
  real device node, mount_mfs will now create a real /dev/mfs<PID> device
  that can be read from userland (e.g. so you can dump an MFS filesystem).

* BUF AND DEVICE STRATEGY changes.  The struct buf contains a b_dev field.
  In order to properly handle stacked devices we now require that the b_dev
  field be initialized before the device strategy routine is called.  This
  required some additional work in various VFS implementations.  To enforce
  this requirement, biodone() now sets b_dev to NODEV.  The new disk layer
  will adjust b_dev before forwarding a request to the actual physical
  device.

* A bug in the ISO CD boot sequence which resulted in a panic has been fixed.

Testing by: lots of people, but David Rhodus found the most aggregious bugs.

    1: /*
    2:  * Copyright (c) 1989, 1993
    3:  *	The Regents of the University of California.  All rights reserved.
    4:  * (c) UNIX System Laboratories, Inc.
    5:  * All or some portions of this file are derived from material licensed
    6:  * to the University of California by American Telephone and Telegraph
    7:  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
    8:  * the permission of UNIX System Laboratories, Inc.
    9:  *
   10:  * Redistribution and use in source and binary forms, with or without
   11:  * modification, are permitted provided that the following conditions
   12:  * are met:
   13:  * 1. Redistributions of source code must retain the above copyright
   14:  *    notice, this list of conditions and the following disclaimer.
   15:  * 2. Redistributions in binary form must reproduce the above copyright
   16:  *    notice, this list of conditions and the following disclaimer in the
   17:  *    documentation and/or other materials provided with the distribution.
   18:  * 3. All advertising materials mentioning features or use of this software
   19:  *    must display the following acknowledgement:
   20:  *	This product includes software developed by the University of
   21:  *	California, Berkeley and its contributors.
   22:  * 4. Neither the name of the University nor the names of its contributors
   23:  *    may be used to endorse or promote products derived from this software
   24:  *    without specific prior written permission.
   25:  *
   26:  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   27:  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   28:  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   29:  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   30:  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   31:  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   32:  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   33:  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   34:  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   35:  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   36:  * SUCH DAMAGE.
   37:  *
   38:  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
   39:  * $FreeBSD: src/sys/kern/vfs_subr.c,v 1.249.2.30 2003/04/04 20:35:57 tegge Exp $
   40:  * $DragonFly: src/sys/kern/vfs_subr.c,v 1.30 2004/05/19 22:52:58 dillon Exp $
   41:  */
   42: 
   43: /*
   44:  * External virtual filesystem routines
   45:  */
   46: #include "opt_ddb.h"
   47: 
   48: #include <sys/param.h>
   49: #include <sys/systm.h>
   50: #include <sys/buf.h>
   51: #include <sys/conf.h>
   52: #include <sys/dirent.h>
   53: #include <sys/domain.h>
   54: #include <sys/eventhandler.h>
   55: #include <sys/fcntl.h>
   56: #include <sys/kernel.h>
   57: #include <sys/kthread.h>
   58: #include <sys/malloc.h>
   59: #include <sys/mbuf.h>
   60: #include <sys/mount.h>
   61: #include <sys/proc.h>
   62: #include <sys/namei.h>
   63: #include <sys/reboot.h>
   64: #include <sys/socket.h>
   65: #include <sys/stat.h>
   66: #include <sys/sysctl.h>
   67: #include <sys/syslog.h>
   68: #include <sys/vmmeter.h>
   69: #include <sys/vnode.h>
   70: 
   71: #include <machine/limits.h>
   72: 
   73: #include <vm/vm.h>
   74: #include <vm/vm_object.h>
   75: #include <vm/vm_extern.h>
   76: #include <vm/vm_kern.h>
   77: #include <vm/pmap.h>
   78: #include <vm/vm_map.h>
   79: #include <vm/vm_page.h>
   80: #include <vm/vm_pager.h>
   81: #include <vm/vnode_pager.h>
   82: #include <vm/vm_zone.h>
   83: 
   84: #include <sys/buf2.h>
   85: #include <sys/thread2.h>
   86: 
   87: static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
   88: 
   89: static void	insmntque (struct vnode *vp, struct mount *mp);
   90: static void	vclean (struct vnode *vp, lwkt_tokref_t vlock, int flags, struct thread *td);
   91: static unsigned long	numvnodes;
   92: SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
   93: 
   94: enum vtype iftovt_tab[16] = {
   95: 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
   96: 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
   97: };
   98: int vttoif_tab[9] = {
   99: 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
  100: 	S_IFSOCK, S_IFIFO, S_IFMT,
  101: };
  102: 
  103: static TAILQ_HEAD(freelst, vnode) vnode_free_list;	/* vnode free list */
  104: 
  105: static u_long wantfreevnodes = 25;
  106: SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
  107: static u_long freevnodes = 0;
  108: SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
  109: 
  110: static int reassignbufcalls;
  111: SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
  112: static int reassignbufloops;
  113: SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, "");
  114: static int reassignbufsortgood;
  115: SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, "");
  116: static int reassignbufsortbad;
  117: SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, "");
  118: static int reassignbufmethod = 1;
  119: SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, "");
  120: 
  121: #ifdef ENABLE_VFS_IOOPT
  122: int vfs_ioopt = 0;
  123: SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
  124: #endif
  125: 
  126: struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* mounted fs */
  127: struct lwkt_token mountlist_token;
  128: struct lwkt_token mntvnode_token;
  129: int	nfs_mount_type = -1;
  130: static struct lwkt_token mntid_token;
  131: static struct lwkt_token vnode_free_list_token;
  132: static struct lwkt_token spechash_token;
  133: struct nfs_public nfs_pub;	/* publicly exported FS */
  134: static vm_zone_t vnode_zone;
  135: 
  136: /*
  137:  * The workitem queue.
  138:  */
  139: #define SYNCER_MAXDELAY		32
  140: static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
  141: time_t syncdelay = 30;		/* max time to delay syncing data */
  142: SYSCTL_INT(_kern, OID_AUTO, syncdelay, CTLFLAG_RW, &syncdelay, 0,
  143: 	"VFS data synchronization delay");
  144: time_t filedelay = 30;		/* time to delay syncing files */
  145: SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
  146: 	"File synchronization delay");
  147: time_t dirdelay = 29;		/* time to delay syncing directories */
  148: SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
  149: 	"Directory synchronization delay");
  150: time_t metadelay = 28;		/* time to delay syncing metadata */
  151: SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
  152: 	"VFS metadata synchronization delay");
  153: static int rushjob;			/* number of slots to run ASAP */
  154: static int stat_rush_requests;	/* number of times I/O speeded up */
  155: SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
  156: 
  157: static int syncer_delayno = 0;
  158: static long syncer_mask; 
  159: LIST_HEAD(synclist, vnode);
  160: static struct synclist *syncer_workitem_pending;
  161: 
  162: int desiredvnodes;
  163: SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, 
  164:     &desiredvnodes, 0, "Maximum number of vnodes");
  165: static int minvnodes;
  166: SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, 
  167:     &minvnodes, 0, "Minimum number of vnodes");
  168: static int vnlru_nowhere = 0;
  169: SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0,
  170:     "Number of times the vnlru process ran without success");
  171: 
  172: static void	vfs_free_addrlist (struct netexport *nep);
  173: static int	vfs_free_netcred (struct radix_node *rn, void *w);
  174: static int	vfs_hang_addrlist (struct mount *mp, struct netexport *nep,
  175: 				       struct export_args *argp);
  176: 
  177: #define VSHOULDFREE(vp) \
  178: 	(!((vp)->v_flag & (VFREE|VDOOMED)) && \
  179: 	 !(vp)->v_holdcnt && !(vp)->v_usecount && \
  180: 	 (!(vp)->v_object || \
  181: 	  !((vp)->v_object->ref_count || (vp)->v_object->resident_page_count)))
  182:  
  183: #define VMIGHTFREE(vp) \
  184: 	(((vp)->v_flag & (VFREE|VDOOMED|VXLOCK)) == 0 &&   \
  185: 	 cache_leaf_test(vp) == 0 && (vp)->v_usecount == 0)
  186:  
  187: #define VSHOULDBUSY(vp) \
  188: 	(((vp)->v_flag & VFREE) && \
  189: 	 ((vp)->v_holdcnt || (vp)->v_usecount))
  190: 
  191: static void vbusy(struct vnode *vp);
  192: static void vfree(struct vnode *vp);
  193: static void vmaybefree(struct vnode *vp);
  194: 
  195: extern int dev_ref_debug;
  196: 
  197: /*
  198:  * NOTE: the vnode interlock must be held on call.
  199:  */
  200: static __inline void
  201: vmaybefree(struct vnode *vp)
  202: {
  203: 	if (VSHOULDFREE(vp))
  204: 		vfree(vp);
  205: }
  206:  
  207: /*
  208:  * Initialize the vnode management data structures.
  209:  */
  210: void
  211: vntblinit()
  212: {
  213: 
  214: 	/*
  215: 	 * Desired vnodes is a result of the physical page count
  216: 	 * and the size of kernel's heap.  It scales in proportion
  217: 	 * to the amount of available physical memory.  This can
  218: 	 * cause trouble on 64-bit and large memory platforms.
  219: 	 */
  220: 	/* desiredvnodes = maxproc + vmstats.v_page_count / 4; */
  221: 	desiredvnodes =
  222: 		min(maxproc + vmstats.v_page_count /4,
  223: 		    2 * (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) /
  224: 		    (5 * (sizeof(struct vm_object) + sizeof(struct vnode))));
  225: 
  226: 	minvnodes = desiredvnodes / 4;
  227: 	lwkt_token_init(&mountlist_token);
  228: 	lwkt_token_init(&mntvnode_token);
  229: 	lwkt_token_init(&mntid_token);
  230: 	lwkt_token_init(&spechash_token);
  231: 	TAILQ_INIT(&vnode_free_list);
  232: 	lwkt_token_init(&vnode_free_list_token);
  233: 	vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5);
  234: 	/*
  235: 	 * Initialize the filesystem syncer.
  236: 	 */     
  237: 	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 
  238: 		&syncer_mask);
  239: 	syncer_maxdelay = syncer_mask + 1;
  240: }
  241: 
  242: /*
  243:  * Mark a mount point as busy. Used to synchronize access and to delay
  244:  * unmounting. Interlock is not released on failure.
  245:  */
  246: int
  247: vfs_busy(struct mount *mp, int flags, lwkt_tokref_t interlkp, struct thread *td)
  248: {
  249: 	int lkflags;
  250: 
  251: 	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
  252: 		if (flags & LK_NOWAIT)
  253: 			return (ENOENT);
  254: 		mp->mnt_kern_flag |= MNTK_MWAIT;
  255: 		/*
  256: 		 * Since all busy locks are shared except the exclusive
  257: 		 * lock granted when unmounting, the only place that a
  258: 		 * wakeup needs to be done is at the release of the
  259: 		 * exclusive lock at the end of dounmount.
  260: 		 *
  261: 		 * note: interlkp is a serializer and thus can be safely
  262: 		 * held through any sleep
  263: 		 */
  264: 		tsleep((caddr_t)mp, 0, "vfs_busy", 0);
  265: 		return (ENOENT);
  266: 	}
  267: 	lkflags = LK_SHARED | LK_NOPAUSE;
  268: 	if (interlkp)
  269: 		lkflags |= LK_INTERLOCK;
  270: 	if (lockmgr(&mp->mnt_lock, lkflags, interlkp, td))
  271: 		panic("vfs_busy: unexpected lock failure");
  272: 	return (0);
  273: }
  274: 
  275: /*
  276:  * Free a busy filesystem.
  277:  */
  278: void
  279: vfs_unbusy(struct mount *mp, struct thread *td)
  280: {
  281: 	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
  282: }
  283: 
  284: /*
  285:  * Lookup a filesystem type, and if found allocate and initialize
  286:  * a mount structure for it.
  287:  *
  288:  * Devname is usually updated by mount(8) after booting.
  289:  */
  290: int
  291: vfs_rootmountalloc(char *fstypename, char *devname, struct mount **mpp)
  292: {
  293: 	struct thread *td = curthread;	/* XXX */
  294: 	struct vfsconf *vfsp;
  295: 	struct mount *mp;
  296: 
  297: 	if (fstypename == NULL)
  298: 		return (ENODEV);
  299: 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
  300: 		if (!strcmp(vfsp->vfc_name, fstypename))
  301: 			break;
  302: 	if (vfsp == NULL)
  303: 		return (ENODEV);
  304: 	mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
  305: 	bzero((char *)mp, (u_long)sizeof(struct mount));
  306: 	lockinit(&mp->mnt_lock, 0, "vfslock", VLKTIMEOUT, LK_NOPAUSE);
  307: 	vfs_busy(mp, LK_NOWAIT, NULL, td);
  308: 	TAILQ_INIT(&mp->mnt_nvnodelist);
  309: 	TAILQ_INIT(&mp->mnt_reservedvnlist);
  310: 	mp->mnt_nvnodelistsize = 0;
  311: 	mp->mnt_vfc = vfsp;
  312: 	mp->mnt_op = vfsp->vfc_vfsops;
  313: 	mp->mnt_flag = MNT_RDONLY;
  314: 	mp->mnt_vnodecovered = NULLVP;
  315: 	vfsp->vfc_refcount++;
  316: 	mp->mnt_iosize_max = DFLTPHYS;
  317: 	mp->mnt_stat.f_type = vfsp->vfc_typenum;
  318: 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
  319: 	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
  320: 	mp->mnt_stat.f_mntonname[0] = '/';
  321: 	mp->mnt_stat.f_mntonname[1] = 0;
  322: 	(void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
  323: 	*mpp = mp;
  324: 	return (0);
  325: }
  326: 
  327: /*
  328:  * Find an appropriate filesystem to use for the root. If a filesystem
  329:  * has not been preselected, walk through the list of known filesystems
  330:  * trying those that have mountroot routines, and try them until one
  331:  * works or we have tried them all.
  332:  */
  333: #ifdef notdef	/* XXX JH */
  334: int
  335: lite2_vfs_mountroot()
  336: {
  337: 	struct vfsconf *vfsp;
  338: 	extern int (*lite2_mountroot) (void);
  339: 	int error;
  340: 
  341: 	if (lite2_mountroot != NULL)
  342: 		return ((*lite2_mountroot)());
  343: 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
  344: 		if (vfsp->vfc_mountroot == NULL)
  345: 			continue;
  346: 		if ((error = (*vfsp->vfc_mountroot)()) == 0)
  347: 			return (0);
  348: 		printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
  349: 	}
  350: 	return (ENODEV);
  351: }
  352: #endif
  353: 
  354: /*
  355:  * Lookup a mount point by filesystem identifier.
  356:  */
  357: struct mount *
  358: vfs_getvfs(fsid)
  359: 	fsid_t *fsid;
  360: {
  361: 	struct mount *mp;
  362: 	lwkt_tokref ilock;
  363: 
  364: 	lwkt_gettoken(&ilock, &mountlist_token);
  365: 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
  366: 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
  367: 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
  368: 			break;
  369: 	    }
  370: 	}
  371: 	lwkt_reltoken(&ilock);
  372: 	return (mp);
  373: }
  374: 
  375: /*
  376:  * Get a new unique fsid.  Try to make its val[0] unique, since this value
  377:  * will be used to create fake device numbers for stat().  Also try (but
  378:  * not so hard) make its val[0] unique mod 2^16, since some emulators only
  379:  * support 16-bit device numbers.  We end up with unique val[0]'s for the
  380:  * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
  381:  *
  382:  * Keep in mind that several mounts may be running in parallel.  Starting
  383:  * the search one past where the previous search terminated is both a
  384:  * micro-optimization and a defense against returning the same fsid to
  385:  * different mounts.
  386:  */
  387: void
  388: vfs_getnewfsid(mp)
  389: 	struct mount *mp;
  390: {
  391: 	static u_int16_t mntid_base;
  392: 	lwkt_tokref ilock;
  393: 	fsid_t tfsid;
  394: 	int mtype;
  395: 
  396: 	lwkt_gettoken(&ilock, &mntid_token);
  397: 	mtype = mp->mnt_vfc->vfc_typenum;
  398: 	tfsid.val[1] = mtype;
  399: 	mtype = (mtype & 0xFF) << 24;
  400: 	for (;;) {
  401: 		tfsid.val[0] = makeudev(255,
  402: 		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
  403: 		mntid_base++;
  404: 		if (vfs_getvfs(&tfsid) == NULL)
  405: 			break;
  406: 	}
  407: 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
  408: 	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
  409: 	lwkt_reltoken(&ilock);
  410: }
  411: 
  412: /*
  413:  * Knob to control the precision of file timestamps:
  414:  *
  415:  *   0 = seconds only; nanoseconds zeroed.
  416:  *   1 = seconds and nanoseconds, accurate within 1/HZ.
  417:  *   2 = seconds and nanoseconds, truncated to microseconds.
  418:  * >=3 = seconds and nanoseconds, maximum precision.
  419:  */
  420: enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
  421: 
  422: static int timestamp_precision = TSP_SEC;
  423: SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
  424:     &timestamp_precision, 0, "");
  425: 
  426: /*
  427:  * Get a current timestamp.
  428:  */
  429: void
  430: vfs_timestamp(tsp)
  431: 	struct timespec *tsp;
  432: {
  433: 	struct timeval tv;
  434: 
  435: 	switch (timestamp_precision) {
  436: 	case TSP_SEC:
  437: 		tsp->tv_sec = time_second;
  438: 		tsp->tv_nsec = 0;
  439: 		break;
  440: 	case TSP_HZ:
  441: 		getnanotime(tsp);
  442: 		break;
  443: 	case TSP_USEC:
  444: 		microtime(&tv);
  445: 		TIMEVAL_TO_TIMESPEC(&tv, tsp);
  446: 		break;
  447: 	case TSP_NSEC:
  448: 	default:
  449: 		nanotime(tsp);
  450: 		break;
  451: 	}
  452: }
  453: 
  454: /*
  455:  * Set vnode attributes to VNOVAL
  456:  */
  457: void
  458: vattr_null(vap)
  459: 	struct vattr *vap;
  460: {
  461: 
  462: 	vap->va_type = VNON;
  463: 	vap->va_size = VNOVAL;
  464: 	vap->va_bytes = VNOVAL;
  465: 	vap->va_mode = VNOVAL;
  466: 	vap->va_nlink = VNOVAL;
  467: 	vap->va_uid = VNOVAL;
  468: 	vap->va_gid = VNOVAL;
  469: 	vap->va_fsid = VNOVAL;
  470: 	vap->va_fileid = VNOVAL;
  471: 	vap->va_blocksize = VNOVAL;
  472: 	vap->va_rdev = VNOVAL;
  473: 	vap->va_atime.tv_sec = VNOVAL;
  474: 	vap->va_atime.tv_nsec = VNOVAL;
  475: 	vap->va_mtime.tv_sec = VNOVAL;
  476: 	vap->va_mtime.tv_nsec = VNOVAL;
  477: 	vap->va_ctime.tv_sec = VNOVAL;
  478: 	vap->va_ctime.tv_nsec = VNOVAL;
  479: 	vap->va_flags = VNOVAL;
  480: 	vap->va_gen = VNOVAL;
  481: 	vap->va_vaflags = 0;
  482: }
  483: 
  484: /*
  485:  * This routine is called when we have too many vnodes.  It attempts
  486:  * to free <count> vnodes and will potentially free vnodes that still
  487:  * have VM backing store (VM backing store is typically the cause
  488:  * of a vnode blowout so we want to do this).  Therefore, this operation
  489:  * is not considered cheap.
  490:  *
  491:  * A number of conditions may prevent a vnode from being reclaimed.
  492:  * the buffer cache may have references on the vnode, a directory
  493:  * vnode may still have references due to the namei cache representing
  494:  * underlying files, or the vnode may be in active use.   It is not
  495:  * desireable to reuse such vnodes.  These conditions may cause the
  496:  * number of vnodes to reach some minimum value regardless of what
  497:  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
  498:  */
  499: static int
  500: vlrureclaim(struct mount *mp)
  501: {
  502: 	struct vnode *vp;
  503: 	lwkt_tokref ilock;
  504: 	lwkt_tokref vlock;
  505: 	int done;
  506: 	int trigger;
  507: 	int usevnodes;
  508: 	int count;
  509: 
  510: 	/*
  511: 	 * Calculate the trigger point, don't allow user
  512: 	 * screwups to blow us up.   This prevents us from
  513: 	 * recycling vnodes with lots of resident pages.  We
  514: 	 * aren't trying to free memory, we are trying to
  515: 	 * free vnodes.
  516: 	 */
  517: 	usevnodes = desiredvnodes;
  518: 	if (usevnodes <= 0)
  519: 		usevnodes = 1;
  520: 	trigger = vmstats.v_page_count * 2 / usevnodes;
  521: 
  522: 	done = 0;
  523: 	lwkt_gettoken(&ilock, &mntvnode_token);
  524: 	count = mp->mnt_nvnodelistsize / 10 + 1;
  525: 	while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) {
  526: 		/*
  527: 		 * __VNODESCAN__
  528: 		 *
  529: 		 * The VP will stick around while we hold mntvnode_token,
  530: 		 * at least until we block, so we can safely do an initial
  531: 		 * check.  But we have to check again after obtaining
  532: 		 * the vnode interlock.  vp->v_interlock points to stable
  533: 		 * storage so it's ok if the vp gets ripped out from
  534: 		 * under us while we are blocked.
  535: 		 */
  536: 		if (vp->v_type == VNON ||
  537: 		    vp->v_type == VBAD ||
  538: 		    !VMIGHTFREE(vp) ||		/* critical path opt */
  539: 		    (vp->v_object &&
  540: 		     vp->v_object->resident_page_count >= trigger)
  541: 		) {
  542: 			TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
  543: 			TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist,vp, v_nmntvnodes);
  544: 			--count;
  545: 			continue;
  546: 		}
  547: 
  548: 		/*
  549: 		 * Get the interlock, delay moving the node to the tail so
  550: 		 * we don't race against new additions to the mountlist.
  551: 		 */
  552: 		lwkt_gettoken(&vlock, vp->v_interlock);
  553: 		if (TAILQ_FIRST(&mp->mnt_nvnodelist) != vp) {
  554: 			lwkt_reltoken(&vlock);
  555: 			continue;
  556: 		}
  557: 		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
  558: 		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist,vp, v_nmntvnodes);
  559: 
  560: 		/*
  561: 		 * Must check again
  562: 		 */
  563: 		if (vp->v_type == VNON ||
  564: 		    vp->v_type == VBAD ||
  565: 		    !VMIGHTFREE(vp) ||		/* critical path opt */
  566: 		    (vp->v_object &&
  567: 		     vp->v_object->resident_page_count >= trigger)
  568: 		) {
  569: 			lwkt_reltoken(&vlock);
  570: 			--count;
  571: 			continue;
  572: 		}
  573: 		vgonel(vp, &vlock, curthread);
  574: 		++done;
  575: 		--count;
  576: 	}
  577: 	lwkt_reltoken(&ilock);
  578: 	return done;
  579: }
  580: 
  581: /*
  582:  * Attempt to recycle vnodes in a context that is always safe to block.
  583:  * Calling vlrurecycle() from the bowels of file system code has some
  584:  * interesting deadlock problems.
  585:  */
  586: static struct thread *vnlruthread;
  587: static int vnlruproc_sig;
  588: 
  589: static void 
  590: vnlru_proc(void)
  591: {
  592: 	struct mount *mp, *nmp;
  593: 	lwkt_tokref ilock;
  594: 	int s;
  595: 	int done;
  596: 	struct thread *td = curthread;
  597: 
  598: 	EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, td,
  599: 	    SHUTDOWN_PRI_FIRST);   
  600: 
  601: 	s = splbio();
  602: 	for (;;) {
  603: 		kproc_suspend_loop();
  604: 		if (numvnodes - freevnodes <= desiredvnodes * 9 / 10) {
  605: 			vnlruproc_sig = 0;
  606: 			wakeup(&vnlruproc_sig);
  607: 			tsleep(td, 0, "vlruwt", hz);
  608: 			continue;
  609: 		}
  610: 		done = 0;
  611: 		lwkt_gettoken(&ilock, &mountlist_token);
  612: 		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
  613: 			if (vfs_busy(mp, LK_NOWAIT, &ilock, td)) {
  614: 				nmp = TAILQ_NEXT(mp, mnt_list);
  615: 				continue;
  616: 			}
  617: 			done += vlrureclaim(mp);
  618: 			lwkt_gettokref(&ilock);
  619: 			nmp = TAILQ_NEXT(mp, mnt_list);
  620: 			vfs_unbusy(mp, td);
  621: 		}
  622: 		lwkt_reltoken(&ilock);
  623: 		if (done == 0) {
  624: 			vnlru_nowhere++;
  625: 			tsleep(td, 0, "vlrup", hz * 3);
  626: 		}
  627: 	}
  628: 	splx(s);
  629: }
  630: 
  631: static struct kproc_desc vnlru_kp = {
  632: 	"vnlru",
  633: 	vnlru_proc,
  634: 	&vnlruthread
  635: };
  636: SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp)
  637: 
  638: /*
  639:  * Routines having to do with the management of the vnode table.
  640:  */
  641: extern vop_t **dead_vnodeop_p;
  642: 
  643: /*
  644:  * Return the next vnode from the free list.
  645:  */
  646: int
  647: getnewvnode(tag, mp, vops, vpp)
  648: 	enum vtagtype tag;
  649: 	struct mount *mp;
  650: 	vop_t **vops;
  651: 	struct vnode **vpp;
  652: {
  653: 	int s;
  654: 	struct thread *td = curthread;	/* XXX */
  655: 	struct vnode *vp = NULL;
  656: 	struct vnode *xvp;
  657: 	vm_object_t object;
  658: 	lwkt_tokref ilock;
  659: 	lwkt_tokref vlock;
  660: 
  661: 	s = splbio();
  662: 
  663: 	/*
  664: 	 * Try to reuse vnodes if we hit the max.  This situation only
  665: 	 * occurs in certain large-memory (2G+) situations.  We cannot
  666: 	 * attempt to directly reclaim vnodes due to nasty recursion
  667: 	 * problems.
  668: 	 */
  669: 	while (numvnodes - freevnodes > desiredvnodes) {
  670: 		if (vnlruproc_sig == 0) {
  671: 			vnlruproc_sig = 1;	/* avoid unnecessary wakeups */
  672: 			wakeup(vnlruthread);
  673: 		}
  674: 		tsleep(&vnlruproc_sig, 0, "vlruwk", hz);
  675: 	}
  676: 
  677: 
  678: 	/*
  679: 	 * Attempt to reuse a vnode already on the free list, allocating
  680: 	 * a new vnode if we can't find one or if we have not reached a
  681: 	 * good minimum for good LRU performance.
  682: 	 */
  683: 	lwkt_gettoken(&ilock, &vnode_free_list_token);
  684: 	if (freevnodes >= wantfreevnodes && numvnodes >= minvnodes) {
  685: 		int count;
  686: 
  687: 		for (count = 0; count < freevnodes; count++) {
  688: 			/*
  689: 			 * __VNODESCAN__
  690: 			 *
  691: 			 * Pull the next vnode off the free list and do some
  692: 			 * sanity checks.  Note that regardless of how we
  693: 			 * block, if freevnodes is non-zero there had better
  694: 			 * be something on the list.
  695: 			 */
  696: 			vp = TAILQ_FIRST(&vnode_free_list);
  697: 			if (vp == NULL)
  698: 				panic("getnewvnode: free vnode isn't");
  699: 
  700: 			/*
  701: 			 * Move the vnode to the end of the list so other
  702: 			 * processes do not double-block trying to recycle
  703: 			 * the same vnode (as an optimization), then get
  704: 			 * the interlock.
  705: 			 */
  706: 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
  707: 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
  708: 
  709: 			/*
  710: 			 * Skip vnodes that are in the process of being
  711: 			 * held or referenced.  Since the act of adding or
  712: 			 * removing a vnode on the freelist requires a token
  713: 			 * and may block, the ref count may be adjusted
  714: 			 * prior to its addition or removal.
  715: 			 */
  716: 			if (VSHOULDBUSY(vp)) {
  717: 				vp = NULL;
  718: 				continue;
  719: 			}
  720: 
  721: 
  722: 			/*
  723: 			 * Obtain the vnode interlock and check that the
  724: 			 * vnode is still on the free list.
  725: 			 *
  726: 			 * This normally devolves into a degenerate case so
  727: 			 * it is optimal.   Loop up if it isn't.  Note that
  728: 			 * the vnode could be in the middle of being moved
  729: 			 * off the free list (the VSHOULDBUSY() check) and
  730: 			 * must be skipped if so.
  731: 			 */
  732: 			lwkt_gettoken(&vlock, vp->v_interlock);
  733: 			TAILQ_FOREACH_REVERSE(xvp, &vnode_free_list, 
  734: 			    freelst, v_freelist) {
  735: 				if (vp == xvp)
  736: 					break;
  737: 			}
  738: 			if (vp != xvp || VSHOULDBUSY(vp)) {
  739: 				vp = NULL;
  740: 				continue;
  741: 			}
  742: 
  743: 			/*
  744: 			 * We now safely own the vnode.  If the vnode has
  745: 			 * an object do not recycle it if its VM object
  746: 			 * has resident pages or references.
  747: 			 */
  748: 			if ((VOP_GETVOBJECT(vp, &object) == 0 &&
  749: 			    (object->resident_page_count || object->ref_count))
  750: 			) {
  751: 				lwkt_reltoken(&vlock);
  752: 				vp = NULL;
  753: 				continue;
  754: 			}
  755: 
  756: 			/*
  757: 			 * We can almost reuse this vnode.  But we don't want
  758: 			 * to recycle it if the vnode has children in the
  759: 			 * namecache because that breaks the namecache's
  760: 			 * path element chain.  (YYY use nc_refs for the
  761: 			 * check?)
  762: 			 */
  763: 			KKASSERT(vp->v_flag & VFREE);
  764: 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
  765: 
  766: 			if (TAILQ_FIRST(&vp->v_namecache) == NULL ||
  767: 			    cache_leaf_test(vp) >= 0) {
  768: 				/* ok, we can reuse this vnode */
  769: 				break;
  770: 			}
  771: 			lwkt_reltoken(&vlock);
  772: 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
  773: 			vp = NULL;
  774: 		}
  775: 	}
  776: 
  777: 	/*
  778: 	 * If vp is non-NULL we hold it's interlock.
  779: 	 */
  780: 	if (vp) {
  781: 		vp->v_flag |= VDOOMED;
  782: 		vp->v_flag &= ~VFREE;
  783: 		freevnodes--;
  784: 		lwkt_reltoken(&ilock);
  785: 		cache_purge(vp);	/* YYY may block */
  786: 		vp->v_lease = NULL;
  787: 		if (vp->v_type != VBAD) {
  788: 			vgonel(vp, &vlock, td);
  789: 		} else {
  790: 			lwkt_reltoken(&vlock);
  791: 		}
  792: 
  793: #ifdef INVARIANTS
  794: 		{
  795: 			int s;
  796: 
  797: 			if (vp->v_data)
  798: 				panic("cleaned vnode isn't");
  799: 			s = splbio();
  800: 			if (vp->v_numoutput)
  801: 				panic("Clean vnode has pending I/O's");
  802: 			splx(s);
  803: 		}
  804: #endif
  805: 		vp->v_flag = 0;
  806: 		vp->v_lastw = 0;
  807: 		vp->v_lasta = 0;
  808: 		vp->v_cstart = 0;
  809: 		vp->v_clen = 0;
  810: 		vp->v_socket = 0;
  811: 		vp->v_writecount = 0;	/* XXX */
  812: 	} else {
  813: 		lwkt_reltoken(&ilock);
  814: 		vp = zalloc(vnode_zone);
  815: 		bzero(vp, sizeof(*vp));
  816: 		vp->v_interlock = lwkt_token_pool_get(vp);
  817: 		lwkt_token_init(&vp->v_pollinfo.vpi_token);
  818: 		cache_purge(vp);
  819: 		TAILQ_INIT(&vp->v_namecache);
  820: 		numvnodes++;
  821: 	}
  822: 
  823: 	TAILQ_INIT(&vp->v_cleanblkhd);
  824: 	TAILQ_INIT(&vp->v_dirtyblkhd);
  825: 	vp->v_type = VNON;
  826: 	vp->v_tag = tag;
  827: 	vp->v_op = vops;
  828: 	insmntque(vp, mp);
  829: 	*vpp = vp;
  830: 	vp->v_usecount = 1;
  831: 	vp->v_data = 0;
  832: 	splx(s);
  833: 
  834: 	vfs_object_create(vp, td);
  835: 	return (0);
  836: }
  837: 
  838: /*
  839:  * Move a vnode from one mount queue to another.
  840:  */
  841: static void
  842: insmntque(vp, mp)
  843: 	struct vnode *vp;
  844: 	struct mount *mp;
  845: {
  846: 	lwkt_tokref ilock;
  847: 
  848: 	lwkt_gettoken(&ilock, &mntvnode_token);
  849: 	/*
  850: 	 * Delete from old mount point vnode list, if on one.
  851: 	 */
  852: 	if (vp->v_mount != NULL) {
  853: 		KASSERT(vp->v_mount->mnt_nvnodelistsize > 0,
  854: 			("bad mount point vnode list size"));
  855: 		TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes);
  856: 		vp->v_mount->mnt_nvnodelistsize--;
  857: 	}
  858: 	/*
  859: 	 * Insert into list of vnodes for the new mount point, if available.
  860: 	 */
  861: 	if ((vp->v_mount = mp) == NULL) {
  862: 		lwkt_reltoken(&ilock);
  863: 		return;
  864: 	}
  865: 	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
  866: 	mp->mnt_nvnodelistsize++;
  867: 	lwkt_reltoken(&ilock);
  868: }
  869: 
  870: /*
  871:  * Update outstanding I/O count and do wakeup if requested.
  872:  */
  873: void
  874: vwakeup(bp)
  875: 	struct buf *bp;
  876: {
  877: 	struct vnode *vp;
  878: 
  879: 	bp->b_flags &= ~B_WRITEINPROG;
  880: 	if ((vp = bp->b_vp)) {
  881: 		vp->v_numoutput--;
  882: 		if (vp->v_numoutput < 0)
  883: 			panic("vwakeup: neg numoutput");
  884: 		if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
  885: 			vp->v_flag &= ~VBWAIT;
  886: 			wakeup((caddr_t) &vp->v_numoutput);
  887: 		}
  888: 	}
  889: }
  890: 
  891: /*
  892:  * Flush out and invalidate all buffers associated with a vnode.
  893:  * Called with the underlying object locked.
  894:  */
  895: int
  896: vinvalbuf(struct vnode *vp, int flags, struct thread *td,
  897: 	int slpflag, int slptimeo)
  898: {
  899: 	struct buf *bp;
  900: 	struct buf *nbp, *blist;
  901: 	int s, error;
  902: 	vm_object_t object;
  903: 	lwkt_tokref vlock;
  904: 
  905: 	if (flags & V_SAVE) {
  906: 		s = splbio();
  907: 		while (vp->v_numoutput) {
  908: 			vp->v_flag |= VBWAIT;
  909: 			error = tsleep((caddr_t)&vp->v_numoutput,
  910: 			    slpflag, "vinvlbuf", slptimeo);
  911: 			if (error) {
  912: 				splx(s);
  913: 				return (error);
  914: 			}
  915: 		}
  916: 		if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
  917: 			splx(s);
  918: 			if ((error = VOP_FSYNC(vp, MNT_WAIT, td)) != 0)
  919: 				return (error);
  920: 			s = splbio();
  921: 			if (vp->v_numoutput > 0 ||
  922: 			    !TAILQ_EMPTY(&vp->v_dirtyblkhd))
  923: 				panic("vinvalbuf: dirty bufs");
  924: 		}
  925: 		splx(s);
  926:   	}
  927: 	s = splbio();
  928: 	for (;;) {
  929: 		blist = TAILQ_FIRST(&vp->v_cleanblkhd);
  930: 		if (!blist)
  931: 			blist = TAILQ_FIRST(&vp->v_dirtyblkhd);
  932: 		if (!blist)
  933: 			break;
  934: 
  935: 		for (bp = blist; bp; bp = nbp) {
  936: 			nbp = TAILQ_NEXT(bp, b_vnbufs);
  937: 			if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
  938: 				error = BUF_TIMELOCK(bp,
  939: 				    LK_EXCLUSIVE | LK_SLEEPFAIL,
  940: 				    "vinvalbuf", slpflag, slptimeo);
  941: 				if (error == ENOLCK)
  942: 					break;
  943: 				splx(s);
  944: 				return (error);
  945: 			}
  946: 			/*
  947: 			 * XXX Since there are no node locks for NFS, I
  948: 			 * believe there is a slight chance that a delayed
  949: 			 * write will occur while sleeping just above, so
  950: 			 * check for it.  Note that vfs_bio_awrite expects
  951: 			 * buffers to reside on a queue, while VOP_BWRITE and
  952: 			 * brelse do not.
  953: 			 */
  954: 			if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
  955: 				(flags & V_SAVE)) {
  956: 
  957: 				if (bp->b_vp == vp) {
  958: 					if (bp->b_flags & B_CLUSTEROK) {
  959: 						BUF_UNLOCK(bp);
  960: 						vfs_bio_awrite(bp);
  961: 					} else {
  962: 						bremfree(bp);
  963: 						bp->b_flags |= B_ASYNC;
  964: 						VOP_BWRITE(bp->b_vp, bp);
  965: 					}
  966: 				} else {
  967: 					bremfree(bp);
  968: 					(void) VOP_BWRITE(bp->b_vp, bp);
  969: 				}
  970: 				break;
  971: 			}
  972: 			bremfree(bp);
  973: 			bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
  974: 			bp->b_flags &= ~B_ASYNC;
  975: 			brelse(bp);
  976: 		}
  977: 	}
  978: 
  979: 	/*
  980: 	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
  981: 	 * have write I/O in-progress but if there is a VM object then the
  982: 	 * VM object can also have read-I/O in-progress.
  983: 	 */
  984: 	do {
  985: 		while (vp->v_numoutput > 0) {
  986: 			vp->v_flag |= VBWAIT;
  987: 			tsleep(&vp->v_numoutput, 0, "vnvlbv", 0);
  988: 		}
  989: 		if (VOP_GETVOBJECT(vp, &object) == 0) {
  990: 			while (object->paging_in_progress)
  991: 				vm_object_pip_sleep(object, "vnvlbx");
  992: 		}
  993: 	} while (vp->v_numoutput > 0);
  994: 
  995: 	splx(s);
  996: 
  997: 	/*
  998: 	 * Destroy the copy in the VM cache, too.
  999: 	 */
 1000: 	lwkt_gettoken(&vlock, vp->v_interlock);
 1001: 	if (VOP_GETVOBJECT(vp, &object) == 0) {
 1002: 		vm_object_page_remove(object, 0, 0,
 1003: 			(flags & V_SAVE) ? TRUE : FALSE);
 1004: 	}
 1005: 	lwkt_reltoken(&vlock);
 1006: 
 1007: 	if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd))
 1008: 		panic("vinvalbuf: flush failed");
 1009: 	return (0);
 1010: }
 1011: 
 1012: /*
 1013:  * Truncate a file's buffer and pages to a specified length.  This
 1014:  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
 1015:  * sync activity.
 1016:  */
 1017: int
 1018: vtruncbuf(struct vnode *vp, struct thread *td, off_t length, int blksize)
 1019: {
 1020: 	struct buf *bp;
 1021: 	struct buf *nbp;
 1022: 	int s, anyfreed;
 1023: 	int trunclbn;
 1024: 
 1025: 	/*
 1026: 	 * Round up to the *next* lbn.
 1027: 	 */
 1028: 	trunclbn = (length + blksize - 1) / blksize;
 1029: 
 1030: 	s = splbio();
 1031: restart:
 1032: 	anyfreed = 1;
 1033: 	for (;anyfreed;) {
 1034: 		anyfreed = 0;
 1035: 		for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
 1036: 			nbp = TAILQ_NEXT(bp, b_vnbufs);
 1037: 			if (bp->b_lblkno >= trunclbn) {
 1038: 				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 1039: 					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
 1040: 					goto restart;
 1041: 				} else {
 1042: 					bremfree(bp);
 1043: 					bp->b_flags |= (B_INVAL | B_RELBUF);
 1044: 					bp->b_flags &= ~B_ASYNC;
 1045: 					brelse(bp);
 1046: 					anyfreed = 1;
 1047: 				}
 1048: 				if (nbp &&
 1049: 				    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
 1050: 				    (nbp->b_vp != vp) ||
 1051: 				    (nbp->b_flags & B_DELWRI))) {
 1052: 					goto restart;
 1053: 				}
 1054: 			}
 1055: 		}
 1056: 
 1057: 		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
 1058: 			nbp = TAILQ_NEXT(bp, b_vnbufs);
 1059: 			if (bp->b_lblkno >= trunclbn) {
 1060: 				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 1061: 					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
 1062: 					goto restart;
 1063: 				} else {
 1064: 					bremfree(bp);
 1065: 					bp->b_flags |= (B_INVAL | B_RELBUF);
 1066: 					bp->b_flags &= ~B_ASYNC;
 1067: 					brelse(bp);
 1068: 					anyfreed = 1;
 1069: 				}
 1070: 				if (nbp &&
 1071: 				    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
 1072: 				    (nbp->b_vp != vp) ||
 1073: 				    (nbp->b_flags & B_DELWRI) == 0)) {
 1074: 					goto restart;
 1075: 				}
 1076: 			}
 1077: 		}
 1078: 	}
 1079: 
 1080: 	if (length > 0) {
 1081: restartsync:
 1082: 		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
 1083: 			nbp = TAILQ_NEXT(bp, b_vnbufs);
 1084: 			if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) {
 1085: 				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 1086: 					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
 1087: 					goto restart;
 1088: 				} else {
 1089: 					bremfree(bp);
 1090: 					if (bp->b_vp == vp) {
 1091: 						bp->b_flags |= B_ASYNC;
 1092: 					} else {
 1093: 						bp->b_flags &= ~B_ASYNC;
 1094: 					}
 1095: 					VOP_BWRITE(bp->b_vp, bp);
 1096: 				}
 1097: 				goto restartsync;
 1098: 			}
 1099: 
 1100: 		}
 1101: 	}
 1102: 
 1103: 	while (vp->v_numoutput > 0) {
 1104: 		vp->v_flag |= VBWAIT;
 1105: 		tsleep(&vp->v_numoutput, 0, "vbtrunc", 0);
 1106: 	}
 1107: 
 1108: 	splx(s);
 1109: 
 1110: 	vnode_pager_setsize(vp, length);
 1111: 
 1112: 	return (0);
 1113: }
 1114: 
 1115: /*
 1116:  * Associate a buffer with a vnode.
 1117:  */
 1118: void
 1119: bgetvp(vp, bp)
 1120: 	struct vnode *vp;
 1121: 	struct buf *bp;
 1122: {
 1123: 	int s;
 1124: 
 1125: 	KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
 1126: 
 1127: 	vhold(vp);
 1128: 	bp->b_vp = vp;
 1129: 	bp->b_dev = vn_todev(vp);
 1130: 	/*
 1131: 	 * Insert onto list for new vnode.
 1132: 	 */
 1133: 	s = splbio();
 1134: 	bp->b_xflags |= BX_VNCLEAN;
 1135: 	bp->b_xflags &= ~BX_VNDIRTY;
 1136: 	TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
 1137: 	splx(s);
 1138: }
 1139: 
 1140: /*
 1141:  * Disassociate a buffer from a vnode.
 1142:  */
 1143: void
 1144: brelvp(bp)
 1145: 	struct buf *bp;
 1146: {
 1147: 	struct vnode *vp;
 1148: 	struct buflists *listheadp;
 1149: 	int s;
 1150: 
 1151: 	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
 1152: 
 1153: 	/*
 1154: 	 * Delete from old vnode list, if on one.
 1155: 	 */
 1156: 	vp = bp->b_vp;
 1157: 	s = splbio();
 1158: 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
 1159: 		if (bp->b_xflags & BX_VNDIRTY)
 1160: 			listheadp = &vp->v_dirtyblkhd;
 1161: 		else 
 1162: 			listheadp = &vp->v_cleanblkhd;
 1163: 		TAILQ_REMOVE(listheadp, bp, b_vnbufs);
 1164: 		bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
 1165: 	}
 1166: 	if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
 1167: 		vp->v_flag &= ~VONWORKLST;
 1168: 		LIST_REMOVE(vp, v_synclist);
 1169: 	}
 1170: 	splx(s);
 1171: 	bp->b_vp = (struct vnode *) 0;
 1172: 	vdrop(vp);
 1173: }
 1174: 
 1175: /*
 1176:  * The workitem queue.
 1177:  * 
 1178:  * It is useful to delay writes of file data and filesystem metadata
 1179:  * for tens of seconds so that quickly created and deleted files need
 1180:  * not waste disk bandwidth being created and removed. To realize this,
 1181:  * we append vnodes to a "workitem" queue. When running with a soft
 1182:  * updates implementation, most pending metadata dependencies should
 1183:  * not wait for more than a few seconds. Thus, mounted on block devices
 1184:  * are delayed only about a half the time that file data is delayed.
 1185:  * Similarly, directory updates are more critical, so are only delayed
 1186:  * about a third the time that file data is delayed. Thus, there are
 1187:  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
 1188:  * one each second (driven off the filesystem syncer process). The
 1189:  * syncer_delayno variable indicates the next queue that is to be processed.
 1190:  * Items that need to be processed soon are placed in this queue:
 1191:  *
 1192:  *	syncer_workitem_pending[syncer_delayno]
 1193:  *
 1194:  * A delay of fifteen seconds is done by placing the request fifteen
 1195:  * entries later in the queue:
 1196:  *
 1197:  *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
 1198:  *
 1199:  */
 1200: 
 1201: /*
 1202:  * Add an item to the syncer work queue.
 1203:  */
 1204: static void
 1205: vn_syncer_add_to_worklist(struct vnode *vp, int delay)
 1206: {
 1207: 	int s, slot;
 1208: 
 1209: 	s = splbio();
 1210: 
 1211: 	if (vp->v_flag & VONWORKLST) {
 1212: 		LIST_REMOVE(vp, v_synclist);
 1213: 	}
 1214: 
 1215: 	if (delay > syncer_maxdelay - 2)
 1216: 		delay = syncer_maxdelay - 2;
 1217: 	slot = (syncer_delayno + delay) & syncer_mask;
 1218: 
 1219: 	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
 1220: 	vp->v_flag |= VONWORKLST;
 1221: 	splx(s);
 1222: }
 1223: 
 1224: struct  thread *updatethread;
 1225: static void sched_sync (void);
 1226: static struct kproc_desc up_kp = {
 1227: 	"syncer",
 1228: 	sched_sync,
 1229: 	&updatethread
 1230: };
 1231: SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
 1232: 
 1233: /*
 1234:  * System filesystem synchronizer daemon.
 1235:  */
 1236: void 
 1237: sched_sync(void)
 1238: {
 1239: 	struct synclist *slp;
 1240: 	struct vnode *vp;
 1241: 	long starttime;
 1242: 	int s;
 1243: 	struct thread *td = curthread;
 1244: 
 1245: 	EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, td,
 1246: 	    SHUTDOWN_PRI_LAST);   
 1247: 
 1248: 	for (;;) {
 1249: 		kproc_suspend_loop();
 1250: 
 1251: 		starttime = time_second;
 1252: 
 1253: 		/*
 1254: 		 * Push files whose dirty time has expired.  Be careful
 1255: 		 * of interrupt race on slp queue.
 1256: 		 */
 1257: 		s = splbio();
 1258: 		slp = &syncer_workitem_pending[syncer_delayno];
 1259: 		syncer_delayno += 1;
 1260: 		if (syncer_delayno == syncer_maxdelay)
 1261: 			syncer_delayno = 0;
 1262: 		splx(s);
 1263: 
 1264: 		while ((vp = LIST_FIRST(slp)) != NULL) {
 1265: 			if (VOP_ISLOCKED(vp, NULL) == 0) {
 1266: 				vn_lock(vp, NULL, LK_EXCLUSIVE | LK_RETRY, td);
 1267: 				(void) VOP_FSYNC(vp, MNT_LAZY, td);
 1268: 				VOP_UNLOCK(vp, NULL, 0, td);
 1269: 			}
 1270: 			s = splbio();
 1271: 			if (LIST_FIRST(slp) == vp) {
 1272: 				/*
 1273: 				 * Note: v_tag VT_VFS vps can remain on the
 1274: 				 * worklist too with no dirty blocks, but 
 1275: 				 * since sync_fsync() moves it to a different 
 1276: 				 * slot we are safe.
 1277: 				 */
 1278: 				if (TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
 1279: 				    !vn_isdisk(vp, NULL))
 1280: 					panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag);
 1281: 				/*
 1282: 				 * Put us back on the worklist.  The worklist
 1283: 				 * routine will remove us from our current
 1284: 				 * position and then add us back in at a later
 1285: 				 * position.
 1286: 				 */
 1287: 				vn_syncer_add_to_worklist(vp, syncdelay);
 1288: 			}
 1289: 			splx(s);
 1290: 		}
 1291: 
 1292: 		/*
 1293: 		 * Do soft update processing.
 1294: 		 */
 1295: 		if (bioops.io_sync)
 1296: 			(*bioops.io_sync)(NULL);
 1297: 
 1298: 		/*
 1299: 		 * The variable rushjob allows the kernel to speed up the
 1300: 		 * processing of the filesystem syncer process. A rushjob
 1301: 		 * value of N tells the filesystem syncer to process the next
 1302: 		 * N seconds worth of work on its queue ASAP. Currently rushjob
 1303: 		 * is used by the soft update code to speed up the filesystem
 1304: 		 * syncer process when the incore state is getting so far
 1305: 		 * ahead of the disk that the kernel memory pool is being
 1306: 		 * threatened with exhaustion.
 1307: 		 */
 1308: 		if (rushjob > 0) {
 1309: 			rushjob -= 1;
 1310: 			continue;
 1311: 		}
 1312: 		/*
 1313: 		 * If it has taken us less than a second to process the
 1314: 		 * current work, then wait. Otherwise start right over
 1315: 		 * again. We can still lose time if any single round
 1316: 		 * takes more than two seconds, but it does not really
 1317: 		 * matter as we are just trying to generally pace the
 1318: 		 * filesystem activity.
 1319: 		 */
 1320: 		if (time_second == starttime)
 1321: 			tsleep(&lbolt, 0, "syncer", 0);
 1322: 	}
 1323: }
 1324: 
 1325: /*
 1326:  * Request the syncer daemon to speed up its work.
 1327:  * We never push it to speed up more than half of its
 1328:  * normal turn time, otherwise it could take over the cpu.
 1329:  *
 1330:  * YYY wchan field protected by the BGL.
 1331:  */
 1332: int
 1333: speedup_syncer()
 1334: {
 1335: 	crit_enter();
 1336: 	if (updatethread->td_wchan == &lbolt) { /* YYY */
 1337: 		unsleep(updatethread);
 1338: 		lwkt_schedule(updatethread);
 1339: 	}
 1340: 	crit_exit();
 1341: 	if (rushjob < syncdelay / 2) {
 1342: 		rushjob += 1;
 1343: 		stat_rush_requests += 1;
 1344: 		return (1);
 1345: 	}
 1346: 	return(0);
 1347: }
 1348: 
 1349: /*
 1350:  * Associate a p-buffer with a vnode.
 1351:  *
 1352:  * Also sets B_PAGING flag to indicate that vnode is not fully associated
 1353:  * with the buffer.  i.e. the bp has not been linked into the vnode or
 1354:  * ref-counted.
 1355:  */
 1356: void
 1357: pbgetvp(vp, bp)
 1358: 	struct vnode *vp;
 1359: 	struct buf *bp;
 1360: {
 1361: 
 1362: 	KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
 1363: 
 1364: 	bp->b_vp = vp;
 1365: 	bp->b_flags |= B_PAGING;
 1366: 	bp->b_dev = vn_todev(vp);
 1367: }
 1368: 
 1369: /*
 1370:  * Disassociate a p-buffer from a vnode.
 1371:  */
 1372: void
 1373: pbrelvp(bp)
 1374: 	struct buf *bp;
 1375: {
 1376: 
 1377: 	KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
 1378: 
 1379: 	/* XXX REMOVE ME */
 1380: 	if (TAILQ_NEXT(bp, b_vnbufs) != NULL) {
 1381: 		panic(
 1382: 		    "relpbuf(): b_vp was probably reassignbuf()d %p %x", 
 1383: 		    bp,
 1384: 		    (int)bp->b_flags
 1385: 		);
 1386: 	}
 1387: 	bp->b_vp = (struct vnode *) 0;
 1388: 	bp->b_flags &= ~B_PAGING;
 1389: }
 1390: 
 1391: void
 1392: pbreassignbuf(bp, newvp)
 1393: 	struct buf *bp;
 1394: 	struct vnode *newvp;
 1395: {
 1396: 	if ((bp->b_flags & B_PAGING) == 0) {
 1397: 		panic(
 1398: 		    "pbreassignbuf() on non phys bp %p", 
 1399: 		    bp
 1400: 		);
 1401: 	}
 1402: 	bp->b_vp = newvp;
 1403: }
 1404: 
 1405: /*
 1406:  * Reassign a buffer from one vnode to another.
 1407:  * Used to assign file specific control information
 1408:  * (indirect blocks) to the vnode to which they belong.
 1409:  */
 1410: void
 1411: reassignbuf(bp, newvp)
 1412: 	struct buf *bp;
 1413: 	struct vnode *newvp;
 1414: {
 1415: 	struct buflists *listheadp;
 1416: 	int delay;
 1417: 	int s;
 1418: 
 1419: 	if (newvp == NULL) {
 1420: 		printf("reassignbuf: NULL");
 1421: 		return;
 1422: 	}
 1423: 	++reassignbufcalls;
 1424: 
 1425: 	/*
 1426: 	 * B_PAGING flagged buffers cannot be reassigned because their vp
 1427: 	 * is not fully linked in.
 1428: 	 */
 1429: 	if (bp->b_flags & B_PAGING)
 1430: 		panic("cannot reassign paging buffer");
 1431: 
 1432: 	s = splbio();
 1433: 	/*
 1434: 	 * Delete from old vnode list, if on one.
 1435: 	 */
 1436: 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
 1437: 		if (bp->b_xflags & BX_VNDIRTY)
 1438: 			listheadp = &bp->b_vp->v_dirtyblkhd;
 1439: 		else 
 1440: 			listheadp = &bp->b_vp->v_cleanblkhd;
 1441: 		TAILQ_REMOVE(listheadp, bp, b_vnbufs);
 1442: 		bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
 1443: 		if (bp->b_vp != newvp) {
 1444: 			vdrop(bp->b_vp);
 1445: 			bp->b_vp = NULL;	/* for clarification */
 1446: 		}
 1447: 	}
 1448: 	/*
 1449: 	 * If dirty, put on list of dirty buffers; otherwise insert onto list
 1450: 	 * of clean buffers.
 1451: 	 */
 1452: 	if (bp->b_flags & B_DELWRI) {
 1453: 		struct buf *tbp;
 1454: 
 1455: 		listheadp = &newvp->v_dirtyblkhd;
 1456: 		if ((newvp->v_flag & VONWORKLST) == 0) {
 1457: 			switch (newvp->v_type) {
 1458: 			case VDIR:
 1459: 				delay = dirdelay;
 1460: 				break;
 1461: 			case VCHR:
 1462: 			case VBLK:
 1463: 				if (newvp->v_rdev && 
 1464: 				    newvp->v_rdev->si_mountpoint != NULL) {
 1465: 					delay = metadelay;
 1466: 					break;
 1467: 				}
 1468: 				/* fall through */
 1469: 			default:
 1470: 				delay = filedelay;
 1471: 			}
 1472: 			vn_syncer_add_to_worklist(newvp, delay);
 1473: 		}
 1474: 		bp->b_xflags |= BX_VNDIRTY;
 1475: 		tbp = TAILQ_FIRST(listheadp);
 1476: 		if (tbp == NULL ||
 1477: 		    bp->b_lblkno == 0 ||
 1478: 		    (bp->b_lblkno > 0 && tbp->b_lblkno < 0) ||
 1479: 		    (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) {
 1480: 			TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs);
 1481: 			++reassignbufsortgood;
 1482: 		} else if (bp->b_lblkno < 0) {
 1483: 			TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs);
 1484: 			++reassignbufsortgood;
 1485: 		} else if (reassignbufmethod == 1) {
 1486: 			/*
 1487: 			 * New sorting algorithm, only handle sequential case,
 1488: 			 * otherwise append to end (but before metadata)
 1489: 			 */
 1490: 			if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL &&
 1491: 			    (tbp->b_xflags & BX_VNDIRTY)) {
 1492: 				/*
 1493: 				 * Found the best place to insert the buffer
 1494: 				 */
 1495: 				TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
 1496: 				++reassignbufsortgood;
 1497: 			} else {
 1498: 				/*
 1499: 				 * Missed, append to end, but before meta-data.
 1500: 				 * We know that the head buffer in the list is
 1501: 				 * not meta-data due to prior conditionals.
 1502: 				 *
 1503: 				 * Indirect effects:  NFS second stage write
 1504: 				 * tends to wind up here, giving maximum 
 1505: 				 * distance between the unstable write and the
 1506: 				 * commit rpc.
 1507: 				 */
 1508: 				tbp = TAILQ_LAST(listheadp, buflists);
 1509: 				while (tbp && tbp->b_lblkno < 0)
 1510: 					tbp = TAILQ_PREV(tbp, buflists, b_vnbufs);
 1511: 				TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
 1512: 				++reassignbufsortbad;
 1513: 			}
 1514: 		} else {
 1515: 			/*
 1516: 			 * Old sorting algorithm, scan queue and insert
 1517: 			 */
 1518: 			struct buf *ttbp;
 1519: 			while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) &&
 1520: 			    (ttbp->b_lblkno < bp->b_lblkno)) {
 1521: 				++reassignbufloops;
 1522: 				tbp = ttbp;
 1523: 			}
 1524: 			TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
 1525: 		}
 1526: 	} else {
 1527: 		bp->b_xflags |= BX_VNCLEAN;
 1528: 		TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs);
 1529: 		if ((newvp->v_flag & VONWORKLST) &&
 1530: 		    TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
 1531: 			newvp->v_flag &= ~VONWORKLST;
 1532: 			LIST_REMOVE(newvp, v_synclist);
 1533: 		}
 1534: 	}
 1535: 	if (bp->b_vp != newvp) {
 1536: 		bp->b_vp = newvp;
 1537: 		vhold(bp->b_vp);
 1538: 	}
 1539: 	splx(s);
 1540: }
 1541: 
 1542: /*
 1543:  * Create a vnode for a block device.
 1544:  * Used for mounting the root file system.
 1545:  */
 1546: int
 1547: bdevvp(dev_t dev, struct vnode **vpp)
 1548: {
 1549: 	struct vnode *vp;
 1550: 	struct vnode *nvp;
 1551: 	int error;
 1552: 
 1553: 	if (dev == NODEV) {
 1554: 		*vpp = NULLVP;
 1555: 		return (ENXIO);
 1556: 	}
 1557: 	error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp);
 1558: 	if (error) {
 1559: 		*vpp = NULLVP;
 1560: 		return (error);
 1561: 	}
 1562: 	vp = nvp;
 1563: 	vp->v_type = VCHR;
 1564: 	vp->v_udev = dev->si_udev;
 1565: 	*vpp = vp;
 1566: 	return (0);
 1567: }
 1568: 
 1569: int
 1570: v_associate_rdev(struct vnode *vp, dev_t dev)
 1571: {
 1572: 	lwkt_tokref ilock;
 1573: 
 1574: 	if (dev == NULL || dev == NODEV)
 1575: 		return(ENXIO);
 1576: 	if (dev_is_good(dev) == 0)
 1577: 		return(ENXIO);
 1578: 	KKASSERT(vp->v_rdev == NULL);
 1579: 	if (dev_ref_debug)
 1580: 		printf("Z1");
 1581: 	vp->v_rdev = reference_dev(dev);
 1582: 	lwkt_gettoken(&ilock, &spechash_token);
 1583: 	SLIST_INSERT_HEAD(&dev->si_hlist, vp, v_specnext);
 1584: 	lwkt_reltoken(&ilock);
 1585: 	return(0);
 1586: }
 1587: 
 1588: void
 1589: v_release_rdev(struct vnode *vp)
 1590: {
 1591: 	lwkt_tokref ilock;
 1592: 	dev_t dev;
 1593: 
 1594: 	if ((dev = vp->v_rdev) != NULL) {
 1595: 		lwkt_gettoken(&ilock, &spechash_token);
 1596: 		SLIST_REMOVE(&dev->si_hlist, vp, vnode, v_specnext);
 1597: 		if (dev_ref_debug)
 1598: 			printf("Y2");
 1599: 		vp->v_rdev = NULL;
 1600: 		release_dev(dev);
 1601: 		lwkt_reltoken(&ilock);
 1602: 	}
 1603: }
 1604: 
 1605: /*
 1606:  * Add a vnode to the alias list hung off the dev_t.  We only associate
 1607:  * the device number with the vnode.  The actual device is not associated
 1608:  * until the vnode is opened (usually in spec_open()), and will be 
 1609:  * disassociated on last close.
 1610:  */
 1611: void
 1612: addaliasu(struct vnode *nvp, udev_t nvp_udev)
 1613: {
 1614: 	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
 1615: 		panic("addaliasu on non-special vnode");
 1616: 	nvp->v_udev = nvp_udev;
 1617: }
 1618: 
 1619: /*
 1620:  * Grab a particular vnode from the free list, increment its
 1621:  * reference count and lock it. The vnode lock bit is set if the
 1622:  * vnode is being eliminated in vgone. The process is awakened
 1623:  * when the transition is completed, and an error returned to
 1624:  * indicate that the vnode is no longer usable (possibly having
 1625:  * been changed to a new file system type).
 1626:  *
 1627:  * This code is very sensitive.  We are depending on the vnode interlock
 1628:  * to be maintained through to the vn_lock() call, which means that we
 1629:  * cannot block which means that we cannot call vbusy() until after vn_lock().
 1630:  * If the interlock is not maintained, the VXLOCK check will not properly
 1631:  * interlock against a vclean()'s LK_DRAIN operation on the lock.
 1632:  */
 1633: int
 1634: vget(struct vnode *vp, lwkt_tokref_t vlock, int flags, thread_t td)
 1635: {
 1636: 	int error;
 1637: 	lwkt_tokref vvlock;
 1638: 
 1639: 	/*
 1640: 	 * We need the interlock to safely modify the v_ fields.  ZZZ it is
 1641: 	 * only legal to pass (1) the vnode's interlock and (2) only pass
 1642: 	 * NULL w/o LK_INTERLOCK if the vnode is *ALREADY* referenced or
 1643: 	 * held.
 1644: 	 */
 1645: 	if ((flags & LK_INTERLOCK) == 0) {
 1646: 		lwkt_gettoken(&vvlock, vp->v_interlock);
 1647: 		vlock = &vvlock;
 1648: 	}
 1649: 
 1650: 	/*
 1651: 	 * If the vnode is in the process of being cleaned out for
 1652: 	 * another use, we wait for the cleaning to finish and then
 1653: 	 * return failure. Cleaning is determined by checking that
 1654: 	 * the VXLOCK flag is set.  It is possible for the vnode to be
 1655: 	 * self-referenced during the cleaning operation.
 1656: 	 */
 1657: 	if (vp->v_flag & VXLOCK) {
 1658: 		if (vp->v_vxthread == curthread) {
 1659: #if 0
 1660: 			/* this can now occur in normal operation */
 1661: 			log(LOG_INFO, "VXLOCK interlock avoided\n");
 1662: #endif
 1663: 		} else {
 1664: 			vp->v_flag |= VXWANT;
 1665: 			lwkt_reltoken(vlock);
 1666: 			tsleep((caddr_t)vp, 0, "vget", 0);
 1667: 			return (ENOENT);
 1668: 		}
 1669: 	}
 1670: 
 1671: 	/*
 1672: 	 * Bump v_usecount to prevent the vnode from being recycled.  The
 1673: 	 * usecount needs to be bumped before we successfully get our lock.
 1674: 	 */
 1675: 	vp->v_usecount++;
 1676: 	if (flags & LK_TYPE_MASK) {
 1677: 		if ((error = vn_lock(vp, vlock, flags | LK_INTERLOCK, td)) != 0) {
 1678: 			/*
 1679: 			 * must expand vrele here because we do not want
 1680: 			 * to call VOP_INACTIVE if the reference count
 1681: 			 * drops back to zero since it was never really
 1682: 			 * active. We must remove it from the free list
 1683: 			 * before sleeping so that multiple processes do
 1684: 			 * not try to recycle it.
 1685: 			 */
 1686: 			lwkt_gettokref(vlock);
 1687: 			vp->v_usecount--;
 1688: 			vmaybefree(vp);
 1689: 			lwkt_reltoken(vlock);
 1690: 		}
 1691: 		return (error);
 1692: 	}
 1693: 	if (VSHOULDBUSY(vp))
 1694: 		vbusy(vp);	/* interlock must be held on call */
 1695: 	lwkt_reltoken(vlock);
 1696: 	return (0);
 1697: }
 1698: 
 1699: void
 1700: vref(struct vnode *vp)
 1701: {
 1702: 	crit_enter();	/* YYY use crit section for moment / BGL protected */
 1703: 	vp->v_usecount++;
 1704: 	crit_exit();
 1705: }
 1706: 
 1707: /*
 1708:  * Vnode put/release.
 1709:  * If count drops to zero, call inactive routine and return to freelist.
 1710:  */
 1711: void
 1712: vrele(struct vnode *vp)
 1713: {
 1714: 	struct thread *td = curthread;	/* XXX */
 1715: 	lwkt_tokref vlock;
 1716: 
 1717: 	KASSERT(vp != NULL && vp->v_usecount >= 0,
 1718: 	    ("vrele: null vp or <=0 v_usecount"));
 1719: 
 1720: 	lwkt_gettoken(&vlock, vp->v_interlock);
 1721: 
 1722: 	if (vp->v_usecount > 1) {
 1723: 		vp->v_usecount--;
 1724: 		lwkt_reltoken(&vlock);
 1725: 		return;
 1726: 	}
 1727: 
 1728: 	if (vp->v_usecount == 1) {
 1729: 		vp->v_usecount--;
 1730: 		/*
 1731: 		 * We must call VOP_INACTIVE with the node locked and the
 1732: 		 * usecount 0.  If we are doing a vpu, the node is already
 1733: 		 * locked, but, in the case of vrele, we must explicitly lock
 1734: 		 * the vnode before calling VOP_INACTIVE.
 1735: 		 */
 1736: 
 1737: 		if (vn_lock(vp, NULL, LK_EXCLUSIVE, td) == 0)
 1738: 			VOP_INACTIVE(vp, td);
 1739: 		vmaybefree(vp);
 1740: 		lwkt_reltoken(&vlock);
 1741: 	} else {
 1742: #ifdef DIAGNOSTIC
 1743: 		vprint("vrele: negative ref count", vp);
 1744: #endif
 1745: 		lwkt_reltoken(&vlock);
 1746: 		panic("vrele: negative ref cnt");
 1747: 	}
 1748: }
 1749: 
 1750: void
 1751: vput(struct vnode *vp)
 1752: {
 1753: 	struct thread *td = curthread;	/* XXX */
 1754: 	lwkt_tokref vlock;
 1755: 
 1756: 	KASSERT(vp != NULL, ("vput: null vp"));
 1757: 
 1758: 	lwkt_gettoken(&vlock, vp->v_interlock);
 1759: 
 1760: 	if (vp->v_usecount > 1) {
 1761: 		vp->v_usecount--;
 1762: 		VOP_UNLOCK(vp, &vlock, LK_INTERLOCK, td);
 1763: 		return;
 1764: 	}
 1765: 
 1766: 	if (vp->v_usecount == 1) {
 1767: 		vp->v_usecount--;
 1768: 		/*
 1769: 		 * We must call VOP_INACTIVE with the node locked.
 1770: 		 * If we are doing a vpu, the node is already locked,
 1771: 		 * so we just need to release the vnode mutex.
 1772: 		 */
 1773: 		VOP_INACTIVE(vp, td);
 1774: 		vmaybefree(vp);
 1775: 		lwkt_reltoken(&vlock);
 1776: 	} else {
 1777: #ifdef DIAGNOSTIC
 1778: 		vprint("vput: negative ref count", vp);
 1779: #endif
 1780: 		lwkt_reltoken(&vlock);
 1781: 		panic("vput: negative ref cnt");
 1782: 	}
 1783: }
 1784: 
 1785: /*
 1786:  * Somebody doesn't want the vnode recycled. ZZZ vnode interlock should
 1787:  * be held but isn't.
 1788:  */
 1789: void
 1790: vhold(vp)
 1791: 	struct vnode *vp;
 1792: {
 1793: 	int s;
 1794: 
 1795:   	s = splbio();
 1796: 	vp->v_holdcnt++;
 1797: 	if (VSHOULDBUSY(vp))
 1798: 		vbusy(vp);	/* interlock must be held on call */
 1799: 	splx(s);
 1800: }
 1801: 
 1802: /*
 1803:  * One less who cares about this vnode.
 1804:  */
 1805: void
 1806: vdrop(vp)
 1807: 	struct vnode *vp;
 1808: {
 1809: 	lwkt_tokref vlock;
 1810: 
 1811: 	lwkt_gettoken(&vlock, vp->v_interlock);
 1812: 	if (vp->v_holdcnt <= 0)
 1813: 		panic("vdrop: holdcnt");
 1814: 	vp->v_holdcnt--;
 1815: 	vmaybefree(vp);
 1816: 	lwkt_reltoken(&vlock);
 1817: }
 1818: 
 1819: int
 1820: vmntvnodescan(
 1821:     struct mount *mp, 
 1822:     int (*fastfunc)(struct mount *mp, struct vnode *vp, void *data),
 1823:     int (*slowfunc)(struct mount *mp, struct vnode *vp, lwkt_tokref_t vlock, void *data),
 1824:     void *data
 1825: ) {
 1826: 	lwkt_tokref ilock;
 1827: 	lwkt_tokref vlock;
 1828: 	struct vnode *pvp;
 1829: 	struct vnode *vp;
 1830: 	int r = 0;
 1831: 
 1832: 	/*
 1833: 	 * Scan the vnodes on the mount's vnode list.  Use a placemarker
 1834: 	 */
 1835: 	pvp = zalloc(vnode_zone);
 1836: 	pvp->v_flag |= VPLACEMARKER;
 1837: 
 1838: 	lwkt_gettoken(&ilock, &mntvnode_token);
 1839: 	TAILQ_INSERT_HEAD(&mp->mnt_nvnodelist, pvp, v_nmntvnodes);
 1840: 
 1841: 	while ((vp = TAILQ_NEXT(pvp, v_nmntvnodes)) != NULL) {
 1842: 		/*
 1843: 		 * Move the placemarker and skip other placemarkers we
 1844: 		 * encounter.  The nothing can get in our way so the
 1845: 		 * mount point on the vp must be valid.
 1846: 		 */
 1847: 		TAILQ_REMOVE(&mp->mnt_nvnodelist, pvp, v_nmntvnodes);
 1848: 		TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, pvp, v_nmntvnodes);
 1849: 		if (vp->v_flag & VPLACEMARKER)
 1850: 			continue;
 1851: 		KKASSERT(vp->v_mount == mp);
 1852: 
 1853: 		/*
 1854: 		 * Quick test
 1855: 		 */
 1856: 		if (fastfunc) {
 1857: 			if ((r = fastfunc(mp, vp, data)) < 0)
 1858: 				continue;
 1859: 			if (r)
 1860: 				break;
 1861: 		}
 1862: 
 1863: 		/*
 1864: 		 * Get the vnodes interlock and make sure it is still on the
 1865: 		 * mount list.  Skip it if it has moved (we may encounter it
 1866: 		 * later).  Then do the with-interlock test.  The callback
 1867: 		 * is responsible for releasing the vnode interlock.
 1868: 		 *
 1869: 		 * The interlock is type-stable.
 1870: 		 */
 1871: 		if (slowfunc) {
 1872: 			lwkt_gettoken(&vlock, vp->v_interlock);
 1873: 			if (vp != TAILQ_PREV(pvp, vnodelst, v_nmntvnodes)) {
 1874: 				printf("vmntvnodescan (debug info only): f=%p vp=%p vnode ripped out from under us\n", slowfunc, vp);
 1875: 				lwkt_reltoken(&vlock);
 1876: 				continue;
 1877: 			}
 1878: 			if ((r = slowfunc(mp, vp, &vlock, data)) != 0) {
 1879: 				KKASSERT(lwkt_havetokref(&vlock) == 0);
 1880: 				break;
 1881: 			}
 1882: 			KKASSERT(lwkt_havetokref(&vlock) == 0);
 1883: 		}
 1884: 	}
 1885: 	TAILQ_REMOVE(&mp->mnt_nvnodelist, pvp, v_nmntvnodes);
 1886: 	zfree(vnode_zone, pvp);
 1887: 	lwkt_reltoken(&ilock);
 1888: 	return(r);
 1889: }
 1890: 
 1891: /*
 1892:  * Remove any vnodes in the vnode table belonging to mount point mp.
 1893:  *
 1894:  * If FORCECLOSE is not specified, there should not be any active ones,
 1895:  * return error if any are found (nb: this is a user error, not a
 1896:  * system error). If FORCECLOSE is specified, detach any active vnodes
 1897:  * that are found.
 1898:  *
 1899:  * If WRITECLOSE is set, only flush out regular file vnodes open for
 1900:  * writing.
 1901:  *
 1902:  * SKIPSYSTEM causes any vnodes marked VSYSTEM to be skipped.
 1903:  *
 1904:  * `rootrefs' specifies the base reference count for the root vnode
 1905:  * of this filesystem. The root vnode is considered busy if its
 1906:  * v_usecount exceeds this value. On a successful return, vflush()
 1907:  * will call vrele() on the root vnode exactly rootrefs times.
 1908:  * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
 1909:  * be zero.
 1910:  */
 1911: #ifdef DIAGNOSTIC
 1912: static int busyprt = 0;		/* print out busy vnodes */
 1913: SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
 1914: #endif
 1915: 
 1916: static int vflush_scan(struct mount *mp, struct vnode *vp, lwkt_tokref_t vlock, void *data);
 1917: 
 1918: struct vflush_info {
 1919: 	int flags;
 1920: 	int busy;
 1921: 	thread_t td;
 1922: };
 1923: 
 1924: int
 1925: vflush(mp, rootrefs, flags)
 1926: 	struct mount *mp;
 1927: 	int rootrefs;
 1928: 	int flags;
 1929: {
 1930: 	struct thread *td = curthread;	/* XXX */
 1931: 	struct vnode *rootvp = NULL;
 1932: 	int error;
 1933: 	lwkt_tokref vlock;
 1934: 	struct vflush_info vflush_info;
 1935: 
 1936: 	if (rootrefs > 0) {
 1937: 		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
 1938: 		    ("vflush: bad args"));
 1939: 		/*
 1940: 		 * Get the filesystem root vnode. We can vput() it
 1941: 		 * immediately, since with rootrefs > 0, it won't go away.
 1942: 		 */
 1943: 		if ((error = VFS_ROOT(mp, &rootvp)) != 0)
 1944: 			return (error);
 1945: 		vput(rootvp);
 1946: 	}
 1947: 
 1948: 	vflush_info.busy = 0;
 1949: 	vflush_info.flags = flags;
 1950: 	vflush_info.td = td;
 1951: 	vmntvnodescan(mp, NULL, vflush_scan, &vflush_info);
 1952: 
 1953: 	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
 1954: 		/*
 1955: 		 * If just the root vnode is busy, and if its refcount
 1956: 		 * is equal to `rootrefs', then go ahead and kill it.
 1957: 		 */
 1958: 		lwkt_gettoken(&vlock, rootvp->v_interlock);
 1959: 		KASSERT(vflush_info.busy > 0, ("vflush: not busy"));
 1960: 		KASSERT(rootvp->v_usecount >= rootrefs, ("vflush: rootrefs"));
 1961: 		if (vflush_info.busy == 1 && rootvp->v_usecount == rootrefs) {
 1962: 			vgonel(rootvp, &vlock, td);
 1963: 			vflush_info.busy = 0;
 1964: 		} else {
 1965: 			lwkt_reltoken(&vlock);
 1966: 		}
 1967: 	}
 1968: 	if (vflush_info.busy)
 1969: 		return (EBUSY);
 1970: 	for (; rootrefs > 0; rootrefs--)
 1971: 		vrele(rootvp);
 1972: 	return (0);
 1973: }
 1974: 
 1975: /*
 1976:  * The scan callback is made with an interlocked vnode.
 1977:  */
 1978: static int
 1979: vflush_scan(struct mount *mp, struct vnode *vp, lwkt_tokref_t vlock, void *data)
 1980: {
 1981: 	struct vflush_info *info = data;
 1982: 	struct vattr vattr;
 1983: 
 1984: 	/*
 1985: 	 * Skip over a vnodes marked VSYSTEM.
 1986: 	 */
 1987: 	if ((info->flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
 1988: 		lwkt_reltoken(vlock);
 1989: 		return(0);
 1990: 	}
 1991: 
 1992: 	/*
 1993: 	 * If WRITECLOSE is set, flush out unlinked but still open
 1994: 	 * files (even if open only for reading) and regular file
 1995: 	 * vnodes open for writing. 
 1996: 	 */
 1997: 	if ((info->flags & WRITECLOSE) &&
 1998: 	    (vp->v_type == VNON ||
 1999: 	    (VOP_GETATTR(vp, &vattr, info->td) == 0 &&
 2000: 	    vattr.va_nlink > 0)) &&
 2001: 	    (vp->v_writecount == 0 || vp->v_type != VREG)) {
 2002: 		lwkt_reltoken(vlock);
 2003: 		return(0);
 2004: 	}
 2005: 
 2006: 	/*
 2007: 	 * With v_usecount == 0, all we need to do is clear out the
 2008: 	 * vnode data structures and we are done.
 2009: 	 */
 2010: 	if (vp->v_usecount == 0) {
 2011: 		vgonel(vp, vlock, info->td);
 2012: 		return(0);
 2013: 	}
 2014: 
 2015: 	/*
 2016: 	 * If FORCECLOSE is set, forcibly close the vnode. For block
 2017: 	 * or character devices, revert to an anonymous device. For
 2018: 	 * all other files, just kill them.
 2019: 	 */
 2020: 	if (info->flags & FORCECLOSE) {
 2021: 		if (vp->v_type != VBLK && vp->v_type != VCHR) {
 2022: 			vgonel(vp, vlock, info->td);
 2023: 		} else {
 2024: 			vclean(vp, vlock, 0, info->td);
 2025: 			vp->v_op = spec_vnodeop_p;
 2026: 			insmntque(vp, (struct mount *) 0);
 2027: 		}
 2028: 		return(0);
 2029: 	}
 2030: #ifdef DIAGNOSTIC
 2031: 	if (busyprt)
 2032: 		vprint("vflush: busy vnode", vp);
 2033: #endif
 2034: 	lwkt_reltoken(vlock);
 2035: 	++info->busy;
 2036: 	return(0);
 2037: }
 2038: 
 2039: /*
 2040:  * Disassociate the underlying file system from a vnode.
 2041:  */
 2042: static void
 2043: vclean(struct vnode *vp, lwkt_tokref_t vlock, int flags, struct thread *td)
 2044: {
 2045: 	int active;
 2046: 
 2047: 	/*
 2048: 	 * Check to see if the vnode is in use. If so we have to reference it
 2049: 	 * before we clean it out so that its count cannot fall to zero and
 2050: 	 * generate a race against ourselves to recycle it.
 2051: 	 */
 2052: 	if ((active = vp->v_usecount))
 2053: 		vp->v_usecount++;
 2054: 
 2055: 	/*
 2056: 	 * Prevent the vnode from being recycled or brought into use while we
 2057: 	 * clean it out.
 2058: 	 */
 2059: 	if (vp->v_flag & VXLOCK)
 2060: 		panic("vclean: deadlock");
 2061: 	vp->v_flag |= VXLOCK;
 2062: 	vp->v_vxthread = curthread;
 2063: 
 2064: 	/*
 2065: 	 * Even if the count is zero, the VOP_INACTIVE routine may still
 2066: 	 * have the object locked while it cleans it out. The VOP_LOCK
 2067: 	 * ensures that the VOP_INACTIVE routine is done with its work.
 2068: 	 * For active vnodes, it ensures that no other activity can
 2069: 	 * occur while the underlying object is being cleaned out.
 2070: 	 *
 2071: 	 * NOTE: we continue to hold the vnode interlock through to the
 2072: 	 * end of vclean().
 2073: 	 */
 2074: 	VOP_LOCK(vp, NULL, LK_DRAIN, td);
 2075: 
 2076: 	/*
 2077: 	 * Clean out any buffers associated with the vnode.
 2078: 	 */
 2079: 	vinvalbuf(vp, V_SAVE, td, 0, 0);
 2080: 	VOP_DESTROYVOBJECT(vp);
 2081: 
 2082: 	/*
 2083: 	 * If purging an active vnode, it must be closed and
 2084: 	 * deactivated before being reclaimed. Note that the
 2085: 	 * VOP_INACTIVE will unlock the vnode.
 2086: 	 */
 2087: 	if (active) {
 2088: 		if (flags & DOCLOSE)
 2089: 			VOP_CLOSE(vp, FNONBLOCK, td);
 2090: 		VOP_INACTIVE(vp, td);
 2091: 	} else {
 2092: 		/*
 2093: 		 * Any other processes trying to obtain this lock must first
 2094: 		 * wait for VXLOCK to clear, then call the new lock operation.
 2095: 		 */
 2096: 		VOP_UNLOCK(vp, NULL, 0, td);
 2097: 	}
 2098: 	/*
 2099: 	 * Reclaim the vnode.
 2100: 	 */
 2101: 	if (VOP_RECLAIM(vp, td))
 2102: 		panic("vclean: cannot reclaim");
 2103: 
 2104: 	if (active) {
 2105: 		/*
 2106: 		 * Inline copy of vrele() since VOP_INACTIVE
 2107: 		 * has already been called.
 2108: 		 */
 2109: 		if (--vp->v_usecount <= 0) {
 2110: #ifdef DIAGNOSTIC
 2111: 			if (vp->v_usecount < 0 || vp->v_writecount != 0) {
 2112: 				vprint("vclean: bad ref count", vp);
 2113: 				panic("vclean: ref cnt");
 2114: 			}
 2115: #endif
 2116: 			vfree(vp);
 2117: 		}
 2118: 	}
 2119: 
 2120: 	cache_purge(vp);
 2121: 	vp->v_vnlock = NULL;
 2122: 	vmaybefree(vp);
 2123: 	
 2124: 	/*
 2125: 	 * Done with purge, notify sleepers of the grim news.
 2126: 	 */
 2127: 	vp->v_op = dead_vnodeop_p;
 2128: 	vn_pollgone(vp);
 2129: 	vp->v_tag = VT_NON;
 2130: 	vp->v_flag &= ~VXLOCK;
 2131: 	vp->v_vxthread = NULL;
 2132: 	if (vp->v_flag & VXWANT) {
 2133: 		vp->v_flag &= ~VXWANT;
 2134: 		wakeup((caddr_t) vp);
 2135: 	}
 2136: 	lwkt_reltoken(vlock);
 2137: }
 2138: 
 2139: /*
 2140:  * Eliminate all activity associated with the requested vnode
 2141:  * and with all vnodes aliased to the requested vnode.
 2142:  */
 2143: int
 2144: vop_revoke(ap)
 2145: 	struct vop_revoke_args /* {
 2146: 		struct vnode *a_vp;
 2147: 		int a_flags;
 2148: 	} */ *ap;
 2149: {
 2150: 	struct vnode *vp, *vq;
 2151: 	lwkt_tokref ilock;
 2152: 	dev_t dev;
 2153: 
 2154: 	KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
 2155: 
 2156: 	vp = ap->a_vp;
 2157: 	/*
 2158: 	 * If a vgone (or vclean) is already in progress,
 2159: 	 * wait until it is done and return.
 2160: 	 */
 2161: 	if (vp->v_flag & VXLOCK) {
 2162: 		vp->v_flag |= VXWANT;
 2163: 		/*lwkt_reltoken(vlock); ZZZ */
 2164: 		tsleep((caddr_t)vp, 0, "vop_revokeall", 0);
 2165: 		return (0);
 2166: 	}
 2167: 
 2168: 	/*
 2169: 	 * If the vnode has a device association, scrap all vnodes associated
 2170: 	 * with the device.  Don't let the device disappear on us while we
 2171: 	 * are scrapping the vnodes.
 2172: 	 */
 2173: 	if (vp->v_type != VCHR && vp->v_type != VBLK)
 2174: 		return(0);
 2175: 	if ((dev = vp->v_rdev) == NULL) {
 2176: 		if ((dev = udev2dev(vp->v_udev, vp->v_type == VBLK)) == NODEV)
 2177: 			return(0);
 2178: 	}
 2179: 	reference_dev(dev);
 2180: 	for (;;) {
 2181: 		lwkt_gettoken(&ilock, &spechash_token);
 2182: 		vq = SLIST_FIRST(&dev->si_hlist);
 2183: 		lwkt_reltoken(&ilock);
 2184: 		if (vq == NULL)
 2185: 			break;
 2186: 		vgone(vq);
 2187: 	}
 2188: 	release_dev(dev);
 2189: 	return (0);
 2190: }
 2191: 
 2192: /*
 2193:  * Recycle an unused vnode to the front of the free list.
 2194:  * Release the passed interlock if the vnode will be recycled.
 2195:  */
 2196: int
 2197: vrecycle(struct vnode *vp, lwkt_tokref_t inter_lkp, struct thread *td)
 2198: {
 2199: 	lwkt_tokref vlock;
 2200: 
 2201: 	lwkt_gettoken(&vlock, vp->v_interlock);
 2202: 	if (vp->v_usecount == 0) {
 2203: 		if (inter_lkp)
 2204: 			lwkt_reltoken(inter_lkp);
 2205: 		vgonel(vp, &vlock, td);
 2206: 		return (1);
 2207: 	}
 2208: 	lwkt_reltoken(&vlock);
 2209: 	return (0);
 2210: }
 2211: 
 2212: /*
 2213:  * Eliminate all activity associated with a vnode
 2214:  * in preparation for reuse.
 2215:  */
 2216: void
 2217: vgone(struct vnode *vp)
 2218: {
 2219: 	struct thread *td = curthread;	/* XXX */
 2220: 	lwkt_tokref vlock;
 2221: 
 2222: 	lwkt_gettoken(&vlock, vp->v_interlock);
 2223: 	vgonel(vp, &vlock, td);
 2224: }
 2225: 
 2226: /*
 2227:  * vgone, with the vp interlock held.
 2228:  */
 2229: void
 2230: vgonel(struct vnode *vp, lwkt_tokref_t vlock, struct thread *td)
 2231: {
 2232: 	lwkt_tokref ilock;
 2233: 	int s;
 2234: 
 2235: 	/*
 2236: 	 * If a vgone (or vclean) is already in progress,
 2237: 	 * wait until it is done and return.
 2238: 	 */
 2239: 	if (vp->v_flag & VXLOCK) {
 2240: 		vp->v_flag |= VXWANT;
 2241: 		lwkt_reltoken(vlock);
 2242: 		tsleep((caddr_t)vp, 0, "vgone", 0);
 2243: 		return;
 2244: 	}
 2245: 
 2246: 	/*
 2247: 	 * Clean out the filesystem specific data.
 2248: 	 */
 2249: 	vclean(vp, vlock, DOCLOSE, td);
 2250: 	lwkt_gettokref(vlock);
 2251: 
 2252: 	/*
 2253: 	 * Delete from old mount point vnode list, if on one.
 2254: 	 */
 2255: 	if (vp->v_mount != NULL)
 2256: 		insmntque(vp, (struct mount *)0);
 2257: 	/*
 2258: 	 * If special device, remove it from special device alias list
 2259: 	 * if it is on one.
 2260: 	 */
 2261: 	if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_rdev != NULL) {
 2262: 		v_release_rdev(vp);
 2263: 	}
 2264: 
 2265: 	/*
 2266: 	 * If it is on the freelist and not already at the head,
 2267: 	 * move it to the head of the list. The test of the
 2268: 	 * VDOOMED flag and the reference count of zero is because
 2269: 	 * it will be removed from the free list by getnewvnode,
 2270: 	 * but will not have its reference count incremented until
 2271: 	 * after calling vgone. If the reference count were
 2272: 	 * incremented first, vgone would (incorrectly) try to
 2273: 	 * close the previous instance of the underlying object.
 2274: 	 */
 2275: 	if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) {
 2276: 		s = splbio();
 2277: 		lwkt_gettoken(&ilock, &vnode_free_list_token);
 2278: 		if (vp->v_flag & VFREE)
 2279: 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 2280: 		else
 2281: 			freevnodes++;
 2282: 		vp->v_flag |= VFREE;
 2283: 		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
 2284: 		lwkt_reltoken(&ilock);
 2285: 		splx(s);
 2286: 	}
 2287: 	vp->v_type = VBAD;
 2288: 	lwkt_reltoken(vlock);
 2289: }
 2290: 
 2291: /*
 2292:  * Lookup a vnode by device number.
 2293:  */
 2294: int
 2295: vfinddev(dev, type, vpp)
 2296: 	dev_t dev;
 2297: 	enum vtype type;
 2298: 	struct vnode **vpp;
 2299: {
 2300: 	lwkt_tokref ilock;
 2301: 	struct vnode *vp;
 2302: 
 2303: 	lwkt_gettoken(&ilock, &spechash_token);
 2304: 	SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
 2305: 		if (type == vp->v_type) {
 2306: 			*vpp = vp;
 2307: 			lwkt_reltoken(&ilock);
 2308: 			return (1);
 2309: 		}
 2310: 	}
 2311: 	lwkt_reltoken(&ilock);
 2312: 	return (0);
 2313: }
 2314: 
 2315: /*
 2316:  * Calculate the total number of references to a special device.  This
 2317:  * routine may only be called for VBLK and VCHR vnodes since v_rdev is
 2318:  * an overloaded field.  Since udev2dev can now return NODEV, we have
 2319:  * to check for a NULL v_rdev.
 2320:  */
 2321: int
 2322: count_dev(dev_t dev)
 2323: {
 2324: 	lwkt_tokref ilock;
 2325: 	struct vnode *vp;
 2326: 	int count = 0;
 2327: 
 2328: 	if (SLIST_FIRST(&dev->si_hlist)) {
 2329: 		lwkt_gettoken(&ilock, &spechash_token);
 2330: 		SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
 2331: 			count += vp->v_usecount;
 2332: 		}
 2333: 		lwkt_reltoken(&ilock);
 2334: 	}
 2335: 	return(count);
 2336: }
 2337: 
 2338: int
 2339: count_udev(udev_t udev)
 2340: {
 2341: 	dev_t dev;
 2342: 
 2343: 	if ((dev = udev2dev(udev, 0)) == NODEV)
 2344: 		return(0);
 2345: 	return(count_dev(dev));
 2346: }
 2347: 
 2348: int
 2349: vcount(struct vnode *vp)
 2350: {
 2351: 	if (vp->v_rdev == NULL)
 2352: 		return(0);
 2353: 	return(count_dev(vp->v_rdev));
 2354: }
 2355: 
 2356: /*
 2357:  * Print out a description of a vnode.
 2358:  */
 2359: static char *typename[] =
 2360: {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
 2361: 
 2362: void
 2363: vprint(label, vp)
 2364: 	char *label;
 2365: 	struct vnode *vp;
 2366: {
 2367: 	char buf[96];
 2368: 
 2369: 	if (label != NULL)
 2370: 		printf("%s: %p: ", label, (void *)vp);
 2371: 	else
 2372: 		printf("%p: ", (void *)vp);
 2373: 	printf("type %s, usecount %d, writecount %d, refcount %d,",
 2374: 	    typename[vp->v_type], vp->v_usecount, vp->v_writecount,
 2375: 	    vp->v_holdcnt);
 2376: 	buf[0] = '\0';
 2377: 	if (vp->v_flag & VROOT)
 2378: 		strcat(buf, "|VROOT");
 2379: 	if (vp->v_flag & VTEXT)
 2380: 		strcat(buf, "|VTEXT");
 2381: 	if (vp->v_flag & VSYSTEM)
 2382: 		strcat(buf, "|VSYSTEM");
 2383: 	if (vp->v_flag & VXLOCK)
 2384: 		strcat(buf, "|VXLOCK");
 2385: 	if (vp->v_flag & VXWANT)
 2386: 		strcat(buf, "|VXWANT");
 2387: 	if (vp->v_flag & VBWAIT)
 2388: 		strcat(buf, "|VBWAIT");
 2389: 	if (vp->v_flag & VDOOMED)
 2390: 		strcat(buf, "|VDOOMED");
 2391: 	if (vp->v_flag & VFREE)
 2392: 		strcat(buf, "|VFREE");
 2393: 	if (vp->v_flag & VOBJBUF)
 2394: 		strcat(buf, "|VOBJBUF");
 2395: 	if (buf[0] != '\0')
 2396: 		printf(" flags (%s)", &buf[1]);
 2397: 	if (vp->v_data == NULL) {
 2398: 		printf("\n");
 2399: 	} else {
 2400: 		printf("\n\t");
 2401: 		VOP_PRINT(vp);
 2402: 	}
 2403: }
 2404: 
 2405: #ifdef DDB
 2406: #include <ddb/ddb.h>
 2407: /*
 2408:  * List all of the locked vnodes in the system.
 2409:  * Called when debugging the kernel.
 2410:  */
 2411: DB_SHOW_COMMAND(lockedvnodes, lockedvnodes)
 2412: {
 2413: 	struct thread *td = curthread;	/* XXX */
 2414: 	lwkt_tokref ilock;
 2415: 	struct mount *mp, *nmp;
 2416: 	struct vnode *vp;
 2417: 
 2418: 	printf("Locked vnodes\n");
 2419: 	lwkt_gettoken(&ilock, &mountlist_token);
 2420: 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 2421: 		if (vfs_busy(mp, LK_NOWAIT, &ilock, td)) {
 2422: 			nmp = TAILQ_NEXT(mp, mnt_list);
 2423: 			continue;
 2424: 		}
 2425: 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 2426: 			if (VOP_ISLOCKED(vp, NULL))
 2427: 				vprint((char *)0, vp);
 2428: 		}
 2429: 		lwkt_gettokref(&ilock);
 2430: 		nmp = TAILQ_NEXT(mp, mnt_list);
 2431: 		vfs_unbusy(mp, td);
 2432: 	}
 2433: 	lwkt_reltoken(&ilock);
 2434: }
 2435: #endif
 2436: 
 2437: /*
 2438:  * Top level filesystem related information gathering.
 2439:  */
 2440: static int	sysctl_ovfs_conf (SYSCTL_HANDLER_ARGS);
 2441: 
 2442: static int
 2443: vfs_sysctl(SYSCTL_HANDLER_ARGS)
 2444: {
 2445: 	int *name = (int *)arg1 - 1;	/* XXX */
 2446: 	u_int namelen = arg2 + 1;	/* XXX */
 2447: 	struct vfsconf *vfsp;
 2448: 
 2449: #if 1 || defined(COMPAT_PRELITE2)
 2450: 	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
 2451: 	if (namelen == 1)
 2452: 		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
 2453: #endif
 2454: 
 2455: #ifdef notyet
 2456: 	/* all sysctl names at this level are at least name and field */
 2457: 	if (namelen < 2)
 2458: 		return (ENOTDIR);		/* overloaded */
 2459: 	if (name[0] != VFS_GENERIC) {
 2460: 		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
 2461: 			if (vfsp->vfc_typenum == name[0])
 2462: 				break;
 2463: 		if (vfsp == NULL)
 2464: 			return (EOPNOTSUPP);
 2465: 		return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
 2466: 		    oldp, oldlenp, newp, newlen, p));
 2467: 	}
 2468: #endif
 2469: 	switch (name[1]) {
 2470: 	case VFS_MAXTYPENUM:
 2471: 		if (namelen != 2)
 2472: 			return (ENOTDIR);
 2473: 		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
 2474: 	case VFS_CONF:
 2475: 		if (namelen != 3)
 2476: 			return (ENOTDIR);	/* overloaded */
 2477: 		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
 2478: 			if (vfsp->vfc_typenum == name[2])
 2479: 				break;
 2480: 		if (vfsp == NULL)
 2481: 			return (EOPNOTSUPP);
 2482: 		return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
 2483: 	}
 2484: 	return (EOPNOTSUPP);
 2485: }
 2486: 
 2487: SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
 2488: 	"Generic filesystem");
 2489: 
 2490: #if 1 || defined(COMPAT_PRELITE2)
 2491: 
 2492: static int
 2493: sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
 2494: {
 2495: 	int error;
 2496: 	struct vfsconf *vfsp;
 2497: 	struct ovfsconf ovfs;
 2498: 
 2499: 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
 2500: 		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
 2501: 		strcpy(ovfs.vfc_name, vfsp->vfc_name);
 2502: 		ovfs.vfc_index = vfsp->vfc_typenum;
 2503: 		ovfs.vfc_refcount = vfsp->vfc_refcount;
 2504: 		ovfs.vfc_flags = vfsp->vfc_flags;
 2505: 		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
 2506: 		if (error)
 2507: 			return error;
 2508: 	}
 2509: 	return 0;
 2510: }
 2511: 
 2512: #endif /* 1 || COMPAT_PRELITE2 */
 2513: 
 2514: #if 0
 2515: #define KINFO_VNODESLOP	10
 2516: /*
 2517:  * Dump vnode list (via sysctl).
 2518:  * Copyout address of vnode followed by vnode.
 2519:  */
 2520: /* ARGSUSED */
 2521: static int
 2522: sysctl_vnode(SYSCTL_HANDLER_ARGS)
 2523: {
 2524: 	struct proc *p = curproc;	/* XXX */
 2525: 	struct mount *mp, *nmp;
 2526: 	struct vnode *nvp, *vp;
 2527: 	lwkt_tokref ilock;
 2528: 	lwkt_tokref jlock;
 2529: 	int error;
 2530: 
 2531: #define VPTRSZ	sizeof (struct vnode *)
 2532: #define VNODESZ	sizeof (struct vnode)
 2533: 
 2534: 	req->lock = 0;
 2535: 	if (!req->oldptr) /* Make an estimate */
 2536: 		return (SYSCTL_OUT(req, 0,
 2537: 			(numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
 2538: 
 2539: 	lwkt_gettoken(&ilock, &mountlist_token);
 2540: 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 2541: 		if (vfs_busy(mp, LK_NOWAIT, &ilock, p)) {
 2542: 			nmp = TAILQ_NEXT(mp, mnt_list);
 2543: 			continue;
 2544: 		}
 2545: 		lwkt_gettoken(&jlock, &mntvnode_token);
 2546: again:
 2547: 		for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
 2548: 		     vp != NULL;
 2549: 		     vp = nvp) {
 2550: 			/*
 2551: 			 * Check that the vp is still associated with
 2552: 			 * this filesystem.  RACE: could have been
 2553: 			 * recycled onto the same filesystem.
 2554: 			 */
 2555: 			if (vp->v_mount != mp)
 2556: 				goto again;
 2557: 			nvp = TAILQ_NEXT(vp, v_nmntvnodes);
 2558: 			if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
 2559: 			    (error = SYSCTL_OUT(req, vp, VNODESZ))) {
 2560: 				lwkt_reltoken(&jlock);
 2561: 				return (error);
 2562: 			}
 2563: 		}
 2564: 		lwkt_reltoken(&jlock);
 2565: 		lwkt_gettokref(&ilock);
 2566: 		nmp = TAILQ_NEXT(mp, mnt_list);	/* ZZZ */
 2567: 		vfs_unbusy(mp, p);
 2568: 	}
 2569: 	lwkt_reltoken(&ilock);
 2570: 
 2571: 	return (0);
 2572: }
 2573: #endif
 2574: 
 2575: /*
 2576:  * XXX
 2577:  * Exporting the vnode list on large systems causes them to crash.
 2578:  * Exporting the vnode list on medium systems causes sysctl to coredump.
 2579:  */
 2580: #if 0
 2581: SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
 2582: 	0, 0, sysctl_vnode, "S,vnode", "");
 2583: #endif
 2584: 
 2585: /*
 2586:  * Check to see if a filesystem is mounted on a block device.
 2587:  */
 2588: int
 2589: vfs_mountedon(struct vnode *vp)
 2590: {
 2591: 	dev_t dev;
 2592: 
 2593: 	if ((dev = vp->v_rdev) == NULL)
 2594: 		dev = udev2dev(vp->v_udev, (vp->v_type == VBLK));
 2595: 	if (dev != NODEV && dev->si_mountpoint)
 2596: 		return (EBUSY);
 2597: 	return (0);
 2598: }
 2599: 
 2600: /*
 2601:  * Unmount all filesystems. The list is traversed in reverse order
 2602:  * of mounting to avoid dependencies.
 2603:  */
 2604: void
 2605: vfs_unmountall()
 2606: {
 2607: 	struct mount *mp;
 2608: 	struct thread *td = curthread;
 2609: 	int error;
 2610: 
 2611: 	if (td->td_proc == NULL)
 2612: 		td = initproc->p_thread;	/* XXX XXX use proc0 instead? */
 2613: 
 2614: 	/*
 2615: 	 * Since this only runs when rebooting, it is not interlocked.
 2616: 	 */
 2617: 	while(!TAILQ_EMPTY(&mountlist)) {
 2618: 		mp = TAILQ_LAST(&mountlist, mntlist);
 2619: 		error = dounmount(mp, MNT_FORCE, td);
 2620: 		if (error) {
 2621: 			TAILQ_REMOVE(&mountlist, mp, mnt_list);
 2622: 			printf("unmount of %s failed (",
 2623: 			    mp->mnt_stat.f_mntonname);
 2624: 			if (error == EBUSY)
 2625: 				printf("BUSY)\n");
 2626: 			else
 2627: 				printf("%d)\n", error);
 2628: 		} else {
 2629: 			/* The unmount has removed mp from the mountlist */
 2630: 		}
 2631: 	}
 2632: }
 2633: 
 2634: /*
 2635:  * Build hash lists of net addresses and hang them off the mount point.
 2636:  * Called by ufs_mount() to set up the lists of export addresses.
 2637:  */
 2638: static int
 2639: vfs_hang_addrlist(mp, nep, argp)
 2640: 	struct mount *mp;
 2641: 	struct netexport *nep;
 2642: 	struct export_args *argp;
 2643: {
 2644: 	struct netcred *np;
 2645: 	struct radix_node_head *rnh;
 2646: 	int i;
 2647: 	struct radix_node *rn;
 2648: 	struct sockaddr *saddr, *smask = 0;
 2649: 	struct domain *dom;
 2650: 	int error;
 2651: 
 2652: 	if (argp->ex_addrlen == 0) {
 2653: 		if (mp->mnt_flag & MNT_DEFEXPORTED)
 2654: 			return (EPERM);
 2655: 		np = &nep->ne_defexported;
 2656: 		np->netc_exflags = argp->ex_flags;
 2657: 		np->netc_anon = argp->ex_anon;
 2658: 		np->netc_anon.cr_ref = 1;
 2659: 		mp->mnt_flag |= MNT_DEFEXPORTED;
 2660: 		return (0);
 2661: 	}
 2662: 
 2663: 	if (argp->ex_addrlen < 0 || argp->ex_addrlen > MLEN)
 2664: 		return (EINVAL);
 2665: 	if (argp->ex_masklen < 0 || argp->ex_masklen > MLEN)
 2666: 		return (EINVAL);
 2667: 
 2668: 	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
 2669: 	np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK);
 2670: 	bzero((caddr_t) np, i);
 2671: 	saddr = (struct sockaddr *) (np + 1);
 2672: 	if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
 2673: 		goto out;
 2674: 	if (saddr->sa_len > argp->ex_addrlen)
 2675: 		saddr->sa_len = argp->ex_addrlen;
 2676: 	if (argp->ex_masklen) {
 2677: 		smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen);
 2678: 		error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen);
 2679: 		if (error)
 2680: 			goto out;
 2681: 		if (smask->sa_len > argp->ex_masklen)
 2682: 			smask->sa_len = argp->ex_masklen;
 2683: 	}
 2684: 	i = saddr->sa_family;
 2685: 	if ((rnh = nep->ne_rtable[i]) == 0) {
 2686: 		/*
 2687: 		 * Seems silly to initialize every AF when most are not used,
 2688: 		 * do so on demand here
 2689: 		 */
 2690: 		for (dom = domains; dom; dom = dom->dom_next)
 2691: 			if (dom->dom_family == i && dom->dom_rtattach) {
 2692: 				dom->dom_rtattach((void **) &nep->ne_rtable[i],
 2693: 				    dom->dom_rtoffset);
 2694: 				break;
 2695: 			}
 2696: 		if ((rnh = nep->ne_rtable[i]) == 0) {
 2697: 			error = ENOBUFS;
 2698: 			goto out;
 2699: 		}
 2700: 	}
 2701: 	rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
 2702: 	    np->netc_rnodes);
 2703: 	if (rn == 0 || np != (struct netcred *) rn) {	/* already exists */
 2704: 		error = EPERM;
 2705: 		goto out;
 2706: 	}
 2707: 	np->netc_exflags = argp->ex_flags;
 2708: 	np->netc_anon = argp->ex_anon;
 2709: 	np->netc_anon.cr_ref = 1;
 2710: 	return (0);
 2711: out:
 2712: 	free(np, M_NETADDR);
 2713: 	return (error);
 2714: }
 2715: 
 2716: /* ARGSUSED */
 2717: static int
 2718: vfs_free_netcred(rn, w)
 2719: 	struct radix_node *rn;
 2720: 	void *w;
 2721: {
 2722: 	struct radix_node_head *rnh = (struct radix_node_head *) w;
 2723: 
 2724: 	(*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
 2725: 	free((caddr_t) rn, M_NETADDR);
 2726: 	return (0);
 2727: }
 2728: 
 2729: /*
 2730:  * Free the net address hash lists that are hanging off the mount points.
 2731:  */
 2732: static void
 2733: vfs_free_addrlist(nep)
 2734: 	struct netexport *nep;
 2735: {
 2736: 	int i;
 2737: 	struct radix_node_head *rnh;
 2738: 
 2739: 	for (i = 0; i <= AF_MAX; i++)
 2740: 		if ((rnh = nep->ne_rtable[i])) {
 2741: 			(*rnh->rnh_walktree) (rnh, vfs_free_netcred,
 2742: 			    (caddr_t) rnh);
 2743: 			free((caddr_t) rnh, M_RTABLE);
 2744: 			nep->ne_rtable[i] = 0;
 2745: 		}
 2746: }
 2747: 
 2748: int
 2749: vfs_export(mp, nep, argp)
 2750: 	struct mount *mp;
 2751: 	struct netexport *nep;
 2752: 	struct export_args *argp;
 2753: {
 2754: 	int error;
 2755: 
 2756: 	if (argp->ex_flags & MNT_DELEXPORT) {
 2757: 		if (mp->mnt_flag & MNT_EXPUBLIC) {
 2758: 			vfs_setpublicfs(NULL, NULL, NULL);
 2759: 			mp->mnt_flag &= ~MNT_EXPUBLIC;
 2760: 		}
 2761: 		vfs_free_addrlist(nep);
 2762: 		mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
 2763: 	}
 2764: 	if (argp->ex_flags & MNT_EXPORTED) {
 2765: 		if (argp->ex_flags & MNT_EXPUBLIC) {
 2766: 			if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
 2767: 				return (error);
 2768: 			mp->mnt_flag |= MNT_EXPUBLIC;
 2769: 		}
 2770: 		if ((error = vfs_hang_addrlist(mp, nep, argp)))
 2771: 			return (error);
 2772: 		mp->mnt_flag |= MNT_EXPORTED;
 2773: 	}
 2774: 	return (0);
 2775: }
 2776: 
 2777: 
 2778: /*
 2779:  * Set the publicly exported filesystem (WebNFS). Currently, only
 2780:  * one public filesystem is possible in the spec (RFC 2054 and 2055)
 2781:  */
 2782: int
 2783: vfs_setpublicfs(mp, nep, argp)
 2784: 	struct mount *mp;
 2785: 	struct netexport *nep;
 2786: 	struct export_args *argp;
 2787: {
 2788: 	int error;
 2789: 	struct vnode *rvp;
 2790: 	char *cp;
 2791: 
 2792: 	/*
 2793: 	 * mp == NULL -> invalidate the current info, the FS is
 2794: 	 * no longer exported. May be called from either vfs_export
 2795: 	 * or unmount, so check if it hasn't already been done.
 2796: 	 */
 2797: 	if (mp == NULL) {
 2798: 		if (nfs_pub.np_valid) {
 2799: 			nfs_pub.np_valid = 0;
 2800: 			if (nfs_pub.np_index != NULL) {
 2801: 				FREE(nfs_pub.np_index, M_TEMP);
 2802: 				nfs_pub.np_index = NULL;
 2803: 			}
 2804: 		}
 2805: 		return (0);
 2806: 	}
 2807: 
 2808: 	/*
 2809: 	 * Only one allowed at a time.
 2810: 	 */
 2811: 	if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
 2812: 		return (EBUSY);
 2813: 
 2814: 	/*
 2815: 	 * Get real filehandle for root of exported FS.
 2816: 	 */
 2817: 	bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
 2818: 	nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
 2819: 
 2820: 	if ((error = VFS_ROOT(mp, &rvp)))
 2821: 		return (error);
 2822: 
 2823: 	if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
 2824: 		return (error);
 2825: 
 2826: 	vput(rvp);
 2827: 
 2828: 	/*
 2829: 	 * If an indexfile was specified, pull it in.
 2830: 	 */
 2831: 	if (argp->ex_indexfile != NULL) {
 2832: 		MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP,
 2833: 		    M_WAITOK);
 2834: 		error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
 2835: 		    MAXNAMLEN, (size_t *)0);
 2836: 		if (!error) {
 2837: 			/*
 2838: 			 * Check for illegal filenames.
 2839: 			 */
 2840: 			for (cp = nfs_pub.np_index; *cp; cp++) {
 2841: 				if (*cp == '/') {
 2842: 					error = EINVAL;
 2843: 					break;
 2844: 				}
 2845: 			}
 2846: 		}
 2847: 		if (error) {
 2848: 			FREE(nfs_pub.np_index, M_TEMP);
 2849: 			return (error);
 2850: 		}
 2851: 	}
 2852: 
 2853: 	nfs_pub.np_mount = mp;
 2854: 	nfs_pub.np_valid = 1;
 2855: 	return (0);
 2856: }
 2857: 
 2858: struct netcred *
 2859: vfs_export_lookup(mp, nep, nam)
 2860: 	struct mount *mp;
 2861: 	struct netexport *nep;
 2862: 	struct sockaddr *nam;
 2863: {
 2864: 	struct netcred *np;
 2865: 	struct radix_node_head *rnh;
 2866: 	struct sockaddr *saddr;
 2867: 
 2868: 	np = NULL;
 2869: 	if (mp->mnt_flag & MNT_EXPORTED) {
 2870: 		/*
 2871: 		 * Lookup in the export list first.
 2872: 		 */
 2873: 		if (nam != NULL) {
 2874: 			saddr = nam;
 2875: 			rnh = nep->ne_rtable[saddr->sa_family];
 2876: 			if (rnh != NULL) {
 2877: 				np = (struct netcred *)
 2878: 					(*rnh->rnh_matchaddr)((caddr_t)saddr,
 2879: 							      rnh);
 2880: 				if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
 2881: 					np = NULL;
 2882: 			}
 2883: 		}
 2884: 		/*
 2885: 		 * If no address match, use the default if it exists.
 2886: 		 */
 2887: 		if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
 2888: 			np = &nep->ne_defexported;
 2889: 	}
 2890: 	return (np);
 2891: }
 2892: 
 2893: /*
 2894:  * perform msync on all vnodes under a mount point.  The mount point must
 2895:  * be locked.  This code is also responsible for lazy-freeing unreferenced
 2896:  * vnodes whos VM objects no longer contain pages.
 2897:  *
 2898:  * NOTE: MNT_WAIT still skips vnodes in the VXLOCK state.
 2899:  */
 2900: static int vfs_msync_scan1(struct mount *mp, struct vnode *vp, void *data);
 2901: static int vfs_msync_scan2(struct mount *mp, struct vnode *vp, 
 2902: 				lwkt_tokref_t vlock, void *data);
 2903: 
 2904: void
 2905: vfs_msync(struct mount *mp, int flags) 
 2906: {
 2907: 	vmntvnodescan(mp, vfs_msync_scan1, vfs_msync_scan2, (void *)flags);
 2908: }
 2909: 
 2910: /*
 2911:  * scan1 is a fast pre-check.  There could be hundreds of thousands of
 2912:  * vnodes, we cannot afford to do anything heavy weight until we have a
 2913:  * fairly good indication that there is work to do.
 2914:  */
 2915: static
 2916: int
 2917: vfs_msync_scan1(struct mount *mp, struct vnode *vp, void *data)
 2918: {
 2919: 	int flags = (int)data;
 2920: 
 2921: 	if ((vp->v_flag & VXLOCK) == 0) {
 2922: 		if (VSHOULDFREE(vp))
 2923: 			return(0);
 2924: 		if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
 2925: 		    (vp->v_flag & VOBJDIRTY) &&
 2926: 		    (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) {
 2927: 			return(0);
 2928: 		}
 2929: 	}
 2930: 	return(-1);
 2931: }
 2932: 
 2933: static
 2934: int
 2935: vfs_msync_scan2(struct mount *mp, struct vnode *vp, lwkt_tokref_t vlock, void *data)
 2936: {
 2937: 	vm_object_t obj;
 2938: 	int error;
 2939: 	int flags = (int)data;
 2940: 
 2941: 	if (vp->v_flag & VXLOCK)
 2942: 		return(0);
 2943: 
 2944: 	if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
 2945: 	    (vp->v_flag & VOBJDIRTY) &&
 2946: 	    (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) {
 2947: 		error = vget(vp, vlock, LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ | LK_INTERLOCK, curthread);
 2948: 		if (error == 0) {
 2949: 			if (VOP_GETVOBJECT(vp, &obj) == 0) {
 2950: 				vm_object_page_clean(obj, 0, 0, 
 2951: 				 flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC);
 2952: 			}
 2953: 			vput(vp);
 2954: 		}
 2955: 		return(0);
 2956: 	}
 2957: 	vmaybefree(vp);
 2958: 	lwkt_reltoken(vlock);
 2959: 	return(0);
 2960: }
 2961: 
 2962: /*
 2963:  * Create the VM object needed for VMIO and mmap support.  This
 2964:  * is done for all VREG files in the system.  Some filesystems might
 2965:  * afford the additional metadata buffering capability of the
 2966:  * VMIO code by making the device node be VMIO mode also.
 2967:  *
 2968:  * vp must be locked when vfs_object_create is called.
 2969:  */
 2970: int
 2971: vfs_object_create(struct vnode *vp, struct thread *td)
 2972: {
 2973: 	return (VOP_CREATEVOBJECT(vp, td));
 2974: }
 2975: 
 2976: /*
 2977:  * NOTE: the vnode interlock must be held during the call.  We have to recheck
 2978:  * the VFREE flag since the vnode may have been removed from the free list
 2979:  * while we were blocked on vnode_free_list_token.  The use or hold count
 2980:  * must have already been bumped by the caller.
 2981:  */
 2982: static void
 2983: vbusy(struct vnode *vp)
 2984: {
 2985: 	lwkt_tokref ilock;
 2986: 
 2987: 	lwkt_gettoken(&ilock, &vnode_free_list_token);
 2988: 	if ((vp->v_flag & VFREE) != 0) {
 2989: 	    TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 2990: 	    freevnodes--;
 2991: 	    vp->v_flag &= ~(VFREE|VAGE);
 2992: 	}
 2993: 	lwkt_reltoken(&ilock);
 2994: }
 2995: 
 2996: /*
 2997:  * NOTE: the vnode interlock must be held during the call.  The use or hold
 2998:  * count must have already been bumped by the caller.  We use a VINFREE to
 2999:  * interlock against other calls to vfree() which might occur while we 
 3000:  * are blocked.  The vnode cannot be reused until it has actually been
 3001:  * placed on the free list, so there are no other races even though the
 3002:  * use and hold counts are 0.
 3003:  */
 3004: static void
 3005: vfree(struct vnode *vp)
 3006: {
 3007: 	lwkt_tokref ilock;
 3008: 
 3009: 	if ((vp->v_flag & VINFREE) == 0) {
 3010: 		vp->v_flag |= VINFREE;
 3011: 		lwkt_gettoken(&ilock, &vnode_free_list_token); /* can block */
 3012: 		KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free"));
 3013: 		if (vp->v_flag & VAGE) {
 3014: 			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
 3015: 		} else {
 3016: 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 3017: 		}
 3018: 		freevnodes++;
 3019: 		vp->v_flag &= ~(VAGE|VINFREE);
 3020: 		vp->v_flag |= VFREE;
 3021: 		lwkt_reltoken(&ilock);	/* can block */
 3022: 	}
 3023: }
 3024: 
 3025: 
 3026: /*
 3027:  * Record a process's interest in events which might happen to
 3028:  * a vnode.  Because poll uses the historic select-style interface
 3029:  * internally, this routine serves as both the ``check for any
 3030:  * pending events'' and the ``record my interest in future events''
 3031:  * functions.  (These are done together, while the lock is held,
 3032:  * to avoid race conditions.)
 3033:  */
 3034: int
 3035: vn_pollrecord(struct vnode *vp, struct thread *td, int events)
 3036: {
 3037: 	lwkt_tokref ilock;
 3038: 
 3039: 	lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token);
 3040: 	if (vp->v_pollinfo.vpi_revents & events) {
 3041: 		/*
 3042: 		 * This leaves events we are not interested
 3043: 		 * in available for the other process which
 3044: 		 * which presumably had requested them
 3045: 		 * (otherwise they would never have been
 3046: 		 * recorded).
 3047: 		 */
 3048: 		events &= vp->v_pollinfo.vpi_revents;
 3049: 		vp->v_pollinfo.vpi_revents &= ~events;
 3050: 
 3051: 		lwkt_reltoken(&ilock);
 3052: 		return events;
 3053: 	}
 3054: 	vp->v_pollinfo.vpi_events |= events;
 3055: 	selrecord(td, &vp->v_pollinfo.vpi_selinfo);
 3056: 	lwkt_reltoken(&ilock);
 3057: 	return 0;
 3058: }
 3059: 
 3060: /*
 3061:  * Note the occurrence of an event.  If the VN_POLLEVENT macro is used,
 3062:  * it is possible for us to miss an event due to race conditions, but
 3063:  * that condition is expected to be rare, so for the moment it is the
 3064:  * preferred interface.
 3065:  */
 3066: void
 3067: vn_pollevent(vp, events)
 3068: 	struct vnode *vp;
 3069: 	short events;
 3070: {
 3071: 	lwkt_tokref ilock;
 3072: 
 3073: 	lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token);
 3074: 	if (vp->v_pollinfo.vpi_events & events) {
 3075: 		/*
 3076: 		 * We clear vpi_events so that we don't
 3077: 		 * call selwakeup() twice if two events are
 3078: 		 * posted before the polling process(es) is
 3079: 		 * awakened.  This also ensures that we take at
 3080: 		 * most one selwakeup() if the polling process
 3081: 		 * is no longer interested.  However, it does
 3082: 		 * mean that only one event can be noticed at
 3083: 		 * a time.  (Perhaps we should only clear those
 3084: 		 * event bits which we note?) XXX
 3085: 		 */
 3086: 		vp->v_pollinfo.vpi_events = 0;	/* &= ~events ??? */
 3087: 		vp->v_pollinfo.vpi_revents |= events;
 3088: 		selwakeup(&vp->v_pollinfo.vpi_selinfo);
 3089: 	}
 3090: 	lwkt_reltoken(&ilock);
 3091: }
 3092: 
 3093: /*
 3094:  * Wake up anyone polling on vp because it is being revoked.
 3095:  * This depends on dead_poll() returning POLLHUP for correct
 3096:  * behavior.
 3097:  */
 3098: void
 3099: vn_pollgone(vp)
 3100: 	struct vnode *vp;
 3101: {
 3102: 	lwkt_tokref ilock;
 3103: 
 3104: 	lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token);
 3105: 	if (vp->v_pollinfo.vpi_events) {
 3106: 		vp->v_pollinfo.vpi_events = 0;
 3107: 		selwakeup(&vp->v_pollinfo.vpi_selinfo);
 3108: 	}
 3109: 	lwkt_reltoken(&ilock);
 3110: }
 3111: 
 3112: 
 3113: 
 3114: /*
 3115:  * Routine to create and manage a filesystem syncer vnode.
 3116:  */
 3117: #define sync_close ((int (*) (struct  vop_close_args *))nullop)
 3118: static int	sync_fsync (struct  vop_fsync_args *);
 3119: static int	sync_inactive (struct  vop_inactive_args *);
 3120: static int	sync_reclaim  (struct  vop_reclaim_args *);
 3121: #define sync_lock ((int (*) (struct  vop_lock_args *))vop_nolock)
 3122: #define sync_unlock ((int (*) (struct  vop_unlock_args *))vop_nounlock)
 3123: static int	sync_print (struct vop_print_args *);
 3124: #define sync_islocked ((int(*) (struct vop_islocked_args *))vop_noislocked)
 3125: 
 3126: static vop_t **sync_vnodeop_p;
 3127: static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
 3128: 	{ &vop_default_desc,	(vop_t *) vop_eopnotsupp },
 3129: 	{ &vop_close_desc,	(vop_t *) sync_close },		/* close */
 3130: 	{ &vop_fsync_desc,	(vop_t *) sync_fsync },		/* fsync */
 3131: 	{ &vop_inactive_desc,	(vop_t *) sync_inactive },	/* inactive */
 3132: 	{ &vop_reclaim_desc,	(vop_t *) sync_reclaim },	/* reclaim */
 3133: 	{ &vop_lock_desc,	(vop_t *) sync_lock },		/* lock */
 3134: 	{ &vop_unlock_desc,	(vop_t *) sync_unlock },	/* unlock */
 3135: 	{ &vop_print_desc,	(vop_t *) sync_print },		/* print */
 3136: 	{ &vop_islocked_desc,	(vop_t *) sync_islocked },	/* islocked */
 3137: 	{ NULL, NULL }
 3138: };
 3139: static struct vnodeopv_desc sync_vnodeop_opv_desc =
 3140: 	{ &sync_vnodeop_p, sync_vnodeop_entries };
 3141: 
 3142: VNODEOP_SET(sync_vnodeop_opv_desc);
 3143: 
 3144: /*
 3145:  * Create a new filesystem syncer vnode for the specified mount point.
 3146:  * This vnode is placed on the worklist and is responsible for sync'ing
 3147:  * the filesystem.
 3148:  *
 3149:  * NOTE: read-only mounts are also placed on the worklist.  The filesystem
 3150:  * sync code is also responsible for cleaning up vnodes.
 3151:  */
 3152: int
 3153: vfs_allocate_syncvnode(struct mount *mp)
 3154: {
 3155: 	struct vnode *vp;
 3156: 	static long start, incr, next;
 3157: 	int error;
 3158: 
 3159: 	/* Allocate a new vnode */
 3160: 	if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) {
 3161: 		mp->mnt_syncer = NULL;
 3162: 		return (error);
 3163: 	}
 3164: 	vp->v_type = VNON;
 3165: 	/*
 3166: 	 * Place the vnode onto the syncer worklist. We attempt to
 3167: 	 * scatter them about on the list so that they will go off
 3168: 	 * at evenly distributed times even if all the filesystems
 3169: 	 * are mounted at once.
 3170: 	 */
 3171: 	next += incr;
 3172: 	if (next == 0 || next > syncer_maxdelay) {
 3173: 		start /= 2;
 3174: 		incr /= 2;
 3175: 		if (start == 0) {
 3176: 			start = syncer_maxdelay / 2;
 3177: 			incr = syncer_maxdelay;
 3178: 		}
 3179: 		next = start;
 3180: 	}
 3181: 	vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
 3182: 	mp->mnt_syncer = vp;
 3183: 	return (0);
 3184: }
 3185: 
 3186: /*
 3187:  * Do a lazy sync of the filesystem.
 3188:  */
 3189: static int
 3190: sync_fsync(ap)
 3191: 	struct vop_fsync_args /* {
 3192: 		struct vnode *a_vp;
 3193: 		struct ucred *a_cred;
 3194: 		int a_waitfor;
 3195: 		struct thread *a_td;
 3196: 	} */ *ap;
 3197: {
 3198: 	struct vnode *syncvp = ap->a_vp;
 3199: 	struct mount *mp = syncvp->v_mount;
 3200: 	struct thread *td = ap->a_td;
 3201: 	lwkt_tokref ilock;
 3202: 	int asyncflag;
 3203: 
 3204: 	/*
 3205: 	 * We only need to do something if this is a lazy evaluation.
 3206: 	 */
 3207: 	if (ap->a_waitfor != MNT_LAZY)
 3208: 		return (0);
 3209: 
 3210: 	/*
 3211: 	 * Move ourselves to the back of the sync list.
 3212: 	 */
 3213: 	vn_syncer_add_to_worklist(syncvp, syncdelay);
 3214: 
 3215: 	/*
 3216: 	 * Walk the list of vnodes pushing all that are dirty and
 3217: 	 * not already on the sync list, and freeing vnodes which have
 3218: 	 * no refs and whos VM objects are empty.  vfs_msync() handles
 3219: 	 * the VM issues and must be called whether the mount is readonly
 3220: 	 * or not.
 3221: 	 */
 3222: 	lwkt_gettoken(&ilock, &mountlist_token);
 3223: 	if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &ilock, td) != 0) {
 3224: 		lwkt_reltoken(&ilock);
 3225: 		return (0);
 3226: 	}
 3227: 	if (mp->mnt_flag & MNT_RDONLY) {
 3228: 		vfs_msync(mp, MNT_NOWAIT);
 3229: 	} else {
 3230: 		asyncflag = mp->mnt_flag & MNT_ASYNC;
 3231: 		mp->mnt_flag &= ~MNT_ASYNC;	/* ZZZ hack */
 3232: 		vfs_msync(mp, MNT_NOWAIT);
 3233: 		VFS_SYNC(mp, MNT_LAZY, td);
 3234: 		if (asyncflag)
 3235: 			mp->mnt_flag |= MNT_ASYNC;
 3236: 	}
 3237: 	vfs_unbusy(mp, td);
 3238: 	return (0);
 3239: }
 3240: 
 3241: /*
 3242:  * The syncer vnode is no referenced.
 3243:  */
 3244: static int
 3245: sync_inactive(ap)
 3246: 	struct vop_inactive_args /* {
 3247: 		struct vnode *a_vp;
 3248: 		struct proc *a_p;
 3249: 	} */ *ap;
 3250: {
 3251: 
 3252: 	vgone(ap->a_vp);
 3253: 	return (0);
 3254: }
 3255: 
 3256: /*
 3257:  * The syncer vnode is no longer needed and is being decommissioned.
 3258:  *
 3259:  * Modifications to the worklist must be protected at splbio().
 3260:  */
 3261: static int
 3262: sync_reclaim(ap)
 3263: 	struct vop_reclaim_args /* {
 3264: 		struct vnode *a_vp;
 3265: 	} */ *ap;
 3266: {
 3267: 	struct vnode *vp = ap->a_vp;
 3268: 	int s;
 3269: 
 3270: 	s = splbio();
 3271: 	vp->v_mount->mnt_syncer = NULL;
 3272: 	if (vp->v_flag & VONWORKLST) {
 3273: 		LIST_REMOVE(vp, v_synclist);
 3274: 		vp->v_flag &= ~VONWORKLST;
 3275: 	}
 3276: 	splx(s);
 3277: 
 3278: 	return (0);
 3279: }
 3280: 
 3281: /*
 3282:  * Print out a syncer vnode.
 3283:  */
 3284: static int
 3285: sync_print(ap)
 3286: 	struct vop_print_args /* {
 3287: 		struct vnode *a_vp;
 3288: 	} */ *ap;
 3289: {
 3290: 	struct vnode *vp = ap->a_vp;
 3291: 
 3292: 	printf("syncer vnode");
 3293: 	if (vp->v_vnlock != NULL)
 3294: 		lockmgr_printinfo(vp->v_vnlock);
 3295: 	printf("\n");
 3296: 	return (0);
 3297: }
 3298: 
 3299: /*
 3300:  * extract the dev_t from a VBLK or VCHR.  The vnode must have been opened
 3301:  * (or v_rdev might be NULL).
 3302:  */
 3303: dev_t
 3304: vn_todev(struct vnode *vp)
 3305: {
 3306: 	if (vp->v_type != VBLK && vp->v_type != VCHR)
 3307: 		return (NODEV);
 3308: 	KKASSERT(vp->v_rdev != NULL);
 3309: 	return (vp->v_rdev);
 3310: }
 3311: 
 3312: /*
 3313:  * Check if vnode represents a disk device.  The vnode does not need to be
 3314:  * opened.
 3315:  */
 3316: int
 3317: vn_isdisk(struct vnode *vp, int *errp)
 3318: {
 3319: 	dev_t dev;
 3320: 
 3321: 	if (vp->v_type != VBLK && vp->v_type != VCHR) {
 3322: 		if (errp != NULL)
 3323: 			*errp = ENOTBLK;
 3324: 		return (0);
 3325: 	}
 3326: 
 3327: 	if ((dev = vp->v_rdev) == NULL)
 3328: 		dev = udev2dev(vp->v_udev, (vp->v_type == VBLK));
 3329: 	if (dev == NULL || dev == NODEV) {
 3330: 		if (errp != NULL)
 3331: 			*errp = ENXIO;
 3332: 		return (0);
 3333: 	}
 3334: 	if (dev_is_good(dev) == 0) {
 3335: 		if (errp != NULL)
 3336: 			*errp = ENXIO;
 3337: 		return (0);
 3338: 	}
 3339: 	if ((dev_dflags(dev) & D_DISK) == 0) {
 3340: 		if (errp != NULL)
 3341: 			*errp = ENOTBLK;
 3342: 		return (0);
 3343: 	}
 3344: 	if (errp != NULL)
 3345: 		*errp = 0;
 3346: 	return (1);
 3347: }
 3348: 
 3349: void
 3350: NDFREE(ndp, flags)
 3351:      struct nameidata *ndp;
 3352:      const uint flags;
 3353: {
 3354: 	if (!(flags & NDF_NO_FREE_PNBUF) &&
 3355: 	    (ndp->ni_cnd.cn_flags & CNP_HASBUF)) {
 3356: 		zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
 3357: 		ndp->ni_cnd.cn_flags &= ~CNP_HASBUF;
 3358: 	}
 3359: 	if (!(flags & NDF_NO_DNCP_RELE) &&
 3360: 	    (ndp->ni_cnd.cn_flags & CNP_WANTDNCP) &&
 3361: 	    ndp->ni_dncp) {
 3362: 		cache_drop(ndp->ni_dncp);
 3363: 		ndp->ni_dncp = NULL;
 3364: 	}
 3365: 	if (!(flags & NDF_NO_NCP_RELE) &&
 3366: 	    (ndp->ni_cnd.cn_flags & CNP_WANTNCP) &&
 3367: 	    ndp->ni_ncp) {
 3368: 		cache_drop(ndp->ni_ncp);
 3369: 		ndp->ni_ncp = NULL;
 3370: 	}
 3371: 	if (!(flags & NDF_NO_DVP_UNLOCK) &&
 3372: 	    (ndp->ni_cnd.cn_flags & CNP_LOCKPARENT) &&
 3373: 	    ndp->ni_dvp != ndp->ni_vp) {
 3374: 		VOP_UNLOCK(ndp->ni_dvp, NULL, 0, ndp->ni_cnd.cn_td);
 3375: 	}
 3376: 	if (!(flags & NDF_NO_DVP_RELE) &&
 3377: 	    (ndp->ni_cnd.cn_flags & (CNP_LOCKPARENT|CNP_WANTPARENT))) {
 3378: 		vrele(ndp->ni_dvp);
 3379: 		ndp->ni_dvp = NULL;
 3380: 	}
 3381: 	if (!(flags & NDF_NO_VP_UNLOCK) &&
 3382: 	    (ndp->ni_cnd.cn_flags & CNP_LOCKLEAF) && ndp->ni_vp) {
 3383: 		VOP_UNLOCK(ndp->ni_vp, NULL, 0, ndp->ni_cnd.cn_td);
 3384: 	}
 3385: 	if (!(flags & NDF_NO_VP_RELE) &&
 3386: 	    ndp->ni_vp) {
 3387: 		vrele(ndp->ni_vp);
 3388: 		ndp->ni_vp = NULL;
 3389: 	}
 3390: 	if (!(flags & NDF_NO_STARTDIR_RELE) &&
 3391: 	    (ndp->ni_cnd.cn_flags & CNP_SAVESTART)) {
 3392: 		vrele(ndp->ni_startdir);
 3393: 		ndp->ni_startdir = NULL;
 3394: 	}
 3395: }
 3396: 
 3397: #ifdef DEBUG_VFS_LOCKS
 3398: 
 3399: void
 3400: assert_vop_locked(struct vnode *vp, const char *str)
 3401: {
 3402: 
 3403: 	if (vp && IS_LOCKING_VFS(vp) && !VOP_ISLOCKED(vp, NULL)) {
 3404: 		panic("%s: %p is not locked shared but should be", str, vp);
 3405: 	}
 3406: }
 3407: 
 3408: void
 3409: assert_vop_unlocked(struct vnode *vp, const char *str)
 3410: {
 3411: 
 3412: 	if (vp && IS_LOCKING_VFS(vp)) {
 3413: 		if (VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE) {
 3414: 			panic("%s: %p is locked but should not be", str, vp);
 3415: 		}
 3416: 	}
 3417: }
 3418: 
 3419: #endif