File:  [DragonFly] / src / sys / vfs / mfs / mfs_vfsops.c
Revision 1.15: download - view: text, annotated - select for diffs
Wed May 19 22:53:04 2004 UTC (10 years, 5 months ago) by dillon
Branches: MAIN
CVS tags: HEAD, DragonFly_1_0_REL, DragonFly_1_0_RC1, DragonFly_1_0A_REL
Device layer rollup commit.

* cdevsw_add() is now required.  cdevsw_add() and cdevsw_remove() may specify
  a mask/match indicating the range of supported minor numbers.  Multiple
  cdevsw_add()'s using the same major number, but distinctly different
  ranges, may be issued.  All devices that failed to call cdevsw_add() before
  now do.

* cdevsw_remove() now automatically marks all devices within its supported
  range as being destroyed.

* vnode->v_rdev is no longer resolved when the vnode is created.  Instead,
  only v_udev (a newly added field) is resolved.  v_rdev is resolved when
  the vnode is opened and cleared on the last close.

* A great deal of code was making rather dubious assumptions with regards
  to the validity of devices associated with vnodes, primarily due to
  the persistence of a device structure due to being indexed by (major, minor)
  instead of by (cdevsw, major, minor).  In particular, if you run a program
  which connects to a USB device and then you pull the USB device and plug
  it back in, the vnode subsystem will continue to believe that the device
  is open when, in fact, it isn't (because it was destroyed and recreated).

  In particular, note that all the VFS mount procedures now check devices
  via v_udev instead of v_rdev prior to calling VOP_OPEN(), since v_rdev
  is NULL prior to the first open.

* The disk layer's device interaction has been rewritten.  The disk layer
  (i.e. the slice and disklabel management layer) no longer overloads
  its data onto the device structure representing the underlying physical
  disk.  Instead, the disk layer uses the new cdevsw_add() functionality
  to register its own cdevsw using the underlying device's major number,
  and simply does NOT register the underlying device's cdevsw.  No
  confusion is created because the device hash is now based on
  (cdevsw,major,minor) rather then (major,minor).

  NOTE: This also means that underlying raw disk devices may use the entire
  device minor number instead of having to reserve the bits used by the disk
  layer, and also means that can we (theoretically) stack a fully
  disklabel-supported 'disk' on top of any block device.

* The new reference counting scheme prevents this by associating a device
  with a cdevsw and disconnecting the device from its cdevsw when the cdevsw
  is removed.  Additionally, all udev2dev() lookups run through the cdevsw
  mask/match and only successfully find devices still associated with an
  active cdevsw.

* Major work on MFS:  MFS no longer shortcuts vnode and device creation.  It
  now creates a real vnode and a real device and implements real open and
  close VOPs.  Additionally, due to the disk layer changes, MFS is no longer
  limited to 255 mounts.  The new limit is 16 million.  Since MFS creates a
  real device node, mount_mfs will now create a real /dev/mfs<PID> device
  that can be read from userland (e.g. so you can dump an MFS filesystem).

* BUF AND DEVICE STRATEGY changes.  The struct buf contains a b_dev field.
  In order to properly handle stacked devices we now require that the b_dev
  field be initialized before the device strategy routine is called.  This
  required some additional work in various VFS implementations.  To enforce
  this requirement, biodone() now sets b_dev to NODEV.  The new disk layer
  will adjust b_dev before forwarding a request to the actual physical
  device.

* A bug in the ISO CD boot sequence which resulted in a panic has been fixed.

Testing by: lots of people, but David Rhodus found the most aggregious bugs.

    1: /*
    2:  * Copyright (c) 1989, 1990, 1993, 1994
    3:  *	The Regents of the University of California.  All rights reserved.
    4:  *
    5:  * Redistribution and use in source and binary forms, with or without
    6:  * modification, are permitted provided that the following conditions
    7:  * are met:
    8:  * 1. Redistributions of source code must retain the above copyright
    9:  *    notice, this list of conditions and the following disclaimer.
   10:  * 2. Redistributions in binary form must reproduce the above copyright
   11:  *    notice, this list of conditions and the following disclaimer in the
   12:  *    documentation and/or other materials provided with the distribution.
   13:  * 3. All advertising materials mentioning features or use of this software
   14:  *    must display the following acknowledgement:
   15:  *	This product includes software developed by the University of
   16:  *	California, Berkeley and its contributors.
   17:  * 4. Neither the name of the University nor the names of its contributors
   18:  *    may be used to endorse or promote products derived from this software
   19:  *    without specific prior written permission.
   20:  *
   21:  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   22:  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   23:  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   24:  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   25:  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   26:  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   27:  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   28:  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   29:  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   30:  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   31:  * SUCH DAMAGE.
   32:  *
   33:  *	@(#)mfs_vfsops.c	8.11 (Berkeley) 6/19/95
   34:  * $FreeBSD: src/sys/ufs/mfs/mfs_vfsops.c,v 1.81.2.3 2001/07/04 17:35:21 tegge Exp $
   35:  * $DragonFly: src/sys/vfs/mfs/mfs_vfsops.c,v 1.15 2004/05/19 22:53:04 dillon Exp $
   36:  */
   37: 
   38: 
   39: #include "opt_mfs.h"
   40: 
   41: #include <sys/param.h>
   42: #include <sys/systm.h>
   43: #include <sys/conf.h>
   44: #include <sys/kernel.h>
   45: #include <sys/proc.h>
   46: #include <sys/buf.h>
   47: #include <sys/mount.h>
   48: #include <sys/signalvar.h>
   49: #include <sys/vnode.h>
   50: #include <sys/malloc.h>
   51: #include <sys/linker.h>
   52: #include <sys/fcntl.h>
   53: 
   54: #include <sys/buf2.h>
   55: 
   56: #include <vfs/ufs/quota.h>
   57: #include <vfs/ufs/inode.h>
   58: #include <vfs/ufs/ufsmount.h>
   59: #include <vfs/ufs/ufs_extern.h>
   60: #include <vfs/ufs/fs.h>
   61: #include <vfs/ufs/ffs_extern.h>
   62: 
   63: #include "mfsnode.h"
   64: #include "mfs_extern.h"
   65: 
   66: MALLOC_DEFINE(M_MFSNODE, "MFS node", "MFS vnode private part");
   67: 
   68: 
   69: extern vop_t **mfs_vnodeop_p;
   70: 
   71: static int	mfs_mount (struct mount *mp,
   72: 			char *path, caddr_t data, struct nameidata *ndp, 
   73: 			struct thread *td);
   74: static int	mfs_start (struct mount *mp, int flags, struct thread *td);
   75: static int	mfs_statfs (struct mount *mp, struct statfs *sbp, 
   76: 			struct thread *td);
   77: static int	mfs_init (struct vfsconf *);
   78: 
   79: d_open_t	mfsopen;
   80: d_close_t	mfsclose;
   81: d_strategy_t	mfsstrategy;
   82: 
   83: #define MFS_CDEV_MAJOR	253
   84: 
   85: static struct cdevsw mfs_cdevsw = {
   86: 	/* name */      "MFS",
   87: 	/* maj */       MFS_CDEV_MAJOR,
   88: 	/* flags */     D_DISK,
   89: 	/* port */	NULL,
   90: 	/* clone */	NULL,
   91: 
   92: 	/* open */      mfsopen,
   93: 	/* close */     mfsclose,
   94: 	/* read */      physread,
   95: 	/* write */     physwrite,
   96: 	/* ioctl */     noioctl,
   97: 	/* poll */      nopoll,
   98: 	/* mmap */      nommap,
   99: 	/* strategy */  mfsstrategy,
  100: 	/* dump */      nodump,
  101: 	/* psize */     nopsize
  102: };
  103: 
  104: /*
  105:  * mfs vfs operations.
  106:  */
  107: static struct vfsops mfs_vfsops = {
  108: 	mfs_mount,
  109: 	mfs_start,
  110: 	ffs_unmount,
  111: 	ufs_root,
  112: 	ufs_quotactl,
  113: 	mfs_statfs,
  114: 	ffs_sync,
  115: 	ffs_vget,
  116: 	ffs_fhtovp,
  117: 	ufs_check_export,
  118: 	ffs_vptofh,
  119: 	mfs_init,
  120: 	vfs_stduninit,
  121: 	vfs_stdextattrctl,
  122: };
  123: 
  124: VFS_SET(mfs_vfsops, mfs, 0);
  125: 
  126: /*
  127:  * We allow the underlying MFS block device to be opened and read.
  128:  */
  129: int
  130: mfsopen(dev_t dev, int flags, int mode, struct thread *td)
  131: {
  132: 	if (flags & FWRITE)
  133: 		return(EROFS);
  134: 	if (dev->si_drv1)
  135: 		return(0);
  136: 	return(ENXIO);
  137: }
  138: 
  139: int
  140: mfsclose(dev_t dev, int flags, int mode, struct thread *td)
  141: {
  142: 	return(0);
  143: }
  144: 
  145: void
  146: mfsstrategy(struct buf *bp)
  147: {
  148: 	struct mfsnode *mfsp;
  149: 
  150: 	if ((mfsp = bp->b_dev->si_drv1) != NULL) {
  151: 		off_t boff = (off_t)bp->b_blkno << DEV_BSHIFT;
  152: 		off_t eoff = boff + bp->b_bcount;
  153: 
  154: 		if (eoff <= mfsp->mfs_size) {
  155: 			bufq_insert_tail(&mfsp->buf_queue, bp);
  156: 			wakeup((caddr_t)mfsp);
  157: 		} else if (boff < mfsp->mfs_size) {
  158: 			bp->b_bcount = mfsp->mfs_size - boff;
  159: 			bufq_insert_tail(&mfsp->buf_queue, bp);
  160: 			wakeup((caddr_t)mfsp);
  161: 		} else if (boff == mfsp->mfs_size) {
  162: 			bp->b_resid = bp->b_bcount;
  163: 			biodone(bp);
  164: 		} else {
  165: 			bp->b_error = EINVAL;
  166: 			biodone(bp);
  167: 		}
  168: 	} else {
  169: 		bp->b_error = ENXIO;
  170: 		bp->b_flags |= B_ERROR;
  171: 		biodone(bp);
  172: 	}
  173: }
  174: 
  175: /*
  176:  * mfs_mount
  177:  *
  178:  * Called when mounting local physical media
  179:  *
  180:  * PARAMETERS:
  181:  *		mountroot
  182:  *			mp	mount point structure
  183:  *			path	NULL (flag for root mount!!!)
  184:  *			data	<unused>
  185:  *			ndp	<unused>
  186:  *			p	process (user credentials check [statfs])
  187:  *
  188:  *		mount
  189:  *			mp	mount point structure
  190:  *			path	path to mount point
  191:  *			data	pointer to argument struct in user space
  192:  *			ndp	mount point namei() return (used for
  193:  *				credentials on reload), reused to look
  194:  *				up block device.
  195:  *			p	process (user credentials check)
  196:  *
  197:  * RETURNS:	0	Success
  198:  *		!0	error number (errno.h)
  199:  *
  200:  * LOCK STATE:
  201:  *
  202:  *		ENTRY
  203:  *			mount point is locked
  204:  *		EXIT
  205:  *			mount point is locked
  206:  *
  207:  * NOTES:
  208:  *		A NULL path can be used for a flag since the mount
  209:  *		system call will fail with EFAULT in copyinstr in
  210:  *		namei() if it is a genuine NULL from the user.
  211:  */
  212: /* ARGSUSED */
  213: static int
  214: mfs_mount(struct mount *mp, char *path, caddr_t data, struct nameidata *ndp,
  215: 	  struct thread *td)
  216: {
  217: 	struct vnode *devvp;
  218: 	struct mfs_args args;
  219: 	struct ufsmount *ump;
  220: 	struct fs *fs;
  221: 	struct mfsnode *mfsp;
  222: 	size_t size;
  223: 	int flags, err;
  224: 	int minnum;
  225: 	dev_t dev;
  226: 
  227: 	/*
  228: 	 * Use NULL path to flag a root mount
  229: 	 */
  230: 	if( path == NULL) {
  231: 		/*
  232: 		 ***
  233: 		 * Mounting root file system
  234: 		 ***
  235: 		 */
  236: 
  237: 		/* you lose */
  238: 		panic("mfs_mount: mount MFS as root: not configured!");
  239: 	}
  240: 
  241: 	/*
  242: 	 ***
  243: 	 * Mounting non-root file system or updating a file system
  244: 	 ***
  245: 	 */
  246: 
  247: 	/* copy in user arguments*/
  248: 	if ((err = copyin(data, (caddr_t)&args, sizeof (struct mfs_args))) != 0)
  249: 		goto error_1;
  250: 
  251: 	/*
  252: 	 * If updating, check whether changing from read-only to
  253: 	 * read/write; if there is no device name, that's all we do.
  254: 	 */
  255: 	if (mp->mnt_flag & MNT_UPDATE) {
  256: 		/*
  257: 		 ********************
  258: 		 * UPDATE
  259: 		 ********************
  260: 		 */
  261: 		ump = VFSTOUFS(mp);
  262: 		fs = ump->um_fs;
  263: 		if (fs->fs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) {
  264: 			flags = WRITECLOSE;
  265: 			if (mp->mnt_flag & MNT_FORCE)
  266: 				flags |= FORCECLOSE;
  267: 			err = ffs_flushfiles(mp, flags, td);
  268: 			if (err)
  269: 				goto error_1;
  270: 		}
  271: 		if (fs->fs_ronly && (mp->mnt_kern_flag & MNTK_WANTRDWR))
  272: 			fs->fs_ronly = 0;
  273: 		/* if not updating name...*/
  274: 		if (args.fspec == 0) {
  275: 			/*
  276: 			 * Process export requests.  Jumping to "success"
  277: 			 * will return the vfs_export() error code. 
  278: 			 */
  279: 			err = vfs_export(mp, &ump->um_export, &args.export);
  280: 			goto success;
  281: 		}
  282: 
  283: 		/* XXX MFS does not support name updating*/
  284: 		goto success;
  285: 	}
  286: 	/*
  287: 	 * Do the MALLOC before the getnewvnode since doing so afterward
  288: 	 * might cause a bogus v_data pointer to get dereferenced
  289: 	 * elsewhere if MALLOC should block.
  290: 	 */
  291: 	MALLOC(mfsp, struct mfsnode *, sizeof *mfsp, M_MFSNODE, M_WAITOK);
  292: 
  293: 	err = getnewvnode(VT_MFS, (struct mount *)0, mfs_vnodeop_p, &devvp);
  294: 	if (err) {
  295: 		FREE(mfsp, M_MFSNODE);
  296: 		goto error_1;
  297: 	}
  298: 
  299: 	minnum = (curproc->p_pid & 0xFF) |
  300: 		((curproc->p_pid & ~0xFF) << 8);
  301: 
  302: 	devvp->v_type = VCHR;
  303: 	dev = make_dev(&mfs_cdevsw, minnum, UID_ROOT, GID_WHEEL, 0600,
  304: 			"MFS%d", minnum >> 16);
  305: 	/* It is not clear that these will get initialized otherwise */
  306: 	dev->si_bsize_phys = DEV_BSIZE;
  307: 	dev->si_iosize_max = DFLTPHYS;
  308: 	dev->si_drv1 = mfsp;
  309: 	addaliasu(devvp, makeudev(MFS_CDEV_MAJOR, minnum));
  310: 	devvp->v_data = mfsp;
  311: 	mfsp->mfs_baseoff = args.base;
  312: 	mfsp->mfs_size = args.size;
  313: 	mfsp->mfs_vnode = devvp;
  314: 	mfsp->mfs_dev = reference_dev(dev);
  315: 	mfsp->mfs_td = td;
  316: 	mfsp->mfs_active = 1;
  317: 	bufq_init(&mfsp->buf_queue);
  318: 
  319: 	/*
  320: 	 * Since this is a new mount, we want the names for
  321: 	 * the device and the mount point copied in.  If an
  322: 	 * error occurs,  the mountpoint is discarded by the
  323: 	 * upper level code.
  324: 	 */
  325: 	/* Save "last mounted on" info for mount point (NULL pad)*/
  326: 	copyinstr(	path,				/* mount point*/
  327: 			mp->mnt_stat.f_mntonname,	/* save area*/
  328: 			MNAMELEN - 1,			/* max size*/
  329: 			&size);				/* real size*/
  330: 	bzero( mp->mnt_stat.f_mntonname + size, MNAMELEN - size);
  331: 
  332: 	/* Save "mounted from" info for mount point (NULL pad)*/
  333: 	copyinstr(	args.fspec,			/* device name*/
  334: 			mp->mnt_stat.f_mntfromname,	/* save area*/
  335: 			MNAMELEN - 1,			/* max size*/
  336: 			&size);				/* real size*/
  337: 	bzero( mp->mnt_stat.f_mntfromname + size, MNAMELEN - size);
  338: 
  339: 	if ((err = ffs_mountfs(devvp, mp, td, M_MFSNODE)) != 0) { 
  340: 		mfsp->mfs_active = 0;
  341: 		goto error_2;
  342: 	}
  343: 
  344: 	/*
  345: 	 * Initialize FS stat information in mount struct; uses both
  346: 	 * mp->mnt_stat.f_mntonname and mp->mnt_stat.f_mntfromname
  347: 	 *
  348: 	 * This code is common to root and non-root mounts
  349: 	 */
  350: 	(void) VFS_STATFS(mp, &mp->mnt_stat, td);
  351: 
  352: 	goto success;
  353: 
  354: error_2:	/* error with devvp held*/
  355: 
  356: 	/* release devvp before failing*/
  357: 	vrele(devvp);
  358: 
  359: error_1:	/* no state to back out*/
  360: 
  361: success:
  362: 	return( err);
  363: }
  364: 
  365: /*
  366:  * Used to grab the process and keep it in the kernel to service
  367:  * memory filesystem I/O requests.
  368:  *
  369:  * Loop servicing I/O requests.
  370:  * Copy the requested data into or out of the memory filesystem
  371:  * address space.
  372:  */
  373: /* ARGSUSED */
  374: static int
  375: mfs_start(struct mount *mp, int flags, struct thread *td)
  376: {
  377: 	struct vnode *vp = VFSTOUFS(mp)->um_devvp;
  378: 	struct mfsnode *mfsp = VTOMFS(vp);
  379: 	struct buf *bp;
  380: 	int gotsig = 0, sig;
  381: 
  382: 	/*
  383: 	 * We must prevent the system from trying to swap
  384: 	 * out or kill ( when swap space is low, see vm/pageout.c ) the
  385: 	 * process.  A deadlock can occur if the process is swapped out,
  386: 	 * and the system can loop trying to kill the unkillable ( while
  387: 	 * references exist ) MFS process when swap space is low.
  388: 	 */
  389: 	KKASSERT(curproc);
  390: 	PHOLD(curproc);
  391: 
  392: 	while (mfsp->mfs_active) {
  393: 		int s;
  394: 
  395: 		s = splbio();
  396: 
  397: 		while ((bp = bufq_first(&mfsp->buf_queue)) != NULL) {
  398: 			bufq_remove(&mfsp->buf_queue, bp);
  399: 			splx(s);
  400: 			mfs_doio(bp, mfsp);
  401: 			wakeup((caddr_t)bp);
  402: 			s = splbio();
  403: 		}
  404: 
  405: 		splx(s);
  406: 
  407: 		/*
  408: 		 * If a non-ignored signal is received, try to unmount.
  409: 		 * If that fails, clear the signal (it has been "processed"),
  410: 		 * otherwise we will loop here, as tsleep will always return
  411: 		 * EINTR/ERESTART.
  412: 		 */
  413: 		/*
  414: 		 * Note that dounmount() may fail if work was queued after
  415: 		 * we slept. We have to jump hoops here to make sure that we
  416: 		 * process any buffers after the sleep, before we dounmount()
  417: 		 */
  418: 		if (gotsig) {
  419: 			gotsig = 0;
  420: 			if (dounmount(mp, 0, td) != 0) {
  421: 				KKASSERT(td->td_proc);
  422: 				sig = CURSIG(td->td_proc);
  423: 				if (sig)
  424: 					SIGDELSET(td->td_proc->p_siglist, sig);
  425: 			}
  426: 		}
  427: 		else if (tsleep((caddr_t)mfsp, PCATCH, "mfsidl", 0))
  428: 			gotsig++;	/* try to unmount in next pass */
  429: 	}
  430: 	PRELE(curproc);
  431: 	v_release_rdev(vp);	/* hack because we do not implement CLOSE */
  432: 	/* XXX destroy/release devvp */
  433: 	return (0);
  434: }
  435: 
  436: /*
  437:  * Get file system statistics.
  438:  */
  439: static int
  440: mfs_statfs(struct mount *mp, struct statfs *sbp, struct thread *td)
  441: {
  442: 	int error;
  443: 
  444: 	error = ffs_statfs(mp, sbp, td);
  445: 	sbp->f_type = mp->mnt_vfc->vfc_typenum;
  446: 	return (error);
  447: }
  448: 
  449: /*
  450:  * Memory based filesystem initialization.
  451:  */
  452: static int
  453: mfs_init(struct vfsconf *vfsp)
  454: {
  455: 	cdevsw_add(&mfs_cdevsw, 0, 0);
  456: 	return (0);
  457: }