raw I/O enhancement

Chris Torek chris at trantor.umd.edu
Mon Feb 29 15:47:57 AEST 1988


In article <3070 at voder.UUCP> jeff at voder.UUCP (Jeff Gilliam) writes:
>... physio() is changed to allocate a buffer from the pool of swap
>buffers if no buffer pointer is passed to it.

His implementation has one problem, however.  At entry to physio,
the pages involved in the raw transfer may not yet be in core.  If
this is the case, vslock() calls pagein to bring them in.  pagein
uses swap buffer headers for the pagein.  Hence, if physio takes
the last swap buffer, the system may deadlock in vslock().  This
is extremely unlikely on a system with `many' swap buffers (physio
has to take *all* the swap buffers), but on Suns with small physical
memories I have seen nswbuf as low as 4.

My own version of the generic raw code appears below.  (It is based
on SCCS rev 7.3, but that has MAXPHYS defined in ../machine/machparam.h,
not directly in vm_swp.c, which is why I put the `-' in this diff
listing.)  I also hang on to the swap buffer for all IO vectors
(not that many people use more than one), and removed the goto's
from physio().  (And yes, I generate one unnecessary `movl' :-) )

RCS file: /sys/sys/RCS/vm_swp.c,v
retrieving revision 1.1
diff -c2 -r1.1 vm_swp.c
*** /tmp/,RCSt1007514	Mon Feb 29 00:14:46 1988
--- vm_swp.c	Mon Feb 29 00:13:27 1988
***************
*** 4,8 ****
   * specifies the terms and conditions for redistribution.
   *
!  *	@(#)vm_swp.c	7.1 (Berkeley) 6/5/86
   */
  
--- 4,8 ----
   * specifies the terms and conditions for redistribution.
   *
!  *	@(#)vm_swp.c	7.3- (Berkeley) 4/2/87
   */
  
***************
*** 43,47 ****
   * We simply initialize the header and queue the I/O but
   * do not wait for completion. When the I/O completes,
!  * iodone() will link the header to a list of cleaned
   * pages to be processed by the pageout daemon.
   */
--- 43,47 ----
   * We simply initialize the header and queue the I/O but
   * do not wait for completion. When the I/O completes,
!  * biodone() will link the header to a list of cleaned
   * pages to be processed by the pageout daemon.
   */
***************
*** 106,110 ****
  			if (c < nbytes)
  				panic("big push");
! 			return (error);
  		}
  		bp->b_un.b_addr += c;
--- 106,110 ----
  			if (c < nbytes)
  				panic("big push");
! 			return (0);
  		}
  		bp->b_un.b_addr += c;
***************
*** 178,183 ****
   * Raw I/O. The arguments are
   *	The strategy routine for the device
!  *	A buffer, which will always be a special buffer
!  *	  header owned exclusively by the device for this purpose
   *	The device number
   *	Read/write flag
--- 178,184 ----
   * Raw I/O. The arguments are
   *	The strategy routine for the device
!  *	A buffer, which will either be a special buffer
!  *	  header owned exclusively by the device for this purpose,
!  *	  or NULL, indicating that we should find one
   *	The device number
   *	Read/write flag
***************
*** 200,253 ****
  	register int c;
  	char *a;
! 	int s, error = 0;
  
! nextiov:
! 	if (uio->uio_iovcnt == 0)
! 		return (0);
! 	iov = uio->uio_iov;
! 	if (useracc(iov->iov_base,(u_int)iov->iov_len,rw==B_READ?B_WRITE:B_READ) == NULL)
! 		return (EFAULT);
! 	s = splbio();
! 	while (bp->b_flags&B_BUSY) {
! 		bp->b_flags |= B_WANTED;
! 		sleep((caddr_t)bp, PRIBIO+1);
! 	}
! 	splx(s);
! 	bp->b_error = 0;
! 	bp->b_proc = u.u_procp;
! 	bp->b_un.b_addr = iov->iov_base;
! 	while (iov->iov_len > 0) {
! 		bp->b_flags = B_BUSY | B_PHYS | rw;
! 		bp->b_dev = dev;
! 		bp->b_blkno = btodb(uio->uio_offset);
! 		bp->b_bcount = iov->iov_len;
! 		(*mincnt)(bp);
! 		c = bp->b_bcount;
! 		u.u_procp->p_flag |= SPHYSIO;
! 		vslock(a = bp->b_un.b_addr, c);
! 		physstrat(bp, strat, PRIBIO);
! 		(void) splbio();
! 		vsunlock(a, c, rw);
! 		u.u_procp->p_flag &= ~SPHYSIO;
! 		if (bp->b_flags&B_WANTED)
! 			wakeup((caddr_t)bp);
  		splx(s);
! 		c -= bp->b_resid;
! 		bp->b_un.b_addr += c;
! 		iov->iov_len -= c;
! 		uio->uio_resid -= c;
! 		uio->uio_offset += c;
  		/* temp kludge for tape drives */
! 		if (bp->b_resid || (bp->b_flags&B_ERROR))
  			break;
  	}
! 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS);
! 	error = geterror(bp);
! 	/* temp kludge for tape drives */
! 	if (bp->b_resid || error)
! 		return (error);
! 	uio->uio_iov++;
! 	uio->uio_iovcnt--;
! 	goto nextiov;
  }
  
--- 201,281 ----
  	register int c;
  	char *a;
! 	int s, release = 0, error = 0;
  
! 	if (bp == NULL) {
! 		/*
! 		 * Steal a swap I/O buffer header.  Make sure we
! 		 * will not deadlock in vslock().
! 		 */
! 		release = 1;
! 		s = splbio();
! 		while ((bp = bswlist.av_forw) == NULL || bp->av_forw == NULL) {
! 			bswlist.b_flags |= B_WANTED;
! 			sleep((caddr_t)&bswlist, PRIBIO+1);
! 		}
! 		bswlist.av_forw = bp->av_forw;
  		splx(s);
! 	}
! 	for (; uio->uio_iovcnt; uio->uio_iov++, uio->uio_iovcnt--) {
! 		iov = uio->uio_iov;
! 		if (!useracc(iov->iov_base, (u_int)iov->iov_len,
! 		    rw == B_READ ? B_WRITE : B_READ)) {
! 			error = EFAULT;
! 			break;
! 		}
! 		if (!release) {	/* only if sharing caller's buffer */
! 			s = splbio();
! 			while (bp->b_flags&B_BUSY) {
! 				bp->b_flags |= B_WANTED;
! 				sleep((caddr_t)bp, PRIBIO+1);
! 			}
! 			splx(s);
! 		}
! 		bp->b_error = 0;
! 		bp->b_proc = u.u_procp;
! 		bp->b_un.b_addr = iov->iov_base;
! 		while (iov->iov_len > 0) {
! 			bp->b_flags = B_BUSY | B_PHYS | rw;
! 			bp->b_dev = dev;
! 			bp->b_blkno = btodb(uio->uio_offset);
! 			bp->b_bcount = iov->iov_len;
! 			(*mincnt)(bp);
! 			c = bp->b_bcount;
! 			u.u_procp->p_flag |= SPHYSIO;
! 			vslock(a = bp->b_un.b_addr, c);
! 			physstrat(bp, strat, PRIBIO);
! 			(void) splbio();
! 			vsunlock(a, c, rw);
! 			u.u_procp->p_flag &= ~SPHYSIO;
! 			if (bp->b_flags&B_WANTED)	/* rare */
! 				wakeup((caddr_t)bp);
! 			splx(s);
! 			c -= bp->b_resid;
! 			bp->b_un.b_addr += c;
! 			iov->iov_len -= c;
! 			uio->uio_resid -= c;
! 			uio->uio_offset += c;
! 			/* temp kludge for tape drives */
! 			if (bp->b_resid || (bp->b_flags&B_ERROR))
! 				break;
! 		}
! 		bp->b_flags &= ~(B_BUSY | B_WANTED | B_PHYS);
! 		error = geterror(bp);
  		/* temp kludge for tape drives */
! 		if (bp->b_resid || error)
  			break;
  	}
! 	if (release) {
! 		s = splbio();
! 		bp->av_forw = bswlist.av_forw;
! 		bswlist.av_forw = bp;
! 		if (bswlist.b_flags & B_WANTED) {
! 			bswlist.b_flags &= ~B_WANTED;
! 			wakeup((caddr_t)&bswlist);
! 			wakeup((caddr_t)&proc[2]);
! 		}
! 		splx(s);
! 	}
! 	return (error);
  }
  
***************
*** 261,263 ****
--- 289,308 ----
  	if (bp->b_bcount > MAXPHYS)
  		bp->b_bcount = MAXPHYS;
+ }
+ 
+ rawread(dev, uio)
+ 	dev_t dev;
+ 	struct uio *uio;
+ {
+ 
+ 	return (physio(cdevsw[major(dev)].d_strategy, (struct buf *)NULL,
+ 		dev, B_READ, minphys, uio));
+ }
+ 
+ rawwrite(dev, uio)
+ 	dev_t dev;
+ 	struct uio *uio;
+ {
+ 	return (physio(cdevsw[major(dev)].d_strategy, (struct buf *)NULL,
+ 		dev, B_WRITE, minphys, uio));
  }
-- 
In-Real-Life: Chris Torek, Univ of MD Computer Science, +1 301 454 7163
(still on trantor.umd.edu because mimsy is not yet re-news-networked)
Domain: chris at mimsy.umd.edu		Path: ...!uunet!mimsy!chris



More information about the Comp.bugs.4bsd.ucb-fixes mailing list