File:  [DragonFly] / src / sys / i386 / include / Attic / globaldata.h
Revision 1.24: download - view: text, annotated - select for diffs
Thu Apr 29 17:25:00 2004 UTC (9 years, 11 months ago) by dillon
Branches: MAIN
CVS tags: HEAD
Rewrite the optimized memcpy/bcopy/bzero support subsystem.  Rip out the
old FreeBSD code almost entirely.

* Add support for stacked ONFAULT routines, allowing copyin and copyout to
  call the general memcpy entry point instead of rolling their own.

* Split memcpy/bcopy and bzero into their own files

* Add support for XMM (128 bit) and MMX (64 bit) media instruction copies

* Rewrite the integer code.  Also note that most of the previous integer
  and FP special case support had been ripped out of DragonFly long ago
  in that the assembly was no longer being referenced.  It doesn't make
  sense to have a dozen different zeroing/copying routines so focus on
  the ones that work well with recent (last ~5 years) cpus.

* Rewrite the FP state handling code.  Instead of restoring the FP state
  let it hang, which allows userland to make multiple syscalls and/or for
  the system to make multiple bcopy()/memcpy() calls without having to
  save/restore the FP state on each call.  Userland will take a fault when
  it needs the FP again.

  Note that FP optimized copies only occur for block sizes >= 2048 bytes,
  so this is not something that userland, or the kernel, will trip up on
  every time it tries to do a bcopy().

* LWKT threads need to be able to save the FP state, add the simple
  conditional and 5 lines of assembly required to do that.

AMD Athlon notes: 64 bit media instructions will get us 90% of the way
there.  It is possible to squeeze out slightly more memory bandwidth from
the 128 bit XMM instructions (SSE2).  While it does not exist in this commit
there are two additional features that can be used:  prefetching and
non-temporal writes.  Prefetching is a 3dNOW instruction and can squeeze
out significant additionaL performance if you fetch ~128 bytes ahead of
the game, but I believe it is AMD-only.  Non-temporal writes can double
UNCACHED memory bandwidth, but they have a horrible effect on L1/L2
performance and you can't mix non-temporal writes with normal writes without
completely destroying memory performance (e.g. multiple GB/s -> less then
100 MBytes/sec).

Neither prefetching nor non-temporal writes are implemented in this commit.

    1: /*-
    2:  * Copyright (c) Peter Wemm <peter@netplex.com.au>
    3:  * All rights reserved.
    4:  *
    5:  * Redistribution and use in source and binary forms, with or without
    6:  * modification, are permitted provided that the following conditions
    7:  * are met:
    8:  * 1. Redistributions of source code must retain the above copyright
    9:  *    notice, this list of conditions and the following disclaimer.
   10:  * 2. Redistributions in binary form must reproduce the above copyright
   11:  *    notice, this list of conditions and the following disclaimer in the
   12:  *    documentation and/or other materials provided with the distribution.
   13:  *
   14:  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   15:  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   16:  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   17:  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   18:  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   19:  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   20:  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   21:  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   22:  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   23:  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   24:  * SUCH DAMAGE.
   25:  *
   26:  *	Only machine-dependant code should ever include this file.  MI
   27:  *	code and header files do NOT include this file.  e.g. sys/globaldata.h
   28:  *	should not include this file.
   29:  *
   30:  * $FreeBSD: src/sys/i386/include/globaldata.h,v 1.11.2.1 2000/05/16 06:58:10 dillon Exp $
   31:  * $DragonFly: src/sys/i386/include/globaldata.h,v 1.24 2004/04/29 17:25:00 dillon Exp $
   32:  */
   33: 
   34: #ifndef _MACHINE_GLOBALDATA_H_
   35: #define _MACHINE_GLOBALDATA_H_
   36: 
   37: #if defined(_KERNEL) || defined(_KERNEL_STRUCTURES)
   38: 
   39: #ifndef _SYS_GLOBALDATA_H_
   40: #include <sys/globaldata.h>	/* struct globaldata */
   41: #endif
   42: #ifndef _SYS_THREAD_H_
   43: #include <sys/thread.h>		/* struct thread */
   44: #endif
   45: #ifndef _MACHINE_SEGMENTS_H_
   46: #include "segments.h"	/* struct segment_descriptor */
   47: #endif
   48: #ifndef _MACHINE_TSS_H_
   49: #include "tss.h"	/* struct i386tss */
   50: #endif
   51: 
   52: /*
   53:  * Note on interrupt control.  Pending interrupts not yet dispatched are
   54:  * marked in gd_fpending or gd_ipending.  Once dispatched the interrupt's
   55:  * pending bit is cleared and the interrupt is masked.  Upon completion
   56:  * the interrupt is unmasked.
   57:  *
   58:  * For edge triggered interrupts interrupts may be enabled again at this
   59:  * point and if they occur before the interrupt service routine is complete
   60:  * the service routine will loop.
   61:  *
   62:  * The current thread's cpl is stored in the thread structure.
   63:  */
   64: struct mdglobaldata {
   65: 	struct globaldata mi;
   66: 	struct segment_descriptor gd_common_tssd;
   67: 	struct segment_descriptor *gd_tss_gdt;
   68: 	struct thread   *gd_npxthread;
   69: 	struct i386tss  gd_common_tss;
   70: 	int		gd_kernel_fpu_lock;	/* fast bcopy/zero cpu lock */
   71: 	int		gd_fpending;	/* fast interrupt pending */
   72: 	int		gd_ipending;	/* normal interrupt pending */
   73: 	int		gd_idelayed;	/* delayed software ints */
   74: 	int		gd_currentldt;
   75: 	int		gd_private_tss;
   76: 	u_int		unused001;
   77: 	u_int		gd_other_cpus;
   78: 	u_int		gd_ss_eflags;
   79: 	pt_entry_t	*gd_CMAP1;
   80: 	pt_entry_t	*gd_CMAP2;
   81: 	pt_entry_t	*gd_CMAP3;
   82: 	pt_entry_t	*gd_PMAP1;
   83: 	caddr_t		gd_CADDR1;
   84: 	caddr_t		gd_CADDR2;
   85: 	caddr_t		gd_CADDR3;
   86: 	unsigned	*gd_PADDR1;
   87: 	u_int		gd_acpi_id;
   88: 	u_int		gd_apic_id;
   89: };
   90: 
   91: /*
   92:  * This is the upper (0xff800000) address space layout that is per-cpu.
   93:  * It is setup in locore.s and pmap.c for the BSP and in mp_machdep.c for
   94:  * each AP.  genassym helps export this to the assembler code.
   95:  *
   96:  * WARNING!  page-bounded fields are hardwired for SMPpt[] setup in
   97:  * i386/i386/mp_machdep.c and locore.s.
   98:  */
   99: struct privatespace {
  100: 	/* page 0 - data page */
  101: 	struct mdglobaldata mdglobaldata;
  102: 	char		__filler0[PAGE_SIZE - sizeof(struct mdglobaldata)];
  103: 
  104: 	/* page 1..4 - CPAGE1,CPAGE2,CPAGE3,PPAGE1 */
  105: 	char		CPAGE1[PAGE_SIZE];		/* SMPpt[1] */
  106: 	char		CPAGE2[PAGE_SIZE];		/* SMPpt[2] */
  107: 	char		CPAGE3[PAGE_SIZE];		/* SMPpt[3] */
  108: 	char		PPAGE1[PAGE_SIZE];		/* SMPpt[4] */
  109: 
  110: 	/* page 5..4+UPAGES - idle stack (UPAGES pages) */
  111: 	char		idlestack[UPAGES * PAGE_SIZE];	/* SMPpt[5..] */
  112: };
  113: #define mdcpu  		((struct mdglobaldata *)_get_mycpu())
  114: 
  115: #endif
  116: 
  117: #ifdef _KERNEL
  118: 
  119: extern struct privatespace CPU_prvspace[];
  120: 
  121: #endif
  122: 
  123: #endif