File:  [DragonFly] / src / sys / i386 / i386 / Attic / globals.s
Revision 1.21: download - view: text, annotated - select for diffs
Thu Apr 29 17:24:58 2004 UTC (10 years, 6 months ago) by dillon
Branches: MAIN
CVS tags: HEAD
Rewrite the optimized memcpy/bcopy/bzero support subsystem.  Rip out the
old FreeBSD code almost entirely.

* Add support for stacked ONFAULT routines, allowing copyin and copyout to
  call the general memcpy entry point instead of rolling their own.

* Split memcpy/bcopy and bzero into their own files

* Add support for XMM (128 bit) and MMX (64 bit) media instruction copies

* Rewrite the integer code.  Also note that most of the previous integer
  and FP special case support had been ripped out of DragonFly long ago
  in that the assembly was no longer being referenced.  It doesn't make
  sense to have a dozen different zeroing/copying routines so focus on
  the ones that work well with recent (last ~5 years) cpus.

* Rewrite the FP state handling code.  Instead of restoring the FP state
  let it hang, which allows userland to make multiple syscalls and/or for
  the system to make multiple bcopy()/memcpy() calls without having to
  save/restore the FP state on each call.  Userland will take a fault when
  it needs the FP again.

  Note that FP optimized copies only occur for block sizes >= 2048 bytes,
  so this is not something that userland, or the kernel, will trip up on
  every time it tries to do a bcopy().

* LWKT threads need to be able to save the FP state, add the simple
  conditional and 5 lines of assembly required to do that.

AMD Athlon notes: 64 bit media instructions will get us 90% of the way
there.  It is possible to squeeze out slightly more memory bandwidth from
the 128 bit XMM instructions (SSE2).  While it does not exist in this commit
there are two additional features that can be used:  prefetching and
non-temporal writes.  Prefetching is a 3dNOW instruction and can squeeze
out significant additionaL performance if you fetch ~128 bytes ahead of
the game, but I believe it is AMD-only.  Non-temporal writes can double
UNCACHED memory bandwidth, but they have a horrible effect on L1/L2
performance and you can't mix non-temporal writes with normal writes without
completely destroying memory performance (e.g. multiple GB/s -> less then
100 MBytes/sec).

Neither prefetching nor non-temporal writes are implemented in this commit.

    1: /*-
    2:  * Copyright (c) Peter Wemm <peter@netplex.com.au>
    3:  * All rights reserved.
    4:  *
    5:  * Redistribution and use in source and binary forms, with or without
    6:  * modification, are permitted provided that the following conditions
    7:  * are met:
    8:  * 1. Redistributions of source code must retain the above copyright
    9:  *    notice, this list of conditions and the following disclaimer.
   10:  * 2. Redistributions in binary form must reproduce the above copyright
   11:  *    notice, this list of conditions and the following disclaimer in the
   12:  *    documentation and/or other materials provided with the distribution.
   13:  *
   14:  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   15:  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   16:  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   17:  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   18:  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   19:  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   20:  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   21:  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   22:  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   23:  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   24:  * SUCH DAMAGE.
   25:  *
   26:  * $FreeBSD: src/sys/i386/i386/globals.s,v 1.13.2.1 2000/05/16 06:58:06 dillon Exp $
   27:  * $DragonFly: src/sys/i386/i386/globals.s,v 1.21 2004/04/29 17:24:58 dillon Exp $
   28:  */
   29: 
   30: #include <machine/asmacros.h>
   31: #include <machine/pmap.h>
   32: 
   33: #include "assym.s"
   34: 
   35: 	/*
   36: 	 * Define the layout of the per-cpu address space.  This is
   37: 	 * "constructed" in locore.s on the BSP and in mp_machdep.c for
   38: 	 * each AP.  DO NOT REORDER THESE WITHOUT UPDATING THE REST!
   39: 	 *
   40: 	 * On UP the per-cpu address space is simply placed in the data
   41: 	 * segment.
   42: 	 */
   43: 	.data
   44: 	.globl	CPU_prvspace, lapic
   45: 	.set	CPU_prvspace,(MPPTDI << PDRSHIFT)
   46: 	.set	lapic,CPU_prvspace + (NPTEPG-1) * PAGE_SIZE
   47: 
   48: 	.globl  gd_idlestack,gd_idlestack_top
   49: 	.set    gd_idlestack,PS_IDLESTACK
   50: 	.set    gd_idlestack_top,PS_IDLESTACK_TOP
   51: 
   52: 	.globl	globaldata
   53: 	.set	globaldata,0
   54: 
   55: 	/*
   56: 	 * Define layout of the global data.  On SMP this lives in
   57: 	 * the per-cpu address space, otherwise it's in the data segment.
   58: 	 */
   59: 	.globl	gd_curthread, gd_npxthread, gd_reqflags, gd_common_tss
   60: 	.set	gd_curthread,globaldata + GD_CURTHREAD
   61: 	.set	gd_npxthread,globaldata + GD_NPXTHREAD
   62: 	.set	gd_reqflags,globaldata + GD_REQFLAGS
   63: 	.set	gd_common_tss,globaldata + GD_COMMON_TSS
   64: 
   65: 	.globl	gd_common_tssd, gd_tss_gdt
   66: 	.set	gd_common_tssd,globaldata + GD_COMMON_TSSD
   67: 	.set	gd_tss_gdt,globaldata + GD_TSS_GDT
   68: 
   69: 	.globl	gd_currentldt
   70: 	.set	gd_currentldt,globaldata + GD_CURRENTLDT
   71: 
   72: 	.globl	gd_kernel_fpu_lock
   73: 	.set	gd_kernel_fpu_lock, globaldata + GD_KERNEL_FPU_LOCK
   74: 
   75: 	/*
   76: 	 * The BSP version of these get setup in locore.s and pmap.c, while
   77: 	 * the AP versions are setup in mp_machdep.c.
   78: 	 */
   79: 	.globl  gd_cpuid, gd_other_cpus
   80: 	.globl	gd_ss_eflags, gd_intr_nesting_level
   81: 	.globl  gd_CMAP1, gd_CMAP2, gd_CMAP3, gd_PMAP1
   82: 	.globl  gd_CADDR1, gd_CADDR2, gd_CADDR3, gd_PADDR1
   83: 	.globl  gd_ipending, gd_fpending, gd_cnt, gd_private_tss
   84: 
   85: 	.set    gd_cpuid,globaldata + GD_CPUID
   86: 	.set    gd_private_tss,globaldata + GD_PRIVATE_TSS
   87: 	.set    gd_other_cpus,globaldata + GD_OTHER_CPUS
   88: 	.set    gd_ss_eflags,globaldata + GD_SS_EFLAGS
   89: 	.set    gd_intr_nesting_level,globaldata + GD_INTR_NESTING_LEVEL
   90: 	.set    gd_CMAP1,globaldata + GD_PRV_CMAP1
   91: 	.set    gd_CMAP2,globaldata + GD_PRV_CMAP2
   92: 	.set    gd_CMAP3,globaldata + GD_PRV_CMAP3
   93: 	.set    gd_PMAP1,globaldata + GD_PRV_PMAP1
   94: 	.set    gd_CADDR1,globaldata + GD_PRV_CADDR1
   95: 	.set    gd_CADDR2,globaldata + GD_PRV_CADDR2
   96: 	.set    gd_CADDR3,globaldata + GD_PRV_CADDR3
   97: 	.set    gd_PADDR1,globaldata + GD_PRV_PADDR1
   98: 	.set	gd_fpending,globaldata + GD_FPENDING
   99: 	.set	gd_ipending,globaldata + GD_IPENDING
  100: 	.set	gd_cnt,globaldata + GD_CNT
  101: 
  102: #if defined(APIC_IO)
  103: 	.globl	lapic_eoi, lapic_svr, lapic_tpr, lapic_irr1, lapic_ver
  104: 	.globl	lapic_icr_lo,lapic_icr_hi,lapic_isr1
  105: /*
  106:  * Do not clutter our namespace with these unless we need them in other
  107:  * assembler code.  The C code uses different definitions.
  108:  */
  109: #if 0
  110: 	.globl	lapic_id,lapic_ver,lapic_tpr,lapic_apr,lapic_ppr,lapic_eoi
  111: 	.globl	lapic_ldr,lapic_dfr,lapic_svr,lapic_isr,lapic_isr0
  112: 	.globl	lapic_isr2,lapic_isr3,lapic_isr4,lapic_isr5,lapic_isr6
  113: 	.globl	lapic_isr7,lapic_tmr,lapic_tmr0,lapic_tmr1,lapic_tmr2
  114: 	.globl	lapic_tmr3,lapic_tmr4,lapic_tmr5,lapic_tmr6,lapic_tmr7
  115: 	.globl	lapic_irr,lapic_irr0,lapic_irr1,lapic_irr2,lapic_irr3
  116: 	.globl	lapic_irr4,lapic_irr5,lapic_irr6,lapic_irr7,lapic_esr
  117: 	.globl	lapic_lvtt,lapic_pcint,lapic_lvt1
  118: 	.globl	lapic_lvt2,lapic_lvt3,lapic_ticr,lapic_tccr,lapic_tdcr
  119: #endif
  120: 	.set	lapic_id,	lapic + 0x020
  121: 	.set	lapic_ver,	lapic + 0x030
  122: 	.set	lapic_tpr,	lapic + 0x080
  123: 	.set	lapic_apr,	lapic + 0x090
  124: 	.set	lapic_ppr,	lapic + 0x0a0
  125: 	.set	lapic_eoi,	lapic + 0x0b0
  126: 	.set	lapic_ldr,	lapic + 0x0d0
  127: 	.set	lapic_dfr,	lapic + 0x0e0
  128: 	.set	lapic_svr,	lapic + 0x0f0
  129: 	.set	lapic_isr,	lapic + 0x100
  130: 	.set	lapic_isr0,	lapic + 0x100
  131: 	.set	lapic_isr1,	lapic + 0x110
  132: 	.set	lapic_isr2,	lapic + 0x120
  133: 	.set	lapic_isr3,	lapic + 0x130
  134: 	.set	lapic_isr4,	lapic + 0x140
  135: 	.set	lapic_isr5,	lapic + 0x150
  136: 	.set	lapic_isr6,	lapic + 0x160
  137: 	.set	lapic_isr7,	lapic + 0x170
  138: 	.set	lapic_tmr,	lapic + 0x180
  139: 	.set	lapic_tmr0,	lapic + 0x180
  140: 	.set	lapic_tmr1,	lapic + 0x190
  141: 	.set	lapic_tmr2,	lapic + 0x1a0
  142: 	.set	lapic_tmr3,	lapic + 0x1b0
  143: 	.set	lapic_tmr4,	lapic + 0x1c0
  144: 	.set	lapic_tmr5,	lapic + 0x1d0
  145: 	.set	lapic_tmr6,	lapic + 0x1e0
  146: 	.set	lapic_tmr7,	lapic + 0x1f0
  147: 	.set	lapic_irr,	lapic + 0x200
  148: 	.set	lapic_irr0,	lapic + 0x200
  149: 	.set	lapic_irr1,	lapic + 0x210
  150: 	.set	lapic_irr2,	lapic + 0x220
  151: 	.set	lapic_irr3,	lapic + 0x230
  152: 	.set	lapic_irr4,	lapic + 0x240
  153: 	.set	lapic_irr5,	lapic + 0x250
  154: 	.set	lapic_irr6,	lapic + 0x260
  155: 	.set	lapic_irr7,	lapic + 0x270
  156: 	.set	lapic_esr,	lapic + 0x280
  157: 	.set	lapic_icr_lo,	lapic + 0x300
  158: 	.set	lapic_icr_hi,	lapic + 0x310
  159: 	.set	lapic_lvtt,	lapic + 0x320
  160: 	.set	lapic_pcint,	lapic + 0x340
  161: 	.set	lapic_lvt1,	lapic + 0x350
  162: 	.set	lapic_lvt2,	lapic + 0x360
  163: 	.set	lapic_lvt3,	lapic + 0x370
  164: 	.set	lapic_ticr,	lapic + 0x380
  165: 	.set	lapic_tccr,	lapic + 0x390
  166: 	.set	lapic_tdcr,	lapic + 0x3e0
  167: #endif
  168: