File:
[DragonFly] /
src /
sys /
i386 /
i386 /
Attic /
globals.s
Revision
1.21:
download - view:
text,
annotated -
select for diffs
Thu Apr 29 17:24:58 2004 UTC (9 years, 1 month ago) by
dillon
Branches:
MAIN
CVS tags:
HEAD
Rewrite the optimized memcpy/bcopy/bzero support subsystem. Rip out the
old FreeBSD code almost entirely.
* Add support for stacked ONFAULT routines, allowing copyin and copyout to
call the general memcpy entry point instead of rolling their own.
* Split memcpy/bcopy and bzero into their own files
* Add support for XMM (128 bit) and MMX (64 bit) media instruction copies
* Rewrite the integer code. Also note that most of the previous integer
and FP special case support had been ripped out of DragonFly long ago
in that the assembly was no longer being referenced. It doesn't make
sense to have a dozen different zeroing/copying routines so focus on
the ones that work well with recent (last ~5 years) cpus.
* Rewrite the FP state handling code. Instead of restoring the FP state
let it hang, which allows userland to make multiple syscalls and/or for
the system to make multiple bcopy()/memcpy() calls without having to
save/restore the FP state on each call. Userland will take a fault when
it needs the FP again.
Note that FP optimized copies only occur for block sizes >= 2048 bytes,
so this is not something that userland, or the kernel, will trip up on
every time it tries to do a bcopy().
* LWKT threads need to be able to save the FP state, add the simple
conditional and 5 lines of assembly required to do that.
AMD Athlon notes: 64 bit media instructions will get us 90% of the way
there. It is possible to squeeze out slightly more memory bandwidth from
the 128 bit XMM instructions (SSE2). While it does not exist in this commit
there are two additional features that can be used: prefetching and
non-temporal writes. Prefetching is a 3dNOW instruction and can squeeze
out significant additionaL performance if you fetch ~128 bytes ahead of
the game, but I believe it is AMD-only. Non-temporal writes can double
UNCACHED memory bandwidth, but they have a horrible effect on L1/L2
performance and you can't mix non-temporal writes with normal writes without
completely destroying memory performance (e.g. multiple GB/s -> less then
100 MBytes/sec).
Neither prefetching nor non-temporal writes are implemented in this commit.
1: /*-
2: * Copyright (c) Peter Wemm <peter@netplex.com.au>
3: * All rights reserved.
4: *
5: * Redistribution and use in source and binary forms, with or without
6: * modification, are permitted provided that the following conditions
7: * are met:
8: * 1. Redistributions of source code must retain the above copyright
9: * notice, this list of conditions and the following disclaimer.
10: * 2. Redistributions in binary form must reproduce the above copyright
11: * notice, this list of conditions and the following disclaimer in the
12: * documentation and/or other materials provided with the distribution.
13: *
14: * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17: * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24: * SUCH DAMAGE.
25: *
26: * $FreeBSD: src/sys/i386/i386/globals.s,v 1.13.2.1 2000/05/16 06:58:06 dillon Exp $
27: * $DragonFly: src/sys/i386/i386/globals.s,v 1.21 2004/04/29 17:24:58 dillon Exp $
28: */
29:
30: #include <machine/asmacros.h>
31: #include <machine/pmap.h>
32:
33: #include "assym.s"
34:
35: /*
36: * Define the layout of the per-cpu address space. This is
37: * "constructed" in locore.s on the BSP and in mp_machdep.c for
38: * each AP. DO NOT REORDER THESE WITHOUT UPDATING THE REST!
39: *
40: * On UP the per-cpu address space is simply placed in the data
41: * segment.
42: */
43: .data
44: .globl CPU_prvspace, lapic
45: .set CPU_prvspace,(MPPTDI << PDRSHIFT)
46: .set lapic,CPU_prvspace + (NPTEPG-1) * PAGE_SIZE
47:
48: .globl gd_idlestack,gd_idlestack_top
49: .set gd_idlestack,PS_IDLESTACK
50: .set gd_idlestack_top,PS_IDLESTACK_TOP
51:
52: .globl globaldata
53: .set globaldata,0
54:
55: /*
56: * Define layout of the global data. On SMP this lives in
57: * the per-cpu address space, otherwise it's in the data segment.
58: */
59: .globl gd_curthread, gd_npxthread, gd_reqflags, gd_common_tss
60: .set gd_curthread,globaldata + GD_CURTHREAD
61: .set gd_npxthread,globaldata + GD_NPXTHREAD
62: .set gd_reqflags,globaldata + GD_REQFLAGS
63: .set gd_common_tss,globaldata + GD_COMMON_TSS
64:
65: .globl gd_common_tssd, gd_tss_gdt
66: .set gd_common_tssd,globaldata + GD_COMMON_TSSD
67: .set gd_tss_gdt,globaldata + GD_TSS_GDT
68:
69: .globl gd_currentldt
70: .set gd_currentldt,globaldata + GD_CURRENTLDT
71:
72: .globl gd_kernel_fpu_lock
73: .set gd_kernel_fpu_lock, globaldata + GD_KERNEL_FPU_LOCK
74:
75: /*
76: * The BSP version of these get setup in locore.s and pmap.c, while
77: * the AP versions are setup in mp_machdep.c.
78: */
79: .globl gd_cpuid, gd_other_cpus
80: .globl gd_ss_eflags, gd_intr_nesting_level
81: .globl gd_CMAP1, gd_CMAP2, gd_CMAP3, gd_PMAP1
82: .globl gd_CADDR1, gd_CADDR2, gd_CADDR3, gd_PADDR1
83: .globl gd_ipending, gd_fpending, gd_cnt, gd_private_tss
84:
85: .set gd_cpuid,globaldata + GD_CPUID
86: .set gd_private_tss,globaldata + GD_PRIVATE_TSS
87: .set gd_other_cpus,globaldata + GD_OTHER_CPUS
88: .set gd_ss_eflags,globaldata + GD_SS_EFLAGS
89: .set gd_intr_nesting_level,globaldata + GD_INTR_NESTING_LEVEL
90: .set gd_CMAP1,globaldata + GD_PRV_CMAP1
91: .set gd_CMAP2,globaldata + GD_PRV_CMAP2
92: .set gd_CMAP3,globaldata + GD_PRV_CMAP3
93: .set gd_PMAP1,globaldata + GD_PRV_PMAP1
94: .set gd_CADDR1,globaldata + GD_PRV_CADDR1
95: .set gd_CADDR2,globaldata + GD_PRV_CADDR2
96: .set gd_CADDR3,globaldata + GD_PRV_CADDR3
97: .set gd_PADDR1,globaldata + GD_PRV_PADDR1
98: .set gd_fpending,globaldata + GD_FPENDING
99: .set gd_ipending,globaldata + GD_IPENDING
100: .set gd_cnt,globaldata + GD_CNT
101:
102: #if defined(APIC_IO)
103: .globl lapic_eoi, lapic_svr, lapic_tpr, lapic_irr1, lapic_ver
104: .globl lapic_icr_lo,lapic_icr_hi,lapic_isr1
105: /*
106: * Do not clutter our namespace with these unless we need them in other
107: * assembler code. The C code uses different definitions.
108: */
109: #if 0
110: .globl lapic_id,lapic_ver,lapic_tpr,lapic_apr,lapic_ppr,lapic_eoi
111: .globl lapic_ldr,lapic_dfr,lapic_svr,lapic_isr,lapic_isr0
112: .globl lapic_isr2,lapic_isr3,lapic_isr4,lapic_isr5,lapic_isr6
113: .globl lapic_isr7,lapic_tmr,lapic_tmr0,lapic_tmr1,lapic_tmr2
114: .globl lapic_tmr3,lapic_tmr4,lapic_tmr5,lapic_tmr6,lapic_tmr7
115: .globl lapic_irr,lapic_irr0,lapic_irr1,lapic_irr2,lapic_irr3
116: .globl lapic_irr4,lapic_irr5,lapic_irr6,lapic_irr7,lapic_esr
117: .globl lapic_lvtt,lapic_pcint,lapic_lvt1
118: .globl lapic_lvt2,lapic_lvt3,lapic_ticr,lapic_tccr,lapic_tdcr
119: #endif
120: .set lapic_id, lapic + 0x020
121: .set lapic_ver, lapic + 0x030
122: .set lapic_tpr, lapic + 0x080
123: .set lapic_apr, lapic + 0x090
124: .set lapic_ppr, lapic + 0x0a0
125: .set lapic_eoi, lapic + 0x0b0
126: .set lapic_ldr, lapic + 0x0d0
127: .set lapic_dfr, lapic + 0x0e0
128: .set lapic_svr, lapic + 0x0f0
129: .set lapic_isr, lapic + 0x100
130: .set lapic_isr0, lapic + 0x100
131: .set lapic_isr1, lapic + 0x110
132: .set lapic_isr2, lapic + 0x120
133: .set lapic_isr3, lapic + 0x130
134: .set lapic_isr4, lapic + 0x140
135: .set lapic_isr5, lapic + 0x150
136: .set lapic_isr6, lapic + 0x160
137: .set lapic_isr7, lapic + 0x170
138: .set lapic_tmr, lapic + 0x180
139: .set lapic_tmr0, lapic + 0x180
140: .set lapic_tmr1, lapic + 0x190
141: .set lapic_tmr2, lapic + 0x1a0
142: .set lapic_tmr3, lapic + 0x1b0
143: .set lapic_tmr4, lapic + 0x1c0
144: .set lapic_tmr5, lapic + 0x1d0
145: .set lapic_tmr6, lapic + 0x1e0
146: .set lapic_tmr7, lapic + 0x1f0
147: .set lapic_irr, lapic + 0x200
148: .set lapic_irr0, lapic + 0x200
149: .set lapic_irr1, lapic + 0x210
150: .set lapic_irr2, lapic + 0x220
151: .set lapic_irr3, lapic + 0x230
152: .set lapic_irr4, lapic + 0x240
153: .set lapic_irr5, lapic + 0x250
154: .set lapic_irr6, lapic + 0x260
155: .set lapic_irr7, lapic + 0x270
156: .set lapic_esr, lapic + 0x280
157: .set lapic_icr_lo, lapic + 0x300
158: .set lapic_icr_hi, lapic + 0x310
159: .set lapic_lvtt, lapic + 0x320
160: .set lapic_pcint, lapic + 0x340
161: .set lapic_lvt1, lapic + 0x350
162: .set lapic_lvt2, lapic + 0x360
163: .set lapic_lvt3, lapic + 0x370
164: .set lapic_ticr, lapic + 0x380
165: .set lapic_tccr, lapic + 0x390
166: .set lapic_tdcr, lapic + 0x3e0
167: #endif
168: