mactype/optimize/memcpy_amd.c
MacType 07a1eee62c Initial release. Some files may missing, please report.
Dependency not included.

Deps: EasyHook, FreeType
2016-05-30 17:12:15 +08:00

984 lines
26 KiB
C

/******************************************************************************
Copyright (c) 2001 Advanced Micro Devices, Inc.
LIMITATION OF LIABILITY: THE MATERIALS ARE PROVIDED *AS IS* WITHOUT ANY
EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY,
NONINFRINGEMENT OF THIRD-PARTY INTELLECTUAL PROPERTY, OR FITNESS FOR ANY
PARTICULAR PURPOSE. IN NO EVENT SHALL AMD OR ITS SUPPLIERS BE LIABLE FOR ANY
DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF PROFITS,
BUSINESS INTERRUPTION, LOSS OF INFORMATION) ARISING OUT OF THE USE OF OR
INABILITY TO USE THE MATERIALS, EVEN IF AMD HAS BEEN ADVISED OF THE POSSIBILITY
OF SUCH DAMAGES. BECAUSE SOME JURISDICTIONS PROHIBIT THE EXCLUSION OR LIMITATION
OF LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE LIMITATION MAY
NOT APPLY TO YOU.
AMD does not assume any responsibility for any errors which may appear in the
Materials nor any responsibility to support or update the Materials. AMD retains
the right to make changes to its test specifications at any time, without notice.
NO SUPPORT OBLIGATION: AMD is not obligated to furnish, support, or make any
further information, software, technical information, know-how, or show-how
available to you.
So that all may benefit from your experience, please report any problems
or suggestions about this software to 3dsdk.support@amd.com
AMD Developer Technologies, M/S 585
Advanced Micro Devices, Inc.
5900 E. Ben White Blvd.
Austin, TX 78741
3dsdk.support@amd.com
******************************************************************************/
#pragma once
#include "memcpy_amd.h"
/*****************************************************************************
MEMCPY_AMD.CPP
******************************************************************************/
// Very optimized memcpy() routine for all AMD Athlon and Duron family.
// This code uses any of FOUR different basic copy methods, depending
// on the transfer size.
// NOTE: Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
// "Streaming Store"), and also uses the software prefetchnta instructions,
// be sure you're running on Athlon/Duron or other recent CPU before calling!
#define IN_CACHE_COPY 64 * 1024 // upper limit for movq/movq copy w/SW prefetch
// Next is a copy that uses the MMX registers to copy 8 bytes at a time,
// also using the "unrolled loop" optimization. This code uses
// the software prefetch instruction to get the data into the cache.
#define UNCACHED_COPY 197 * 1024 // upper limit for movq/movntq w/SW prefetch
// For larger blocks, which will spill beyond the cache, it's faster to
// use the Streaming Store instruction MOVNTQ. This write instruction
// bypasses the cache and writes straight to main memory. This code also
// uses the software prefetch instruction to pre-read the data.
// USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE"
#define BLOCK_PREFETCH_COPY infinity // no limit for movq/movntq w/block prefetch
#define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch
// For the largest size blocks, a special technique called Block Prefetch
// can be used to accelerate the read operations. Block Prefetch reads
// one address per cache line, for a series of cache lines, in a short loop.
// This is faster than using software prefetch. The technique is great for
// getting maximum read bandwidth, especially in DDR memory systems.
// Inline assembly syntax for use with Visual C++
/////////////////////////////////////////////////////////////////////////////////////
// katsyonak: Added MMX & SSE optimized memcpy - October 8, 2003 //
// //
// katsyonak: Added AMD, MMX & SSE optimized memset - October 12, 2003 //
// //
// Aw3/katsyonak: Added AMD, MMX & SSE optimized memzero - February 11, 2004 //
///////////////////////////////////////////////////////////////////////////////
static unsigned long CPU_Type = 0;
// 0 = CPU check not performed yet (Auto detect)
// 1 = No optimization
// 2 = MMX
// 3 = MMX2 for AMD Athlon/Duron and above (might also work on MMX2 (KATMAI) Intel machines)
// 4 = SSE
// 5 = SSE2 (only for Pentium 4 detection, the optimization used is SSE)
unsigned long get_cpu_type()
{
__asm
{
mov eax, [CPU_Type]
cmp eax, 5
ja do_detect
or eax, eax
jne ret_eax
do_detect:
xor eax, eax
cpuid
or eax, eax
mov eax, 1 ;No optimization
je cpu_done
xor esi, esi
cmp ebx, 68747541h ;Auth
jne not_amd
cmp edx, 69746E65h ;enti
jne not_amd
cmp ecx, 444D4163h ;cAMD
jne not_amd
inc esi
not_amd:
;mov eax,1
cpuid
mov al, 1 ;No optimization
bt edx, 23 ;MMX Feature Bit
jnb ret_al
or esi, esi
je check_sse
and ah, 1111b
cmp ah, 6 ;model 6 (K7) = Athlon, Duron
jb cpu_mmx
mov eax, 80000000h
cpuid
cmp eax, 80000000h
jbe cpu_mmx
mov eax, 80000001h
cpuid
bt edx, 31 ;AMD Feature Bit
jnb cpu_mmx
mov al, 3 ;AMD
jmp ret_al
check_sse:
bt edx, 25 ;SSE Feature Bit
jb cpu_sse
cpu_mmx:
mov al, 2
jmp ret_al
cpu_sse:
mov al, 4 ;SSE
bt edx, 26 ;SSE2 Feature Bit
adc al, 0
ret_al:
movzx eax,al
cpu_done:
mov [CPU_Type], eax
ret_eax:
}
}
static unsigned long memcpyProc = 0;
static unsigned long memsetProc = 0;
static unsigned long memzeroProc = 0;
void * _stdcall memcpy_optimized(void *dest, const void *src, size_t n)
{
__asm
{
mov ebx, [n] ; number of bytes to copy
mov edi, [dest] ; destination
mov esi, [src] ; source
push edi
mov ecx, [memcpyProc]
jecxz $memcpy_detect
jmp ecx
$memcpy_detect:
push ebx
push esi
push edi
call get_cpu_type
mov ecx, offset copy_sse
cmp al, 3
ja addr_done
mov ecx, offset copy_amd
je addr_done
mov ecx, offset copy_mmx
cmp al, 1
ja addr_done
mov ecx, offset copy_rep
addr_done:
mov [memcpyProc], ecx
pop edi
pop esi
pop ebx
jmp ecx
align 16
copy_sse:
cmp ebx, 512
jb copy_mmx ; tiny? skip optimized copy
mov ecx, 16 ; a trick that's faster than rep movsb...
sub ecx, edi ; align destination to qword
and ecx, 1111b ; get the low bits
sub ebx, ecx ; update copy count
neg ecx ; set up to jump into the array
add ecx, offset $memcpy_sse_align_done
jmp ecx ; jump to array of movsb's
align 16
$memcpy_sse_ic_1_a: ; 64-byte block copies, in-cache copy
prefetchnta [esi + 320] ; start reading ahead
movaps xmm0, [esi] ; read 128 bits
movaps xmm1, [esi+16]
movaps xmm2, [esi+32]
movaps xmm3, [esi+48]
add esi, 64 ; update source pointer
movntps [edi], xmm0 ; write 128 bits
movntps [edi+16], xmm1
movntps [edi+32], xmm2
movntps [edi+48], xmm3
add edi, 64 ; update destination pointer
dec ecx ; count down
jnz $memcpy_sse_ic_1_a ; last 64-byte block?
sfence ; flush the write buffer
mov ecx, ebx ; has valid low 6 bits of the byte count
shr ecx, 2 ; dword count
and ecx, 1111b ; only look at the "remainder" bits
neg ecx ; set up to jump into the array
add ecx, offset $memcpy_last_few
jmp ecx ; jump to array of movsd's
align 4
movsb
movsb
movsb
movsb
movsb
movsb
movsb
movsb
movsb
movsb
movsb
movsb
movsb
movsb
movsb
movsb
$memcpy_sse_align_done: ; destination is double quadword aligned
mov ecx, ebx ; number of bytes left to copy
shr ecx, 6 ; get 64-byte block count
test esi, 1111b ; Is the source address aligned?
je $memcpy_sse_ic_1_a
// This is small block copy that uses the SSE registers to copy 16 bytes
// at a time. It uses the "unrolled loop" optimization, and also uses
// the software prefetch instruction to get the data into the cache.
align 16
$memcpy_sse_ic_1: ; 64-byte block copies, in-cache copy
prefetchnta [esi + 320] ; start reading ahead
movups xmm0, [esi] ; read 128 bits
movups xmm1, [esi+16]
movups xmm2, [esi+32]
movups xmm3, [esi+48]
add esi, 64 ; update source pointer
movntps [edi], xmm0 ; write 128 bits
movntps [edi+16], xmm1
movntps [edi+32], xmm2
movntps [edi+48], xmm3
add edi, 64 ; update destination pointer
dec ecx ; count down
jnz $memcpy_sse_ic_1 ; last 64-byte block?
sfence ; flush the write buffer
mov ecx, ebx ; has valid low 6 bits of the byte count
shr ecx, 2 ; dword count
and ecx, 1111b ; only look at the "remainder" bits
neg ecx ; set up to jump into the array
add ecx, offset $memcpy_last_few
jmp ecx ; jump to array of movsd's
align 16
copy_amd:
cmp ebx, 128
jb copy_rep ; tiny? skip optimized copy
cmp ebx, 32*1024 ; don't align between 32k-64k because
jbe $memcpy_amd_do_align ; it appears to be slower
cmp ebx, 64*1024
jbe $memcpy_amd_align_done
$memcpy_amd_do_align:
mov ecx, 8 ; a trick that's faster than rep movsb...
sub ecx, edi ; align destination to qword
and ecx, 111b ; get the low bits
sub ebx, ecx ; update copy count
neg ecx ; set up to jump into the array
add ecx, offset $memcpy_amd_align_done
jmp ecx ; jump to array of movsb's
$memcpy_amd_uc_test:
cmp ecx, UNCACHED_COPY/64 ; big enough? use block prefetch copy
jae $memcpy_amd_bp_1
// For larger blocks, which will spill beyond the cache, it's faster to
// use the Streaming Store instruction MOVNTQ. This write instruction
// bypasses the cache and writes straight to main memory. This code also
// uses the software prefetch instruction to pre-read the data.
align 16
$memcpy_amd_uc_1: ; 64-byte blocks, uncached copy
prefetchnta [esi + (200*64/34+192)] ; start reading ahead
movq mm0, [esi] ; read 64 bits
add edi, 64 ; update destination pointer
movq mm1, [esi+8]
add esi, 64 ; update source pointer
movq mm2, [esi-48]
movntq [edi-64], mm0 ; write 64 bits, bypassing the cache
movq mm0, [esi-40] ; note: movntq also prevents the CPU
movntq [edi-56], mm1 ; from READING the destination address
movq mm1, [esi-32] ; into the cache, only to be over-written
movntq [edi-48], mm2 ; so that also helps performance
movq mm2, [esi-24]
movntq [edi-40], mm0
movq mm0, [esi-16]
movntq [edi-32], mm1
movq mm1, [esi-8]
movntq [edi-24], mm2
movntq [edi-16], mm0
dec ecx
movntq [edi-8], mm1
jnz $memcpy_amd_uc_1 ; last 64-byte block?
sfence ; flush the write buffer
mov ecx, ebx ; has valid low 6 bits of the byte count
shr ecx, 2 ; dword count
and ecx, 1111b ; only look at the "remainder" bits
neg ecx ; set up to jump into the array
add ecx, offset $memcpy_last_few
jmp ecx ; jump to array of movsd's
// For the largest size blocks, a special technique called Block Prefetch
// can be used to accelerate the read operations. Block Prefetch reads
// one address per cache line, for a series of cache lines, in a short loop.
// This is faster than using software prefetch, in this case.
// The technique is great for getting maximum read bandwidth,
// especially in DDR memory systems.
$memcpy_amd_bp_1: ; large blocks, block prefetch copy
mov eax, CACHEBLOCK / 2 ; block prefetch loop, unrolled 2X
add esi, CACHEBLOCK * 64 ; move to the top of the block
align 16
$memcpy_amd_bp_2:
mov edx, [esi-64] ; grab one address per cache line
mov edx, [esi-128] ; grab one address per cache line
sub esi, 128 ; go reverse order
dec eax ; count down the cache lines
jnz $memcpy_amd_bp_2 ; keep grabbing more lines into cache
mov eax, CACHEBLOCK ; now that it's in cache, do the copy
align 16
$memcpy_amd_bp_3:
movq mm0, [esi] ; read 64 bits
movq mm1, [esi+8]
movq mm2, [esi+16]
movq mm3, [esi+24]
movq mm4, [esi+32]
movq mm5, [esi+40]
movq mm6, [esi+48]
movq mm7, [esi+56]
add esi, 64 ; update source pointer
movntq [edi], mm0 ; write 64 bits, bypassing cache
movntq [edi+8], mm1 ; note: movntq also prevents the CPU
movntq [edi+16], mm2 ; from READING the destination address
movntq [edi+24], mm3 ; into the cache, only to be over-written,
movntq [edi+32], mm4 ; so that also helps performance
movntq [edi+40], mm5
movntq [edi+48], mm6
movntq [edi+56], mm7
add edi, 64 ; update dest pointer
dec eax ; count down
jnz $memcpy_amd_bp_3 ; keep copying
sub ecx, CACHEBLOCK ; update the 64-byte block count
jbe $memcpy_done ; no more 64-byte blocks left
cmp ecx, CACHEBLOCK ; big enough to run another prefetch loop?
jae $memcpy_amd_bp_1 ; yes, keep processing chunks
jmp $memcpy_amd_uc_1 ; 64-byte blocks, uncached copy
align 4
movsb
movsb
movsb
movsb
movsb
movsb
movsb
movsb
$memcpy_amd_align_done: ; destination is dword aligned
mov ecx, ebx ; number of bytes left to copy
shr ecx, 6 ; get 64-byte block count
cmp ecx, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy
jae $memcpy_amd_uc_test
// This is small block copy that uses the MMX registers to copy 8 bytes
// at a time. It uses the "unrolled loop" optimization, and also uses
// the software prefetch instruction to get the data into the cache.
align 16
$memcpy_amd_ic_1: ; 64-byte block copies, in-cache copy
prefetchnta [esi + (200*64/34+192)] ; start reading ahead
movq mm0, [esi] ; read 64 bits
movq mm1, [esi+8]
movq [edi], mm0 ; write 64 bits
movq [edi+8], mm1 ; note: the normal movq writes the
movq mm2, [esi+16] ; data to cache; a cache line will be
movq mm3, [esi+24] ; allocated as needed, to store the data
movq [edi+16], mm2
movq [edi+24], mm3
movq mm0, [esi+32]
movq mm1, [esi+40]
movq [edi+32], mm0
movq [edi+40], mm1
movq mm2, [esi+48]
movq mm3, [esi+56]
movq [edi+48], mm2
movq [edi+56], mm3
add esi, 64 ; update source pointer
add edi, 64 ; update destination pointer
dec ecx ; count down
jnz $memcpy_amd_ic_1 ; last 64-byte block?
$memcpy_done:
sfence ; flush the write buffer
mov ecx, ebx ; has valid low 6 bits of the byte count
shr ecx, 2 ; dword count
and ecx, 1111b ; only look at the "remainder" bits
neg ecx ; set up to jump into the array
add ecx, offset $memcpy_last_few
jmp ecx ; jump to array of movsd's
align 16
copy_mmx:
cmp ebx, 128
jb copy_rep ; tiny? skip optimized copy
mov ecx, 8 ; a trick that's faster than rep movsb...
sub ecx, edi ; align destination to qword
and ecx, 111b ; get the low bits
sub ebx, ecx ; update copy count
neg ecx ; set up to jump into the array
add ecx, offset $memcpy_mmx_align_done
jmp ecx ; jump to array of movsb's
align 4
movsb
movsb
movsb
movsb
movsb
movsb
movsb
movsb
$memcpy_mmx_align_done: ; destination is dword aligned
mov ecx, ebx ; number of bytes left to copy
shr ecx, 6 ; get 64-byte block count
align 16
$memcpy_mmx_ic_1:
movq mm0, [esi] ; read 64 bits
movq mm1, [esi+8]
movq [edi], mm0 ; write 64 bits
movq [edi+8], mm1
movq mm2, [esi+16]
movq mm3, [esi+24]
movq [edi+16], mm2
movq [edi+24], mm3
movq mm0, [esi+32]
movq mm1, [esi+40]
movq [edi+32], mm0
movq [edi+40], mm1
movq mm2, [esi+48]
movq mm3, [esi+56]
movq [edi+48], mm2
movq [edi+56], mm3
add esi, 64 ; update source pointer
add edi, 64 ; update destination pointer
dec ecx ; count down
jnz $memcpy_mmx_ic_1 ; last 64-byte block?
mov ecx, ebx ; has valid low 6 bits of the byte count
shr ecx, 2 ; dword count
and ecx, 1111b ; only look at the "remainder" bits
neg ecx ; set up to jump into the array
add ecx, offset $memcpy_last_few
jmp ecx ; jump to array of movsd's
align 16
copy_rep:
mov ecx, ebx
shr ecx, 2
and ebx, 11b ; ebx isn't required any more
rep movsd
mov ecx, ebx
rep movsb
jmp $memcpy_exit
// The smallest copy uses the X86 "movsd" instruction, in an optimized
// form which is an "unrolled loop". Then it handles the last few bytes.
align 4
movsd
movsd ; perform last 1-15 dword copies
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd ; perform last 1-7 dword copies
movsd
movsd
movsd
movsd
movsd
movsd
$memcpy_last_few: ; dword aligned from before movsd's
mov ecx, ebx ; has valid low 2 bits of the byte count
and ecx, 11b ; the last few cows must come home
rep movsb ; the last 1, 2, or 3 bytes
emms
$memcpy_exit:
pop eax // [dest] ; ret value = destination pointer
}
}
void* _stdcall memset_optimized(void *dest, int c, size_t n)
{
__asm
{
mov ebx, [n] ; number of bytes to fill
mov edi, [dest] ; destination
movzx eax, [c] ; character
mov ah, al
mov ecx, eax
shl ecx, 16
push edi
or eax, ecx
mov ecx,[memsetProc]
jecxz $memset_detect
jmp ecx
$memset_detect:
push eax
push ebx
push edi
call get_cpu_type
mov ecx, offset fill_sse
cmp al, 3
ja addr_done
mov ecx, offset fill_amd
je addr_done
mov ecx, offset fill_mmx
cmp al, 1
ja addr_done
mov ecx, offset fill_rep
addr_done:
mov [memsetProc], ecx
pop edi
pop ebx
pop eax
jmp ecx
align 16
fill_sse:
cmp ebx, 2048
jb fill_mmx ; tiny? skip optimized fill
mov ecx, 16 ; a trick that's faster than rep stosb...
sub ecx, edi ; align destination to qword
and ecx, 1111b ; get the low bits
sub ebx, ecx ; update copy count
neg ecx ; set up to jump into the array
add ecx, offset $memset_sse_align_done
jmp ecx ; jump to array of stosb's
align 4
stosb
stosb
stosb
stosb
stosb
stosb
stosb
stosb
stosb
stosb
stosb
stosb
stosb
stosb
stosb
stosb
$memset_sse_align_done: ; destination is double quadword aligned
mov ecx, ebx ; number of bytes left to fill
shr ecx, 6 ; get 64-byte block count
push eax
push eax
push eax
push eax
movups xmm0, [esp]
add esp, 16
align 16
$memset_sse_ic_1:
movntps [edi], xmm0 ; write 128 bits
movntps [edi+16], xmm0
movntps [edi+32], xmm0
movntps [edi+48], xmm0
add edi, 64 ; update destination pointer
dec ecx ; count down
jnz $memset_sse_ic_1 ; last 64-byte block?
sfence ; flush the write buffer
mov ecx, ebx ; has valid low 6 bits of the byte count
shr ecx, 2 ; dword count
and ecx, 1111b ; only look at the "remainder" bits
neg ecx ; set up to jump into the array
add ecx, offset $memset_last_few
jmp ecx ; jump to array of stosd's
align 16
fill_amd:
cmp ebx, 128
jb fill_rep ; tiny? skip optimized fill
mov ecx, 8 ; a trick that's faster than rep stosb...
sub ecx, edi ; align destination to qword
and ecx, 111b ; get the low bits
sub ebx, ecx ; update fill count
neg ecx ; set up to jump into the array
add ecx, offset $memset_amd_align_done
jmp ecx ; jump to array of stosb's
align 4
stosb
stosb
stosb
stosb
stosb
stosb
stosb
stosb
$memset_amd_align_done: ; destination is dword aligned
mov ecx, ebx ; number of bytes left to fill
shr ecx, 6 ; get 64-byte block count
movd mm0, eax
punpckldq mm0, mm0
align 16
$memset_amd_ic_1:
movntq [edi], mm0 ; write 64 bits
movntq [edi+8], mm0
movntq [edi+16], mm0
movntq [edi+24], mm0
movntq [edi+32], mm0
movntq [edi+40], mm0
movntq [edi+48], mm0
movntq [edi+56], mm0
add edi, 64 ; update destination pointer
dec ecx ; count down
jnz $memset_amd_ic_1 ; last 64-byte block?
sfence ; flush the write buffer
mov ecx, ebx ; has valid low 6 bits of the byte count
shr ecx, 2 ; dword count
and ecx, 1111b ; only look at the "remainder" bits
neg ecx ; set up to jump into the array
add ecx, offset $memset_last_few
jmp ecx ; jump to array of stosd's
align 16
fill_mmx:
cmp ebx, 192
jb fill_rep ; tiny? skip optimized fill
mov ecx, 8 ; a trick that's faster than rep stosb...
sub ecx, edi ; align destination to qword
and ecx, 111b ; get the low bits
sub ebx, ecx ; update fill count
neg ecx ; set up to jump into the array
add ecx, offset $memset_mmx_align_done
jmp ecx ; jump to array of stosb's
align 4
stosb
stosb
stosb
stosb
stosb
stosb
stosb
stosb
$memset_mmx_align_done: ; destination is dword aligned
mov ecx, ebx ; number of bytes left to fill
shr ecx, 6 ; get 64-byte block count
movd mm0, eax
punpckldq mm0, mm0
align 16
$memset_mmx_ic_1:
movq [edi], mm0 ; write 64 bits
movq [edi+8], mm0
movq [edi+16], mm0
movq [edi+24], mm0
movq [edi+32], mm0
movq [edi+40], mm0
movq [edi+48], mm0
movq [edi+56], mm0
add edi, 64 ; update destination pointer
dec ecx ; count down
jnz $memset_mmx_ic_1 ; last 64-byte block?
mov ecx, ebx ; has valid low 6 bits of the byte count
shr ecx, 2 ; dword count
and ecx, 1111b ; only look at the "remainder" bits
neg ecx ; set up to jump into the array
add ecx, offset $memset_last_few
jmp ecx ; jump to array of stosd's
align 16
fill_rep:
mov ecx, ebx
shr ecx, 2
and ebx, 11b ; ebx isn't required any more
rep stosd
mov ecx, ebx
rep stosb
jmp $memset_exit
align 4
stosd
stosd ; perform last 1-15 dword fills
stosd
stosd
stosd
stosd
stosd
stosd
stosd
stosd ; perform last 1-7 dword fills
stosd
stosd
stosd
stosd
stosd
stosd
$memset_last_few: ; dword aligned from before stosd's
mov ecx, ebx ; has valid low 2 bits of the byte count
and ecx, 11b ; the last few cows must come home
rep stosb ; the last 1, 2, or 3 bytes
emms
$memset_exit:
pop eax // [dest] ; ret value = destination pointer
}
}
void _stdcall memzero_optimized(void *dest, size_t n)
{
__asm
{
mov ebx, [n] ; number of bytes to fill
mov edi, [dest] ; destination
xor eax, eax
mov ecx,[memzeroProc]
jecxz $memzero_detect
jmp ecx
$memzero_detect:
push ebx
push edi
call get_cpu_type
mov ecx, offset fill_sse
cmp al, 3
ja addr_done
mov ecx, offset fill_amd
je addr_done
mov ecx, offset fill_mmx
cmp al, 1
ja addr_done
mov ecx, offset fill_rep
addr_done:
mov [memzeroProc], ecx
pop edi
pop ebx
xor eax, eax
jmp ecx
align 16
fill_sse:
cmp ebx, 2048
jb fill_mmx ; tiny? skip optimized fill
mov ecx, 16 ; a trick that's faster than rep stosb...
sub ecx, edi ; align destination to qword
and ecx, 1111b ; get the low bits
sub ebx, ecx ; update copy count
neg ecx ; set up to jump into the array
add ecx, offset $memzero_sse_align_done
jmp ecx ; jump to array of stosb's
align 4
stosb
stosb
stosb
stosb
stosb
stosb
stosb
stosb
stosb
stosb
stosb
stosb
stosb
stosb
stosb
stosb
$memzero_sse_align_done: ; destination is double quadword aligned
mov ecx, ebx ; number of bytes left to fill
shr ecx, 6 ; get 64-byte block count
xorps xmm0, xmm0
align 16
$memzero_sse_ic_1:
movntps [edi], xmm0 ; write 128 bits
movntps [edi+16], xmm0
movntps [edi+32], xmm0
movntps [edi+48], xmm0
add edi, 64 ; update destination pointer
dec ecx ; count down
jnz $memzero_sse_ic_1 ; last 64-byte block?
sfence ; flush the write buffer
mov ecx, ebx ; has valid low 6 bits of the byte count
shr ecx, 2 ; dword count
and ecx, 1111b ; only look at the "remainder" bits
neg ecx ; set up to jump into the array
add ecx, offset $memzero_last_few
jmp ecx ; jump to array of stosd's
align 16
fill_amd:
cmp ebx, 128
jb fill_rep ; tiny? skip optimized fill
mov ecx, 8 ; a trick that's faster than rep stosb...
sub ecx, edi ; align destination to qword
and ecx, 111b ; get the low bits
sub ebx, ecx ; update fill count
neg ecx ; set up to jump into the array
add ecx, offset $memzero_amd_align_done
jmp ecx ; jump to array of stosb's
align 4
stosb
stosb
stosb
stosb
stosb
stosb
stosb
stosb
$memzero_amd_align_done: ; destination is dword aligned
mov ecx, ebx ; number of bytes left to fill
shr ecx, 6 ; get 64-byte block count
pxor mm0, mm0
align 16
$memzero_amd_ic_1:
movntq [edi], mm0 ; write 64 bits
movntq [edi+8], mm0
movntq [edi+16], mm0
movntq [edi+24], mm0
movntq [edi+32], mm0
movntq [edi+40], mm0
movntq [edi+48], mm0
movntq [edi+56], mm0
add edi, 64 ; update destination pointer
dec ecx ; count down
jnz $memzero_amd_ic_1 ; last 64-byte block?
sfence ; flush the write buffer
mov ecx, ebx ; has valid low 6 bits of the byte count
shr ecx, 2 ; dword count
and ecx, 1111b ; only look at the "remainder" bits
neg ecx ; set up to jump into the array
add ecx, offset $memzero_last_few
jmp ecx ; jump to array of stosd's
align 16
fill_mmx:
cmp ebx, 192
jb fill_rep ; tiny? skip optimized fill
mov ecx, 8 ; a trick that's faster than rep stosb...
sub ecx, edi ; align destination to qword
and ecx, 111b ; get the low bits
sub ebx, ecx ; update fill count
neg ecx ; set up to jump into the array
add ecx, offset $memzero_mmx_align_done
jmp ecx ; jump to array of stosb's
align 4
stosb
stosb
stosb
stosb
stosb
stosb
stosb
stosb
$memzero_mmx_align_done: ; destination is dword aligned
mov ecx, ebx ; number of bytes left to fill
shr ecx, 6 ; get 64-byte block count
pxor mm0, mm0
align 16
$memzero_mmx_ic_1:
movq [edi], mm0 ; write 64 bits
movq [edi+8], mm0
movq [edi+16], mm0
movq [edi+24], mm0
movq [edi+32], mm0
movq [edi+40], mm0
movq [edi+48], mm0
movq [edi+56], mm0
add edi, 64 ; update destination pointer
dec ecx ; count down
jnz $memzero_mmx_ic_1 ; last 64-byte block?
mov ecx, ebx ; has valid low 6 bits of the byte count
shr ecx, 2 ; dword count
and ecx, 1111b ; only look at the "remainder" bits
neg ecx ; set up to jump into the array
add ecx, offset $memzero_last_few
jmp ecx ; jump to array of stosd's
align 16
fill_rep:
mov ecx, ebx
shr ecx, 2
and ebx, 11b ; ebx isn't required any more
rep stosd
mov ecx, ebx
rep stosb
jmp $memzero_exit
align 4
stosd
stosd ; perform last 1-15 dword fills
stosd
stosd
stosd
stosd
stosd
stosd
stosd
stosd ; perform last 1-7 dword fills
stosd
stosd
stosd
stosd
stosd
stosd
$memzero_last_few: ; dword aligned from before stosd's
mov ecx, ebx ; has valid low 2 bits of the byte count
and ecx, 11b ; the last few cows must come home
rep stosb ; the last 1, 2, or 3 bytes
emms
$memzero_exit:
}
}