mirror of
https://github.com/snowie2000/mactype.git
synced 2025-01-07 03:07:01 +08:00
07a1eee62c
Dependency not included. Deps: EasyHook, FreeType
984 lines
26 KiB
C
984 lines
26 KiB
C
/******************************************************************************
|
|
|
|
Copyright (c) 2001 Advanced Micro Devices, Inc.
|
|
|
|
LIMITATION OF LIABILITY: THE MATERIALS ARE PROVIDED *AS IS* WITHOUT ANY
|
|
EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY,
|
|
NONINFRINGEMENT OF THIRD-PARTY INTELLECTUAL PROPERTY, OR FITNESS FOR ANY
|
|
PARTICULAR PURPOSE. IN NO EVENT SHALL AMD OR ITS SUPPLIERS BE LIABLE FOR ANY
|
|
DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF PROFITS,
|
|
BUSINESS INTERRUPTION, LOSS OF INFORMATION) ARISING OUT OF THE USE OF OR
|
|
INABILITY TO USE THE MATERIALS, EVEN IF AMD HAS BEEN ADVISED OF THE POSSIBILITY
|
|
OF SUCH DAMAGES. BECAUSE SOME JURISDICTIONS PROHIBIT THE EXCLUSION OR LIMITATION
|
|
OF LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE LIMITATION MAY
|
|
NOT APPLY TO YOU.
|
|
|
|
AMD does not assume any responsibility for any errors which may appear in the
|
|
Materials nor any responsibility to support or update the Materials. AMD retains
|
|
the right to make changes to its test specifications at any time, without notice.
|
|
|
|
NO SUPPORT OBLIGATION: AMD is not obligated to furnish, support, or make any
|
|
further information, software, technical information, know-how, or show-how
|
|
available to you.
|
|
|
|
So that all may benefit from your experience, please report any problems
|
|
or suggestions about this software to 3dsdk.support@amd.com
|
|
|
|
AMD Developer Technologies, M/S 585
|
|
Advanced Micro Devices, Inc.
|
|
5900 E. Ben White Blvd.
|
|
Austin, TX 78741
|
|
3dsdk.support@amd.com
|
|
******************************************************************************/
|
|
#pragma once
|
|
|
|
#include "memcpy_amd.h"
|
|
|
|
/*****************************************************************************
|
|
MEMCPY_AMD.CPP
|
|
******************************************************************************/
|
|
|
|
// Very optimized memcpy() routine for all AMD Athlon and Duron family.
|
|
// This code uses any of FOUR different basic copy methods, depending
|
|
// on the transfer size.
|
|
// NOTE: Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
|
|
// "Streaming Store"), and also uses the software prefetchnta instructions,
|
|
// be sure you're running on Athlon/Duron or other recent CPU before calling!
|
|
|
|
#define IN_CACHE_COPY 64 * 1024 // upper limit for movq/movq copy w/SW prefetch
|
|
// Next is a copy that uses the MMX registers to copy 8 bytes at a time,
|
|
// also using the "unrolled loop" optimization. This code uses
|
|
// the software prefetch instruction to get the data into the cache.
|
|
|
|
#define UNCACHED_COPY 197 * 1024 // upper limit for movq/movntq w/SW prefetch
|
|
// For larger blocks, which will spill beyond the cache, it's faster to
|
|
// use the Streaming Store instruction MOVNTQ. This write instruction
|
|
// bypasses the cache and writes straight to main memory. This code also
|
|
// uses the software prefetch instruction to pre-read the data.
|
|
// USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE"
|
|
|
|
#define BLOCK_PREFETCH_COPY infinity // no limit for movq/movntq w/block prefetch
|
|
#define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch
|
|
// For the largest size blocks, a special technique called Block Prefetch
|
|
// can be used to accelerate the read operations. Block Prefetch reads
|
|
// one address per cache line, for a series of cache lines, in a short loop.
|
|
// This is faster than using software prefetch. The technique is great for
|
|
// getting maximum read bandwidth, especially in DDR memory systems.
|
|
|
|
// Inline assembly syntax for use with Visual C++
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////
|
|
// katsyonak: Added MMX & SSE optimized memcpy - October 8, 2003 //
|
|
// //
|
|
// katsyonak: Added AMD, MMX & SSE optimized memset - October 12, 2003 //
|
|
// //
|
|
// Aw3/katsyonak: Added AMD, MMX & SSE optimized memzero - February 11, 2004 //
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
|
|
static unsigned long CPU_Type = 0;
|
|
// 0 = CPU check not performed yet (Auto detect)
|
|
// 1 = No optimization
|
|
// 2 = MMX
|
|
// 3 = MMX2 for AMD Athlon/Duron and above (might also work on MMX2 (KATMAI) Intel machines)
|
|
// 4 = SSE
|
|
// 5 = SSE2 (only for Pentium 4 detection, the optimization used is SSE)
|
|
unsigned long get_cpu_type()
|
|
{
|
|
__asm
|
|
{
|
|
mov eax, [CPU_Type]
|
|
cmp eax, 5
|
|
ja do_detect
|
|
or eax, eax
|
|
jne ret_eax
|
|
do_detect:
|
|
xor eax, eax
|
|
cpuid
|
|
or eax, eax
|
|
mov eax, 1 ;No optimization
|
|
je cpu_done
|
|
xor esi, esi
|
|
cmp ebx, 68747541h ;Auth
|
|
jne not_amd
|
|
cmp edx, 69746E65h ;enti
|
|
jne not_amd
|
|
cmp ecx, 444D4163h ;cAMD
|
|
jne not_amd
|
|
inc esi
|
|
not_amd:
|
|
;mov eax,1
|
|
cpuid
|
|
mov al, 1 ;No optimization
|
|
bt edx, 23 ;MMX Feature Bit
|
|
jnb ret_al
|
|
or esi, esi
|
|
je check_sse
|
|
and ah, 1111b
|
|
cmp ah, 6 ;model 6 (K7) = Athlon, Duron
|
|
jb cpu_mmx
|
|
mov eax, 80000000h
|
|
cpuid
|
|
cmp eax, 80000000h
|
|
jbe cpu_mmx
|
|
mov eax, 80000001h
|
|
cpuid
|
|
bt edx, 31 ;AMD Feature Bit
|
|
jnb cpu_mmx
|
|
mov al, 3 ;AMD
|
|
jmp ret_al
|
|
check_sse:
|
|
bt edx, 25 ;SSE Feature Bit
|
|
jb cpu_sse
|
|
cpu_mmx:
|
|
mov al, 2
|
|
jmp ret_al
|
|
cpu_sse:
|
|
mov al, 4 ;SSE
|
|
bt edx, 26 ;SSE2 Feature Bit
|
|
adc al, 0
|
|
ret_al:
|
|
movzx eax,al
|
|
cpu_done:
|
|
mov [CPU_Type], eax
|
|
ret_eax:
|
|
}
|
|
}
|
|
|
|
static unsigned long memcpyProc = 0;
|
|
static unsigned long memsetProc = 0;
|
|
static unsigned long memzeroProc = 0;
|
|
|
|
void * _stdcall memcpy_optimized(void *dest, const void *src, size_t n)
|
|
{
|
|
__asm
|
|
{
|
|
mov ebx, [n] ; number of bytes to copy
|
|
mov edi, [dest] ; destination
|
|
mov esi, [src] ; source
|
|
push edi
|
|
|
|
mov ecx, [memcpyProc]
|
|
jecxz $memcpy_detect
|
|
jmp ecx
|
|
|
|
$memcpy_detect:
|
|
push ebx
|
|
push esi
|
|
push edi
|
|
call get_cpu_type
|
|
mov ecx, offset copy_sse
|
|
cmp al, 3
|
|
ja addr_done
|
|
mov ecx, offset copy_amd
|
|
je addr_done
|
|
mov ecx, offset copy_mmx
|
|
cmp al, 1
|
|
ja addr_done
|
|
mov ecx, offset copy_rep
|
|
addr_done:
|
|
mov [memcpyProc], ecx
|
|
pop edi
|
|
pop esi
|
|
pop ebx
|
|
jmp ecx
|
|
|
|
align 16
|
|
copy_sse:
|
|
cmp ebx, 512
|
|
jb copy_mmx ; tiny? skip optimized copy
|
|
|
|
mov ecx, 16 ; a trick that's faster than rep movsb...
|
|
sub ecx, edi ; align destination to qword
|
|
and ecx, 1111b ; get the low bits
|
|
sub ebx, ecx ; update copy count
|
|
neg ecx ; set up to jump into the array
|
|
add ecx, offset $memcpy_sse_align_done
|
|
jmp ecx ; jump to array of movsb's
|
|
|
|
align 16
|
|
$memcpy_sse_ic_1_a: ; 64-byte block copies, in-cache copy
|
|
prefetchnta [esi + 320] ; start reading ahead
|
|
|
|
movaps xmm0, [esi] ; read 128 bits
|
|
movaps xmm1, [esi+16]
|
|
movaps xmm2, [esi+32]
|
|
movaps xmm3, [esi+48]
|
|
add esi, 64 ; update source pointer
|
|
movntps [edi], xmm0 ; write 128 bits
|
|
movntps [edi+16], xmm1
|
|
movntps [edi+32], xmm2
|
|
movntps [edi+48], xmm3
|
|
add edi, 64 ; update destination pointer
|
|
dec ecx ; count down
|
|
jnz $memcpy_sse_ic_1_a ; last 64-byte block?
|
|
sfence ; flush the write buffer
|
|
mov ecx, ebx ; has valid low 6 bits of the byte count
|
|
shr ecx, 2 ; dword count
|
|
and ecx, 1111b ; only look at the "remainder" bits
|
|
neg ecx ; set up to jump into the array
|
|
add ecx, offset $memcpy_last_few
|
|
jmp ecx ; jump to array of movsd's
|
|
|
|
align 4
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
|
|
$memcpy_sse_align_done: ; destination is double quadword aligned
|
|
mov ecx, ebx ; number of bytes left to copy
|
|
shr ecx, 6 ; get 64-byte block count
|
|
test esi, 1111b ; Is the source address aligned?
|
|
je $memcpy_sse_ic_1_a
|
|
|
|
// This is small block copy that uses the SSE registers to copy 16 bytes
|
|
// at a time. It uses the "unrolled loop" optimization, and also uses
|
|
// the software prefetch instruction to get the data into the cache.
|
|
align 16
|
|
$memcpy_sse_ic_1: ; 64-byte block copies, in-cache copy
|
|
prefetchnta [esi + 320] ; start reading ahead
|
|
|
|
movups xmm0, [esi] ; read 128 bits
|
|
movups xmm1, [esi+16]
|
|
movups xmm2, [esi+32]
|
|
movups xmm3, [esi+48]
|
|
add esi, 64 ; update source pointer
|
|
movntps [edi], xmm0 ; write 128 bits
|
|
movntps [edi+16], xmm1
|
|
movntps [edi+32], xmm2
|
|
movntps [edi+48], xmm3
|
|
add edi, 64 ; update destination pointer
|
|
dec ecx ; count down
|
|
jnz $memcpy_sse_ic_1 ; last 64-byte block?
|
|
sfence ; flush the write buffer
|
|
mov ecx, ebx ; has valid low 6 bits of the byte count
|
|
shr ecx, 2 ; dword count
|
|
and ecx, 1111b ; only look at the "remainder" bits
|
|
neg ecx ; set up to jump into the array
|
|
add ecx, offset $memcpy_last_few
|
|
jmp ecx ; jump to array of movsd's
|
|
|
|
align 16
|
|
copy_amd:
|
|
cmp ebx, 128
|
|
jb copy_rep ; tiny? skip optimized copy
|
|
cmp ebx, 32*1024 ; don't align between 32k-64k because
|
|
jbe $memcpy_amd_do_align ; it appears to be slower
|
|
cmp ebx, 64*1024
|
|
jbe $memcpy_amd_align_done
|
|
$memcpy_amd_do_align:
|
|
mov ecx, 8 ; a trick that's faster than rep movsb...
|
|
sub ecx, edi ; align destination to qword
|
|
and ecx, 111b ; get the low bits
|
|
sub ebx, ecx ; update copy count
|
|
neg ecx ; set up to jump into the array
|
|
add ecx, offset $memcpy_amd_align_done
|
|
jmp ecx ; jump to array of movsb's
|
|
|
|
$memcpy_amd_uc_test:
|
|
cmp ecx, UNCACHED_COPY/64 ; big enough? use block prefetch copy
|
|
jae $memcpy_amd_bp_1
|
|
|
|
// For larger blocks, which will spill beyond the cache, it's faster to
|
|
// use the Streaming Store instruction MOVNTQ. This write instruction
|
|
// bypasses the cache and writes straight to main memory. This code also
|
|
// uses the software prefetch instruction to pre-read the data.
|
|
align 16
|
|
$memcpy_amd_uc_1: ; 64-byte blocks, uncached copy
|
|
prefetchnta [esi + (200*64/34+192)] ; start reading ahead
|
|
|
|
movq mm0, [esi] ; read 64 bits
|
|
add edi, 64 ; update destination pointer
|
|
movq mm1, [esi+8]
|
|
add esi, 64 ; update source pointer
|
|
movq mm2, [esi-48]
|
|
movntq [edi-64], mm0 ; write 64 bits, bypassing the cache
|
|
movq mm0, [esi-40] ; note: movntq also prevents the CPU
|
|
movntq [edi-56], mm1 ; from READING the destination address
|
|
movq mm1, [esi-32] ; into the cache, only to be over-written
|
|
movntq [edi-48], mm2 ; so that also helps performance
|
|
movq mm2, [esi-24]
|
|
movntq [edi-40], mm0
|
|
movq mm0, [esi-16]
|
|
movntq [edi-32], mm1
|
|
movq mm1, [esi-8]
|
|
movntq [edi-24], mm2
|
|
movntq [edi-16], mm0
|
|
dec ecx
|
|
movntq [edi-8], mm1
|
|
jnz $memcpy_amd_uc_1 ; last 64-byte block?
|
|
sfence ; flush the write buffer
|
|
mov ecx, ebx ; has valid low 6 bits of the byte count
|
|
shr ecx, 2 ; dword count
|
|
and ecx, 1111b ; only look at the "remainder" bits
|
|
neg ecx ; set up to jump into the array
|
|
add ecx, offset $memcpy_last_few
|
|
jmp ecx ; jump to array of movsd's
|
|
|
|
// For the largest size blocks, a special technique called Block Prefetch
|
|
// can be used to accelerate the read operations. Block Prefetch reads
|
|
// one address per cache line, for a series of cache lines, in a short loop.
|
|
// This is faster than using software prefetch, in this case.
|
|
// The technique is great for getting maximum read bandwidth,
|
|
// especially in DDR memory systems.
|
|
$memcpy_amd_bp_1: ; large blocks, block prefetch copy
|
|
mov eax, CACHEBLOCK / 2 ; block prefetch loop, unrolled 2X
|
|
add esi, CACHEBLOCK * 64 ; move to the top of the block
|
|
align 16
|
|
$memcpy_amd_bp_2:
|
|
mov edx, [esi-64] ; grab one address per cache line
|
|
mov edx, [esi-128] ; grab one address per cache line
|
|
sub esi, 128 ; go reverse order
|
|
dec eax ; count down the cache lines
|
|
jnz $memcpy_amd_bp_2 ; keep grabbing more lines into cache
|
|
|
|
mov eax, CACHEBLOCK ; now that it's in cache, do the copy
|
|
align 16
|
|
$memcpy_amd_bp_3:
|
|
movq mm0, [esi] ; read 64 bits
|
|
movq mm1, [esi+8]
|
|
movq mm2, [esi+16]
|
|
movq mm3, [esi+24]
|
|
movq mm4, [esi+32]
|
|
movq mm5, [esi+40]
|
|
movq mm6, [esi+48]
|
|
movq mm7, [esi+56]
|
|
add esi, 64 ; update source pointer
|
|
movntq [edi], mm0 ; write 64 bits, bypassing cache
|
|
movntq [edi+8], mm1 ; note: movntq also prevents the CPU
|
|
movntq [edi+16], mm2 ; from READING the destination address
|
|
movntq [edi+24], mm3 ; into the cache, only to be over-written,
|
|
movntq [edi+32], mm4 ; so that also helps performance
|
|
movntq [edi+40], mm5
|
|
movntq [edi+48], mm6
|
|
movntq [edi+56], mm7
|
|
add edi, 64 ; update dest pointer
|
|
dec eax ; count down
|
|
jnz $memcpy_amd_bp_3 ; keep copying
|
|
sub ecx, CACHEBLOCK ; update the 64-byte block count
|
|
jbe $memcpy_done ; no more 64-byte blocks left
|
|
cmp ecx, CACHEBLOCK ; big enough to run another prefetch loop?
|
|
jae $memcpy_amd_bp_1 ; yes, keep processing chunks
|
|
jmp $memcpy_amd_uc_1 ; 64-byte blocks, uncached copy
|
|
|
|
align 4
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
|
|
$memcpy_amd_align_done: ; destination is dword aligned
|
|
mov ecx, ebx ; number of bytes left to copy
|
|
shr ecx, 6 ; get 64-byte block count
|
|
cmp ecx, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy
|
|
jae $memcpy_amd_uc_test
|
|
|
|
// This is small block copy that uses the MMX registers to copy 8 bytes
|
|
// at a time. It uses the "unrolled loop" optimization, and also uses
|
|
// the software prefetch instruction to get the data into the cache.
|
|
align 16
|
|
$memcpy_amd_ic_1: ; 64-byte block copies, in-cache copy
|
|
prefetchnta [esi + (200*64/34+192)] ; start reading ahead
|
|
|
|
movq mm0, [esi] ; read 64 bits
|
|
movq mm1, [esi+8]
|
|
movq [edi], mm0 ; write 64 bits
|
|
movq [edi+8], mm1 ; note: the normal movq writes the
|
|
movq mm2, [esi+16] ; data to cache; a cache line will be
|
|
movq mm3, [esi+24] ; allocated as needed, to store the data
|
|
movq [edi+16], mm2
|
|
movq [edi+24], mm3
|
|
movq mm0, [esi+32]
|
|
movq mm1, [esi+40]
|
|
movq [edi+32], mm0
|
|
movq [edi+40], mm1
|
|
movq mm2, [esi+48]
|
|
movq mm3, [esi+56]
|
|
movq [edi+48], mm2
|
|
movq [edi+56], mm3
|
|
|
|
add esi, 64 ; update source pointer
|
|
add edi, 64 ; update destination pointer
|
|
dec ecx ; count down
|
|
jnz $memcpy_amd_ic_1 ; last 64-byte block?
|
|
|
|
$memcpy_done:
|
|
sfence ; flush the write buffer
|
|
mov ecx, ebx ; has valid low 6 bits of the byte count
|
|
shr ecx, 2 ; dword count
|
|
and ecx, 1111b ; only look at the "remainder" bits
|
|
neg ecx ; set up to jump into the array
|
|
add ecx, offset $memcpy_last_few
|
|
jmp ecx ; jump to array of movsd's
|
|
|
|
align 16
|
|
copy_mmx:
|
|
cmp ebx, 128
|
|
jb copy_rep ; tiny? skip optimized copy
|
|
|
|
mov ecx, 8 ; a trick that's faster than rep movsb...
|
|
sub ecx, edi ; align destination to qword
|
|
and ecx, 111b ; get the low bits
|
|
sub ebx, ecx ; update copy count
|
|
neg ecx ; set up to jump into the array
|
|
add ecx, offset $memcpy_mmx_align_done
|
|
jmp ecx ; jump to array of movsb's
|
|
|
|
align 4
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
|
|
$memcpy_mmx_align_done: ; destination is dword aligned
|
|
mov ecx, ebx ; number of bytes left to copy
|
|
shr ecx, 6 ; get 64-byte block count
|
|
|
|
align 16
|
|
$memcpy_mmx_ic_1:
|
|
movq mm0, [esi] ; read 64 bits
|
|
movq mm1, [esi+8]
|
|
movq [edi], mm0 ; write 64 bits
|
|
movq [edi+8], mm1
|
|
movq mm2, [esi+16]
|
|
movq mm3, [esi+24]
|
|
movq [edi+16], mm2
|
|
movq [edi+24], mm3
|
|
movq mm0, [esi+32]
|
|
movq mm1, [esi+40]
|
|
movq [edi+32], mm0
|
|
movq [edi+40], mm1
|
|
movq mm2, [esi+48]
|
|
movq mm3, [esi+56]
|
|
movq [edi+48], mm2
|
|
movq [edi+56], mm3
|
|
|
|
add esi, 64 ; update source pointer
|
|
add edi, 64 ; update destination pointer
|
|
dec ecx ; count down
|
|
jnz $memcpy_mmx_ic_1 ; last 64-byte block?
|
|
mov ecx, ebx ; has valid low 6 bits of the byte count
|
|
shr ecx, 2 ; dword count
|
|
and ecx, 1111b ; only look at the "remainder" bits
|
|
neg ecx ; set up to jump into the array
|
|
add ecx, offset $memcpy_last_few
|
|
jmp ecx ; jump to array of movsd's
|
|
|
|
align 16
|
|
copy_rep:
|
|
mov ecx, ebx
|
|
shr ecx, 2
|
|
and ebx, 11b ; ebx isn't required any more
|
|
rep movsd
|
|
mov ecx, ebx
|
|
rep movsb
|
|
jmp $memcpy_exit
|
|
|
|
// The smallest copy uses the X86 "movsd" instruction, in an optimized
|
|
// form which is an "unrolled loop". Then it handles the last few bytes.
|
|
align 4
|
|
movsd
|
|
movsd ; perform last 1-15 dword copies
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd ; perform last 1-7 dword copies
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
|
|
$memcpy_last_few: ; dword aligned from before movsd's
|
|
mov ecx, ebx ; has valid low 2 bits of the byte count
|
|
and ecx, 11b ; the last few cows must come home
|
|
rep movsb ; the last 1, 2, or 3 bytes
|
|
emms
|
|
|
|
$memcpy_exit:
|
|
pop eax // [dest] ; ret value = destination pointer
|
|
}
|
|
}
|
|
|
|
void* _stdcall memset_optimized(void *dest, int c, size_t n)
|
|
{
|
|
__asm
|
|
{
|
|
mov ebx, [n] ; number of bytes to fill
|
|
mov edi, [dest] ; destination
|
|
movzx eax, [c] ; character
|
|
mov ah, al
|
|
mov ecx, eax
|
|
shl ecx, 16
|
|
push edi
|
|
or eax, ecx
|
|
|
|
mov ecx,[memsetProc]
|
|
jecxz $memset_detect
|
|
jmp ecx
|
|
|
|
$memset_detect:
|
|
push eax
|
|
push ebx
|
|
push edi
|
|
call get_cpu_type
|
|
mov ecx, offset fill_sse
|
|
cmp al, 3
|
|
ja addr_done
|
|
mov ecx, offset fill_amd
|
|
je addr_done
|
|
mov ecx, offset fill_mmx
|
|
cmp al, 1
|
|
ja addr_done
|
|
mov ecx, offset fill_rep
|
|
addr_done:
|
|
mov [memsetProc], ecx
|
|
pop edi
|
|
pop ebx
|
|
pop eax
|
|
jmp ecx
|
|
|
|
align 16
|
|
fill_sse:
|
|
cmp ebx, 2048
|
|
jb fill_mmx ; tiny? skip optimized fill
|
|
|
|
mov ecx, 16 ; a trick that's faster than rep stosb...
|
|
sub ecx, edi ; align destination to qword
|
|
and ecx, 1111b ; get the low bits
|
|
sub ebx, ecx ; update copy count
|
|
neg ecx ; set up to jump into the array
|
|
add ecx, offset $memset_sse_align_done
|
|
jmp ecx ; jump to array of stosb's
|
|
|
|
align 4
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
|
|
$memset_sse_align_done: ; destination is double quadword aligned
|
|
mov ecx, ebx ; number of bytes left to fill
|
|
shr ecx, 6 ; get 64-byte block count
|
|
push eax
|
|
push eax
|
|
push eax
|
|
push eax
|
|
movups xmm0, [esp]
|
|
add esp, 16
|
|
|
|
align 16
|
|
$memset_sse_ic_1:
|
|
movntps [edi], xmm0 ; write 128 bits
|
|
movntps [edi+16], xmm0
|
|
movntps [edi+32], xmm0
|
|
movntps [edi+48], xmm0
|
|
|
|
add edi, 64 ; update destination pointer
|
|
dec ecx ; count down
|
|
jnz $memset_sse_ic_1 ; last 64-byte block?
|
|
sfence ; flush the write buffer
|
|
mov ecx, ebx ; has valid low 6 bits of the byte count
|
|
shr ecx, 2 ; dword count
|
|
and ecx, 1111b ; only look at the "remainder" bits
|
|
neg ecx ; set up to jump into the array
|
|
add ecx, offset $memset_last_few
|
|
jmp ecx ; jump to array of stosd's
|
|
|
|
align 16
|
|
fill_amd:
|
|
cmp ebx, 128
|
|
jb fill_rep ; tiny? skip optimized fill
|
|
|
|
mov ecx, 8 ; a trick that's faster than rep stosb...
|
|
sub ecx, edi ; align destination to qword
|
|
and ecx, 111b ; get the low bits
|
|
sub ebx, ecx ; update fill count
|
|
neg ecx ; set up to jump into the array
|
|
add ecx, offset $memset_amd_align_done
|
|
jmp ecx ; jump to array of stosb's
|
|
|
|
align 4
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
|
|
$memset_amd_align_done: ; destination is dword aligned
|
|
mov ecx, ebx ; number of bytes left to fill
|
|
shr ecx, 6 ; get 64-byte block count
|
|
movd mm0, eax
|
|
punpckldq mm0, mm0
|
|
|
|
align 16
|
|
$memset_amd_ic_1:
|
|
movntq [edi], mm0 ; write 64 bits
|
|
movntq [edi+8], mm0
|
|
movntq [edi+16], mm0
|
|
movntq [edi+24], mm0
|
|
movntq [edi+32], mm0
|
|
movntq [edi+40], mm0
|
|
movntq [edi+48], mm0
|
|
movntq [edi+56], mm0
|
|
|
|
add edi, 64 ; update destination pointer
|
|
dec ecx ; count down
|
|
jnz $memset_amd_ic_1 ; last 64-byte block?
|
|
sfence ; flush the write buffer
|
|
mov ecx, ebx ; has valid low 6 bits of the byte count
|
|
shr ecx, 2 ; dword count
|
|
and ecx, 1111b ; only look at the "remainder" bits
|
|
neg ecx ; set up to jump into the array
|
|
add ecx, offset $memset_last_few
|
|
jmp ecx ; jump to array of stosd's
|
|
|
|
align 16
|
|
fill_mmx:
|
|
cmp ebx, 192
|
|
jb fill_rep ; tiny? skip optimized fill
|
|
|
|
mov ecx, 8 ; a trick that's faster than rep stosb...
|
|
sub ecx, edi ; align destination to qword
|
|
and ecx, 111b ; get the low bits
|
|
sub ebx, ecx ; update fill count
|
|
neg ecx ; set up to jump into the array
|
|
add ecx, offset $memset_mmx_align_done
|
|
jmp ecx ; jump to array of stosb's
|
|
|
|
align 4
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
|
|
$memset_mmx_align_done: ; destination is dword aligned
|
|
mov ecx, ebx ; number of bytes left to fill
|
|
shr ecx, 6 ; get 64-byte block count
|
|
movd mm0, eax
|
|
punpckldq mm0, mm0
|
|
|
|
align 16
|
|
$memset_mmx_ic_1:
|
|
movq [edi], mm0 ; write 64 bits
|
|
movq [edi+8], mm0
|
|
movq [edi+16], mm0
|
|
movq [edi+24], mm0
|
|
movq [edi+32], mm0
|
|
movq [edi+40], mm0
|
|
movq [edi+48], mm0
|
|
movq [edi+56], mm0
|
|
|
|
add edi, 64 ; update destination pointer
|
|
dec ecx ; count down
|
|
jnz $memset_mmx_ic_1 ; last 64-byte block?
|
|
mov ecx, ebx ; has valid low 6 bits of the byte count
|
|
shr ecx, 2 ; dword count
|
|
and ecx, 1111b ; only look at the "remainder" bits
|
|
neg ecx ; set up to jump into the array
|
|
add ecx, offset $memset_last_few
|
|
jmp ecx ; jump to array of stosd's
|
|
|
|
align 16
|
|
fill_rep:
|
|
mov ecx, ebx
|
|
shr ecx, 2
|
|
and ebx, 11b ; ebx isn't required any more
|
|
rep stosd
|
|
mov ecx, ebx
|
|
rep stosb
|
|
jmp $memset_exit
|
|
|
|
align 4
|
|
stosd
|
|
stosd ; perform last 1-15 dword fills
|
|
stosd
|
|
stosd
|
|
stosd
|
|
stosd
|
|
stosd
|
|
stosd
|
|
stosd
|
|
stosd ; perform last 1-7 dword fills
|
|
stosd
|
|
stosd
|
|
stosd
|
|
stosd
|
|
stosd
|
|
stosd
|
|
|
|
$memset_last_few: ; dword aligned from before stosd's
|
|
mov ecx, ebx ; has valid low 2 bits of the byte count
|
|
and ecx, 11b ; the last few cows must come home
|
|
rep stosb ; the last 1, 2, or 3 bytes
|
|
emms
|
|
|
|
$memset_exit:
|
|
pop eax // [dest] ; ret value = destination pointer
|
|
}
|
|
}
|
|
|
|
void _stdcall memzero_optimized(void *dest, size_t n)
|
|
{
|
|
__asm
|
|
{
|
|
mov ebx, [n] ; number of bytes to fill
|
|
mov edi, [dest] ; destination
|
|
xor eax, eax
|
|
|
|
mov ecx,[memzeroProc]
|
|
jecxz $memzero_detect
|
|
jmp ecx
|
|
|
|
$memzero_detect:
|
|
push ebx
|
|
push edi
|
|
call get_cpu_type
|
|
mov ecx, offset fill_sse
|
|
cmp al, 3
|
|
ja addr_done
|
|
mov ecx, offset fill_amd
|
|
je addr_done
|
|
mov ecx, offset fill_mmx
|
|
cmp al, 1
|
|
ja addr_done
|
|
mov ecx, offset fill_rep
|
|
addr_done:
|
|
mov [memzeroProc], ecx
|
|
pop edi
|
|
pop ebx
|
|
xor eax, eax
|
|
jmp ecx
|
|
|
|
align 16
|
|
fill_sse:
|
|
cmp ebx, 2048
|
|
jb fill_mmx ; tiny? skip optimized fill
|
|
|
|
mov ecx, 16 ; a trick that's faster than rep stosb...
|
|
sub ecx, edi ; align destination to qword
|
|
and ecx, 1111b ; get the low bits
|
|
sub ebx, ecx ; update copy count
|
|
neg ecx ; set up to jump into the array
|
|
add ecx, offset $memzero_sse_align_done
|
|
jmp ecx ; jump to array of stosb's
|
|
|
|
align 4
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
|
|
$memzero_sse_align_done: ; destination is double quadword aligned
|
|
mov ecx, ebx ; number of bytes left to fill
|
|
shr ecx, 6 ; get 64-byte block count
|
|
xorps xmm0, xmm0
|
|
|
|
align 16
|
|
$memzero_sse_ic_1:
|
|
movntps [edi], xmm0 ; write 128 bits
|
|
movntps [edi+16], xmm0
|
|
movntps [edi+32], xmm0
|
|
movntps [edi+48], xmm0
|
|
add edi, 64 ; update destination pointer
|
|
dec ecx ; count down
|
|
jnz $memzero_sse_ic_1 ; last 64-byte block?
|
|
sfence ; flush the write buffer
|
|
mov ecx, ebx ; has valid low 6 bits of the byte count
|
|
shr ecx, 2 ; dword count
|
|
and ecx, 1111b ; only look at the "remainder" bits
|
|
neg ecx ; set up to jump into the array
|
|
add ecx, offset $memzero_last_few
|
|
jmp ecx ; jump to array of stosd's
|
|
|
|
align 16
|
|
fill_amd:
|
|
cmp ebx, 128
|
|
jb fill_rep ; tiny? skip optimized fill
|
|
|
|
mov ecx, 8 ; a trick that's faster than rep stosb...
|
|
sub ecx, edi ; align destination to qword
|
|
and ecx, 111b ; get the low bits
|
|
sub ebx, ecx ; update fill count
|
|
neg ecx ; set up to jump into the array
|
|
add ecx, offset $memzero_amd_align_done
|
|
jmp ecx ; jump to array of stosb's
|
|
|
|
align 4
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
|
|
$memzero_amd_align_done: ; destination is dword aligned
|
|
mov ecx, ebx ; number of bytes left to fill
|
|
shr ecx, 6 ; get 64-byte block count
|
|
pxor mm0, mm0
|
|
|
|
align 16
|
|
$memzero_amd_ic_1:
|
|
movntq [edi], mm0 ; write 64 bits
|
|
movntq [edi+8], mm0
|
|
movntq [edi+16], mm0
|
|
movntq [edi+24], mm0
|
|
movntq [edi+32], mm0
|
|
movntq [edi+40], mm0
|
|
movntq [edi+48], mm0
|
|
movntq [edi+56], mm0
|
|
add edi, 64 ; update destination pointer
|
|
dec ecx ; count down
|
|
jnz $memzero_amd_ic_1 ; last 64-byte block?
|
|
sfence ; flush the write buffer
|
|
mov ecx, ebx ; has valid low 6 bits of the byte count
|
|
shr ecx, 2 ; dword count
|
|
and ecx, 1111b ; only look at the "remainder" bits
|
|
neg ecx ; set up to jump into the array
|
|
add ecx, offset $memzero_last_few
|
|
jmp ecx ; jump to array of stosd's
|
|
|
|
align 16
|
|
fill_mmx:
|
|
cmp ebx, 192
|
|
jb fill_rep ; tiny? skip optimized fill
|
|
|
|
mov ecx, 8 ; a trick that's faster than rep stosb...
|
|
sub ecx, edi ; align destination to qword
|
|
and ecx, 111b ; get the low bits
|
|
sub ebx, ecx ; update fill count
|
|
neg ecx ; set up to jump into the array
|
|
add ecx, offset $memzero_mmx_align_done
|
|
jmp ecx ; jump to array of stosb's
|
|
|
|
align 4
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
stosb
|
|
|
|
$memzero_mmx_align_done: ; destination is dword aligned
|
|
mov ecx, ebx ; number of bytes left to fill
|
|
shr ecx, 6 ; get 64-byte block count
|
|
pxor mm0, mm0
|
|
|
|
align 16
|
|
$memzero_mmx_ic_1:
|
|
movq [edi], mm0 ; write 64 bits
|
|
movq [edi+8], mm0
|
|
movq [edi+16], mm0
|
|
movq [edi+24], mm0
|
|
movq [edi+32], mm0
|
|
movq [edi+40], mm0
|
|
movq [edi+48], mm0
|
|
movq [edi+56], mm0
|
|
add edi, 64 ; update destination pointer
|
|
dec ecx ; count down
|
|
jnz $memzero_mmx_ic_1 ; last 64-byte block?
|
|
mov ecx, ebx ; has valid low 6 bits of the byte count
|
|
shr ecx, 2 ; dword count
|
|
and ecx, 1111b ; only look at the "remainder" bits
|
|
neg ecx ; set up to jump into the array
|
|
add ecx, offset $memzero_last_few
|
|
jmp ecx ; jump to array of stosd's
|
|
|
|
align 16
|
|
fill_rep:
|
|
mov ecx, ebx
|
|
shr ecx, 2
|
|
and ebx, 11b ; ebx isn't required any more
|
|
rep stosd
|
|
mov ecx, ebx
|
|
rep stosb
|
|
jmp $memzero_exit
|
|
|
|
align 4
|
|
stosd
|
|
stosd ; perform last 1-15 dword fills
|
|
stosd
|
|
stosd
|
|
stosd
|
|
stosd
|
|
stosd
|
|
stosd
|
|
stosd
|
|
stosd ; perform last 1-7 dword fills
|
|
stosd
|
|
stosd
|
|
stosd
|
|
stosd
|
|
stosd
|
|
stosd
|
|
|
|
$memzero_last_few: ; dword aligned from before stosd's
|
|
mov ecx, ebx ; has valid low 2 bits of the byte count
|
|
and ecx, 11b ; the last few cows must come home
|
|
rep stosb ; the last 1, 2, or 3 bytes
|
|
emms
|
|
|
|
$memzero_exit:
|
|
}
|
|
}
|