/****************************************************************************** Copyright (c) 2001 Advanced Micro Devices, Inc. LIMITATION OF LIABILITY: THE MATERIALS ARE PROVIDED *AS IS* WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT OF THIRD-PARTY INTELLECTUAL PROPERTY, OR FITNESS FOR ANY PARTICULAR PURPOSE. IN NO EVENT SHALL AMD OR ITS SUPPLIERS BE LIABLE FOR ANY DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF PROFITS, BUSINESS INTERRUPTION, LOSS OF INFORMATION) ARISING OUT OF THE USE OF OR INABILITY TO USE THE MATERIALS, EVEN IF AMD HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. BECAUSE SOME JURISDICTIONS PROHIBIT THE EXCLUSION OR LIMITATION OF LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE LIMITATION MAY NOT APPLY TO YOU. AMD does not assume any responsibility for any errors which may appear in the Materials nor any responsibility to support or update the Materials. AMD retains the right to make changes to its test specifications at any time, without notice. NO SUPPORT OBLIGATION: AMD is not obligated to furnish, support, or make any further information, software, technical information, know-how, or show-how available to you. So that all may benefit from your experience, please report any problems or suggestions about this software to 3dsdk.support@amd.com AMD Developer Technologies, M/S 585 Advanced Micro Devices, Inc. 5900 E. Ben White Blvd. Austin, TX 78741 3dsdk.support@amd.com ******************************************************************************/ #pragma once #include "memcpy_amd.h" /***************************************************************************** MEMCPY_AMD.CPP ******************************************************************************/ // Very optimized memcpy() routine for all AMD Athlon and Duron family. // This code uses any of FOUR different basic copy methods, depending // on the transfer size. // NOTE: Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or // "Streaming Store"), and also uses the software prefetchnta instructions, // be sure you're running on Athlon/Duron or other recent CPU before calling! #define IN_CACHE_COPY 64 * 1024 // upper limit for movq/movq copy w/SW prefetch // Next is a copy that uses the MMX registers to copy 8 bytes at a time, // also using the "unrolled loop" optimization. This code uses // the software prefetch instruction to get the data into the cache. #define UNCACHED_COPY 197 * 1024 // upper limit for movq/movntq w/SW prefetch // For larger blocks, which will spill beyond the cache, it's faster to // use the Streaming Store instruction MOVNTQ. This write instruction // bypasses the cache and writes straight to main memory. This code also // uses the software prefetch instruction to pre-read the data. // USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE" #define BLOCK_PREFETCH_COPY infinity // no limit for movq/movntq w/block prefetch #define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch // For the largest size blocks, a special technique called Block Prefetch // can be used to accelerate the read operations. Block Prefetch reads // one address per cache line, for a series of cache lines, in a short loop. // This is faster than using software prefetch. The technique is great for // getting maximum read bandwidth, especially in DDR memory systems. // Inline assembly syntax for use with Visual C++ ///////////////////////////////////////////////////////////////////////////////////// // katsyonak: Added MMX & SSE optimized memcpy - October 8, 2003 // // // // katsyonak: Added AMD, MMX & SSE optimized memset - October 12, 2003 // // // // Aw3/katsyonak: Added AMD, MMX & SSE optimized memzero - February 11, 2004 // /////////////////////////////////////////////////////////////////////////////// static unsigned long CPU_Type = 0; // 0 = CPU check not performed yet (Auto detect) // 1 = No optimization // 2 = MMX // 3 = MMX2 for AMD Athlon/Duron and above (might also work on MMX2 (KATMAI) Intel machines) // 4 = SSE // 5 = SSE2 (only for Pentium 4 detection, the optimization used is SSE) unsigned long get_cpu_type() { __asm { mov eax, [CPU_Type] cmp eax, 5 ja do_detect or eax, eax jne ret_eax do_detect: xor eax, eax cpuid or eax, eax mov eax, 1 ;No optimization je cpu_done xor esi, esi cmp ebx, 68747541h ;Auth jne not_amd cmp edx, 69746E65h ;enti jne not_amd cmp ecx, 444D4163h ;cAMD jne not_amd inc esi not_amd: ;mov eax,1 cpuid mov al, 1 ;No optimization bt edx, 23 ;MMX Feature Bit jnb ret_al or esi, esi je check_sse and ah, 1111b cmp ah, 6 ;model 6 (K7) = Athlon, Duron jb cpu_mmx mov eax, 80000000h cpuid cmp eax, 80000000h jbe cpu_mmx mov eax, 80000001h cpuid bt edx, 31 ;AMD Feature Bit jnb cpu_mmx mov al, 3 ;AMD jmp ret_al check_sse: bt edx, 25 ;SSE Feature Bit jb cpu_sse cpu_mmx: mov al, 2 jmp ret_al cpu_sse: mov al, 4 ;SSE bt edx, 26 ;SSE2 Feature Bit adc al, 0 ret_al: movzx eax,al cpu_done: mov [CPU_Type], eax ret_eax: } } static unsigned long memcpyProc = 0; static unsigned long memsetProc = 0; static unsigned long memzeroProc = 0; void * _stdcall memcpy_optimized(void *dest, const void *src, size_t n) { __asm { mov ebx, [n] ; number of bytes to copy mov edi, [dest] ; destination mov esi, [src] ; source push edi mov ecx, [memcpyProc] jecxz $memcpy_detect jmp ecx $memcpy_detect: push ebx push esi push edi call get_cpu_type mov ecx, offset copy_sse cmp al, 3 ja addr_done mov ecx, offset copy_amd je addr_done mov ecx, offset copy_mmx cmp al, 1 ja addr_done mov ecx, offset copy_rep addr_done: mov [memcpyProc], ecx pop edi pop esi pop ebx jmp ecx align 16 copy_sse: cmp ebx, 512 jb copy_mmx ; tiny? skip optimized copy mov ecx, 16 ; a trick that's faster than rep movsb... sub ecx, edi ; align destination to qword and ecx, 1111b ; get the low bits sub ebx, ecx ; update copy count neg ecx ; set up to jump into the array add ecx, offset $memcpy_sse_align_done jmp ecx ; jump to array of movsb's align 16 $memcpy_sse_ic_1_a: ; 64-byte block copies, in-cache copy prefetchnta [esi + 320] ; start reading ahead movaps xmm0, [esi] ; read 128 bits movaps xmm1, [esi+16] movaps xmm2, [esi+32] movaps xmm3, [esi+48] add esi, 64 ; update source pointer movntps [edi], xmm0 ; write 128 bits movntps [edi+16], xmm1 movntps [edi+32], xmm2 movntps [edi+48], xmm3 add edi, 64 ; update destination pointer dec ecx ; count down jnz $memcpy_sse_ic_1_a ; last 64-byte block? sfence ; flush the write buffer mov ecx, ebx ; has valid low 6 bits of the byte count shr ecx, 2 ; dword count and ecx, 1111b ; only look at the "remainder" bits neg ecx ; set up to jump into the array add ecx, offset $memcpy_last_few jmp ecx ; jump to array of movsd's align 4 movsb movsb movsb movsb movsb movsb movsb movsb movsb movsb movsb movsb movsb movsb movsb movsb $memcpy_sse_align_done: ; destination is double quadword aligned mov ecx, ebx ; number of bytes left to copy shr ecx, 6 ; get 64-byte block count test esi, 1111b ; Is the source address aligned? je $memcpy_sse_ic_1_a // This is small block copy that uses the SSE registers to copy 16 bytes // at a time. It uses the "unrolled loop" optimization, and also uses // the software prefetch instruction to get the data into the cache. align 16 $memcpy_sse_ic_1: ; 64-byte block copies, in-cache copy prefetchnta [esi + 320] ; start reading ahead movups xmm0, [esi] ; read 128 bits movups xmm1, [esi+16] movups xmm2, [esi+32] movups xmm3, [esi+48] add esi, 64 ; update source pointer movntps [edi], xmm0 ; write 128 bits movntps [edi+16], xmm1 movntps [edi+32], xmm2 movntps [edi+48], xmm3 add edi, 64 ; update destination pointer dec ecx ; count down jnz $memcpy_sse_ic_1 ; last 64-byte block? sfence ; flush the write buffer mov ecx, ebx ; has valid low 6 bits of the byte count shr ecx, 2 ; dword count and ecx, 1111b ; only look at the "remainder" bits neg ecx ; set up to jump into the array add ecx, offset $memcpy_last_few jmp ecx ; jump to array of movsd's align 16 copy_amd: cmp ebx, 128 jb copy_rep ; tiny? skip optimized copy cmp ebx, 32*1024 ; don't align between 32k-64k because jbe $memcpy_amd_do_align ; it appears to be slower cmp ebx, 64*1024 jbe $memcpy_amd_align_done $memcpy_amd_do_align: mov ecx, 8 ; a trick that's faster than rep movsb... sub ecx, edi ; align destination to qword and ecx, 111b ; get the low bits sub ebx, ecx ; update copy count neg ecx ; set up to jump into the array add ecx, offset $memcpy_amd_align_done jmp ecx ; jump to array of movsb's $memcpy_amd_uc_test: cmp ecx, UNCACHED_COPY/64 ; big enough? use block prefetch copy jae $memcpy_amd_bp_1 // For larger blocks, which will spill beyond the cache, it's faster to // use the Streaming Store instruction MOVNTQ. This write instruction // bypasses the cache and writes straight to main memory. This code also // uses the software prefetch instruction to pre-read the data. align 16 $memcpy_amd_uc_1: ; 64-byte blocks, uncached copy prefetchnta [esi + (200*64/34+192)] ; start reading ahead movq mm0, [esi] ; read 64 bits add edi, 64 ; update destination pointer movq mm1, [esi+8] add esi, 64 ; update source pointer movq mm2, [esi-48] movntq [edi-64], mm0 ; write 64 bits, bypassing the cache movq mm0, [esi-40] ; note: movntq also prevents the CPU movntq [edi-56], mm1 ; from READING the destination address movq mm1, [esi-32] ; into the cache, only to be over-written movntq [edi-48], mm2 ; so that also helps performance movq mm2, [esi-24] movntq [edi-40], mm0 movq mm0, [esi-16] movntq [edi-32], mm1 movq mm1, [esi-8] movntq [edi-24], mm2 movntq [edi-16], mm0 dec ecx movntq [edi-8], mm1 jnz $memcpy_amd_uc_1 ; last 64-byte block? sfence ; flush the write buffer mov ecx, ebx ; has valid low 6 bits of the byte count shr ecx, 2 ; dword count and ecx, 1111b ; only look at the "remainder" bits neg ecx ; set up to jump into the array add ecx, offset $memcpy_last_few jmp ecx ; jump to array of movsd's // For the largest size blocks, a special technique called Block Prefetch // can be used to accelerate the read operations. Block Prefetch reads // one address per cache line, for a series of cache lines, in a short loop. // This is faster than using software prefetch, in this case. // The technique is great for getting maximum read bandwidth, // especially in DDR memory systems. $memcpy_amd_bp_1: ; large blocks, block prefetch copy mov eax, CACHEBLOCK / 2 ; block prefetch loop, unrolled 2X add esi, CACHEBLOCK * 64 ; move to the top of the block align 16 $memcpy_amd_bp_2: mov edx, [esi-64] ; grab one address per cache line mov edx, [esi-128] ; grab one address per cache line sub esi, 128 ; go reverse order dec eax ; count down the cache lines jnz $memcpy_amd_bp_2 ; keep grabbing more lines into cache mov eax, CACHEBLOCK ; now that it's in cache, do the copy align 16 $memcpy_amd_bp_3: movq mm0, [esi] ; read 64 bits movq mm1, [esi+8] movq mm2, [esi+16] movq mm3, [esi+24] movq mm4, [esi+32] movq mm5, [esi+40] movq mm6, [esi+48] movq mm7, [esi+56] add esi, 64 ; update source pointer movntq [edi], mm0 ; write 64 bits, bypassing cache movntq [edi+8], mm1 ; note: movntq also prevents the CPU movntq [edi+16], mm2 ; from READING the destination address movntq [edi+24], mm3 ; into the cache, only to be over-written, movntq [edi+32], mm4 ; so that also helps performance movntq [edi+40], mm5 movntq [edi+48], mm6 movntq [edi+56], mm7 add edi, 64 ; update dest pointer dec eax ; count down jnz $memcpy_amd_bp_3 ; keep copying sub ecx, CACHEBLOCK ; update the 64-byte block count jbe $memcpy_done ; no more 64-byte blocks left cmp ecx, CACHEBLOCK ; big enough to run another prefetch loop? jae $memcpy_amd_bp_1 ; yes, keep processing chunks jmp $memcpy_amd_uc_1 ; 64-byte blocks, uncached copy align 4 movsb movsb movsb movsb movsb movsb movsb movsb $memcpy_amd_align_done: ; destination is dword aligned mov ecx, ebx ; number of bytes left to copy shr ecx, 6 ; get 64-byte block count cmp ecx, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy jae $memcpy_amd_uc_test // This is small block copy that uses the MMX registers to copy 8 bytes // at a time. It uses the "unrolled loop" optimization, and also uses // the software prefetch instruction to get the data into the cache. align 16 $memcpy_amd_ic_1: ; 64-byte block copies, in-cache copy prefetchnta [esi + (200*64/34+192)] ; start reading ahead movq mm0, [esi] ; read 64 bits movq mm1, [esi+8] movq [edi], mm0 ; write 64 bits movq [edi+8], mm1 ; note: the normal movq writes the movq mm2, [esi+16] ; data to cache; a cache line will be movq mm3, [esi+24] ; allocated as needed, to store the data movq [edi+16], mm2 movq [edi+24], mm3 movq mm0, [esi+32] movq mm1, [esi+40] movq [edi+32], mm0 movq [edi+40], mm1 movq mm2, [esi+48] movq mm3, [esi+56] movq [edi+48], mm2 movq [edi+56], mm3 add esi, 64 ; update source pointer add edi, 64 ; update destination pointer dec ecx ; count down jnz $memcpy_amd_ic_1 ; last 64-byte block? $memcpy_done: sfence ; flush the write buffer mov ecx, ebx ; has valid low 6 bits of the byte count shr ecx, 2 ; dword count and ecx, 1111b ; only look at the "remainder" bits neg ecx ; set up to jump into the array add ecx, offset $memcpy_last_few jmp ecx ; jump to array of movsd's align 16 copy_mmx: cmp ebx, 128 jb copy_rep ; tiny? skip optimized copy mov ecx, 8 ; a trick that's faster than rep movsb... sub ecx, edi ; align destination to qword and ecx, 111b ; get the low bits sub ebx, ecx ; update copy count neg ecx ; set up to jump into the array add ecx, offset $memcpy_mmx_align_done jmp ecx ; jump to array of movsb's align 4 movsb movsb movsb movsb movsb movsb movsb movsb $memcpy_mmx_align_done: ; destination is dword aligned mov ecx, ebx ; number of bytes left to copy shr ecx, 6 ; get 64-byte block count align 16 $memcpy_mmx_ic_1: movq mm0, [esi] ; read 64 bits movq mm1, [esi+8] movq [edi], mm0 ; write 64 bits movq [edi+8], mm1 movq mm2, [esi+16] movq mm3, [esi+24] movq [edi+16], mm2 movq [edi+24], mm3 movq mm0, [esi+32] movq mm1, [esi+40] movq [edi+32], mm0 movq [edi+40], mm1 movq mm2, [esi+48] movq mm3, [esi+56] movq [edi+48], mm2 movq [edi+56], mm3 add esi, 64 ; update source pointer add edi, 64 ; update destination pointer dec ecx ; count down jnz $memcpy_mmx_ic_1 ; last 64-byte block? mov ecx, ebx ; has valid low 6 bits of the byte count shr ecx, 2 ; dword count and ecx, 1111b ; only look at the "remainder" bits neg ecx ; set up to jump into the array add ecx, offset $memcpy_last_few jmp ecx ; jump to array of movsd's align 16 copy_rep: mov ecx, ebx shr ecx, 2 and ebx, 11b ; ebx isn't required any more rep movsd mov ecx, ebx rep movsb jmp $memcpy_exit // The smallest copy uses the X86 "movsd" instruction, in an optimized // form which is an "unrolled loop". Then it handles the last few bytes. align 4 movsd movsd ; perform last 1-15 dword copies movsd movsd movsd movsd movsd movsd movsd movsd ; perform last 1-7 dword copies movsd movsd movsd movsd movsd movsd $memcpy_last_few: ; dword aligned from before movsd's mov ecx, ebx ; has valid low 2 bits of the byte count and ecx, 11b ; the last few cows must come home rep movsb ; the last 1, 2, or 3 bytes emms $memcpy_exit: pop eax // [dest] ; ret value = destination pointer } } void* _stdcall memset_optimized(void *dest, int c, size_t n) { __asm { mov ebx, [n] ; number of bytes to fill mov edi, [dest] ; destination movzx eax, [c] ; character mov ah, al mov ecx, eax shl ecx, 16 push edi or eax, ecx mov ecx,[memsetProc] jecxz $memset_detect jmp ecx $memset_detect: push eax push ebx push edi call get_cpu_type mov ecx, offset fill_sse cmp al, 3 ja addr_done mov ecx, offset fill_amd je addr_done mov ecx, offset fill_mmx cmp al, 1 ja addr_done mov ecx, offset fill_rep addr_done: mov [memsetProc], ecx pop edi pop ebx pop eax jmp ecx align 16 fill_sse: cmp ebx, 2048 jb fill_mmx ; tiny? skip optimized fill mov ecx, 16 ; a trick that's faster than rep stosb... sub ecx, edi ; align destination to qword and ecx, 1111b ; get the low bits sub ebx, ecx ; update copy count neg ecx ; set up to jump into the array add ecx, offset $memset_sse_align_done jmp ecx ; jump to array of stosb's align 4 stosb stosb stosb stosb stosb stosb stosb stosb stosb stosb stosb stosb stosb stosb stosb stosb $memset_sse_align_done: ; destination is double quadword aligned mov ecx, ebx ; number of bytes left to fill shr ecx, 6 ; get 64-byte block count push eax push eax push eax push eax movups xmm0, [esp] add esp, 16 align 16 $memset_sse_ic_1: movntps [edi], xmm0 ; write 128 bits movntps [edi+16], xmm0 movntps [edi+32], xmm0 movntps [edi+48], xmm0 add edi, 64 ; update destination pointer dec ecx ; count down jnz $memset_sse_ic_1 ; last 64-byte block? sfence ; flush the write buffer mov ecx, ebx ; has valid low 6 bits of the byte count shr ecx, 2 ; dword count and ecx, 1111b ; only look at the "remainder" bits neg ecx ; set up to jump into the array add ecx, offset $memset_last_few jmp ecx ; jump to array of stosd's align 16 fill_amd: cmp ebx, 128 jb fill_rep ; tiny? skip optimized fill mov ecx, 8 ; a trick that's faster than rep stosb... sub ecx, edi ; align destination to qword and ecx, 111b ; get the low bits sub ebx, ecx ; update fill count neg ecx ; set up to jump into the array add ecx, offset $memset_amd_align_done jmp ecx ; jump to array of stosb's align 4 stosb stosb stosb stosb stosb stosb stosb stosb $memset_amd_align_done: ; destination is dword aligned mov ecx, ebx ; number of bytes left to fill shr ecx, 6 ; get 64-byte block count movd mm0, eax punpckldq mm0, mm0 align 16 $memset_amd_ic_1: movntq [edi], mm0 ; write 64 bits movntq [edi+8], mm0 movntq [edi+16], mm0 movntq [edi+24], mm0 movntq [edi+32], mm0 movntq [edi+40], mm0 movntq [edi+48], mm0 movntq [edi+56], mm0 add edi, 64 ; update destination pointer dec ecx ; count down jnz $memset_amd_ic_1 ; last 64-byte block? sfence ; flush the write buffer mov ecx, ebx ; has valid low 6 bits of the byte count shr ecx, 2 ; dword count and ecx, 1111b ; only look at the "remainder" bits neg ecx ; set up to jump into the array add ecx, offset $memset_last_few jmp ecx ; jump to array of stosd's align 16 fill_mmx: cmp ebx, 192 jb fill_rep ; tiny? skip optimized fill mov ecx, 8 ; a trick that's faster than rep stosb... sub ecx, edi ; align destination to qword and ecx, 111b ; get the low bits sub ebx, ecx ; update fill count neg ecx ; set up to jump into the array add ecx, offset $memset_mmx_align_done jmp ecx ; jump to array of stosb's align 4 stosb stosb stosb stosb stosb stosb stosb stosb $memset_mmx_align_done: ; destination is dword aligned mov ecx, ebx ; number of bytes left to fill shr ecx, 6 ; get 64-byte block count movd mm0, eax punpckldq mm0, mm0 align 16 $memset_mmx_ic_1: movq [edi], mm0 ; write 64 bits movq [edi+8], mm0 movq [edi+16], mm0 movq [edi+24], mm0 movq [edi+32], mm0 movq [edi+40], mm0 movq [edi+48], mm0 movq [edi+56], mm0 add edi, 64 ; update destination pointer dec ecx ; count down jnz $memset_mmx_ic_1 ; last 64-byte block? mov ecx, ebx ; has valid low 6 bits of the byte count shr ecx, 2 ; dword count and ecx, 1111b ; only look at the "remainder" bits neg ecx ; set up to jump into the array add ecx, offset $memset_last_few jmp ecx ; jump to array of stosd's align 16 fill_rep: mov ecx, ebx shr ecx, 2 and ebx, 11b ; ebx isn't required any more rep stosd mov ecx, ebx rep stosb jmp $memset_exit align 4 stosd stosd ; perform last 1-15 dword fills stosd stosd stosd stosd stosd stosd stosd stosd ; perform last 1-7 dword fills stosd stosd stosd stosd stosd stosd $memset_last_few: ; dword aligned from before stosd's mov ecx, ebx ; has valid low 2 bits of the byte count and ecx, 11b ; the last few cows must come home rep stosb ; the last 1, 2, or 3 bytes emms $memset_exit: pop eax // [dest] ; ret value = destination pointer } } void _stdcall memzero_optimized(void *dest, size_t n) { __asm { mov ebx, [n] ; number of bytes to fill mov edi, [dest] ; destination xor eax, eax mov ecx,[memzeroProc] jecxz $memzero_detect jmp ecx $memzero_detect: push ebx push edi call get_cpu_type mov ecx, offset fill_sse cmp al, 3 ja addr_done mov ecx, offset fill_amd je addr_done mov ecx, offset fill_mmx cmp al, 1 ja addr_done mov ecx, offset fill_rep addr_done: mov [memzeroProc], ecx pop edi pop ebx xor eax, eax jmp ecx align 16 fill_sse: cmp ebx, 2048 jb fill_mmx ; tiny? skip optimized fill mov ecx, 16 ; a trick that's faster than rep stosb... sub ecx, edi ; align destination to qword and ecx, 1111b ; get the low bits sub ebx, ecx ; update copy count neg ecx ; set up to jump into the array add ecx, offset $memzero_sse_align_done jmp ecx ; jump to array of stosb's align 4 stosb stosb stosb stosb stosb stosb stosb stosb stosb stosb stosb stosb stosb stosb stosb stosb $memzero_sse_align_done: ; destination is double quadword aligned mov ecx, ebx ; number of bytes left to fill shr ecx, 6 ; get 64-byte block count xorps xmm0, xmm0 align 16 $memzero_sse_ic_1: movntps [edi], xmm0 ; write 128 bits movntps [edi+16], xmm0 movntps [edi+32], xmm0 movntps [edi+48], xmm0 add edi, 64 ; update destination pointer dec ecx ; count down jnz $memzero_sse_ic_1 ; last 64-byte block? sfence ; flush the write buffer mov ecx, ebx ; has valid low 6 bits of the byte count shr ecx, 2 ; dword count and ecx, 1111b ; only look at the "remainder" bits neg ecx ; set up to jump into the array add ecx, offset $memzero_last_few jmp ecx ; jump to array of stosd's align 16 fill_amd: cmp ebx, 128 jb fill_rep ; tiny? skip optimized fill mov ecx, 8 ; a trick that's faster than rep stosb... sub ecx, edi ; align destination to qword and ecx, 111b ; get the low bits sub ebx, ecx ; update fill count neg ecx ; set up to jump into the array add ecx, offset $memzero_amd_align_done jmp ecx ; jump to array of stosb's align 4 stosb stosb stosb stosb stosb stosb stosb stosb $memzero_amd_align_done: ; destination is dword aligned mov ecx, ebx ; number of bytes left to fill shr ecx, 6 ; get 64-byte block count pxor mm0, mm0 align 16 $memzero_amd_ic_1: movntq [edi], mm0 ; write 64 bits movntq [edi+8], mm0 movntq [edi+16], mm0 movntq [edi+24], mm0 movntq [edi+32], mm0 movntq [edi+40], mm0 movntq [edi+48], mm0 movntq [edi+56], mm0 add edi, 64 ; update destination pointer dec ecx ; count down jnz $memzero_amd_ic_1 ; last 64-byte block? sfence ; flush the write buffer mov ecx, ebx ; has valid low 6 bits of the byte count shr ecx, 2 ; dword count and ecx, 1111b ; only look at the "remainder" bits neg ecx ; set up to jump into the array add ecx, offset $memzero_last_few jmp ecx ; jump to array of stosd's align 16 fill_mmx: cmp ebx, 192 jb fill_rep ; tiny? skip optimized fill mov ecx, 8 ; a trick that's faster than rep stosb... sub ecx, edi ; align destination to qword and ecx, 111b ; get the low bits sub ebx, ecx ; update fill count neg ecx ; set up to jump into the array add ecx, offset $memzero_mmx_align_done jmp ecx ; jump to array of stosb's align 4 stosb stosb stosb stosb stosb stosb stosb stosb $memzero_mmx_align_done: ; destination is dword aligned mov ecx, ebx ; number of bytes left to fill shr ecx, 6 ; get 64-byte block count pxor mm0, mm0 align 16 $memzero_mmx_ic_1: movq [edi], mm0 ; write 64 bits movq [edi+8], mm0 movq [edi+16], mm0 movq [edi+24], mm0 movq [edi+32], mm0 movq [edi+40], mm0 movq [edi+48], mm0 movq [edi+56], mm0 add edi, 64 ; update destination pointer dec ecx ; count down jnz $memzero_mmx_ic_1 ; last 64-byte block? mov ecx, ebx ; has valid low 6 bits of the byte count shr ecx, 2 ; dword count and ecx, 1111b ; only look at the "remainder" bits neg ecx ; set up to jump into the array add ecx, offset $memzero_last_few jmp ecx ; jump to array of stosd's align 16 fill_rep: mov ecx, ebx shr ecx, 2 and ebx, 11b ; ebx isn't required any more rep stosd mov ecx, ebx rep stosb jmp $memzero_exit align 4 stosd stosd ; perform last 1-15 dword fills stosd stosd stosd stosd stosd stosd stosd stosd ; perform last 1-7 dword fills stosd stosd stosd stosd stosd stosd $memzero_last_few: ; dword aligned from before stosd's mov ecx, ebx ; has valid low 2 bits of the byte count and ecx, 11b ; the last few cows must come home rep stosb ; the last 1, 2, or 3 bytes emms $memzero_exit: } }