Fast memory copy (SSE4)
Visual Studio 2008 has supports “Enable Instruction Functions” options (see a project settings -> C/C++ -> Optimization). Note that this option can enlarge code.
Also memcpy function implementation has written with using sse2 (movdqa).
int CopyMemSSE4(int* piDst, int* piSrc, unsigned long SizeInBytes) { // Initialize pointers to start of the USWC memory _asm { mov esi, piSrc mov edx, piSrc // Initialize pointer to end of the USWC memory add edx, SizeInBytes // Initialize pointer to start of the cacheable WB buffer mov edi, piDst // Start of Bulk Load loop inner_start: // Load data from USWC Memory using Streaming Load MOVNTDQA xmm0, xmmword ptr [esi] MOVNTDQA xmm1, xmmword ptr [esi+16] MOVNTDQA xmm2, xmmword ptr [esi+32] MOVNTDQA xmm3, xmmword ptr [esi+48] // Copy data to buffer MOVDQA xmmword ptr [edi], xmm0 MOVDQA xmmword ptr [edi+16], xmm1 MOVDQA xmmword ptr [edi+32], xmm2 MOVDQA xmmword ptr [edi+48], xmm3 // Increment pointers by cache line size and test for end of loop add esi, 040h add edi, 040h cmp esi, edx jne inner_start } // End of Bulk Load loop return 0; } #define DATA_SIZE 0x01000000 int main(int argc, char* argv[]) { int *piSrc = NULL; int *piDst = NULL; unsigned long dwDataSizeInBytes = sizeof(int) * DATA_SIZE; piSrc = (int *)_aligned_malloc(dwDataSizeInBytes, dwDataSizeInBytes); piDst = (int *)_aligned_malloc(dwDataSizeInBytes, dwDataSizeInBytes); memset(piSrc, 255, dwDataSizeInBytes); memset(piDst, 0, dwDataSizeInBytes); CopyMemSSE4(piDst, piSrc, dwDataSizeInBytes); _aligned_free(piSrc); _aligned_free(piDst); }
Additional links: