Fast memory copy (SSE4)
Visual Studio 2008 has supports “Enable Instruction Functions” options (see a project settings -> C/C++ -> Optimization). Note that this option can enlarge code.
Also memcpy function implementation has written with using sse2 (movdqa).
int CopyMemSSE4(int* piDst, int* piSrc, unsigned long SizeInBytes)
{
// Initialize pointers to start of the USWC memory
_asm
{
mov esi, piSrc
mov edx, piSrc
// Initialize pointer to end of the USWC memory
add edx, SizeInBytes
// Initialize pointer to start of the cacheable WB buffer
mov edi, piDst
// Start of Bulk Load loop
inner_start:
// Load data from USWC Memory using Streaming Load
MOVNTDQA xmm0, xmmword ptr [esi]
MOVNTDQA xmm1, xmmword ptr [esi+16]
MOVNTDQA xmm2, xmmword ptr [esi+32]
MOVNTDQA xmm3, xmmword ptr [esi+48]
// Copy data to buffer
MOVDQA xmmword ptr [edi], xmm0
MOVDQA xmmword ptr [edi+16], xmm1
MOVDQA xmmword ptr [edi+32], xmm2
MOVDQA xmmword ptr [edi+48], xmm3
// Increment pointers by cache line size and test for end of loop
add esi, 040h
add edi, 040h
cmp esi, edx
jne inner_start
}
// End of Bulk Load loop
return 0;
}
#define DATA_SIZE 0x01000000
int main(int argc, char* argv[])
{
int *piSrc = NULL;
int *piDst = NULL;
unsigned long dwDataSizeInBytes = sizeof(int) * DATA_SIZE;
piSrc = (int *)_aligned_malloc(dwDataSizeInBytes, dwDataSizeInBytes);
piDst = (int *)_aligned_malloc(dwDataSizeInBytes, dwDataSizeInBytes);
memset(piSrc, 255, dwDataSizeInBytes);
memset(piDst, 0, dwDataSizeInBytes);
CopyMemSSE4(piDst, piSrc, dwDataSizeInBytes);
_aligned_free(piSrc);
_aligned_free(piDst);
}
Additional links:
