Home > Fresh News > Fast memory copy (SSE4)

Fast memory copy (SSE4)

January 29th, 2011 Leave a comment Go to comments

Visual Studio 2008 has supports “Enable Instruction Functions” options (see a project settings -> C/C++ -> Optimization). Note that this option can enlarge code.

Also memcpy function implementation has written with using sse2 (movdqa).
[CPP]int CopyMemSSE4(int* piDst, int* piSrc, unsigned long SizeInBytes)
{
// Initialize pointers to start of the USWC memory

_asm
{
mov esi, piSrc
mov edx, piSrc

// Initialize pointer to end of the USWC memory
add edx, SizeInBytes

// Initialize pointer to start of the cacheable WB buffer
mov edi, piDst

// Start of Bulk Load loop
inner_start:
// Load data from USWC Memory using Streaming Load
MOVNTDQA xmm0, xmmword ptr [esi]
MOVNTDQA xmm1, xmmword ptr [esi+16]
MOVNTDQA xmm2, xmmword ptr [esi+32]
MOVNTDQA xmm3, xmmword ptr [esi+48]

// Copy data to buffer
MOVDQA xmmword ptr [edi], xmm0
MOVDQA xmmword ptr [edi+16], xmm1
MOVDQA xmmword ptr [edi+32], xmm2
MOVDQA xmmword ptr [edi+48], xmm3

// Increment pointers by cache line size and test for end of loop
add esi, 040h
add edi, 040h
cmp esi, edx
jne inner_start
}
// End of Bulk Load loop

return 0;
}

#define DATA_SIZE 0x01000000

int main(int argc, char* argv[])
{
int *piSrc = NULL;
int *piDst = NULL;
unsigned long dwDataSizeInBytes = sizeof(int) * DATA_SIZE;

piSrc = (int *)_aligned_malloc(dwDataSizeInBytes, dwDataSizeInBytes);
piDst = (int *)_aligned_malloc(dwDataSizeInBytes, dwDataSizeInBytes);

memset(piSrc, 255, dwDataSizeInBytes);
memset(piDst, 0, dwDataSizeInBytes);

CopyMemSSE4(piDst, piSrc, dwDataSizeInBytes);

_aligned_free(piSrc);
_aligned_free(piDst);
}[/CPP]
Additional links:

  1. No comments yet.
  1. No trackbacks yet.
You must be logged in to post a comment.