#include "stdafx.h"
#include <windows.h>
#include "TestObj.h"
#include "Tests.h"

IMPLEMENT_DYNCREATE(CT_MemCpyInt, CTestObject)

/////////////////////////////////////////////////////////////////////////////
CT_MemCpyInt::CT_MemCpyInt()
{
	m_TestType = 8;
	m_bSelfWarming = TRUE;
}

CT_MemCpyInt::CT_MemCpyInt(UINT nTestType, BOOL bSelfWarming)
{
	m_TestType = nTestType;
	m_bSelfWarming = bSelfWarming;
}

CT_MemCpyInt::~CT_MemCpyInt()
{
}

/////////////////////////////////////////////////////////////////////////////
// Standard memcpy version
static void CT_MemCpyIntC(
			  void* pDest,
			  void* pData,
			  UINT dwByteCount)
{
	//memcpy(pDest, pData, dwByteCount);
	// This is the meat of memcpy:
	_asm{
	cld						;2 (Not pairable)

	mov ecx, dwByteCount	;1
	mov edi, pDest			;0

	shr ecx, 2;				;1
	mov esi, pData			;0

	rep movsd				;13+ecx
	}
}

/////////////////////////////////////////////////////////////////////////////
static void CT_MemCpyIntMMX8(
			  void* pDest,
			  void* pData,
			  UINT dwByteCount)
{
	// Quick & Dirty way to take care of trailing bytes
	DWORD extra = dwByteCount & (8-1);
	if(extra){
		memcpy((char*)pDest+dwByteCount-extra, pData, extra);
	}

	// 8 bytes/iteration: 
	_asm 
	{
        mov esi, pData		
        mov ecx, dwByteCount
		mov edi, pDest		          
        sub ecx, 8
		jl DoneCopy
		
	ALIGN 16
	LoopCopy:
		movq mm0, [esi+ecx]		;1
		movq [edi+ecx], mm0		;1
		sub	ecx, 8				;1
		 jge LoopCopy           ;0
	
	DoneCopy:
		emms
	}
}

/////////////////////////////////////////////////////////////////////////////
static void CT_MemCpyIntMMX8sw(
			  void* pDest,
			  void* pData,
			  UINT dwByteCount)
{
	// Quick & Dirty way to take care of trailing bytes
	DWORD extra = dwByteCount & (8-1);
	if(extra){
		memcpy((char*)pDest+dwByteCount-extra, pData, extra);
	}

	// 8 bytes/iteration: 
	_asm 
	{
        mov esi, pData		
        mov ecx, dwByteCount
		mov edi, pDest		          
		sub ecx, 256
		jl DonePreWarm
        
	ALIGN 16
	PreWarm:
		// Pre-warm the read buffer
							;clocks
		mov al, [esi]		;1
		mov bl, [esi+32]	;1
		mov al, [esi+64]	;1
		mov bl, [esi+96]	;1
		mov al, [esi+128]	;1
		mov bl, [esi+160]	;1
		mov al, [esi+192]	;1
		mov bl, [esi+224]	;1
		
		add esi, 256		;1	The nop will force the code
		 nop				;0	to pair better.
		
		sub ecx, 256		;1
		 jg PreWarm			;0

	DonePreWarm:
		mov ecx, dwByteCount;
		mov esi, pData;
        sub ecx, 8
		jl DoneCopy
		
	ALIGN 16
	LoopCopy:
		movq mm0, [esi+ecx]		;1
		movq [edi+ecx], mm0		;1
		sub	ecx, 8				;1
		 jge LoopCopy           ;0
	
	DoneCopy:
		emms
	}
}

/////////////////////////////////////////////////////////////////////////////
static void CT_MemCpyIntMMX16(
			  void* pDest,
			  void* pData,
			  UINT dwByteCount)
{
	// Quick & Dirty way to take care of trailing bytes
	DWORD extra = dwByteCount & (16-1);
	if(extra){
		memcpy((char*)pDest+dwByteCount-extra, pData, extra);
	}

	// 16 bytes/iteration: 
	_asm 
	{
        mov esi, pData		
        mov ecx, dwByteCount
		mov edi, pDest
        sub ecx, 16
		jl DoneCopy
		
	ALIGN 16
	LoopCopy:
		movq mm0, [esi+ecx]		;1
		movq mm1, [esi+ecx+8]	;1
		movq [edi+ecx], mm0		;1
		movq [edi+ecx+8], mm1	;1
		sub	ecx, 16				;1
		 jge LoopCopy           ;0
	
	DoneCopy:
		emms
	}
}

/////////////////////////////////////////////////////////////////////////////
static void CT_MemCpyIntMMX16sw(
			  void* pDest,
			  void* pData,
			  UINT dwByteCount)
{
	// Quick & Dirty way to take care of trailing bytes
	DWORD extra = dwByteCount & (16-1);
	if(extra){
		memcpy((char*)pDest+dwByteCount-extra, pData, extra);
	}

	// 16 bytes/iteration: 
	_asm 
	{
        mov esi, pData		
        mov ecx, dwByteCount
		mov edi, pDest
		sub ecx, 256
		jl DonePreWarm		          
        
	ALIGN 16
	PreWarm:
		// Pre-warm the entire read buffer
		// assume a multiple of 256
							;clocks
		mov al, [esi]		;1
		mov bl, [esi+32]	;1
		mov al, [esi+64]	;1
		mov bl, [esi+96]	;1
		mov al, [esi+128]	;1
		mov bl, [esi+160]	;1
		mov al, [esi+192]	;1
		mov bl, [esi+224]	;1
		
		add esi, 256		;1	The nop will force the code
		 nop				;0	to pair better.
		
		sub ecx, 256		;1
		 jg PreWarm			;0

	DonePreWarm:
		mov ecx, dwByteCount;
		mov esi, pData;
        sub ecx, 16
		jl DoneCopy
		
	ALIGN 16
	LoopCopy:
		movq mm0, [esi+ecx]		;1
		movq mm1, [esi+ecx+8]	;1
		movq [edi+ecx], mm0		;1
		movq [edi+ecx+8], mm1	;1
		sub	ecx, 16				;1
		 jge LoopCopy           ;0
	
	DoneCopy:
		emms
	}
}

/////////////////////////////////////////////////////////////////////////////
static void CT_MemCpyIntMMX32(
			  void* pDest,
			  void* pData,
			  UINT dwByteCount)
{
	// Quick & Dirty way to take care of trailing bytes
	DWORD extra = dwByteCount & (32-1);
	if(extra){
		memcpy((char*)pDest+dwByteCount-extra, pData, extra);
	}

	// 32 bytes/iteration: 
	_asm 
	{
        mov esi, pData		
        mov ecx, dwByteCount
		mov edi, pDest
        sub ecx, 32
		jl DoneCopy
		
	ALIGN 16
	LoopCopy:
		movq mm0, [esi+ecx]		;1
		movq mm1, [esi+ecx+8]	;1
		movq mm2, [esi+ecx+16]	;1
		movq mm3, [esi+ecx+24]	;1
		movq [edi+ecx], mm0		;1
		movq [edi+ecx+8], mm1	;1
		movq [edi+ecx+16], mm2	;1
		movq [edi+ecx+24], mm3	;1
		sub	ecx, 32				;1
		 jge LoopCopy           ;0
	
	DoneCopy:
		emms
	}
}

/////////////////////////////////////////////////////////////////////////////
static void CT_MemCpyIntMMX32sw(
			  void* pDest,
			  void* pData,
			  UINT dwByteCount)
{
	// Quick & Dirty way to take care of trailing bytes
	DWORD extra = dwByteCount & (32-1);
	if(extra){
		memcpy((char*)pDest+dwByteCount-extra, pData, extra);
	}

	// 32 bytes/iteration: 
	_asm 
	{
        mov esi, pData		
        mov ecx, dwByteCount
		mov edi, pDest
		sub ecx, 256
		jl DonePreWarm		          
        
	ALIGN 16
	PreWarm:
		// Pre-warm the entire read buffer
		// assume a multiple of 256
							;clocks
		mov al, [esi]		;1
		mov bl, [esi+32]	;1
		mov al, [esi+64]	;1
		mov bl, [esi+96]	;1
		mov al, [esi+128]	;1
		mov bl, [esi+160]	;1
		mov al, [esi+192]	;1
		mov bl, [esi+224]	;1
		
		add esi, 256		;1	The nop will force the code
		 nop				;0	to pair better.
		
		sub ecx, 256		;1
		 jg PreWarm			;0

	DonePreWarm:
		mov ecx, dwByteCount;
		mov esi, pData;
        sub ecx, 32
		jl DoneCopy
		
	ALIGN 16
	LoopCopy:
		movq mm0, [esi+ecx]		;1
		movq mm1, [esi+ecx+8]	;1
		movq mm2, [esi+ecx+16]	;1
		movq mm3, [esi+ecx+24]	;1
		movq [edi+ecx], mm0		;1
		movq [edi+ecx+8], mm1	;1
		movq [edi+ecx+16], mm2	;1
		movq [edi+ecx+24], mm3	;1
		sub	ecx, 32				;1
		 jge LoopCopy           ;0
	
	DoneCopy:
		emms
	}
}

/////////////////////////////////////////////////////////////////////////////
static void CT_MemCpyIntMMX64(
			  void* pDest,
			  void* pData,
			  UINT dwByteCount)
{
	// Quick & Dirty way to take care of trailing bytes
	DWORD extra = dwByteCount & (64-1);
	if(extra){
		memcpy((char*)pDest+dwByteCount-extra, pData, extra);
	}

	// 64 bytes/iteration: 	
	_asm 
	{
        mov esi, pData		
        mov ecx, dwByteCount
		mov edi, pDest
        sub ecx, 64
		jl DoneCopy                
		
	MainLoop:
		movq mm0, [esi+ecx]		;1
		movq mm1, [esi+ecx+8]	;1
		movq mm2, [esi+ecx+16]	;1
		movq mm3, [esi+ecx+24]	;1

		movq [edi+ecx], mm0		;1
		movq [edi+ecx+8], mm1	;1
		movq [edi+ecx+16], mm2	;1
		movq [edi+ecx+24], mm3	;1

		movq mm4, [esi+ecx+32]	;1
		movq mm5, [esi+ecx+40]	;1
		movq mm6, [esi+ecx+48]	;1
		movq mm7, [esi+ecx+56]	;1

		movq [edi+ecx+32], mm4	;1
		movq [edi+ecx+40], mm5	;1
		movq [edi+ecx+48], mm6	;1
		movq [edi+ecx+56], mm7	;1

		sub	ecx, 64				;1
		 jge MainLoop           ;0
	
	DoneCopy:
		emms
	}
}

/////////////////////////////////////////////////////////////////////////////
static void CT_MemCpyIntMMX64sw(
			  void* pDest,
			  void* pData,
			  UINT dwByteCount)
{
	// Quick & Dirty way to take care of trailing bytes
	DWORD extra = dwByteCount & (64-1);
	if(extra){
		memcpy((char*)pDest+dwByteCount-extra, pData, extra);
	}

	// 64 bytes/iteration: 	
	_asm 
	{
        mov esi, pData		
        mov ecx, dwByteCount
		mov edi, pDest
		sub ecx, 256
		jl DonePreWarm		          
        
	ALIGN 16
	PreWarm:
		// Pre-warm the entire read buffer
		// assume a multiple of 256
							;clocks
		mov al, [esi]		;1
		mov bl, [esi+32]	;1
		mov al, [esi+64]	;1
		mov bl, [esi+96]	;1
		mov al, [esi+128]	;1
		mov bl, [esi+160]	;1
		mov al, [esi+192]	;1
		mov bl, [esi+224]	;1
		
		add esi, 256		;1	The nop will force the code
		 nop				;0	to pair better.
		
		sub ecx, 256		;1
		 jg PreWarm			;0

	DonePreWarm:
		mov ecx, dwByteCount;
		mov esi, pData;
        sub ecx, 64
		jl DoneCopy                
		
	MainLoop:
		movq mm0, [esi+ecx]		;1
		movq mm1, [esi+ecx+8]	;1
		movq mm2, [esi+ecx+16]	;1
		movq mm3, [esi+ecx+24]	;1

		movq [edi+ecx], mm0		;1
		movq [edi+ecx+8], mm1	;1
		movq [edi+ecx+16], mm2	;1
		movq [edi+ecx+24], mm3	;1

		movq mm4, [esi+ecx+32]	;1
		movq mm5, [esi+ecx+40]	;1
		movq mm6, [esi+ecx+48]	;1
		movq mm7, [esi+ecx+56]	;1

		movq [edi+ecx+32], mm4	;1
		movq [edi+ecx+40], mm5	;1
		movq [edi+ecx+48], mm6	;1
		movq [edi+ecx+56], mm7	;1

		sub	ecx, 64				;1
		 jge MainLoop           ;0
	
	DoneCopy:
		emms
	}
}

/////////////////////////////////////////////////////////////////////////////
static void CT_MemCpyIntMMX256(
			  void* pDest,
			  void* pData,
			  UINT dwByteCount)
{
	// Quick & Dirty way to take care of trailing bytes
	DWORD extra = dwByteCount & (256-1);
	if(extra){
		memcpy((char*)pDest+dwByteCount-extra, pData, extra);
	}

	// 256 bytes/iteration: 
	_asm 
	{
        mov esi,pData;          
        mov ecx,dwByteCount;
        mov edi,pDest;          
        sub ecx, 256					 
        jl DoneCopy;                

	MainLoop:

		// Byte 0
		movq mm0, [esi+ecx]		;1
		movq mm1, [esi+ecx+8]	;1
		movq mm2, [esi+ecx+16]	;1
		movq mm3, [esi+ecx+24]	;1
		movq [edi+ecx], mm0		;1
		movq [edi+ecx+8], mm1	;1
		movq [edi+ecx+16], mm2	;1
		movq [edi+ecx+24], mm3	;1

		movq mm4, [esi+ecx+32]	;1
		movq mm5, [esi+ecx+40]	;1
		movq mm6, [esi+ecx+48]	;1
		movq mm7, [esi+ecx+56]	;1
		movq [edi+ecx+32], mm4	;1
		movq [edi+ecx+40], mm5	;1
		movq [edi+ecx+48], mm6	;1
		movq [edi+ecx+56], mm7	;1

		// Byte 1
		movq mm0, [esi+ecx+64]	;1
		movq mm1, [esi+ecx+72]	;1
		movq mm2, [esi+ecx+80]	;1
		movq mm3, [esi+ecx+88]	;1
		movq [edi+ecx+64], mm0	;1
		movq [edi+ecx+72], mm1	;1
		movq [edi+ecx+80], mm2	;1
		movq [edi+ecx+88], mm3	;1

		movq mm4, [esi+ecx+96]	;1
		movq mm5, [esi+ecx+104]	;1
		movq mm6, [esi+ecx+112]	;1
		movq mm7, [esi+ecx+120]	;1
		movq [edi+ecx+96], mm4	;1
		movq [edi+ecx+104], mm5	;1
		movq [edi+ecx+112], mm6	;1
		movq [edi+ecx+120], mm7	;1

		// Byte 2
		movq mm0, [esi+ecx+128]	;1
		movq mm1, [esi+ecx+136]	;1
		movq mm2, [esi+ecx+144]	;1
		movq mm3, [esi+ecx+152]	;1
		movq [edi+ecx+128], mm0	;1
		movq [edi+ecx+136], mm1	;1
		movq [edi+ecx+144], mm2	;1
		movq [edi+ecx+152], mm3	;1

		movq mm4, [esi+ecx+160]	;1
		movq mm5, [esi+ecx+168]	;1
		movq mm6, [esi+ecx+176]	;1
		movq mm7, [esi+ecx+184]	;1
		movq [edi+ecx+160], mm4	;1
		movq [edi+ecx+168], mm5	;1
		movq [edi+ecx+176], mm6	;1
		movq [edi+ecx+184], mm7	;1

		// Byte 3
		movq mm0, [esi+ecx+192]	;1
		movq mm1, [esi+ecx+200]	;1
		movq mm2, [esi+ecx+208]	;1
		movq mm3, [esi+ecx+216]	;1
		movq [edi+ecx+192], mm0	;1
		movq [edi+ecx+200], mm1	;1
		movq [edi+ecx+208], mm2	;1
		movq [edi+ecx+216], mm3	;1

		movq mm4, [esi+ecx+224]	;1
		movq mm5, [esi+ecx+232]	;1
		movq mm6, [esi+ecx+240]	;1
		movq mm7, [esi+ecx+248]	;1
		movq [edi+ecx+224], mm4	;1
		movq [edi+ecx+232], mm5	;1
		movq [edi+ecx+240], mm6	;1
		movq [edi+ecx+248], mm7	;1

		sub	ecx, 256			;1
		 jge MainLoop           ;0
	
	DoneCopy:
		emms
	}
}

/////////////////////////////////////////////////////////////////////////////
static void CT_MemCpyIntMMX256sw(
			  void* pDest,
			  void* pData,
			  UINT dwByteCount)
{
	// Quick & Dirty way to take care of trailing bytes
	DWORD extra = dwByteCount & (256-1);
	if(extra){
		memcpy((char*)pDest+dwByteCount-extra, pData, extra);
	}

	// 256 bytes/iteration: 
	_asm 
	{
        mov esi,pData;          
        mov ecx,dwByteCount;
        mov edi,pDest;          
        sub ecx, 256					 
        jl DoneCopy;                

	MainLoop:

		// Pre-warm the read buffer
		mov al, [esi+ecx]		;1
		mov al, [esi+ecx+32]	;1
		mov al, [esi+ecx+64]	;1
		mov al, [esi+ecx+96]	;1
		mov al, [esi+ecx+128]	;1
		mov al, [esi+ecx+160]	;1
		mov al, [esi+ecx+192]	;1
		mov al, [esi+ecx+224]	;1

		// Byte 0
		movq mm0, [esi+ecx]		;1
		movq mm1, [esi+ecx+8]	;1
		movq mm2, [esi+ecx+16]	;1
		movq mm3, [esi+ecx+24]	;1
		movq [edi+ecx], mm0		;1
		movq [edi+ecx+8], mm1	;1
		movq [edi+ecx+16], mm2	;1
		movq [edi+ecx+24], mm3	;1

		movq mm4, [esi+ecx+32]	;1
		movq mm5, [esi+ecx+40]	;1
		movq mm6, [esi+ecx+48]	;1
		movq mm7, [esi+ecx+56]	;1
		movq [edi+ecx+32], mm4	;1
		movq [edi+ecx+40], mm5	;1
		movq [edi+ecx+48], mm6	;1
		movq [edi+ecx+56], mm7	;1

		// Byte 1
		movq mm0, [esi+ecx+64]	;1
		movq mm1, [esi+ecx+72]	;1
		movq mm2, [esi+ecx+80]	;1
		movq mm3, [esi+ecx+88]	;1
		movq [edi+ecx+64], mm0	;1
		movq [edi+ecx+72], mm1	;1
		movq [edi+ecx+80], mm2	;1
		movq [edi+ecx+88], mm3	;1

		movq mm4, [esi+ecx+96]	;1
		movq mm5, [esi+ecx+104]	;1
		movq mm6, [esi+ecx+112]	;1
		movq mm7, [esi+ecx+120]	;1
		movq [edi+ecx+96], mm4	;1
		movq [edi+ecx+104], mm5	;1
		movq [edi+ecx+112], mm6	;1
		movq [edi+ecx+120], mm7	;1

		// Byte 2
		movq mm0, [esi+ecx+128]	;1
		movq mm1, [esi+ecx+136]	;1
		movq mm2, [esi+ecx+144]	;1
		movq mm3, [esi+ecx+152]	;1
		movq [edi+ecx+128], mm0	;1
		movq [edi+ecx+136], mm1	;1
		movq [edi+ecx+144], mm2	;1
		movq [edi+ecx+152], mm3	;1

		movq mm4, [esi+ecx+160]	;1
		movq mm5, [esi+ecx+168]	;1
		movq mm6, [esi+ecx+176]	;1
		movq mm7, [esi+ecx+184]	;1
		movq [edi+ecx+160], mm4	;1
		movq [edi+ecx+168], mm5	;1
		movq [edi+ecx+176], mm6	;1
		movq [edi+ecx+184], mm7	;1

		// Byte 3
		movq mm0, [esi+ecx+192]	;1
		movq mm1, [esi+ecx+200]	;1
		movq mm2, [esi+ecx+208]	;1
		movq mm3, [esi+ecx+216]	;1
		movq [edi+ecx+192], mm0	;1
		movq [edi+ecx+200], mm1	;1
		movq [edi+ecx+208], mm2	;1
		movq [edi+ecx+216], mm3	;1

		movq mm4, [esi+ecx+224]	;1
		movq mm5, [esi+ecx+232]	;1
		movq mm6, [esi+ecx+240]	;1
		movq mm7, [esi+ecx+248]	;1
		movq [edi+ecx+224], mm4	;1
		movq [edi+ecx+232], mm5	;1
		movq [edi+ecx+240], mm6	;1
		movq [edi+ecx+248], mm7	;1

		sub	ecx, 256			;1
		 jge MainLoop           ;0
	
	DoneCopy:
		emms
	}
}

/////////////////////////////////////////////////////////////////////////////
void CT_MemCpyInt::RunC(){
	CT_MemCpyIntC(m_pucC2, m_pucC1, m_dwCount);
}

void CT_MemCpyInt::RunPent(){
}

void CT_MemCpyInt::RunMMX(){
	if(m_bSelfWarming){
		switch(m_TestType){
			case 16:	CT_MemCpyIntMMX16sw(m_pucM2, m_pucM1, m_dwCount);	break;
			case 32:	CT_MemCpyIntMMX32sw(m_pucM2, m_pucM1, m_dwCount);	break;
			case 64:	CT_MemCpyIntMMX64sw(m_pucM2, m_pucM1, m_dwCount);	break;
			case 256:	CT_MemCpyIntMMX256sw(m_pucM2, m_pucM1, m_dwCount);	break;
			default:	CT_MemCpyIntMMX8sw(m_pucM2, m_pucM1, m_dwCount);	
		}
	} else {
		switch(m_TestType){
			case 16:	CT_MemCpyIntMMX16(m_pucM2, m_pucM1, m_dwCount);		break;
			case 32:	CT_MemCpyIntMMX32(m_pucM2, m_pucM1, m_dwCount);		break;
			case 64:	CT_MemCpyIntMMX64(m_pucM2, m_pucM1, m_dwCount);		break;
			case 256:	CT_MemCpyIntMMX256(m_pucM2, m_pucM1, m_dwCount);	break;
			default:	CT_MemCpyIntMMX8(m_pucM2, m_pucM1, m_dwCount);
		}
	}
}

/////////////////////////////////////////////////////////////////////////////
void CT_MemCpyInt::RunTest()
{
	// Allocate the buffers
	m_MBC1.Allocate(&m_pucC1, m_dwCount);
	m_MBC2.Allocate(&m_pucC2, m_dwCount);
	m_MBP1.Allocate(&m_pucP1, m_dwCount);
	m_MBP2.Allocate(&m_pucP2, m_dwCount);
	m_MBM1.Allocate(&m_pucM1, m_dwCount);
	m_MBM2.Allocate(&m_pucM2, m_dwCount);

	// Initialize the buffers
	ZeroAllBuffers();
	for(DWORD i=0; i < m_dwCount;i++)
	{
		m_pucC1[i] = m_pucM1[i] = 255;
	}

	//-------------------------------------------------------------------------
	// Run Tests
	CTestObject::RunTest();

	//-------------------------------------------------------------------------
	// Verify Outputs
	m_tMMX.diff = CompareBuffers(m_pucC2, m_pucM2, m_dwCount);

	//-------------------------------------------------------------------------
	// Output the results
	m_tC.dwFlags = RESULT_TIME;
	//m_tPent.dwFlags = RESULT_TIME | RESULT_DIFF;
	m_tMMX.dwFlags = RESULT_TIME | RESULT_DIFF;

	if(m_bSelfWarming){
		switch(m_TestType){
			case 16:	ShowResults("MemCpyInt16sw()");	break;
			case 32:	ShowResults("MemCpyInt32sw()");	break;
			case 64:	ShowResults("MemCpyInt64sw()");	break;
			case 256:	ShowResults("MemCpyInt256sw()");break;
			default:	ShowResults("MemCpyInt8sw()");	
		}
	} else {
		switch(m_TestType){
			case 16:	ShowResults("MemCpyInt16()");	break;
			case 32:	ShowResults("MemCpyInt32()");	break;
			case 64:	ShowResults("MemCpyInt64()");	break;
			case 256:	ShowResults("MemCpyInt256()");	break;
			default:	ShowResults("MemCpyInt8()");
		}
	}
}