SSE指令集的介绍网上一大堆, 这里贴一个用VS2008环境下的SSE测试程序, 分别用C++代码, C++内联汇编, C++的SSE Intrinsics三种方式计算卷积的程序...这是一个win32控制台程序.....
程序下载地址 : http://download.csdn.net/detail/hemmingway/4598506
主文件的代码一览:
// Test_SSE.cpp : 定义控制台应用程序的入口点。 // calc conversion // #include "stdafx.h" #include <xmmintrin.h> // __m128 data type and SSE functions #include <float.h> #include <math.h> #include <Windows.h> // Support odprintf #include <stdarg.h> #include <ctype.h> #include "MMX_SSESupport.h" #include "TimeCounter.h" #define ARRAY_SIZE 100000 #pragma warning(disable : 4324) // Arrays processed by SSE should have 16 bytes alignment: __declspec(align(16)) float m_fInitialArray[ARRAY_SIZE]; __declspec(align(16)) float m_fResultArray[ARRAY_SIZE]; // minimum and maximum values in the result array float m_fMin; float m_fMax; #define TIME_START CTimeCounter* pT = new CTimeCounter() #define TIME_END ShowTime(pT->GetExecutionTime()) ////////////////////////////////////////////////////////////////////////// //odprintf -- debug function void __cdecl odprintf(const char* fmt, ...) { char buf[4096], *p = buf; va_list args; va_start(args, fmt); p += vsnprintf_s(p, sizeof(buf), _TRUNCATE, fmt, args); va_end(args); while ( p > buf && isspace(p[-1]) ) *--p = '\0'; *p++ = '\r'; *p++ = '\n'; *p = '\0'; OutputDebugStringA(buf); //output as ANSI string //OutputDebugString } ////////////////////////////////////////////////////////////////////////// // Show execution time (ms) void ShowTime(__int64 nTime) { printf("usage time: %I64d\n\n",nTime); //在g++中对应的是<stdint.h> int64_t, 应该用%lld输出 } ////////////////////////////////////////////////////////////////////////// // ShowArray, display array's data void ShowArray(float* pArray) { if ( !(*pArray)) return; float* p = pArray; for ( int i = 0; i < ARRAY_SIZE; i += 500 ) //没有显示所有的数据出来 { printf("%f ", p[i]); if (i == 5) printf("\n"); } printf("\n\n"); } ////////////////////////////////////////////////////////////////////////// // InitArray, Fill initial array void InitArray() { m_fMin = FLT_MAX; m_fMax = FLT_MIN; float f; int i; for ( i = 0; i < ARRAY_SIZE; i++ ) { // Fill array with one sin cycle and ensure that all values are positive // (to use sqrt in conversion) f = (float) sin(((double)i * 6.29 / ARRAY_SIZE)) + 2.0f; if ( f < m_fMin ) m_fMin = f; if ( f > m_fMax ) m_fMax = f; m_fInitialArray[i] = f; } ShowArray(m_fInitialArray); } ////////////////////////////////////////////////////////////////////////// // Make conversion using C++ code // // Each initial array member is converted to result array member // using some formula (just to demonstrate SSE features). // Minimum and maximum result values are calculated and shown. // // Function also calculates and shows conversion time (ms). // void OnCplusplus() { TIME_START; m_fMin = FLT_MAX; m_fMax = FLT_MIN; int i; for ( i = 0; i < ARRAY_SIZE; i++ ) { m_fResultArray[i] = sqrt(m_fInitialArray[i] * 2.8f); if ( m_fResultArray[i] < m_fMin ) m_fMin = m_fResultArray[i]; if ( m_fResultArray[i] > m_fMax ) m_fMax = m_fResultArray[i]; } TIME_END; ShowArray(m_fResultArray); } ////////////////////////////////////////////////////////////////////////// //OnSseAssembly, Make conversion using C++ code with inline Assembly void OnSseAssembly() { TIME_START; float* pIn = m_fInitialArray; float* pOut = m_fResultArray; float f = 2.8f; float flt_min = FLT_MIN; float flt_max = FLT_MAX; __m128 min128; __m128 max128; // using additional registers: // xmm2 - multiplication coefficient // xmm3 - minimum // xmm4 - maximum _asm { movss xmm2, f // xmm2[0] = 2.8 shufps xmm2, xmm2, 0 // xmm2[1, 2, 3] = xmm2[0] movss xmm3, flt_max // xmm3 = FLT_MAX shufps xmm3, xmm3, 0 // xmm3[1, 2, 3] = xmm3[0] movss xmm4, flt_min // xmm4 = FLT_MIN shufps xmm4, xmm4, 0 // xmm3[1, 2, 3] = xmm3[0] mov esi, pIn // input pointer mov edi, pOut // output pointer mov ecx, ARRAY_SIZE/4 // loop counter start_loop: movaps xmm1, [esi] // xmm1 = [esi] mulps xmm1, xmm2 // xmm1 = xmm1 * xmm2 sqrtps xmm1, xmm1 // xmm1 = sqrt(xmm1) movaps [edi], xmm1 // [edi] = xmm1 minps xmm3, xmm1 maxps xmm4, xmm1 add esi, 16 add edi, 16 dec ecx jnz start_loop movaps min128, xmm3 movaps max128, xmm4 } // extract minimum and maximum values from min128 and max128 union u { __m128 m; float f[4]; } x; x.m = min128; m_fMin = min(x.f[0], min(x.f[1], min(x.f[2], x.f[3]))); x.m = max128; m_fMax = max(x.f[0], max(x.f[1], max(x.f[2], x.f[3]))); TIME_END; ShowArray(m_fResultArray); } ////////////////////////////////////////////////////////////////////////// // OnSseCpp, Make conversion using C++ code with SSE Intrinsics void OnSseCpp() { TIME_START; __m128 coeff = _mm_set_ps1(2.8f); // coeff[0, 1, 2, 3] = 2.8 __m128 tmp; __m128 min128 = _mm_set_ps1(FLT_MAX); // min128[0, 1, 2, 3] = FLT_MAX __m128 max128 = _mm_set_ps1(FLT_MIN); // max128[0, 1, 2, 3] = FLT_MIN __m128* pSource = (__m128*) m_fInitialArray; __m128* pDest = (__m128*) m_fResultArray; for ( int i = 0; i < ARRAY_SIZE/4; i++ ) { tmp = _mm_mul_ps(*pSource, coeff); // tmp = *pSource * coeff *pDest = _mm_sqrt_ps(tmp); // *pDest = sqrt(tmp) min128 = _mm_min_ps(*pDest, min128); max128 = _mm_max_ps(*pDest, max128); pSource++; pDest++; } // extract minimum and maximum values from min128 and max128 union u { __m128 m; float f[4]; } x; x.m = min128; m_fMin = min(x.f[0], min(x.f[1], min(x.f[2], x.f[3]))); x.m = max128; m_fMax = max(x.f[0], max(x.f[1], max(x.f[2], x.f[3]))); TIME_END; ShowArray(m_fResultArray); } int _tmain(int argc, _TCHAR* argv[]) { // Test SSE support ? bool bMMX, bSSE; TestFeatures(&bMMX, &bSSE); if ( !bSSE ) { // Do not support SSE odprintf("Do not support SSE.\n"); return 0; } odprintf("everything is ok..."); //first, prepare data printf("program generate %d floating point(Not all data are displayed)...\n\n", ARRAY_SIZE); InitArray(); //second, Make conversion using C++ code getchar(); printf("Make conversion using C++ code\n\n"); OnCplusplus(); //third,Make conversion using C++ code with inline Assembly getchar(); printf("Make conversion using C++ code with inline Assembly\n\n"); OnSseAssembly(); //finally, Make conversion using C++ code with SSE Intrinsics getchar(); printf("Make conversion using C++ code with SSE Intrinsics\n\n"); OnSseCpp(); getchar(); return 0; }