SSE/AVX 是其实就是SIMD指令集的主要应用(MMX太过远古现在基本不用了,AVX512现在桌面级还不支持),使用时只需要#include <immintrin.h>
即可,至于具体的函数可以查阅Intel的官网 ,官网的对每个指令都非常详细的demo.
想到这个是因为,做计组作业的时候被问到一个问题,感觉挺怪的,我只能拿奔腾4和龙芯做对比了..
回到正题,初步用SIMD做了一下memset的操作,确实有比较大的提速,AVX和SSE的差距貌似并不是特别大,可能主要的时间浪费在IO上了,不过gcc里面的memset确实非常快,似乎是CPU内部对这个做了专门的提速,外加上我没有考虑上字节对齐的因素。
一句题外话,for(Loop)循环会比用指针(Pointer)做迭代慢,因为在汇编上,多了一个赋值给i的操作,但是影响貌似并不大。
最后代码里用了WIN32的API做的Timer来实现更精确的计数.
#include <iostream> #include <windows.h> #include <immintrin.h> #include <ctime> //======================Precise Timer================================ class Timer { private: unsigned long long _start_count; unsigned long long _stop_count; double _millisec_percount; bool _timer_state; public: Timer(); void StartTimer(); void StopTimer(); double GetTimerMilliSec() const; double GetTimerSec() const; }; Timer::Timer() { _start_count = _stop_count = 0; _timer_state = false; unsigned long long frequency; QueryPerformanceFrequency((LARGE_INTEGER*)&frequency); _millisec_percount = (double)1.0 / ((double)frequency / 1000.0); } void Timer::StartTimer() { _timer_state = true; QueryPerformanceCounter((LARGE_INTEGER*)&_start_count); } void Timer::StopTimer() { QueryPerformanceCounter((LARGE_INTEGER*)&_stop_count); _timer_state = false; } double Timer::GetTimerSec() const { return GetTimerMilliSec() / (double)1000.0; } double Timer::GetTimerMilliSec() const { return (float)(_stop_count - _start_count) * _millisec_percount; } //=================================================================== void Memset_Loop(int* a, int size) { for (int i = 0; i < size; i++) a[i] = 0; } void Memset_Pointer(int* a, int size) { for (int* i = &a[0], *end = &a[size]; i < end; i++) *i = 0; } void Memset_SSE(int* a, int size) { __m128i zero = _mm_setzero_si128(); for (__m128i* i = reinterpret_cast<__m128i*>(a), *end = reinterpret_cast<__m128i*>(&a[size]); i < end; i++) _mm_store_si128(i, zero); } void Memset_AVX(int* a, int size) { __m256i zero = _mm256_setzero_si256(); for (__m256i* i = reinterpret_cast<__m256i*>(a), *end = reinterpret_cast<__m256i*>(&a[size]); i < end; i++) _mm256_store_si256(i, zero); } void Memset_SYS(int* a, int size) { memset(a, 0, sizeof(a)); } /* unsupported AVX512 void Memset_AVX512(int* a, int size) { __m512i zero = _mm512_setzero_si512(); for (__m512i* i = reinterpret_cast<__m512i*>(a), *end = reinterpret_cast<__m512i*>(&a[size]); i < end; i++) _mm512_store_si512(i, zero); } */ const int MYSIZE = 100000000; int main() { int* a = new int[MYSIZE]; for (int i = 0; i < MYSIZE; i++) a[i] = rand(); std::cout << "Start!!!" << std::endl; Timer CurTimer; CurTimer.StartTimer(); Memset_Pointer(a, MYSIZE); CurTimer.StopTimer(); std::cout << "Loop : " << CurTimer.GetTimerMilliSec() << std::endl; CurTimer.StartTimer(); Memset_Pointer(a, MYSIZE); CurTimer.StopTimer(); std::cout << "Pointer: " << CurTimer.GetTimerMilliSec() << std::endl; CurTimer.StartTimer(); Memset_SSE(a, MYSIZE); CurTimer.StopTimer(); std::cout << "SSE : " << CurTimer.GetTimerMilliSec() << std::endl; CurTimer.StartTimer(); Memset_AVX(a, MYSIZE); CurTimer.StopTimer(); std::cout << "AVX : " << CurTimer.GetTimerMilliSec() << std::endl; CurTimer.StartTimer(); Memset_SYS(a, MYSIZE); CurTimer.StopTimer(); std::cout << "SYS : " << CurTimer.GetTimerMilliSec() << std::endl; return 0; }