AVX指令集矩陣乘向量演算法
阿新 • • 發佈:2019-02-12
#include <stdio.h> #include <time.h> #include <x86intrin.h> void matmul_avx(const float *x, const float **w,float *y,const int col,const int row){ const int col_reduced_8 = col - col % 8; float scratchpad[8]; __m256 op0, op1, tgt, tmp_vec; for (int i = 0; i < row; i++) { float res = 0; tgt = _mm256_setzero_ps(); for (int j = 0; j < col_reduced_8; j += 8) { op0 = __builtin_ia32_loadups256(&x[j]); op1 = __builtin_ia32_loadups256(&w[i][j]); tmp_vec = __builtin_ia32_mulps256(op0, op1); tgt = __builtin_ia32_addps256(tmp_vec, tgt); } __builtin_ia32_storeups256(scratchpad, tgt); for (int k = 0; k < 8; k++) res += scratchpad[k]; for (int l = col_reduced_8; l < col; l++) { res += w[i][l] * x[l]; } y[i] = res; } } int main() { const int col = 2048, row = 512, num_mul = 10; float **w; float x[col]; float y[row]; float scratchpad[8]; w = (float **)malloc(sizeof(float*)*row); for (int i = 0; i < row; i ++) { w[i] = (float *)malloc(sizeof(float) * col); } for (int i = 0; i < row; i++) { for (int j = 0; j < col; j++) { w[i][j] = (float) (rand() % 1000) / 800.0f; } } for (int j = 0; j < col; j++) { x[j] = (float) (rand() % 1000) / 800.0f; } clock_t t1, t2; // The original matrix multiplication version t1 = clock(); for (int r = 0; r < num_mul; r++) for (int j = 0; j < row; j++) { float sum = 0; float *wj = w[j]; for (int i = 0; i < col; i++) sum += wj[i] * x[i]; y[j] = sum; } t2 = clock(); float diff = ((float) t2 - (float) t1) / (num_mul*CLOCKS_PER_SEC); printf("\nTime taken: %f second.\n", diff); for (int i = 0; i < row; i++) { printf("%.4f, ", y[i]); y[i]=0; } printf("\n"); // The avx matrix multiplication version. t1 = clock(); for (int r = 0; r < num_mul; r++) matmul_avx(x,w,y,col,row); t2 = clock(); diff = ((float) t2 - (float) t1) / (num_mul*CLOCKS_PER_SEC); printf("\nTime taken: %f second.\n",diff); for (int i = 0; i < row; i++) { printf("%.4f, ", y[i]); } printf("\n"); }
執行方式:
gcc -o test test.c -mavx
./test