double 乘法_矩陣乘法優化過程(DGEMM)
阿新 • • 發佈:2021-01-29
技術標籤:double 乘法
1. 介紹
本文以N*N 矩陣乘法為例敘述優化的過程,該程式碼運行於x86 Linux/Mac 平臺。 整個優化分為一下幾個步驟:
- 向量化 (在x86架構上採用AVX)
- 迴圈展開(loop unrolling)
- cache blocking (也可稱為tiling)
- 多執行緒 (OpenMP)
NOTE:
- 本文的程式碼來自 << Computer Organization and Design RISC-V edition>>.
- 完整程式碼可訪問 https://gitee.com/optimization/dgemm.git
- 本文程式碼針對的是x86 平臺,其實對於其它平臺可以通過簡單修改SIMD指令來實現。
2. C 程式碼
void dgemm_c(size_t n, double *A, double *B, double *C)
{
for (size_t i = 0; i < n; ++i) {
for (size_t j = 0; j < n; ++j) {
double cij = C[i+j*n];
for (size_t k = 0; k < n; k++) {
cij += A[i+k*n] * B[k+j*n];
}
C[i+j*n] = cij;
}
}
}
3. 向量化程式碼
void dgemm_avx(size_t n, double *A, double *B, double *C) { for (size_t i = 0; i < n; i += 4) { for (size_t j = 0; j < n; j++) { __m256d c0 = _mm256_load_pd(C+i+j*n); /* c0 = C[i][j] */ for (size_t k = 0; k < n; k++) { c0 = _mm256_add_pd(c0, _mm256_mul_pd(_mm256_load_pd(A+i+k*n), _mm256_broadcast_sd(B+k+j*n))); } _mm256_store_pd(C+i+j*n, c0); /* C[i][j] = c0 */; } } }
4. 迴圈展開程式碼
#define UNROLL 4 void dgemm_avx_unroll(size_t n, double *A, double *B, double *C) { for (size_t i = 0; i < n; i += 4*UNROLL) { for (size_t j = 0; j < n; j++) { __m256d c[UNROLL]; for (int x = 0; x < UNROLL; x++) { c[x] = _mm256_load_pd(C+i+x*4+j*n); } for (size_t k = 0; k < n; k++) { __m256d b = _mm256_broadcast_sd(B+k+j*n); for (int x = 0; x < UNROLL; x++) { c[x] = _mm256_add_pd(c[x], _mm256_mul_pd(_mm256_load_pd(A+i+k*n+x*4), b)); } } for (int x = 0; x < UNROLL; x++) { _mm256_store_pd(C+i+x*4+j*n, c[x]); } } } }
5. cache blocking 程式碼
#define UNROLL 4
#define BLOCKSIZE 32
static inline void do_block(int n, int si, int sj, int sk,
double *A, double *B, double *C)
{
for (int i = si; i < si + BLOCKSIZE; i += UNROLL*4) {
for (int j = sj; j < sj + BLOCKSIZE; j++) {
__m256d c[UNROLL];
for (int x = 0; x < UNROLL; x++) {
c[x] = _mm256_load_pd(C+i+x*4+j*n);
}
for (int k = sk; k < sk + BLOCKSIZE; k++) {
__m256d b = _mm256_broadcast_sd(B+k+j*n);
for (int x = 0; x < UNROLL; x++) {
c[x] = _mm256_add_pd(c[x],
_mm256_mul_pd(
_mm256_load_pd(A+n*k+x*4+i), b));
}
}
for (int x = 0; x < UNROLL; x++) {
_mm256_store_pd(C+i+x*4+j*n, c[x]);
}
}
}
}
void dgemm_avx_unroll_blk(size_t n, double *A, double *B, double *C)
{
for (int sj = 0; sj < n; sj += BLOCKSIZE) {
for (int si = 0; si < n; si += BLOCKSIZE) {
for (int sk = 0; sk < n; sk += BLOCKSIZE) {
do_block(n, si, sj, sk, A, B, C);
}
}
}
}
6. 多執行緒程式碼
void dgemm_avx_unroll_blk_omp(size_t n, double *A, double *B, double *C)
{
#pragma omp parallel for
for (int sj = 0; sj < n; sj += BLOCKSIZE) {
for (int si = 0; si < n; si += BLOCKSIZE) {
for (int sk = 0; sk < n; sk += BLOCKSIZE) {
do_block(n, si, sj, sk, A, B, C);
}
}
}
}
7. benchmark
- 在採用20482048 矩陣進行測試的時(毫秒)
- 加速比(相對於C版本)