1. 程式人生 > 其它 >double 乘法_矩陣乘法優化過程(DGEMM)

double 乘法_矩陣乘法優化過程(DGEMM)

技術標籤:double 乘法

1. 介紹

本文以N*N 矩陣乘法為例敘述優化的過程,該程式碼運行於x86 Linux/Mac 平臺。 整個優化分為一下幾個步驟:

  • 向量化 (在x86架構上採用AVX)
  • 迴圈展開(loop unrolling)
  • cache blocking (也可稱為tiling)
  • 多執行緒 (OpenMP)

NOTE:

  • 本文的程式碼來自 << Computer Organization and Design RISC-V edition>>.
  • 完整程式碼可訪問 https://gitee.com/optimization/dgemm.git
  • 本文程式碼針對的是x86 平臺,其實對於其它平臺可以通過簡單修改SIMD指令來實現。

2. C 程式碼

void dgemm_c(size_t n, double *A, double *B, double *C)
{
	for (size_t i = 0; i < n; ++i) {
		for (size_t j = 0; j < n; ++j) {
			double cij = C[i+j*n];
			for (size_t k = 0; k < n; k++) {
				cij += A[i+k*n] * B[k+j*n];
			}
			C[i+j*n] = cij;
		}
	}
}

3. 向量化程式碼

void dgemm_avx(size_t n, double *A, double *B, double *C)
{
	for (size_t i = 0; i < n; i += 4) {
		for (size_t j = 0; j < n; j++) {
			__m256d c0 = _mm256_load_pd(C+i+j*n); /* c0 = C[i][j] */
			for (size_t k = 0; k < n; k++) {
				c0 = _mm256_add_pd(c0,
					_mm256_mul_pd(_mm256_load_pd(A+i+k*n),
						_mm256_broadcast_sd(B+k+j*n)));
			}
			_mm256_store_pd(C+i+j*n, c0);  /* C[i][j] = c0 */;
		}
	}
}

4. 迴圈展開程式碼

#define UNROLL 4
void dgemm_avx_unroll(size_t n, double *A, double *B, double *C)
{
	for (size_t i = 0; i < n; i += 4*UNROLL) {
		for (size_t j = 0; j < n; j++) {
			__m256d c[UNROLL];
			for (int x = 0; x < UNROLL; x++) {
				c[x] = _mm256_load_pd(C+i+x*4+j*n);
			}
			for (size_t k = 0; k < n; k++) {
				__m256d b = _mm256_broadcast_sd(B+k+j*n);
				for (int x = 0; x < UNROLL; x++) {
					c[x] = _mm256_add_pd(c[x],
						_mm256_mul_pd(_mm256_load_pd(A+i+k*n+x*4),
							b));
				}
			}
			for (int x = 0; x < UNROLL; x++) {
				_mm256_store_pd(C+i+x*4+j*n, c[x]);
			}
		}
	}
}

5. cache blocking 程式碼

#define UNROLL 4
#define BLOCKSIZE 32

static inline void do_block(int n, int si, int sj, int sk,
			double *A, double *B, double *C)
{
	for (int i = si; i < si + BLOCKSIZE; i += UNROLL*4) {
		for (int j = sj; j < sj + BLOCKSIZE; j++) {
			__m256d c[UNROLL];
			for (int x = 0; x < UNROLL; x++) {
				c[x] = _mm256_load_pd(C+i+x*4+j*n);
			}
			for (int k = sk; k < sk + BLOCKSIZE; k++) {
				__m256d b = _mm256_broadcast_sd(B+k+j*n);
				for (int x = 0; x < UNROLL; x++) {
					c[x] = _mm256_add_pd(c[x],
						_mm256_mul_pd(
							_mm256_load_pd(A+n*k+x*4+i), b));
				}
			}

			for (int x = 0; x < UNROLL; x++) {
				_mm256_store_pd(C+i+x*4+j*n, c[x]);
			}
		}
	}
}

void dgemm_avx_unroll_blk(size_t n, double *A, double *B, double *C)
{
	for (int sj = 0; sj < n; sj += BLOCKSIZE) {
		for (int si = 0; si < n; si += BLOCKSIZE) {
			for (int sk = 0; sk < n; sk += BLOCKSIZE) {
				do_block(n, si, sj, sk, A, B, C);
			}
		}
	}
}

6. 多執行緒程式碼

void dgemm_avx_unroll_blk_omp(size_t n, double *A, double *B, double *C)
{
#pragma omp parallel for
	for (int sj = 0; sj < n; sj += BLOCKSIZE) {
		for (int si = 0; si < n; si += BLOCKSIZE) {
			for (int sk = 0; sk < n; sk += BLOCKSIZE) {
				do_block(n, si, sj, sk, A, B, C);
			}
		}
	}
}

7. benchmark

  • 在採用20482048 矩陣進行測試的時(毫秒)

4b89a95c7b2bbf884e86189af5b3bd21.png
  • 加速比(相對於C版本)

465027fee5f54154249ea6ef0a236648.png