【CUDA並行程式設計之四】矩陣相乘
阿新 • • 發佈:2019-02-03
前面介紹了基本的Cuda程式設計的相關知識,那麼這一篇在此基礎之上來看看GPU在處理資料計算上的高效能,我們拿矩陣相乘來作為例子。
1.CPU上執行矩陣相乘以及效能。
在CPU上進行矩陣相乘運算的程式碼:
mat_mul.cc:
wtime.h://a[i]*b[i] + c[i] = d[i] #include<iostream> #include<vector> #include<map> #include<fstream> #include"wtime.h" using namespace std; const int N = 320; //矩陣有兩種表達的方法用二維矩陣或者用一維矩陣表示 int a[N+1][N+1],b[N+1][N+1],c[N+1][N+1],d[N+1][N+1]; int aa[(N+1)*(N+1)],bb[(N+1)*(N+1)],cc[(N+1)*(N+1)],dd[(N+1)*(N+1)]; void init() { for(int i=0;i<N;i++) for(int j=0;j<N;j++) { a[i][j] = 1; b[i][j] = 2; c[i][j] = 3; } } void init1() { for(int i=0;i<N;i++) for(int j=0;j<N;j++) { aa[i*N+j] = 1; bb[i*N+j] = 2; cc[i*N+j] = 3; } } void mul() { for(int i=0;i<N;i++) for(int j=0;j<N;j++) { for(int k=0;k<N;k++) { d[i][j] += a[i][k] * b[k][j]; } d[i][j] += c[i][j]; } } void mul1() { for(int i=0;i<N;i++) for(int j=0;j<N;j++) { for(int k=0;k<N;k++) { dd[i*N+j] += aa[i*N+k] * bb[k*N+j]; } dd[N*i+j] += cc[N*i+j]; } } void print() { ofstream fout; fout.open("result.txt"); if(!fout) { perror("can not open the file"); } for(int i=0;i<N;i++) { for(int j=0;j<N;j++) { fout<<d[i][j]<<" "; } fout<<endl; } fout.close(); } int main() { init1(); double t = wtime(); mul1(); t = wtime()-t; printf("computation timing = %10.10f sec\n",t); //print(); return 0; }
#ifndef _WTIME_
#define _WTIME_
double wtime();
#endif
wtime.cc:
#include <stdio.h> #include <sys/time.h> #include <iostream> #include <cstdlib> double wtime(void) { double now_time; struct timeval etstart; struct timezone tzp; if(gettimeofday(&etstart,&tzp)==-1) { perror("Error:calling gettimeofday() not successfully.\n"); } now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0; return now_time; } #if 0 int main() { double time; time = wtime(); printf("time of day = %10.4f\n",time); return 0; } #endif
makefile:
target:
g++ mat_mul.cc wtime.cc
./a.out
結果:
2.GPU上執行矩陣相乘以及效能。
程式碼:
cuda_mat_mul_v1.cu:
cuda_wtime.cu://matrix multiplication with global memory #include<iostream> #include<fstream> #include "wtime.h" using namespace std; const int BLOCK_SIZE = 16; const int GRID_SIZE = 20; //D = A * B + C; __global__ void mat_mul(int *da,int *db,int *dc,int *dd,int N) { int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; int sum = 0; for(int i=0;i<N;i++) { sum += da[row*N + i] * db[row*i+col]; } dd[row*N + col] = sum + dc[row*N + col]; } int main() { int N = BLOCK_SIZE * GRID_SIZE; int *ha,*hb,*hc,*hd; int *da,*db,*dc,*dd; double time; ha = new int[N*N]; hb = new int[N*N]; hc = new int[N*N]; hd = new int[N*N]; cudaError_t err; //initialize for(int i=0;i<N;i++) for(int j=0;j<N;j++) { ha[i*N+j] = 1; hb[i*N+j] = 2; hc[i*N+j] = 3; } //malloc</strong> cudaMalloc(&da,N*N*sizeof(int)); cudaMalloc(&db,N*N*sizeof(int)); cudaMalloc(&dc,N*N*sizeof(int)); err = cudaMalloc(&dd,N*N*sizeof(int)); printf("Cuda Malloc C : %s\n",cudaGetErrorString(err)); //host to device cudaMemcpy(da,ha,N*N*sizeof(int),cudaMemcpyHostToDevice); cudaMemcpy(db,hb,N*N*sizeof(int),cudaMemcpyHostToDevice); cudaMemcpy(dc,hc,N*N*sizeof(int),cudaMemcpyHostToDevice); cudaMemcpy(dd,hd,N*N*sizeof(int),cudaMemcpyHostToDevice); dim3 threadBlock(BLOCK_SIZE,BLOCK_SIZE); dim3 grid(GRID_SIZE,GRID_SIZE); //kernel time = wtime(); mat_mul<<<grid,threadBlock>>>(da,db,dc,dd,N); printf("Computation time is %10.10f\n",wtime()-time); //device to host cudaMemcpy(hd,dd,N*N*sizeof(int),cudaMemcpyDeviceToHost); //print result to file ofstream fout; fout.open("result_v1.txt"); if(!fout) { cerr<<"open the file error"<<endl; exit(-1); } for(int i=0;i<N;i++) { for(int j=0;j<N;j++) { fout<<hd[i*N+j]<<" "; } fout<<endl; } delete []ha;delete []hb;delete []hc;delete []hd; cudaFree(da);cudaFree(db);cudaFree(dc);cudaFree(dd); return 0; }
#include <stdio.h>
#include <sys/time.h>
#include <iostream>
#include <cstdlib>
double wtime(void)
{
double now_time;
struct timeval etstart;
struct timezone tzp;
if(gettimeofday(&etstart,&tzp)==-1)
{
perror("Error:calling gettimeofday() not successfully.\n");
}
now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0;
return now_time;
}
#if 0
int main()
{
double time;
time = wtime();
printf("time of day = %10.4f\n",time);
return 0;
}
#endif
wtime.h:#ifndef _WTIME_
#define _WTIME_
double wtime();
#endif
cuda_wtime.cu:
#include <stdio.h>
#include <sys/time.h>
#include <iostream>
#include <cstdlib>
double wtime(void)
{
double now_time;
struct timeval etstart;
struct timezone tzp;
if(gettimeofday(&etstart,&tzp)==-1)
{
perror("Error:calling gettimeofday() not successfully.\n");
}
now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0;
return now_time;
}
#if 0
int main()
{
double time;
time = wtime();
printf("time of day = %10.4f\n",time);
return 0;
}
#endif
makefile:
cu:
nvcc cuda_mat_mul_v1.cu cuda_wtime.cu
./a.out
結果:
3.計算效能對比:
矩陣大小 | 1600*1600 | 1200*1200 | 800*800 | 320*320 |
序列時間/s | 30.9 | 11.49865 | 2.597987 | 0.162311 |
並行時間 | grid=100/block=16 | grid=75/block=16 | grid=50/block=16 | grid=20/block=16 |
kernel執行時間/s | 0.0000319 | 0.0000309944 | 0.0000309944 | 0.0000231266 |
平行計算總時間(分配記憶體加+資料拷貝+計算)/s | 0.70796 | 0.439213 | 0.310214 | 0.237676 |
可見,在矩陣規模大的時候非常明顯的體現出了GPU強大的計算能力。