CUDA共享內存的使用示例
阿新 • • 發佈:2017-12-08
blocks col all square 歸約 如果 ont 位置 nload
CUDA共享內存使用示例如下:參考教材《GPU高性能編程CUDA實戰》。P54-P65
教材下載地址:http://download.csdn.net/download/yizhaoyanbo/10150300。如果沒有下載分可以評論區留下郵箱,我發你。
1 #include <cuda.h> 2 #include <cuda_runtime.h> 3 #include <device_launch_parameters.h> 4 #include <device_functions.h> 5 #include <iostream> 6 #include <string> 7 8 using namespace std; 9 10 #define imin(a,b) (a<b? a:b) 11 const int N = 33 * 1024; 12 const int threadsPerBlock = 256; 13 const int blocksPerGrid = imin(32, (N + threadsPerBlock - 1) / threadsPerBlock); 14 15 __global__ void dot(float *a, float *b, float *c) 16 { 17 __shared__ floatcache[threadsPerBlock]; 18 int tid = threadIdx.x + blockDim.x*blockIdx.x; 19 int cacheIndex = threadIdx.x; 20 21 float temp = 0; 22 //每個線程負責計算的點乘,再加和 23 while (tid<N) 24 { 25 temp += a[tid] * b[tid]; 26 tid += blockDim.x*gridDim.x; 27 } 28 29 //每個線程塊中線程計算的加和保存到緩沖區cache,一共有blocksPerGrid個緩沖區副本30 cache[cacheIndex] = temp; 31 //對線程塊中的線程進行同步 32 __syncthreads(); 33 34 //歸約運算,將每個緩沖區中的值加和,存放到緩沖區第一個元素位置 35 int i = blockDim.x / 2; 36 while (i != 0) 37 { 38 if (cacheIndex < i) 39 { 40 cache[cacheIndex] += cache[cacheIndex + i]; 41 } 42 __syncthreads(); 43 i /= 2; 44 } 45 //使用第一個線程取出每個緩沖區第一個元素賦值到C數組 46 if (cacheIndex == 0) 47 { 48 c[blockIdx.x] = cache[0]; 49 } 50 } 51 52 void main() 53 { 54 float *a, *b, c, *partial_c; 55 float *dev_a, *dev_b, *dev_partial_c; 56 57 //分配CPU內存 58 a = (float*)malloc(N * sizeof(float)); 59 b = (float*)malloc(N * sizeof(float)); 60 partial_c = (float*)malloc(blocksPerGrid * sizeof(float)); 61 62 //分配GPU內存 63 cudaMalloc(&dev_a, N * sizeof(float)); 64 cudaMalloc(&dev_b, N * sizeof(float)); 65 cudaMalloc(&dev_partial_c, blocksPerGrid * sizeof(float)); 66 67 float sum = 0; 68 for (int i = 0; i < N; i++) 69 { 70 a[i] = i; 71 b[i] = i * 2; 72 } 73 74 //將數組上傳到GPU 75 cudaMemcpy(dev_a, a, N * sizeof(float), cudaMemcpyHostToDevice); 76 cudaMemcpy(dev_b, b, N * sizeof(float), cudaMemcpyHostToDevice); 77 78 dot << <blocksPerGrid, threadsPerBlock >> > (dev_a, dev_b, dev_partial_c); 79 80 cudaMemcpy(partial_c, dev_partial_c, blocksPerGrid * sizeof(float), cudaMemcpyDeviceToHost); 81 82 //CPU 完成最終求和 83 c = 0; 84 for (int i = 0; i < blocksPerGrid; i++) 85 { 86 c += partial_c[i]; 87 } 88 89 #define sum_squares(x) (x*(x+1)*(2*x+1)/6) 90 printf("does GPU value %.6g = %.6g?\n", c, 2 * sum_squares((float)(N - 1))); 91 92 cudaFree(dev_a); 93 cudaFree(dev_b); 94 cudaFree(dev_partial_c); 95 96 free(a); 97 free(b); 98 free(partial_c); 99 }
CUDA共享內存的使用示例