CUDA之單thread單block&多thread單block&多thread多block
阿新 • • 發佈:2018-11-15
用簡單的立方和歸約來舉例:
//單thread單block #include <stdio.h> #include <stdlib.h> #include <cuda_runtime.h> #define DATA_SIZE 1048576 int data[DATA_SIZE]; //產生大量0-9之間的隨機數 void GenerateNumbers(int *number, int size) { for (int i = 0; i < size; i++) { number[i] = rand() % 10; } } //CUDA 初始化 bool InitCUDA() { int count; //取得支援Cuda的裝置的數目 cudaGetDeviceCount(&count); if (count == 0) { fprintf(stderr, "There is no device.\n"); return false; } int i; for (i = 0; i < count; i++) { cudaDeviceProp prop; if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) { if (prop.major >= 1) { break; } } } if (i == count) { fprintf(stderr, "There is no device supporting CUDA 1.x.\n"); return false; } cudaSetDevice(i); return true; } // __global__ 函式(GPU上執行) 計算立方和 __global__ static void sumOfcubes(int *num, int* result) { intsum = 0; inti; for (i= 0; i< DATA_SIZE; i++) { sum += num[i] * num[i] * num[i]; } *result = sum; } int main() { //CUDA 初始化 if (!InitCUDA()) { return 0; } //生成隨機數 GenerateNumbers(data, DATA_SIZE); int* gpudata, *result; cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE); cudaMalloc((void**)&result, sizeof(int)); //cudaMemcpy 將產生的隨機數複製到顯示卡記憶體中 cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice); sumOfcubes<< <1, 1, 0 >> > (gpudata, result); cudaMemcpy(sum, result, sizeof(int), cudaMemcpyDeviceToHost); cudaFree(gpudata); cudaFree(result); printf("GPUsum: %d \n", sum); int sum = 0; for (int i = 0; i < DATA_SIZE; i++) { sum += data[i] * data[i] * data[i]; } printf("CPUsum: %d \n", sum); getchar(); return 0; }
//單block多thread #include <stdio.h> #include <stdlib.h> #include <time.h> //CUDA RunTime API #include <cuda_runtime.h> #include "device_launch_parameters.h" #define DATA_SIZE 1048576 #define THREAD_NUM 1024 //256--->1024 int data[DATA_SIZE]; void GenerateNumbers(int *number, int size) { for (int i = 0; i < size; i++) { number[i] = rand() % 10; } } // __global__ 函式(GPU上執行) 計算立方和 __global__ static void sumOfSquares(int *num, int* result, clock_t* time) { const int tid = threadIdx.x; //計算每個執行緒需要完成的量 const int size = DATA_SIZE / THREAD_NUM; int sum = 0; int i; //記錄運算開始的時間 clock_t start; //只在thread 0(即threadIdx.x = 0 的時候)進行記錄 if (tid == 0) start = clock(); for (i = tid; i < DATA_SIZE; i += THREAD_NUM) //for (i = tid * size; i < (tid + 1) * size; i++) { sum += num[i] * num[i] * num[i]; } result[tid] = sum; //計算時間的動作,只在thread 0(即threadIdx.x = 0 的時候)進行 if (tid == 0) *time = clock() - start; } int main() { //CUDA 初始化 //生成隨機數 GenerateNumbers(data, DATA_SIZE); /*把資料複製到顯示卡記憶體中*/ int* gpudata, *result; clock_t* time; //cudaMalloc 取得一塊顯示卡記憶體( 其中result用來儲存計算結果,time用來儲存執行時間) cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE); cudaMalloc((void**)&result, sizeof(int)*THREAD_NUM); cudaMalloc((void**)&time, sizeof(clock_t)); //cudaMemcpy 將產生的隨機數複製到顯示卡記憶體中cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice); // 啟動kernel函式 cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice); sumOfSquares << < 1, THREAD_NUM, 0 >> >(gpudata, result, time); int sum[THREAD_NUM]; clock_t time_use; //cudaMemcpy 將結果從視訊記憶體中複製回記憶體 cudaMemcpy(sum, result, sizeof(int) * THREAD_NUM, cudaMemcpyDeviceToHost); cudaMemcpy(time_use, time, sizeof(clock_t), cudaMemcpyDeviceToHost); //Free cudaFree(gpudata); cudaFree(result); cudaFree(time); int final_sum = 0; /*立方和歸約*/ for (int i = 0; i < THREAD_NUM; i++) { final_sum += sum[i]; } printf("GPUsum: %d\n time:%d\n", final_sum,time_use); final_sum = 0; for (int i = 0; i < DATA_SIZE; i++) { final_sum += data[i] * data[i] * data[i]; } printf("CPUsum: %d \n", final_sum); getchar(); return 0; }
//多block多thread #include <stdio.h> #include <stdlib.h> #include <time.h> //CUDA RunTime API #include <cuda_runtime.h> #include "device_launch_parameters.h" #define DATA_SIZE 1048576 #define THREAD_NUM 256 #define BLOCK_NUM 32 int data[DATA_SIZE]; void GenerateNumbers(int *number, int size) { for (int i = 0; i < size; i++) { number[i] = rand() % 10; } } // __global__ 函式(GPU上執行) 計算立方和 __global__ static void sumOfSquares(int *num, int* result, clock_t* time) { const int tid = threadIdx.x; const int bid = blockIdx.x; int sum = 0; int i; //記錄運算開始的時間 clock_t start; //只在thread 0(即threadIdx.x = 0 的時候)進行記錄,每個block 都會記錄開始時間及結束時間 if (tid == 0) time[bid] = clock(); //thread需要同時通過tid和bid來確定,並保證記憶體連續性 for (i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM) { sum += num[i] * num[i] * num[i]; } //Result的數量隨之增加 result[bid * THREAD_NUM + tid] = sum; //計算時間的動作,只在thread 0(即threadIdx.x = 0 的時候)進行,每個block 都會記錄開始時間及結束時間 if (tid == 0) time[bid + BLOCK_NUM] = clock(); } int main() { GenerateNumbers(data, DATA_SIZE); int* gpudata, *result; clock_t* time; cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE); cudaMalloc((void**)&result, sizeof(int)*THREAD_NUM* BLOCK_NUM); cudaMalloc((void**)&time, sizeof(clock_t)* BLOCK_NUM * 2); cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice); // 在CUDA 中執行函式語法:函式名稱<<<block 數目, thread 數目, shared memory 大小>>>(引數...); sumOfSquares << < BLOCK_NUM, THREAD_NUM, 0 >> > (gpudata, result, time); int sum[THREAD_NUM*BLOCK_NUM]; clock_t time_use[BLOCK_NUM * 2]; //cudaMemcpy 將結果從視訊記憶體中複製回記憶體 cudaMemcpy(sum, result, sizeof(int)* THREAD_NUM*BLOCK_NUM, cudaMemcpyDeviceToHost); cudaMemcpy(time_use, time, sizeof(clock_t)* BLOCK_NUM * 2, cudaMemcpyDeviceToHost); cudaFree(gpudata); cudaFree(result); cudaFree(time); int final_sum = 0; for (int i = 0; i < THREAD_NUM*BLOCK_NUM; i++) { final_sum += sum[i]; } //採取新的計時策略把每個block 最早的開始時間,和最晚的結束時間相減,取得總執行時間 clock_t min_start, max_end; min_start = time_use[0]; max_end = time_use[BLOCK_NUM]; for (int i = 1; i < BLOCK_NUM; i++) { if (min_start > time_use[i]) min_start = time_use[i]; if (max_end < time_use[i + BLOCK_NUM]) max_end = time_use[i + BLOCK_NUM]; } printf("GPUsum: %d gputime: %d\n", final_sum, max_end - min_start); final_sum = 0; for (int i = 0; i < DATA_SIZE; i++) { final_sum += data[i] * data[i] * data[i]; } printf("CPUsum: %d \n", final_sum); getchar(); return 0; }
ShareMemory
是一個block 中所有thread 都能使用的共享記憶體,存取的速度相當快,存取shared memory 的速度和存取暫存器相同,不需要擔心latency 的問題。
可以直接利用__shared__宣告一個shared memory變數
__shared__ float temp[THREAD_NUM * 3];
Shared memory 有時會出現儲存體衝突(bank conflict)的問題:
例如:每個SM有16KB 的shared memory,分成16 個bank
•如果同時每個thread 是存取不同的bank,就不會有問題
•如果同時有兩個(或更多)threads 存取同一個bank 的資料,就會發生bank conflict,這些threads 就必須照順序去存取,而無法同時存取shared memory 了。
//多block多thread 使用sharememory
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
//CUDA RunTime API
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include "device_functions.h"
#define DATA_SIZE 1048576
#define THREAD_NUM 256
#define BLOCK_NUM 32
int data[DATA_SIZE];
void GenerateNumbers(int *number, int size)
{
for (int i = 0; i < size; i++) {
number[i] = rand() % 10;
}
}
// __global__ 函式(GPU上執行) 計算立方和
__global__ static void sumOfSquares(int *num, int* result, clock_t* time)
{
extern __shared__ int shared[];
const int tid = threadIdx.x;
const int bid = blockIdx.x;
shared[tid] = 0;
int i;
//記錄運算開始的時間
clock_t start;
//只在thread 0(即threadIdx.x = 0 的時候)進行記錄,每個block 都會記錄開始時間及結束時間
if (tid == 0)
time[bid] = clock();
//thread需要同時通過tid和bid來確定,並保證記憶體連續性
for (i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM)
{
shared[tid] += num[i] * num[i] * num[i];
}
//同步保證每個thread 都已經把結果寫到shared[tid] 裡面
__syncthreads();
//使用執行緒0完成加和運算
if (tid == 0)
{
for (i = 1; i < THREAD_NUM; i++) shared[0] += shared[i];
result[bid] = shared[0];
}
//計算時間的動作,只在thread 0(即threadIdx.x = 0 的時候)進行,每個block 都會記錄開始時間及結束時間
if (tid == 0)
time[bid + BLOCK_NUM] = clock();
}
int main()
{
GenerateNumbers(data, DATA_SIZE);
int* gpudata, *result;
clock_t* time;
cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
cudaMalloc((void**)&result, sizeof(int)*BLOCK_NUM);
cudaMalloc((void**)&time, sizeof(clock_t)* BLOCK_NUM * 2);
cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);
// 在CUDA 中執行函式語法:函式名稱<<<block 數目, thread 數目, shared memory 大小>>>(引數...);
sumOfSquares <<< BLOCK_NUM, THREAD_NUM, THREAD_NUM * sizeof(int) >>>(gpudata, result,time);
int sum[BLOCK_NUM];
clock_t time_use[BLOCK_NUM * 2];
//cudaMemcpy 將結果從視訊記憶體中複製回記憶體
cudaMemcpy(&sum, result, sizeof(int)*BLOCK_NUM, cudaMemcpyDeviceToHost);
cudaMemcpy(&time_use, time, sizeof(clock_t)* BLOCK_NUM * 2, cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);
cudaFree(time);
int final_sum = 0;
for (int i = 0; i < BLOCK_NUM; i++)
{
final_sum += sum[i];
}
//採取新的計時策略把每個block 最早的開始時間,和最晚的結束時間相減,取得總執行時間
clock_t min_start, max_end;
min_start = time_use[0];
max_end = time_use[BLOCK_NUM];
for (int i = 1; i < BLOCK_NUM; i++)
{
if (min_start > time_use[i]) min_start = time_use[i];
if (max_end < time_use[i + BLOCK_NUM])
max_end = time_use[i + BLOCK_NUM];
}
printf("GPUsum: %d gputime: %d\n", final_sum, max_end - min_start);
final_sum = 0;
for (int i = 0; i < DATA_SIZE; i++)
{
final_sum += data[i] * data[i] * data[i];
}
printf("CPUsum: %d \n", final_sum);
getchar();
return 0;
}
Block內完成部分加和工作,所以gputime增加了
//多block多thread
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
//CUDA RunTime API
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include "device_functions.h"
#define DATA_SIZE 1048576
#define THREAD_NUM 256
#define BLOCK_NUM 32
int data[DATA_SIZE];
void GenerateNumbers(int *number, int size)
{
for (int i = 0; i < size; i++) {
number[i] = rand() % 10;
}
}
// __global__ 函式(GPU上執行) 計算立方和
__global__ static void sumOfSquares(int *num, int* result, clock_t* time)
{
extern __shared__ int shared[];
const int tid = threadIdx.x;
const int bid = blockIdx.x;
shared[tid] = 0;
int i;
//記錄運算開始的時間
//只在thread 0(即threadIdx.x = 0 的時候)進行記錄,每個block 都會記錄開始時間及結束時間
if (tid == 0)
time[bid] = clock();
//thread需要同時通過tid和bid來確定,並保證記憶體連續性
for (i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM)
{
shared[tid] += num[i] * num[i] * num[i];
}
//同步保證每個thread 都已經把結果寫到shared[tid] 裡面
__syncthreads();
//樹狀加法
int offset = 1, mask = 1;
while (offset < THREAD_NUM)
{
if ((tid & mask) == 0)
{
shared[tid] += shared[tid + offset];
}
offset += offset;
mask = offset + mask;
__syncthreads();
}
if (tid == 0)
{
result[bid] = shared[0];
time[bid + BLOCK_NUM] = clock();
}
}
int main()
{
GenerateNumbers(data, DATA_SIZE);
int* gpudata, *result;
clock_t* time;
cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
cudaMalloc((void**)&result, sizeof(int)*BLOCK_NUM);
cudaMalloc((void**)&time, sizeof(clock_t)* BLOCK_NUM * 2);
cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);
// 在CUDA 中執行函式語法:函式名稱<<<block 數目, thread 數目, shared memory 大小>>>(引數...);
sumOfSquares <<< BLOCK_NUM, THREAD_NUM, THREAD_NUM * sizeof(int) >>>(gpudata, result,time);
int sum[BLOCK_NUM];
clock_t time_use[BLOCK_NUM * 2];
//cudaMemcpy 將結果從視訊記憶體中複製回記憶體
cudaMemcpy(&sum, result, sizeof(int)*BLOCK_NUM, cudaMemcpyDeviceToHost);
cudaMemcpy(&time_use, time, sizeof(clock_t)* BLOCK_NUM * 2, cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);
cudaFree(time);
int final_sum = 0;
for (int i = 0; i < BLOCK_NUM; i++)
{
final_sum += sum[i];
}
//採取新的計時策略把每個block 最早的開始時間,和最晚的結束時間相減,取得總執行時間
clock_t min_start, max_end;
min_start = time_use[0];
max_end = time_use[BLOCK_NUM];
for (int i = 1; i < BLOCK_NUM; i++)
{
if (min_start > time_use[i]) min_start = time_use[i];
if (max_end < time_use[i + BLOCK_NUM])
max_end = time_use[i + BLOCK_NUM];
}
printf("GPUsum: %d gputime: %d\n", final_sum, max_end - min_start);
final_sum = 0;
for (int i = 0; i < DATA_SIZE; i++)
{
final_sum += data[i] * data[i] * data[i];
}
printf("CPUsum: %d \n", final_sum);
getchar();
return 0;
}