1. 程式人生 > >CUDA之單thread單block&多thread單block&多thread多block

CUDA之單thread單block&多thread單block&多thread多block

用簡單的立方和歸約來舉例:

//單thread單block
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#define DATA_SIZE 1048576
int data[DATA_SIZE];
//產生大量0-9之間的隨機數
void GenerateNumbers(int *number, int size)
{
for (int i = 0; i < size; i++) {
number[i] = rand() % 10;
}
}
//CUDA 初始化
bool InitCUDA()
{
int count;
//取得支援Cuda的裝置的數目
cudaGetDeviceCount(&count);
if (count == 0) {
fprintf(stderr, "There is no device.\n");
return false;
}
int i;
for (i = 0; i < count; i++) {
cudaDeviceProp prop;
if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
if (prop.major >= 1) {
break; } }
}
if (i == count) {
fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
return false;
}
cudaSetDevice(i);
return true;
}
// __global__ 函式(GPU上執行) 計算立方和
__global__ static void sumOfcubes(int *num, int* result)
{
intsum = 0;
inti;
for (i= 0; i< DATA_SIZE; i++) {
sum += num[i] * num[i] * num[i];
}
*result = sum;
}
int main()
{ //CUDA 初始化
if (!InitCUDA()) {
return 0;
}
//生成隨機數
GenerateNumbers(data, DATA_SIZE);
int* gpudata, *result;
cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
cudaMalloc((void**)&result, sizeof(int));
//cudaMemcpy 將產生的隨機數複製到顯示卡記憶體中
cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);
sumOfcubes<< <1, 1, 0 >> > (gpudata, result);
cudaMemcpy(sum, result, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);
printf("GPUsum: %d \n", sum);
int sum = 0;
for (int i = 0; i < DATA_SIZE; i++) {
sum += data[i] * data[i] * data[i];
}
printf("CPUsum: %d \n", sum);
getchar();
return 0;
}
//單block多thread

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
//CUDA RunTime API
#include <cuda_runtime.h>
#include "device_launch_parameters.h"

#define DATA_SIZE 1048576
#define THREAD_NUM 1024 //256--->1024
int data[DATA_SIZE];

void GenerateNumbers(int *number, int size)
{
	for (int i = 0; i < size; i++) {
		number[i] = rand() % 10;
	}
}
// __global__ 函式(GPU上執行) 計算立方和
__global__ static void sumOfSquares(int *num, int* result, clock_t* time)
{ 
	const int tid = threadIdx.x;
//計算每個執行緒需要完成的量
const int size = DATA_SIZE / THREAD_NUM;
int sum = 0;
int i;
//記錄運算開始的時間
clock_t start;
//只在thread 0(即threadIdx.x = 0 的時候)進行記錄
if (tid == 0) start = clock();
for (i = tid; i < DATA_SIZE; i += THREAD_NUM)
//for (i = tid * size; i < (tid + 1) * size; i++)
{
	sum += num[i] * num[i] * num[i];
}
result[tid] = sum;
//計算時間的動作,只在thread 0(即threadIdx.x = 0 的時候)進行
if (tid == 0) 
*time = clock() - start;
}
int main()
{ //CUDA 初始化
	//生成隨機數
	GenerateNumbers(data, DATA_SIZE);
	/*把資料複製到顯示卡記憶體中*/
	int* gpudata, *result;
	clock_t* time;
	//cudaMalloc 取得一塊顯示卡記憶體( 其中result用來儲存計算結果,time用來儲存執行時間)
	cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE); 
	cudaMalloc((void**)&result, sizeof(int)*THREAD_NUM);
	cudaMalloc((void**)&time, sizeof(clock_t));
	//cudaMemcpy 將產生的隨機數複製到顯示卡記憶體中cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);
	// 啟動kernel函式
	cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);
	sumOfSquares << < 1, THREAD_NUM, 0 >> >(gpudata, result, time);
	int sum[THREAD_NUM];
	clock_t time_use;
	//cudaMemcpy 將結果從視訊記憶體中複製回記憶體
	cudaMemcpy(sum, result, sizeof(int) * THREAD_NUM, cudaMemcpyDeviceToHost);
	cudaMemcpy(time_use, time, sizeof(clock_t), cudaMemcpyDeviceToHost);
	//Free
	cudaFree(gpudata);
	cudaFree(result);
	cudaFree(time);
	int final_sum = 0; /*立方和歸約*/
	for (int i = 0; i < THREAD_NUM; i++)
	{
		final_sum += sum[i];
	}
	printf("GPUsum: %d\n time:%d\n", final_sum,time_use);
	final_sum = 0;
	for (int i = 0; i < DATA_SIZE; i++) {
		final_sum += data[i] * data[i] * data[i];
	}
	printf("CPUsum: %d \n", final_sum);
	getchar();
	return 0;
}
//多block多thread

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
//CUDA RunTime API
#include <cuda_runtime.h>
#include "device_launch_parameters.h"

#define DATA_SIZE 1048576
#define THREAD_NUM 256
#define BLOCK_NUM 32
int data[DATA_SIZE];

void GenerateNumbers(int *number, int size)
{
	for (int i = 0; i < size; i++) {
		number[i] = rand() % 10;
	}
}
// __global__ 函式(GPU上執行) 計算立方和
__global__ static void sumOfSquares(int *num, int* result, clock_t* time)
{
	const int tid = threadIdx.x;
	const int bid = blockIdx.x;

	int sum = 0;
	int i;
	//記錄運算開始的時間
	clock_t start;
	//只在thread 0(即threadIdx.x = 0 的時候)進行記錄,每個block 都會記錄開始時間及結束時間
	if (tid == 0)
		time[bid] = clock();
	//thread需要同時通過tid和bid來確定,並保證記憶體連續性
	for (i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM)
	{
		sum += num[i] * num[i] * num[i];
	}
	//Result的數量隨之增加
	result[bid * THREAD_NUM + tid] = sum;
	//計算時間的動作,只在thread 0(即threadIdx.x = 0 的時候)進行,每個block 都會記錄開始時間及結束時間
	if (tid == 0)
		time[bid + BLOCK_NUM] = clock();
}
int main()
{
	GenerateNumbers(data, DATA_SIZE);
	int* gpudata, *result;
	clock_t* time;
	cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
	cudaMalloc((void**)&result, sizeof(int)*THREAD_NUM* BLOCK_NUM);
	cudaMalloc((void**)&time, sizeof(clock_t)* BLOCK_NUM * 2);
	cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);
	// 在CUDA 中執行函式語法:函式名稱<<<block 數目, thread 數目, shared memory 大小>>>(引數...);
	sumOfSquares << < BLOCK_NUM, THREAD_NUM, 0 >> > (gpudata, result, time);
	int sum[THREAD_NUM*BLOCK_NUM];
	clock_t time_use[BLOCK_NUM * 2];
	//cudaMemcpy 將結果從視訊記憶體中複製回記憶體
	cudaMemcpy(sum, result, sizeof(int)* THREAD_NUM*BLOCK_NUM, cudaMemcpyDeviceToHost);
	cudaMemcpy(time_use, time, sizeof(clock_t)* BLOCK_NUM * 2, cudaMemcpyDeviceToHost);
	cudaFree(gpudata);
	cudaFree(result);
	cudaFree(time);
	int final_sum = 0;
	for (int i = 0; i < THREAD_NUM*BLOCK_NUM; i++)
	{
		final_sum += sum[i];
	}
	//採取新的計時策略把每個block 最早的開始時間,和最晚的結束時間相減,取得總執行時間
	clock_t min_start, max_end;
	min_start = time_use[0];
	max_end = time_use[BLOCK_NUM];
	for (int i = 1; i < BLOCK_NUM; i++)
	{
		if (min_start > time_use[i]) min_start = time_use[i];
		if (max_end < time_use[i + BLOCK_NUM])
			max_end = time_use[i + BLOCK_NUM];
	}
	printf("GPUsum: %d gputime: %d\n", final_sum, max_end - min_start);
	final_sum = 0;
	for (int i = 0; i < DATA_SIZE; i++)
	{
		final_sum += data[i] * data[i] * data[i];
	}
	printf("CPUsum: %d \n", final_sum);
	getchar();
	return 0;
}

ShareMemory
是一個block 中所有thread 都能使用的共享記憶體,存取的速度相當快,存取shared memory 的速度和存取暫存器相同,不需要擔心latency 的問題。
可以直接利用__shared__宣告一個shared memory變數
__shared__ float temp[THREAD_NUM * 3];
Shared memory 有時會出現儲存體衝突(bank conflict)的問題:
例如:每個SM有16KB 的shared memory,分成16 個bank
•如果同時每個thread 是存取不同的bank,就不會有問題
•如果同時有兩個(或更多)threads 存取同一個bank 的資料,就會發生bank conflict,這些threads 就必須照順序去存取,而無法同時存取shared memory 了。

//多block多thread 使用sharememory

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
//CUDA RunTime API
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include "device_functions.h"

#define DATA_SIZE 1048576
#define THREAD_NUM 256
#define BLOCK_NUM 32
int data[DATA_SIZE];
 
void GenerateNumbers(int *number, int size)
{
	for (int i = 0; i < size; i++) {
		number[i] = rand() % 10;
	}
}
// __global__ 函式(GPU上執行) 計算立方和
__global__ static void sumOfSquares(int *num, int* result, clock_t* time)
{
	extern __shared__ int shared[];
	const int tid = threadIdx.x;
	const int bid = blockIdx.x;
	
	shared[tid] = 0;
	int i;
	//記錄運算開始的時間
	clock_t start;
	//只在thread 0(即threadIdx.x = 0 的時候)進行記錄,每個block 都會記錄開始時間及結束時間
	if (tid == 0) 
		time[bid] = clock();
	//thread需要同時通過tid和bid來確定,並保證記憶體連續性
	for (i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM)
	{
		shared[tid] += num[i] * num[i] * num[i];
	}
	//同步保證每個thread 都已經把結果寫到shared[tid] 裡面
	__syncthreads();
	//使用執行緒0完成加和運算
	if (tid == 0)
	{
		for (i = 1; i < THREAD_NUM; i++) shared[0] += shared[i];
		result[bid] = shared[0];
	}
	//計算時間的動作,只在thread 0(即threadIdx.x = 0 的時候)進行,每個block 都會記錄開始時間及結束時間
	if (tid == 0) 
		time[bid + BLOCK_NUM] = clock();
}
int main()
{
	GenerateNumbers(data, DATA_SIZE);
	int* gpudata, *result;
	clock_t* time;
	cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
	cudaMalloc((void**)&result, sizeof(int)*BLOCK_NUM);
	cudaMalloc((void**)&time, sizeof(clock_t)* BLOCK_NUM * 2);
	cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);
	// 在CUDA 中執行函式語法:函式名稱<<<block 數目, thread 數目, shared memory 大小>>>(引數...);
	sumOfSquares <<< BLOCK_NUM, THREAD_NUM, THREAD_NUM * sizeof(int) >>>(gpudata, result,time);
	int sum[BLOCK_NUM];
	clock_t time_use[BLOCK_NUM * 2];
	//cudaMemcpy 將結果從視訊記憶體中複製回記憶體
	cudaMemcpy(&sum, result, sizeof(int)*BLOCK_NUM, cudaMemcpyDeviceToHost);
	cudaMemcpy(&time_use, time, sizeof(clock_t)* BLOCK_NUM * 2, cudaMemcpyDeviceToHost);
	cudaFree(gpudata);
	cudaFree(result);
	cudaFree(time);
	int final_sum = 0;
	for (int i = 0; i < BLOCK_NUM; i++)
	{
		final_sum += sum[i];
	}
	//採取新的計時策略把每個block 最早的開始時間,和最晚的結束時間相減,取得總執行時間
	clock_t min_start, max_end;
	min_start = time_use[0];
	max_end = time_use[BLOCK_NUM];
	for (int i = 1; i < BLOCK_NUM; i++)
	{
		if (min_start > time_use[i]) min_start = time_use[i];
		if (max_end < time_use[i + BLOCK_NUM])
			max_end = time_use[i + BLOCK_NUM];
	}
	printf("GPUsum: %d gputime: %d\n", final_sum, max_end - min_start);
	final_sum = 0;
	for (int i = 0; i < DATA_SIZE; i++)
	{
		final_sum += data[i] * data[i] * data[i];
	}
	printf("CPUsum: %d \n", final_sum);
	getchar();
	return 0;
}

Block內完成部分加和工作,所以gputime增加了

//多block多thread

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
//CUDA RunTime API
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include "device_functions.h"

#define DATA_SIZE 1048576
#define THREAD_NUM 256
#define BLOCK_NUM 32
int data[DATA_SIZE];
 
void GenerateNumbers(int *number, int size)
{
	for (int i = 0; i < size; i++) {
		number[i] = rand() % 10;
	}
}
// __global__ 函式(GPU上執行) 計算立方和
__global__ static void sumOfSquares(int *num, int* result, clock_t* time)
{
	extern __shared__ int shared[];
	const int tid = threadIdx.x;
	const int bid = blockIdx.x;
	
	shared[tid] = 0;
	int i;
	//記錄運算開始的時間
	//只在thread 0(即threadIdx.x = 0 的時候)進行記錄,每個block 都會記錄開始時間及結束時間
	if (tid == 0) 
		time[bid] = clock();
	//thread需要同時通過tid和bid來確定,並保證記憶體連續性
	for (i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM)
	{
		shared[tid] += num[i] * num[i] * num[i];
	}
	//同步保證每個thread 都已經把結果寫到shared[tid] 裡面
	__syncthreads();

	//樹狀加法
	int offset = 1, mask = 1;
	while (offset < THREAD_NUM)
	{
		if ((tid & mask) == 0)
		{
			shared[tid] += shared[tid + offset];
		}
		offset += offset;
		mask = offset + mask;
		__syncthreads();
	}
	if (tid == 0)
	{
		result[bid] = shared[0];
		time[bid + BLOCK_NUM] = clock();
	}	
}

int main()
{
	GenerateNumbers(data, DATA_SIZE);
	int* gpudata, *result;
	clock_t* time;
	cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
	cudaMalloc((void**)&result, sizeof(int)*BLOCK_NUM);
	cudaMalloc((void**)&time, sizeof(clock_t)* BLOCK_NUM * 2);
	cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);
	// 在CUDA 中執行函式語法:函式名稱<<<block 數目, thread 數目, shared memory 大小>>>(引數...);
	sumOfSquares <<< BLOCK_NUM, THREAD_NUM, THREAD_NUM * sizeof(int) >>>(gpudata, result,time);
	int sum[BLOCK_NUM];
	clock_t time_use[BLOCK_NUM * 2];
	//cudaMemcpy 將結果從視訊記憶體中複製回記憶體
	cudaMemcpy(&sum, result, sizeof(int)*BLOCK_NUM, cudaMemcpyDeviceToHost);
	cudaMemcpy(&time_use, time, sizeof(clock_t)* BLOCK_NUM * 2, cudaMemcpyDeviceToHost);
	cudaFree(gpudata);
	cudaFree(result);
	cudaFree(time);
	int final_sum = 0;
	for (int i = 0; i < BLOCK_NUM; i++)
	{
		final_sum += sum[i];
	}
	//採取新的計時策略把每個block 最早的開始時間,和最晚的結束時間相減,取得總執行時間
	clock_t min_start, max_end;
	min_start = time_use[0];
	max_end = time_use[BLOCK_NUM];
	for (int i = 1; i < BLOCK_NUM; i++)
	{
		if (min_start > time_use[i]) min_start = time_use[i];
		if (max_end < time_use[i + BLOCK_NUM])
			max_end = time_use[i + BLOCK_NUM];
	}
	printf("GPUsum: %d gputime: %d\n", final_sum, max_end - min_start);
	final_sum = 0;
	for (int i = 0; i < DATA_SIZE; i++)
	{
		final_sum += data[i] * data[i] * data[i];
	}
	printf("CPUsum: %d \n", final_sum);
	getchar();
	return 0;
}