1. 程式人生 > >cuda 單block多執行緒

cuda 單block多執行緒

cuda單block多thread的實現

#include <iostream>
#include <cuda_runtime.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>

__global__ void gpu_add(float*d_a,float*d_b,float*d_c,int n)//a,b,c陣列,n是元素個數
{
	int idx = threadIdx.x;
	int IDX = idx;	
	d_c[idx] =d_a[idx] +d_b[idx];
	IDX+=idx;
}

int main()
{
	float *h_a,*h_b,*h_c,*d_a,*d_b,*d_c;
	int n = 1024;
	size_t nBytes = n * sizeof(float);
	time_t t;
	h_a = (float*)malloc(nBytes);
	h_b = (float*)malloc(nBytes);
	h_c = (float*)malloc(nBytes);
	srand((unsigned int)time(&t));
	for(int i = 0;i < n;++i)
	{
		h_a[i] = (float)(rand()&0xff)/10.0f;
		h_b[i] = (float)(rand()&0xff)/10.0f;
		std::cout << "h_a[" << i << "]=" << h_a[i] << "\t";
		std::cout << "h_b[" << i << "]=" << h_b[i] << "\n";
	}

	cudaMalloc((void**)&d_a,nBytes);
	cudaMalloc((void**)&d_b,nBytes);
	cudaMalloc((void**)&d_c,nBytes);

	cudaMemcpy(d_a,h_a,nBytes,cudaMemcpyHostToDevice);
	cudaMemcpy(d_b,h_b,nBytes,cudaMemcpyHostToDevice);
	
	gpu_add<<<1,n>>>(d_a,d_b,d_c,n);
	
	cudaMemcpy(h_c,d_c,nBytes,cudaMemcpyDeviceToHost);

	cudaFree(d_a);
	cudaFree(d_b);
	cudaFree(d_c);
	
	for(int i = 0;i < n;++i)
	{
		std::cout << "c[" << i<<"]=" <<h_c[i]<<"\n";
	}
	free(h_a);
	free(h_b);
	free(h_c);
}