cuda 單block多執行緒
阿新 • • 發佈:2018-11-13
cuda單block多thread的實現
#include <iostream> #include <cuda_runtime.h> #include <stdio.h> #include <time.h> #include <stdlib.h> #include <string.h> __global__ void gpu_add(float*d_a,float*d_b,float*d_c,int n)//a,b,c陣列,n是元素個數 { int idx = threadIdx.x; int IDX = idx; d_c[idx] =d_a[idx] +d_b[idx]; IDX+=idx; } int main() { float *h_a,*h_b,*h_c,*d_a,*d_b,*d_c; int n = 1024; size_t nBytes = n * sizeof(float); time_t t; h_a = (float*)malloc(nBytes); h_b = (float*)malloc(nBytes); h_c = (float*)malloc(nBytes); srand((unsigned int)time(&t)); for(int i = 0;i < n;++i) { h_a[i] = (float)(rand()&0xff)/10.0f; h_b[i] = (float)(rand()&0xff)/10.0f; std::cout << "h_a[" << i << "]=" << h_a[i] << "\t"; std::cout << "h_b[" << i << "]=" << h_b[i] << "\n"; } cudaMalloc((void**)&d_a,nBytes); cudaMalloc((void**)&d_b,nBytes); cudaMalloc((void**)&d_c,nBytes); cudaMemcpy(d_a,h_a,nBytes,cudaMemcpyHostToDevice); cudaMemcpy(d_b,h_b,nBytes,cudaMemcpyHostToDevice); gpu_add<<<1,n>>>(d_a,d_b,d_c,n); cudaMemcpy(h_c,d_c,nBytes,cudaMemcpyDeviceToHost); cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); for(int i = 0;i < n;++i) { std::cout << "c[" << i<<"]=" <<h_c[i]<<"\n"; } free(h_a); free(h_b); free(h_c); }