1. 程式人生 > >cuda 在GPU和CPU之間複製陣列

cuda 在GPU和CPU之間複製陣列

#define CUDACHECK(cmd) do {                         \
  cudaError_t e = cmd;                              \
  if( e != cudaSuccess ) {                          \
    printf("Failed: Cuda error %s:%d '%s'\n",             \
        __FILE__,__LINE__,cudaGetErrorString(e));   \
    exit(1);                             \
  }                                                 \
} while
(0) int nDev=2; float** sendbuff = (float**)malloc(nDev * sizeof(float*)); float** recvbuff = (float**)malloc(nDev * sizeof(float*)); cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev); int size=4; for (int i = 0; i < nDev; ++i) { CUDACHECK(cudaSetDevice(localRank*nDev + i)); CUDACHECK(cudaMalloc(sendbuff + i, size * sizeof
(float))); CUDACHECK(cudaMalloc(recvbuff + i, size * sizeof(float))); CUDACHECK(cudaMemset(sendbuff[i], 1, size * sizeof(float))); CUDACHECK(cudaMemset(recvbuff[i], 0, size * sizeof(float))); float *h_arr; h_arr = (float *)malloc(size*sizeof(float)); for (int i=0; i<size; ++i) h_arr[i] = i; // Or other values
CUDACHECK(cudaMemcpy(sendbuff[i], h_arr, size*sizeof(float), cudaMemcpyHostToDevice)); //將資料從CPU傳遞到GPU CUDACHECK(cudaStreamCreate(s+i)); } for (int i = 0; i < nDev; ++i) { CUDACHECK(cudaSetDevice(localRank*nDev + i)); float* recvCPU=(float*)malloc(size*sizeof(float)); //將資料從cuda 拷貝到cpu CUDACHECK(cudaMemcpy(recvCPU, sendbuff[i], sizeof(float) * size, cudaMemcpyDeviceToHost)); printf("Begin Reduce Dev is %d of process myRank is %d, RecvBUf is %f,%f,%f,%f\n",i,myRank ,recvCPU[0],recvCPU[1],recvCPU[2],recvCPU[3]); }