cuda 在GPU和CPU之間複製陣列
阿新 • • 發佈:2019-01-05
#define CUDACHECK(cmd) do { \
cudaError_t e = cmd; \
if( e != cudaSuccess ) { \
printf("Failed: Cuda error %s:%d '%s'\n", \
__FILE__,__LINE__,cudaGetErrorString(e)); \
exit(1); \
} \
} while (0)
int nDev=2;
float** sendbuff = (float**)malloc(nDev * sizeof(float*));
float** recvbuff = (float**)malloc(nDev * sizeof(float*));
cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
int size=4;
for (int i = 0; i < nDev; ++i) {
CUDACHECK(cudaSetDevice(localRank*nDev + i));
CUDACHECK(cudaMalloc(sendbuff + i, size * sizeof (float)));
CUDACHECK(cudaMalloc(recvbuff + i, size * sizeof(float)));
CUDACHECK(cudaMemset(sendbuff[i], 1, size * sizeof(float)));
CUDACHECK(cudaMemset(recvbuff[i], 0, size * sizeof(float)));
float *h_arr;
h_arr = (float *)malloc(size*sizeof(float));
for (int i=0; i<size; ++i)
h_arr[i] = i; // Or other values
CUDACHECK(cudaMemcpy(sendbuff[i], h_arr, size*sizeof(float), cudaMemcpyHostToDevice)); //將資料從CPU傳遞到GPU
CUDACHECK(cudaStreamCreate(s+i));
}
for (int i = 0; i < nDev; ++i) {
CUDACHECK(cudaSetDevice(localRank*nDev + i));
float* recvCPU=(float*)malloc(size*sizeof(float)); //將資料從cuda 拷貝到cpu
CUDACHECK(cudaMemcpy(recvCPU, sendbuff[i], sizeof(float) * size, cudaMemcpyDeviceToHost));
printf("Begin Reduce Dev is %d of process myRank is %d, RecvBUf is %f,%f,%f,%f\n",i,myRank
,recvCPU[0],recvCPU[1],recvCPU[2],recvCPU[3]);
}