1. 程式人生 > >Cuda learn record two

Cuda learn record two

dsp help != 裏的 launch function eset show ads

這是一個cuda 自帶的算例,包含cuda 計算的一般流程。

這個地址有比較清楚的cuda的介紹。感謝作者分享(http://blog.csdn.net/hjimce/article/details/51506207)

一般來說,cuda 計算的流程是:

1. 設置顯卡編號:cudaSetDevice; 這個主要是在有多個GPU的機器上使用,其編號是從0號開始。

2. 為顯卡開辟內存變量: cudaMalloc;使用方法:cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));

這裏的指針是指向設備端的內存地址,無法再主機端使用。

3.把主機端的數據拷貝到設備端:cudaMemcpy; 使用方法:

cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);

這裏註意需要指明數據傳輸的地址,

4. 調用內核函數__global__ 類型函數;

cudaAdd<<<blocksPerGrid, threadsPerBlock>>> ( )

這裏blocksPerGrid, threadsPerBlock 都是Dim3型的數據,

5. 把計算結果拷貝到主機端。

6. 釋放顯存空間。

  1 #include "cuda_runtime.h"
  2
#include "device_launch_parameters.h" 3 4 #include <stdio.h> 5 6 static void HandleError(cudaError_t err, 7 const char *file, 8 int line) { 9 if (err != cudaSuccess) { 10 printf("%s in %s at line %d\n", cudaGetErrorString(err), 11 file, line);
12 exit(EXIT_FAILURE); 13 } 14 } 15 #define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ )) 16 17 cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size); 18 void printCudaInformation(); 19 20 __global__ void addKernel(int *c, const int *a, const int *b) 21 { 22 int i = threadIdx.x; 23 c[i] = a[i] + b[i]; 24 } 25 26 int main() 27 { 28 const int arraySize = 5; 29 const int a[arraySize] = { 1, 2, 3, 4, 5 }; 30 const int b[arraySize] = { 10, 20, 30, 40, 50 }; 31 int c[arraySize] = { 0 }; 32 33 // Add vectors in parallel. 34 HANDLE_ERROR( addWithCuda(c, a, b, arraySize) ); 35 36 printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n", 37 c[0], c[1], c[2], c[3], c[4]); 38 39 // cudaDeviceReset must be called before exiting in order for profiling and 40 // tracing tools such as Nsight and Visual Profiler to show complete traces. 41 HANDLE_ERROR( cudaDeviceReset() ); 42 43 system("pause"); 44 printCudaInformation(); 45 system("pause"); 46 return 0; 47 } 48 49 // Helper function for using CUDA to add vectors in parallel. 50 cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size) 51 { 52 int *dev_a = 0; 53 int *dev_b = 0; 54 int *dev_c = 0; 55 cudaError_t cudaStatus=cudaSuccess; 56 57 // Choose which GPU to run on, change this on a multi-GPU system. 58 HANDLE_ERROR(cudaSetDevice(0)); 59 60 // Allocate GPU buffers for three vectors (two input, one output) 61 HANDLE_ERROR(cudaMalloc((void**)&dev_c, size * sizeof(int))); 62 HANDLE_ERROR(cudaMalloc((void**)&dev_a, size * sizeof(int))); 63 HANDLE_ERROR(cudaMalloc((void**)&dev_b, size * sizeof(int))); 64 65 // Copy input vectors from host memory to GPU buffers. 66 HANDLE_ERROR(cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice)); 67 HANDLE_ERROR(cudaMemcpy(dev_b, a, size * sizeof(int), cudaMemcpyHostToDevice)); 68 69 70 // Launch a kernel on the GPU with one thread for each element. 71 addKernel<<<1, size>>>(dev_c, dev_a, dev_b); 72 73 // Check for any errors launching the kernel 74 HANDLE_ERROR(cudaGetLastError()); 75 76 // cudaDeviceSynchronize waits for the kernel to finish, and returns 77 // any errors encountered during the launch. 78 HANDLE_ERROR(cudaDeviceSynchronize()); 79 80 // Copy output vector from GPU buffer to host memory. 81 HANDLE_ERROR(cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost)); 82 83 return cudaStatus; 84 } 85 86 void printCudaInformation() 87 { 88 int count; 89 cudaGetDeviceCount(&count); 90 printf("count=%d \n", count); 91 cudaDeviceProp myProp; 92 cudaGetDeviceProperties(&myProp, 0); 93 printf(" --- General Information of My Cuda Device ---\n"); 94 printf(" Device name: %s\n", myProp.name); 95 printf(" Computer capatibility : %d.%d\n", myProp.major, myProp.minor); 96 printf(" Clock rate: %d\n", myProp.clockRate); 97 98 printf(" --- Memory Information of My Cuda Device ---\n"); 99 printf(" Total global memory: %ld =%d double \n", myProp.totalGlobalMem, myProp.totalGlobalMem / sizeof(double)); 100 printf(" Total const memory: %ld =%d int \n", myProp.totalConstMem, myProp.totalConstMem / sizeof(int)); 101 printf(" max memoory pitch: %ld \n", myProp.memPitch); 102 103 printf(" --- Multiprocessor Information of My Cuda Device ---\n"); 104 printf(" multprocessor count= %d\n", myProp.multiProcessorCount); 105 printf(" Shared mem per mp=%d\n", myProp.sharedMemPerBlock); 106 printf(" Registers per mp=%d\n", myProp.regsPerBlock); 107 printf(" Thread in wrap=%d\n", myProp.warpSize); 108 printf(" Max thread per block=%d\n", myProp.maxThreadsPerBlock); 109 printf(" Max threads dimensions= (%d, %d, %d) \n", 110 myProp.maxThreadsDim[0], myProp.maxThreadsDim[1], myProp.maxThreadsDim[2]); 111 printf(" Max Grid dimensions= (%d, %d, %d) \n", 112 myProp.maxGridSize[0], myProp.maxGridSize[1], myProp.maxGridSize[2]); 113 printf("\n"); 114 }

Cuda learn record two