[CUDA C] Managing Momory
阿新 • • 發佈:2018-11-24
Host and Device Memory Functions
STANDARD C FUNCTIONS | CUDA C FUNCTIONS |
---|---|
malloc | cudaMalloc |
memcpy | cudaMemcpy |
memset | cudaMemset |
free | cudaFree |
Function Signature
cudaError_t cudaMalloc(void ** devPtr, size_t size); // may return cudaSuccess, cudaErrorMemoryAllocation, etc. cudaError_t cudaMemcpy(void * dst, void * src, size_t size, cudaMemcpyKind kind); // kind could be one of the followings: // cudaMemcpyHostToHost // cudaMemcpyHostToDevice // cudaMemcpyDeviceToHost // cudaMemcpyDeviceToDevice // get error char* cudaGetErrorString(cudaError_t error);
Examples
- C Example
#include <stdlib.h> #include <stdio.h> #include <string.h> #include <time.h> void sumArrayOnHost(float * a, float * b, float * c, const int size) { for (int i = 0; i < size; ++i) { c[i] = a[i] + b[i]; } // use for do loop } void initData(float * ip, const int size) { // generate different seed for random number time_t t; srand((unsigned int) time(&t)); for (int i = 0; i < size; ++i) { ip[i] = (float)(rand() & 0xFF) / 10.0f; } } void showData(float * ip, const int size) { for (int i = 0; i < size; ++i) { printf("%.6f ", ip[i]); } printf("\n"); } int main() { const int n = 16; size_t nBytes = n * sizeof(float); float * h_a = nullptr; // host pointer of a float * h_b = nullptr; float * h_c = nullptr; // malloc memory h_a = (float *)malloc(nBytes); h_b = (float *)malloc(nBytes); h_c = (float *)malloc(nBytes); // init data initData(h_a, n); initData(h_b, n); // a + b = c sumArrayOnHost(h_a, h_b, h_c, n); // check result showData(h_a, n); showData(h_b, n); showData(h_c, n); // free memory free(h_a); free(h_b); free(h_c); return 0; }
- CUDA Example
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
void initData(float * ip, const int size) {
time_t t;
srand((unsigned int)time(&t));
for (int i = 0; i < size; ++i) {
ip[i] = (float)(rand() & 0xFF) / 10.0f;
}
}
// modify sumArrayOnHost to sumArrayOnDevice
__global__ void sumArrayOnDevice(float * a, float * b, float * c) {
c[threadIdx.x] = a[threadIdx.x] + b[threadIdx.x];
}
void showData(float * ip, const int size) {
for (int i = 0; i < size; ++i) {
printf("%.6f ", ip[i]);
}
printf("\n");
}
int main() {
const int n = 4;
const size_t nBytes = sizeof(float) * n;
// ptr on host
float * h_a = (float *)malloc(nBytes);
float * h_b = (float *)malloc(nBytes);
float * h_c = (float *)malloc(nBytes);
// ptr on device
float * d_a = nullptr;
float * d_b = nullptr;
float * d_c = nullptr;
cudaMalloc((float**)&d_a, nBytes);
cudaMalloc((float**)&d_b, nBytes);
cudaMalloc((float**)&d_c, nBytes);
initData(h_a, n);
initData(h_b, n);
// copy data from host to device
cudaMemcpy(d_a, h_a, nBytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, nBytes, cudaMemcpyHostToDevice);
// cudaMemcpy(d_c, h_c, cudaMemcpyHostToDevice);
// summation
sumArrayOnDevice<<<1, n>>>(d_a, d_b, d_c);
// copy data back
cudaMemcpy(h_c, d_c, nBytes, cudaMemcpyDeviceToHost);
// show data
showData(h_a, n);
showData(h_b, n);
showData(h_c, n);
free(h_a);
free(h_b);
free(h_c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}