1. 程式人生 > >[CUDA C] Managing Momory

[CUDA C] Managing Momory

Host and Device Memory Functions

CUDA Host and Device Memory Functions
STANDARD C FUNCTIONS CUDA C FUNCTIONS
malloc cudaMalloc
memcpy cudaMemcpy
memset cudaMemset
free cudaFree

 

 

 

 

 

 

Function Signature

cudaError_t cudaMalloc(void ** devPtr, size_t size);
    // may return cudaSuccess, cudaErrorMemoryAllocation, etc.
cudaError_t cudaMemcpy(void * dst, void * src, size_t size, cudaMemcpyKind kind);
    // kind could be one of the followings: 
    // cudaMemcpyHostToHost
    // cudaMemcpyHostToDevice
    // cudaMemcpyDeviceToHost
    // cudaMemcpyDeviceToDevice
// get error
char* cudaGetErrorString(cudaError_t error);
    

 Examples

 

  • C Example
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <time.h>

void sumArrayOnHost(float * a, float * b, float * c, const int size) {
	for (int i = 0; i < size; ++i) {
		c[i] = a[i] + b[i];
	} // use for do loop
}

void initData(float * ip, const int size) {
	// generate different seed for random number
	time_t t;
	srand((unsigned int) time(&t));
	for (int i = 0; i < size; ++i) {
		ip[i] = (float)(rand() & 0xFF) / 10.0f;
	}
}

void showData(float * ip, const int size) {
	for (int i = 0; i < size; ++i) {
		printf("%.6f ", ip[i]);
	}
	printf("\n");
}

int main() {
	const int n = 16;
	size_t nBytes = n * sizeof(float);
	float * h_a = nullptr; // host pointer of a
	float * h_b = nullptr;
	float * h_c = nullptr;
	// malloc memory
	h_a = (float *)malloc(nBytes);
	h_b	= (float *)malloc(nBytes);
	h_c = (float *)malloc(nBytes);
	// init data
	initData(h_a, n);
	initData(h_b, n);
	// a + b = c
	sumArrayOnHost(h_a, h_b, h_c, n);
	// check result
	showData(h_a, n);
	showData(h_b, n);
	showData(h_c, n);
	// free memory
	free(h_a);
	free(h_b);
	free(h_c);
	return 0;
}
  • CUDA Example
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "cuda_runtime.h"  
#include "device_launch_parameters.h"  

void initData(float * ip, const int size) {
	time_t t;
	srand((unsigned int)time(&t));
	for (int i = 0; i < size; ++i) {
		ip[i] = (float)(rand() & 0xFF) / 10.0f;
	}
}

// modify sumArrayOnHost to sumArrayOnDevice
__global__ void sumArrayOnDevice(float * a, float * b, float * c) {
	c[threadIdx.x] = a[threadIdx.x] + b[threadIdx.x];
}

void showData(float * ip, const int size) {
	for (int i = 0; i < size; ++i) {
		printf("%.6f ", ip[i]);
	}
	printf("\n");
}

int main() {
	const int n = 4;
	const size_t nBytes = sizeof(float) * n;
	// ptr on host
	float * h_a = (float *)malloc(nBytes);
	float * h_b = (float *)malloc(nBytes);
	float * h_c = (float *)malloc(nBytes);
	// ptr on device
	float * d_a = nullptr;
	float * d_b = nullptr;
	float * d_c = nullptr;
	cudaMalloc((float**)&d_a, nBytes);
	cudaMalloc((float**)&d_b, nBytes);
	cudaMalloc((float**)&d_c, nBytes);
	initData(h_a, n);
	initData(h_b, n);
	// copy data from host to device
	cudaMemcpy(d_a, h_a, nBytes, cudaMemcpyHostToDevice);
	cudaMemcpy(d_b, h_b, nBytes, cudaMemcpyHostToDevice);
	// cudaMemcpy(d_c, h_c, cudaMemcpyHostToDevice);
	// summation
	sumArrayOnDevice<<<1, n>>>(d_a, d_b, d_c);
	// copy data back
	cudaMemcpy(h_c, d_c, nBytes, cudaMemcpyDeviceToHost);
	// show data
	showData(h_a, n);
	showData(h_b, n);
	showData(h_c, n);
	free(h_a);
	free(h_b);
	free(h_c);
	cudaFree(d_a);
	cudaFree(d_b);
	cudaFree(d_c);
	return 0;
}