1. 程式人生 > >CUDA之向量點積運算

CUDA之向量點積運算

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#define N 10

//向量點積運算
__global__ void Dot(int *a, int *b, int *c)
{
	__shared__ int temp[N];
	temp[threadIdx.x] = a[threadIdx.x] * b[threadIdx.x];
	__syncthreads();
	if (0 == threadIdx.x)
	{
		int sum = 0;
		for (int i; i < N; i++)
		{
			sum += temp[i];
		}
		*c = sum;
		printf("sum Calculated on Device: %d\n", *c);
	}
}

//產生隨機整數
void random_ints(int *a, int n)
{
	for (int i = 0; i < n; i++)
	{
		*(a + i) = rand() % 10;
	}
}

int main()
{
	int *a, *b, *c;  //Host變數
	int *d_a, *d_b, *d_c;  //Device變數
	int size = N * sizeof(int);
	//為Device變數在Device中分配記憶體
	cudaMalloc((void **)&d_a, size);
	cudaMalloc((void **)&d_b, size);
	cudaMalloc((void **)&d_c, sizeof(int));
	//陣列初始化
	a = (int *)malloc(size);
	random_ints(a, N);
	b = (int *)malloc(size);
	random_ints(b, N);
	c = (int *)malloc(sizeof(int));
	//列印陣列
	printf("Array a[N]:\n");
	for (int i = 0; i < N; i++) printf("%d ", a[i]);
	printf("\n");
	printf("Array b[n]:\n");
	for (int i = 0; i < N; i++) printf("%d ", b[i]);
	printf("\n");

	//將Host資料傳入Device中
	cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
	cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
	//在Device中執行Dot函式
	Dot << <1, N >> >(d_a, d_b, d_c);
	//將Device資料傳入Host中
	cudaMemcpy(c, d_c, sizeof(int), cudaMemcpyDeviceToHost);

	//在Host上執行向量點積運算
	int sumHost = 0;
	for (int i = 0; i < N; i++)
	{
		sumHost += a[i] * b[i];
	}
	//列印運算結果
	printf("sum Calculated on Host=%d\n", sumHost);
	printf("Device to Host: a*b=%d\n", *c);
	//釋放Host變數
	free(a);
	free(b);
	free(c);
	//釋放Device變數
	cudaFree(d_a);
	cudaFree(d_b);
	cudaFree(d_c);
	return 0;
}