CUDA之向量點積運算
阿新 • • 發佈:2018-12-14
#include "cuda_runtime.h" #include "device_launch_parameters.h" #include <stdio.h> #include <stdlib.h> #include <malloc.h> #define N 10 //向量點積運算 __global__ void Dot(int *a, int *b, int *c) { __shared__ int temp[N]; temp[threadIdx.x] = a[threadIdx.x] * b[threadIdx.x]; __syncthreads(); if (0 == threadIdx.x) { int sum = 0; for (int i; i < N; i++) { sum += temp[i]; } *c = sum; printf("sum Calculated on Device: %d\n", *c); } } //產生隨機整數 void random_ints(int *a, int n) { for (int i = 0; i < n; i++) { *(a + i) = rand() % 10; } } int main() { int *a, *b, *c; //Host變數 int *d_a, *d_b, *d_c; //Device變數 int size = N * sizeof(int); //為Device變數在Device中分配記憶體 cudaMalloc((void **)&d_a, size); cudaMalloc((void **)&d_b, size); cudaMalloc((void **)&d_c, sizeof(int)); //陣列初始化 a = (int *)malloc(size); random_ints(a, N); b = (int *)malloc(size); random_ints(b, N); c = (int *)malloc(sizeof(int)); //列印陣列 printf("Array a[N]:\n"); for (int i = 0; i < N; i++) printf("%d ", a[i]); printf("\n"); printf("Array b[n]:\n"); for (int i = 0; i < N; i++) printf("%d ", b[i]); printf("\n"); //將Host資料傳入Device中 cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice); cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice); //在Device中執行Dot函式 Dot << <1, N >> >(d_a, d_b, d_c); //將Device資料傳入Host中 cudaMemcpy(c, d_c, sizeof(int), cudaMemcpyDeviceToHost); //在Host上執行向量點積運算 int sumHost = 0; for (int i = 0; i < N; i++) { sumHost += a[i] * b[i]; } //列印運算結果 printf("sum Calculated on Host=%d\n", sumHost); printf("Device to Host: a*b=%d\n", *c); //釋放Host變數 free(a); free(b); free(c); //釋放Device變數 cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); return 0; }