CUDA之矩陣乘法——globalmemory
阿新 • • 發佈:2019-02-13
CUDA 矩陣乘法
使用global memory
報錯
錯誤 17 error : no instance of overloaded function “cudaMalloc” matches the argument list E:\Niki\MVDR_BTR\MVDR_BTR\MatrixMulOnDevice.cu 16
修正:把CUDAcudaMalloc(&Nd, size);改成cudaMalloc((void**)&Nd, size);錯誤 17 error : argument of type “float” is incompatible with parameter of type “void *” E:\Niki\MVDR_BTR\MVDR_BTR\MatrixMulOnDevice.cu 17
修正:把float Md, *Nd, *Pd; 改成float
編譯通過的程式碼
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
__global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, int Width);
void MatrixMulOnDevice(float* M, float* N, float* P, int Width)
{
int size = Width * Width * sizeof(float );
float *Md, *Nd, *Pd;
// Allocate and Load M, N to device memory
cudaMalloc((void**)&Md, size);
cudaMemcpy(Md, M, size, cudaMemcpyHostToDevice);
cudaMalloc((void**)&Nd, size);
cudaMemcpy(Nd, N, size, cudaMemcpyHostToDevice);
// Allocate P on the device
cudaMalloc((void **)&Pd, size);
// Kernel invocation code – to be shown later
// Setup the execution configuration
dim3 dimBlock(Width, Width);
dim3 dimGrid(1, 1);
// Launch the device computation threads!
MatrixMulKernel<<<dimGrid, dimBlock>>>(Md, Nd, Pd,Width);
// Read P from the device
cudaMemcpy(P, Pd, size, cudaMemcpyDeviceToHost);
// Free device matrices
cudaFree(Md); cudaFree(Nd); cudaFree (Pd);
}
// Matrix multiplication kernel – per thread code
__global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, int Width)
{
// 2D Thread ID
int tx = threadIdx.x;
int ty = threadIdx.y;
// Pvalue is used to store the element of the matrix
// that is computed by the thread
float Pvalue = 0;
for (int k = 0; k < Width; ++k)
{
float Melement = Md[ty * Width + k];
float Nelement = Nd[k * Width + tx];
Pvalue += Melement * Nelement;
}
// Write the matrix to device memory;
// each thread writes one element
Pd[ty * Width + tx] = Pvalue;
}
測試
新增主函式程式碼:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
using namespace std;
void MatrixMulOnDevice(float* M, float* N, float* P, int Width);
int main()
{
int Width;
Width = 8;
float M[64];
float N[64];
float P[64];
int i,j;
for(i=0;i<Width;i++)//row
{
for(j=0;j<Width;j++)//colume
{
M[i*Width+j]= i+j+1;
N[i*Width+j]= i+1;
}
}
//float *Mp = M;
//float *Np = N;
//float *Pp = P;
/*MatrixMulOnDevice(Mp,Np, Pp, Width);*/
MatrixMulOnDevice(&M[0],&N[0], &P[0], Width);
printf("矩陣相乘結果為:\n");
for(i=0;i<Width;i++)//row
{
for(j=0;j<Width;j++)//colume
{
printf(" %f \t", P[i*Width+j]);
}
printf("\n");
}
return 0;
}