1. 程式人生 > >CUDA之矩陣乘法——globalmemory

CUDA之矩陣乘法——globalmemory

CUDA 矩陣乘法

使用global memory

報錯

  • 錯誤 17 error : no instance of overloaded function “cudaMalloc” matches the argument list E:\Niki\MVDR_BTR\MVDR_BTR\MatrixMulOnDevice.cu 16
    修正:把CUDAcudaMalloc(&Nd, size);改成cudaMalloc((void**)&Nd, size);

  • 錯誤 17 error : argument of type “float” is incompatible with parameter of type “void *” E:\Niki\MVDR_BTR\MVDR_BTR\MatrixMulOnDevice.cu 17
    修正:把float Md, *Nd, *Pd; 改成float

    Md, Nd, Pd;

編譯通過的程式碼


#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

__global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, int Width);

void MatrixMulOnDevice(float* M, float* N, float* P, int Width)
{
   int size = Width * Width * sizeof(float
); float *Md, *Nd, *Pd; // Allocate and Load M, N to device memory cudaMalloc((void**)&Md, size); cudaMemcpy(Md, M, size, cudaMemcpyHostToDevice); cudaMalloc((void**)&Nd, size); cudaMemcpy(Nd, N, size, cudaMemcpyHostToDevice); // Allocate P on the device cudaMalloc((void
**)&Pd, size); // Kernel invocation code – to be shown later // Setup the execution configuration dim3 dimBlock(Width, Width); dim3 dimGrid(1, 1); // Launch the device computation threads! MatrixMulKernel<<<dimGrid, dimBlock>>>(Md, Nd, Pd,Width); // Read P from the device cudaMemcpy(P, Pd, size, cudaMemcpyDeviceToHost); // Free device matrices cudaFree(Md); cudaFree(Nd); cudaFree (Pd); } // Matrix multiplication kernel – per thread code __global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, int Width) { // 2D Thread ID int tx = threadIdx.x; int ty = threadIdx.y; // Pvalue is used to store the element of the matrix // that is computed by the thread float Pvalue = 0; for (int k = 0; k < Width; ++k) { float Melement = Md[ty * Width + k]; float Nelement = Nd[k * Width + tx]; Pvalue += Melement * Nelement; } // Write the matrix to device memory; // each thread writes one element Pd[ty * Width + tx] = Pvalue; }

測試

新增主函式程式碼:


#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <iostream>
using namespace std;

void MatrixMulOnDevice(float* M, float* N, float* P, int Width);

int main()
{
   int Width;
   Width = 8;
   float M[64];
   float N[64];
   float P[64];

   int i,j;

   for(i=0;i<Width;i++)//row
   {
       for(j=0;j<Width;j++)//colume
       {
           M[i*Width+j]= i+j+1;
           N[i*Width+j]= i+1;
       }
   }

   //float *Mp = M;
   //float *Np = N;
   //float *Pp = P;

    /*MatrixMulOnDevice(Mp,Np, Pp, Width);*/
   MatrixMulOnDevice(&M[0],&N[0], &P[0], Width);
   printf("矩陣相乘結果為:\n");
   for(i=0;i<Width;i++)//row
   {
       for(j=0;j<Width;j++)//colume
       {
           printf(" %f \t", P[i*Width+j]);
       }
       printf("\n");
   }

    return 0;
}