CUDA 矩陣乘法

使用global memory


  • 錯誤 17 error : no instance of overloaded function “cudaMalloc” matches the argument list E:\Niki\MVDR_BTR\MVDR_BTR\MatrixMulOnDevice.cu 16
    修正:把CUDAcudaMalloc(&Nd, size);改成cudaMalloc((void**)&Nd, size);

  • 錯誤 17 error : argument of type “float” is incompatible with parameter of type “void *” E:\Niki\MVDR_BTR\MVDR_BTR\MatrixMulOnDevice.cu 17
    修正:把float Md, *Nd, *Pd; 改成float

    Md, Nd, Pd;


#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

__global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, int Width);

void MatrixMulOnDevice(float* M, float* N, float* P, int Width)
   int size = Width * Width * sizeof(float
); float *Md, *Nd, *Pd; // Allocate and Load M, N to device memory cudaMalloc((void**)&Md, size); cudaMemcpy(Md, M, size, cudaMemcpyHostToDevice); cudaMalloc((void**)&Nd, size); cudaMemcpy(Nd, N, size, cudaMemcpyHostToDevice); // Allocate P on the device cudaMalloc((void
**)&Pd, size); // Kernel invocation code – to be shown later // Setup the execution configuration dim3 dimBlock(Width, Width); dim3 dimGrid(1, 1); // Launch the device computation threads! MatrixMulKernel<<<dimGrid, dimBlock>>>(Md, Nd, Pd,Width); // Read P from the device cudaMemcpy(P, Pd, size, cudaMemcpyDeviceToHost); // Free device matrices cudaFree(Md); cudaFree(Nd); cudaFree (Pd); } // Matrix multiplication kernel – per thread code __global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, int Width) { // 2D Thread ID int tx = threadIdx.x; int ty = threadIdx.y; // Pvalue is used to store the element of the matrix // that is computed by the thread float Pvalue = 0; for (int k = 0; k < Width; ++k) { float Melement = Md[ty * Width + k]; float Nelement = Nd[k * Width + tx]; Pvalue += Melement * Nelement; } // Write the matrix to device memory; // each thread writes one element Pd[ty * Width + tx] = Pvalue; }



#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <iostream>
using namespace std;

void MatrixMulOnDevice(float* M, float* N, float* P, int Width);

int main()
   int Width;
   Width = 8;
   float M[64];
   float N[64];
   float P[64];

   int i,j;

           M[i*Width+j]= i+j+1;
           N[i*Width+j]= i+1;

   //float *Mp = M;
   //float *Np = N;
   //float *Pp = P;

    /*MatrixMulOnDevice(Mp,Np, Pp, Width);*/
   MatrixMulOnDevice(&M[0],&N[0], &P[0], Width);
           printf(" %f \t", P[i*Width+j]);

    return 0;