1. 程式人生 > >CUDA+OpenCV實現光線追蹤(有無constant)

CUDA+OpenCV實現光線追蹤(有無constant)

常量記憶體是NVIDIA提供的一個64KB大小的記憶體空間,它的處理方式和普通的全域性記憶體和共享記憶體都不一樣,是有cuda專門提供的。

執行緒束的概念:執行緒束是指一個包含32個執行緒的集合,在程式中的每一行,執行緒束中的每個執行緒都將在不同的資料上執行相同的指令。

因此,常量記憶體的作用是,能夠將單次記憶體的讀取操作廣播到每個半執行緒束(即16個執行緒),所以如果在半執行緒束中的每個執行緒都從常量記憶體的相同地址上讀取資料,那麼GPU只會產生一次讀取請求,並將其廣播,顯而易見,這種方式的記憶體流量只是使用全域性記憶體流量的1/16。這是常量記憶體的第一個好處,第二個好處則是由於這塊記憶體的內容是不會發生變化的,因此硬體將主動把這個常量記憶體資料快取到GPU上,這樣第一次從敞亮記憶體的某個地址上讀取後,其他半執行緒束請求同一個地址時,將直接在GPU上命中快取,因此也減少了額外的記憶體流量。

使用常量記憶體只需加上:__constant__修飾符,當從主機記憶體複製記憶體到GPU上常量記憶體時,不用cudaMemcpy()而用cudaMemcpyToSymbol(),這樣就複製到常量記憶體裡了。

無constant:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "H:\cuda_by_example\common\book.h"
#include "H:\cuda_by_example\common\cpu_bitmap.h"
#include "device_functions.h"
#include <stdio.h>
  
#define DIM 1024  
  
#define rnd( x ) (x * rand() / RAND_MAX)  
#define INF     2e10f  
  
//資料結構對球面建模
struct Sphere {  
    float   r,b,g;  
    float   radius;  
    float   x,y,z;  
	//hit方法,計算光線是否與球面相交,若相交則返回光線到命中球面處的距離
    __device__ float hit( float ox, float oy, float *n ) {  
        float dx = ox - x;  
        float dy = oy - y;  
        if (dx*dx + dy*dy < radius*radius) {  
            float dz = sqrtf( radius*radius - dx*dx - dy*dy );  
            *n = dz / sqrtf( radius * radius );  
            return dz + z;  
        }  
        return -INF;  
    }  
};  
#define SPHERES 30  
  
  //核函式內容
__global__ void kernel( Sphere *s, unsigned char *ptr ) {  
    //將threadIdx/BlockIdx對映到畫素位置
    int x = threadIdx.x + blockIdx.x * blockDim.x;  
    int y = threadIdx.y + blockIdx.y * blockDim.y;  
    int offset = x + y * blockDim.x * gridDim.x;  
	//讓影象座標偏移DIM/2,使z軸穿過影象中心
    float   ox = (x - DIM/2);  
    float   oy = (y - DIM/2);  
    //初始化背景顏色為黑色
    float   r=0, g=0, b=0;  
    float   maxz = -INF;  
	//對每一個球面陣列進行迭代
    for(int i=0; i<SPHERES; i++) {  
        float   n;  
        float   t = s[i].hit( ox, oy, &n );  
		//如果比上一次命中距離更接近,我將這個距離儲存為最接近距離,並且儲存球面顏色值
        if (t > maxz) {  
            float fscale = n;  
            r = s[i].r * fscale;  
            g = s[i].g * fscale;  
            b = s[i].b * fscale;  
			//更新距離
            maxz = t;  
        }  
    }   
    //判斷球面相交情況後,將當前顏色儲存到輸出影象中
    ptr[offset*4 + 0] = (int)(r * 255);  
    ptr[offset*4 + 1] = (int)(g * 255);  
    ptr[offset*4 + 2] = (int)(b * 255);  
    ptr[offset*4 + 3] = 255;  
}  
  
  
// globals needed by the update routine  
struct DataBlock {  
    unsigned char   *dev_bitmap;  
    Sphere          *s;  
};  
  
int main( void ) {  
    DataBlock   data;  
    //記錄起始時間
    cudaEvent_t     start, stop;  
    HANDLE_ERROR( cudaEventCreate( &start ) );  
    HANDLE_ERROR( cudaEventCreate( &stop ) );  
    HANDLE_ERROR( cudaEventRecord( start, 0 ) );  
  
    CPUBitmap bitmap( DIM, DIM, &data );  
    unsigned char   *dev_bitmap;  
    Sphere          *s;  
  
  
    // allocate memory on the GPU for the output bitmap  
    HANDLE_ERROR( cudaMalloc( (void**)&dev_bitmap,  
                              bitmap.image_size() ) );  
    // allocate memory for the Sphere dataset  
    HANDLE_ERROR( cudaMalloc( (void**)&s,  
                              sizeof(Sphere) * SPHERES ) );  
  
    // allocate temp memory, initialize it, copy to  
    // memory on the GPU, then free our temp memory  
	//生成球面的中心座標顏色和半徑
    Sphere *temp_s = (Sphere*)malloc( sizeof(Sphere) * SPHERES );  
    for (int i=0; i<SPHERES; i++) {  
        temp_s[i].r = rnd( 1.0f );
        temp_s[i].g = rnd( 1.0f );  
        temp_s[i].b = rnd( 1.0f );  
        temp_s[i].x = rnd( 1000.0f ) - 500;  
        temp_s[i].y = rnd( 1000.0f ) - 500;  
        temp_s[i].z = rnd( 1000.0f ) - 500;  
        temp_s[i].radius = rnd( 100.0f ) + 20;  
    }  
    HANDLE_ERROR( cudaMemcpy( s, temp_s,  
                                sizeof(Sphere) * SPHERES,  
                                cudaMemcpyHostToDevice ) );  
    free( temp_s );  
      
    // generate a bitmap from our sphere data  
    dim3    grids(DIM/16,DIM/16);  
    dim3    threads(16,16);  
    kernel<<<grids,threads>>>( s, dev_bitmap );  
  
    // copy our bitmap back from the GPU for display  
    HANDLE_ERROR( cudaMemcpy( bitmap.get_ptr(), dev_bitmap,  
                              bitmap.image_size(),  
                              cudaMemcpyDeviceToHost ) );  
  
    // get stop time, and display the timing results  
    HANDLE_ERROR( cudaEventRecord( stop, 0 ) );  
    HANDLE_ERROR( cudaEventSynchronize( stop ) );  
    float   elapsedTime;  
    HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,  
                                        start, stop ) );  
    printf( "Time to generate:  %3.1f ms\n", elapsedTime );  
  
    HANDLE_ERROR( cudaEventDestroy( start ) );  
    HANDLE_ERROR( cudaEventDestroy( stop ) );  
  
    HANDLE_ERROR( cudaFree( dev_bitmap ) );  
    HANDLE_ERROR( cudaFree( s ) );  
  
    // display  
     bitmap.display_and_exit();  
}  

__device__是在裝置內呼叫,global呼叫device,將在所有核中使用hit,如果去掉__device__將出錯,因為hit是在主機內的

有constant:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "H:\cuda_by_example\common\book.h"
#include "H:\cuda_by_example\common\cpu_bitmap.h"
#include "device_functions.h"
#include <stdio.h>
#define DIM 1024  
  
#define rnd( x ) (x * rand() / RAND_MAX)  
#define INF     2e10f  
  
struct Sphere {  
    float   r,b,g;  
    float   radius;  
    float   x,y,z;  
    __device__ float hit( float ox, float oy, float *n ) {  
        float dx = ox - x;  
        float dy = oy - y;  
        if (dx*dx + dy*dy < radius*radius) {  
            float dz = sqrtf( radius*radius - dx*dx - dy*dy );  
            *n = dz / sqrtf( radius * radius );  
            return dz + z;  
        }  
        return -INF;  
    }  
};  
#define SPHERES 30  
__constant__ Sphere s[SPHERES];  
  
__global__ void kernel( unsigned char *ptr ) {  
    // map from threadIdx/BlockIdx to pixel posiytion  
    int x = threadIdx.x + blockIdx.x * blockDim.x;  
    int y = threadIdx.y + blockIdx.y * blockDim.y;  
    int offset = x + y * blockDim.x * gridDim.x;  
    float   ox = (x - DIM/2);  
    float   oy = (y - DIM/2);  
  
    float   r=0, g=0, b=0;  
    float   maxz = -INF;  
    for(int i=0; i<SPHERES; i++) {  
        float   n;  
        float   t = s[i].hit( ox, oy, &n );  
        if (t > maxz) {  
            float fscale = n;  
            r = s[i].r * fscale;  
            g = s[i].g * fscale;  
            b = s[i].b * fscale;  
            maxz = t;  
        }  
    }   
  
    ptr[offset*4 + 0] = (int)(r * 255);  
    ptr[offset*4 + 1] = (int)(g * 255);  
    ptr[offset*4 + 2] = (int)(b * 255);  
    ptr[offset*4 + 3] = 255;  
}  
 
struct DataBlock{
	unsigned char *dev_bitmap;
};
 
int main(){
	DataBlock data;
	// capture the start time and start to record it
	cudaEvent_t start,stop;
	HANDLE_ERROR(cudaEventCreate(&start));
	HANDLE_ERROR(cudaEventCreate(&stop));
	HANDLE_ERROR(cudaEventRecord(start,0));
 
	CPUBitmap bitmap(DIM,DIM,&data);
	unsigned char *dev_bitmap;
	
	//allocate the memory on the GPU for the output bitmap
 
	HANDLE_ERROR(cudaMalloc((void**)&dev_bitmap,bitmap.image_size()));
 
	Sphere *temp_s = (Sphere*)malloc(sizeof(Sphere)*SPHERES); 
	    for (int i=0; i<SPHERES; i++) {  
        temp_s[i].r = rnd( 1.0f );  
        temp_s[i].g = rnd( 1.0f );  
        temp_s[i].b = rnd( 1.0f );  
        temp_s[i].x = rnd( 1000.0f ) - 500;  
        temp_s[i].y = rnd( 1000.0f ) - 500;  
        temp_s[i].z = rnd( 1000.0f ) - 500;  
        temp_s[i].radius = rnd( 100.0f ) + 20;  
    }  
    HANDLE_ERROR( cudaMemcpyToSymbol( s, temp_s,sizeof(Sphere) * SPHERES) );  
	free(temp_s);
 
	//generate a bitmap from our sphere data
	dim3 grids(DIM/16,DIM/16);
	dim3 threads(16,16);
	kernel<<<grids,threads>>>(dev_bitmap);
 
	//copy the bitmap back from GPU to CPU for display
	HANDLE_ERROR(cudaMemcpy(bitmap.get_ptr(),dev_bitmap,bitmap.image_size(),cudaMemcpyDeviceToHost));
 
	HANDLE_ERROR(cudaEventRecord(stop,0));//stop the time record
	HANDLE_ERROR(cudaEventSynchronize(stop));
	float elapsedTime;
	HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,start,stop));
	printf( "Time to generate:  %3.1f ms\n", elapsedTime ); 
 
	HANDLE_ERROR(cudaEventDestroy(start));
	HANDLE_ERROR(cudaEventDestroy(stop));
 
	HANDLE_ERROR(cudaFree(dev_bitmap));
 
	bitmap.display_and_exit();
}

執行效果圖,spherenumber=100時