CUDA+OpenCV實現光線追蹤(有無constant)
阿新 • • 發佈:2018-12-08
常量記憶體是NVIDIA提供的一個64KB大小的記憶體空間,它的處理方式和普通的全域性記憶體和共享記憶體都不一樣,是有cuda專門提供的。
執行緒束的概念:執行緒束是指一個包含32個執行緒的集合,在程式中的每一行,執行緒束中的每個執行緒都將在不同的資料上執行相同的指令。
因此,常量記憶體的作用是,能夠將單次記憶體的讀取操作廣播到每個半執行緒束(即16個執行緒),所以如果在半執行緒束中的每個執行緒都從常量記憶體的相同地址上讀取資料,那麼GPU只會產生一次讀取請求,並將其廣播,顯而易見,這種方式的記憶體流量只是使用全域性記憶體流量的1/16。這是常量記憶體的第一個好處,第二個好處則是由於這塊記憶體的內容是不會發生變化的,因此硬體將主動把這個常量記憶體資料快取到GPU上,這樣第一次從敞亮記憶體的某個地址上讀取後,其他半執行緒束請求同一個地址時,將直接在GPU上命中快取,因此也減少了額外的記憶體流量。
使用常量記憶體只需加上:__constant__修飾符,當從主機記憶體複製記憶體到GPU上常量記憶體時,不用cudaMemcpy()而用cudaMemcpyToSymbol(),這樣就複製到常量記憶體裡了。
無constant:
#include "cuda_runtime.h" #include "device_launch_parameters.h" #include "H:\cuda_by_example\common\book.h" #include "H:\cuda_by_example\common\cpu_bitmap.h" #include "device_functions.h" #include <stdio.h> #define DIM 1024 #define rnd( x ) (x * rand() / RAND_MAX) #define INF 2e10f //資料結構對球面建模 struct Sphere { float r,b,g; float radius; float x,y,z; //hit方法,計算光線是否與球面相交,若相交則返回光線到命中球面處的距離 __device__ float hit( float ox, float oy, float *n ) { float dx = ox - x; float dy = oy - y; if (dx*dx + dy*dy < radius*radius) { float dz = sqrtf( radius*radius - dx*dx - dy*dy ); *n = dz / sqrtf( radius * radius ); return dz + z; } return -INF; } }; #define SPHERES 30 //核函式內容 __global__ void kernel( Sphere *s, unsigned char *ptr ) { //將threadIdx/BlockIdx對映到畫素位置 int x = threadIdx.x + blockIdx.x * blockDim.x; int y = threadIdx.y + blockIdx.y * blockDim.y; int offset = x + y * blockDim.x * gridDim.x; //讓影象座標偏移DIM/2,使z軸穿過影象中心 float ox = (x - DIM/2); float oy = (y - DIM/2); //初始化背景顏色為黑色 float r=0, g=0, b=0; float maxz = -INF; //對每一個球面陣列進行迭代 for(int i=0; i<SPHERES; i++) { float n; float t = s[i].hit( ox, oy, &n ); //如果比上一次命中距離更接近,我將這個距離儲存為最接近距離,並且儲存球面顏色值 if (t > maxz) { float fscale = n; r = s[i].r * fscale; g = s[i].g * fscale; b = s[i].b * fscale; //更新距離 maxz = t; } } //判斷球面相交情況後,將當前顏色儲存到輸出影象中 ptr[offset*4 + 0] = (int)(r * 255); ptr[offset*4 + 1] = (int)(g * 255); ptr[offset*4 + 2] = (int)(b * 255); ptr[offset*4 + 3] = 255; } // globals needed by the update routine struct DataBlock { unsigned char *dev_bitmap; Sphere *s; }; int main( void ) { DataBlock data; //記錄起始時間 cudaEvent_t start, stop; HANDLE_ERROR( cudaEventCreate( &start ) ); HANDLE_ERROR( cudaEventCreate( &stop ) ); HANDLE_ERROR( cudaEventRecord( start, 0 ) ); CPUBitmap bitmap( DIM, DIM, &data ); unsigned char *dev_bitmap; Sphere *s; // allocate memory on the GPU for the output bitmap HANDLE_ERROR( cudaMalloc( (void**)&dev_bitmap, bitmap.image_size() ) ); // allocate memory for the Sphere dataset HANDLE_ERROR( cudaMalloc( (void**)&s, sizeof(Sphere) * SPHERES ) ); // allocate temp memory, initialize it, copy to // memory on the GPU, then free our temp memory //生成球面的中心座標顏色和半徑 Sphere *temp_s = (Sphere*)malloc( sizeof(Sphere) * SPHERES ); for (int i=0; i<SPHERES; i++) { temp_s[i].r = rnd( 1.0f ); temp_s[i].g = rnd( 1.0f ); temp_s[i].b = rnd( 1.0f ); temp_s[i].x = rnd( 1000.0f ) - 500; temp_s[i].y = rnd( 1000.0f ) - 500; temp_s[i].z = rnd( 1000.0f ) - 500; temp_s[i].radius = rnd( 100.0f ) + 20; } HANDLE_ERROR( cudaMemcpy( s, temp_s, sizeof(Sphere) * SPHERES, cudaMemcpyHostToDevice ) ); free( temp_s ); // generate a bitmap from our sphere data dim3 grids(DIM/16,DIM/16); dim3 threads(16,16); kernel<<<grids,threads>>>( s, dev_bitmap ); // copy our bitmap back from the GPU for display HANDLE_ERROR( cudaMemcpy( bitmap.get_ptr(), dev_bitmap, bitmap.image_size(), cudaMemcpyDeviceToHost ) ); // get stop time, and display the timing results HANDLE_ERROR( cudaEventRecord( stop, 0 ) ); HANDLE_ERROR( cudaEventSynchronize( stop ) ); float elapsedTime; HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime, start, stop ) ); printf( "Time to generate: %3.1f ms\n", elapsedTime ); HANDLE_ERROR( cudaEventDestroy( start ) ); HANDLE_ERROR( cudaEventDestroy( stop ) ); HANDLE_ERROR( cudaFree( dev_bitmap ) ); HANDLE_ERROR( cudaFree( s ) ); // display bitmap.display_and_exit(); }
__device__是在裝置內呼叫,global呼叫device,將在所有核中使用hit,如果去掉__device__將出錯,因為hit是在主機內的
有constant:
#include "cuda_runtime.h" #include "device_launch_parameters.h" #include "H:\cuda_by_example\common\book.h" #include "H:\cuda_by_example\common\cpu_bitmap.h" #include "device_functions.h" #include <stdio.h> #define DIM 1024 #define rnd( x ) (x * rand() / RAND_MAX) #define INF 2e10f struct Sphere { float r,b,g; float radius; float x,y,z; __device__ float hit( float ox, float oy, float *n ) { float dx = ox - x; float dy = oy - y; if (dx*dx + dy*dy < radius*radius) { float dz = sqrtf( radius*radius - dx*dx - dy*dy ); *n = dz / sqrtf( radius * radius ); return dz + z; } return -INF; } }; #define SPHERES 30 __constant__ Sphere s[SPHERES]; __global__ void kernel( unsigned char *ptr ) { // map from threadIdx/BlockIdx to pixel posiytion int x = threadIdx.x + blockIdx.x * blockDim.x; int y = threadIdx.y + blockIdx.y * blockDim.y; int offset = x + y * blockDim.x * gridDim.x; float ox = (x - DIM/2); float oy = (y - DIM/2); float r=0, g=0, b=0; float maxz = -INF; for(int i=0; i<SPHERES; i++) { float n; float t = s[i].hit( ox, oy, &n ); if (t > maxz) { float fscale = n; r = s[i].r * fscale; g = s[i].g * fscale; b = s[i].b * fscale; maxz = t; } } ptr[offset*4 + 0] = (int)(r * 255); ptr[offset*4 + 1] = (int)(g * 255); ptr[offset*4 + 2] = (int)(b * 255); ptr[offset*4 + 3] = 255; } struct DataBlock{ unsigned char *dev_bitmap; }; int main(){ DataBlock data; // capture the start time and start to record it cudaEvent_t start,stop; HANDLE_ERROR(cudaEventCreate(&start)); HANDLE_ERROR(cudaEventCreate(&stop)); HANDLE_ERROR(cudaEventRecord(start,0)); CPUBitmap bitmap(DIM,DIM,&data); unsigned char *dev_bitmap; //allocate the memory on the GPU for the output bitmap HANDLE_ERROR(cudaMalloc((void**)&dev_bitmap,bitmap.image_size())); Sphere *temp_s = (Sphere*)malloc(sizeof(Sphere)*SPHERES); for (int i=0; i<SPHERES; i++) { temp_s[i].r = rnd( 1.0f ); temp_s[i].g = rnd( 1.0f ); temp_s[i].b = rnd( 1.0f ); temp_s[i].x = rnd( 1000.0f ) - 500; temp_s[i].y = rnd( 1000.0f ) - 500; temp_s[i].z = rnd( 1000.0f ) - 500; temp_s[i].radius = rnd( 100.0f ) + 20; } HANDLE_ERROR( cudaMemcpyToSymbol( s, temp_s,sizeof(Sphere) * SPHERES) ); free(temp_s); //generate a bitmap from our sphere data dim3 grids(DIM/16,DIM/16); dim3 threads(16,16); kernel<<<grids,threads>>>(dev_bitmap); //copy the bitmap back from GPU to CPU for display HANDLE_ERROR(cudaMemcpy(bitmap.get_ptr(),dev_bitmap,bitmap.image_size(),cudaMemcpyDeviceToHost)); HANDLE_ERROR(cudaEventRecord(stop,0));//stop the time record HANDLE_ERROR(cudaEventSynchronize(stop)); float elapsedTime; HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,start,stop)); printf( "Time to generate: %3.1f ms\n", elapsedTime ); HANDLE_ERROR(cudaEventDestroy(start)); HANDLE_ERROR(cudaEventDestroy(stop)); HANDLE_ERROR(cudaFree(dev_bitmap)); bitmap.display_and_exit(); }
執行效果圖,spherenumber=100時