1. 程式人生 > >《CUDA By Example》【Chapter 09】原子性?

《CUDA By Example》【Chapter 09】原子性?

9.1 概述

  • 瞭解不同NVIDIA GPU的計算功能集
  • 瞭解原子操作以及為什麼需要使用
  • 瞭解如何在CUDA C核函式中執行帶有原子操作的運算

9.2 計算功能集

不同架構的CPU有不同的功能和指令集(如MMX,SSE,SSE2),對於CUDA支援的GPU也一樣。NVIDIA將GPU支援的各種功能統稱為計算功能集(Compute Capability)。

9.2.1 NVIDIA GPU計算功能集

計算功能集包括1.0, 1.1, 1.2, 1.3以及2.0。高版本計算功能集是低版本計算功能集的超集。
本章介紹硬體在記憶體上執行原子操作的能力。從功能集1.2開始,既支援共享記憶體原子操作又支援全域性記憶體原子操作。

9.2.2 基於最小計算功能集的編譯

告訴編譯器程式碼需要使用某一版本如(1.2/1.1)版本或者更高的計算功能集。

nvcc -arch=sm_12
nvcc -arch=sm_11

9.3原子操作簡介

9.4計算直方圖

9.4.1在CPU上計算直方圖

hist_cpu.cu


#include "../common/book.h"

#define SIZE    (100*1024*1024)

int main( void ) {
    unsigned char *buffer =
                     (unsigned char*)big_random_block( SIZE );

    // capture the start time
clock_t start, stop; start = clock(); unsigned int histo[256]; for (int i=0; i<256; i++) histo[i] = 0; for (int i=0; i<SIZE; i++) histo[buffer[i]]++; stop = clock(); float elapsedTime = (float)(stop - start) / (float
)CLOCKS_PER_SEC * 1000.0f; printf( "Time to generate: %3.1f ms\n", elapsedTime ); long histoCount = 0; for (int i=0; i<256; i++) { histoCount += histo[i]; } printf( "Histogram Sum: %ld\n", histoCount ); free( buffer ); return 0; }

9.4.2在GPU上計算直方圖

使用全域性記憶體原子操作,效能可能會下降。
核函式中計算很少,很可能是全域性記憶體上的原子操作引起了效能的降低。當數千個執行緒嘗試訪問少量的記憶體位置時,將產生大量的競爭。為了確保遞增操作的原子性,對相同記憶體位置的操作都將被硬體序列化。

#include "../common/book.h"

#define SIZE    (100*1024*1024)


__global__ void histo_kernel( unsigned char *buffer,
                              long size,
                              unsigned int *histo ) {

    // clear out the accumulation buffer called temp
    // since we are launched with 256 threads, it is easy
    // to clear that memory with one write per thread
    __shared__  unsigned int temp[256];
    temp[threadIdx.x] = 0;
    __syncthreads();

    // calculate the starting index and the offset to the next
    // block that each thread will be processing
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    int stride = blockDim.x * gridDim.x;
    while (i < size) {
        atomicAdd( &temp[buffer[i]], 1 );
        i += stride;
    }
    // sync the data from the above writes to shared memory
    // then add the shared memory values to the values from
    // the other thread blocks using global memory
    // atomic adds
    // same as before, since we have 256 threads, updating the
    // global histogram is just one write per thread!
    __syncthreads();
    atomicAdd( &(histo[threadIdx.x]), temp[threadIdx.x] );
}

int main( void ) {
    unsigned char *buffer =
                     (unsigned char*)big_random_block( SIZE );

    // capture the start time
    // starting the timer here so that we include the cost of
    // all of the operations on the GPU.  if the data were
    // already on the GPU and we just timed the kernel
    // the timing would drop from 74 ms to 15 ms.  Very fast.
    cudaEvent_t     start, stop;
    HANDLE_ERROR( cudaEventCreate( &start ) );
    HANDLE_ERROR( cudaEventCreate( &stop ) );
    HANDLE_ERROR( cudaEventRecord( start, 0 ) );

    // allocate memory on the GPU for the file's data
    unsigned char *dev_buffer;
    unsigned int *dev_histo;
    HANDLE_ERROR( cudaMalloc( (void**)&dev_buffer, SIZE ) );
    HANDLE_ERROR( cudaMemcpy( dev_buffer, buffer, SIZE,
                              cudaMemcpyHostToDevice ) );

    HANDLE_ERROR( cudaMalloc( (void**)&dev_histo,
                              256 * sizeof( int ) ) );
    HANDLE_ERROR( cudaMemset( dev_histo, 0,
                              256 * sizeof( int ) ) );

    // kernel launch - 2x the number of mps gave best timing
    cudaDeviceProp  prop;
    HANDLE_ERROR( cudaGetDeviceProperties( &prop, 0 ) );
    int blocks = prop.multiProcessorCount;
    histo_kernel<<<blocks*2,256>>>( dev_buffer,
                                    SIZE, dev_histo );

    unsigned int    histo[256];
    HANDLE_ERROR( cudaMemcpy( histo, dev_histo,
                              256 * sizeof( int ),
                              cudaMemcpyDeviceToHost ) );

    // get stop time, and display the timing results
    HANDLE_ERROR( cudaEventRecord( stop, 0 ) );
    HANDLE_ERROR( cudaEventSynchronize( stop ) );
    float   elapsedTime;
    HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,
                                        start, stop ) );
    printf( "Time to generate:  %3.1f ms\n", elapsedTime );

    long histoCount = 0;
    for (int i=0; i<256; i++) {
        histoCount += histo[i];
    }
    printf( "Histogram Sum:  %ld\n", histoCount );

    // verify that we have the same counts via CPU
    for (int i=0; i<SIZE; i++)
        histo[buffer[i]]--;
    for (int i=0; i<256; i++) {
        if (histo[i] != 0)
            printf( "Failure at %d!\n", i );
    }

    HANDLE_ERROR( cudaEventDestroy( start ) );
    HANDLE_ERROR( cudaEventDestroy( stop ) );
    cudaFree( dev_histo );
    cudaFree( dev_buffer );
    free( buffer );
    return 0;
}

使用共享記憶體原子操作和全域性記憶體原子操作。上面程式碼的效能問題是由於原子操作帶來的,有意思的是,解決的 辦法是增加原子操作來優化效能。這裡引入了共享記憶體來優化。比單純使用全域性記憶體原子操作好很多。

#include "../common/book.h"

#define SIZE    (100*1024*1024)


__global__ void histo_kernel( unsigned char *buffer,
                              long size,
                              unsigned int *histo ) {
    // calculate the starting index and the offset to the next
    // block that each thread will be processing
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    int stride = blockDim.x * gridDim.x;
    while (i < size) {
        atomicAdd( &histo[buffer[i]], 1 );
        i += stride;
    }
}

int main( void ) {
    unsigned char *buffer =
                     (unsigned char*)big_random_block( SIZE );

    // capture the start time
    // starting the timer here so that we include the cost of
    // all of the operations on the GPU.
    cudaEvent_t     start, stop;
    HANDLE_ERROR( cudaEventCreate( &start ) );
    HANDLE_ERROR( cudaEventCreate( &stop ) );
    HANDLE_ERROR( cudaEventRecord( start, 0 ) );

    // allocate memory on the GPU for the file's data
    unsigned char *dev_buffer;
    unsigned int *dev_histo;
    HANDLE_ERROR( cudaMalloc( (void**)&dev_buffer, SIZE ) );
    HANDLE_ERROR( cudaMemcpy( dev_buffer, buffer, SIZE,
                              cudaMemcpyHostToDevice ) );

    HANDLE_ERROR( cudaMalloc( (void**)&dev_histo,
                              256 * sizeof( int ) ) );
    HANDLE_ERROR( cudaMemset( dev_histo, 0,
                              256 * sizeof( int ) ) );

    // kernel launch - 2x the number of mps gave best timing
    cudaDeviceProp  prop;
    HANDLE_ERROR( cudaGetDeviceProperties( &prop, 0 ) );
    int blocks = prop.multiProcessorCount;
    histo_kernel<<<blocks*2,256>>>( dev_buffer, SIZE, dev_histo );

    unsigned int    histo[256];
    HANDLE_ERROR( cudaMemcpy( histo, dev_histo,
                              256 * sizeof( int ),
                              cudaMemcpyDeviceToHost ) );

    // get stop time, and display the timing results
    HANDLE_ERROR( cudaEventRecord( stop, 0 ) );
    HANDLE_ERROR( cudaEventSynchronize( stop ) );
    float   elapsedTime;
    HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,
                                        start, stop ) );
    printf( "Time to generate:  %3.1f ms\n", elapsedTime );

    long histoCount = 0;
    for (int i=0; i<256; i++) {
        histoCount += histo[i];
    }
    printf( "Histogram Sum:  %ld\n", histoCount );

    // verify that we have the same counts via CPU
    for (int i=0; i<SIZE; i++)
        histo[buffer[i]]--;
    for (int i=0; i<256; i++) {
        if (histo[i] != 0)
            printf( "Failure at %d!  Off by %d\n", i, histo[i] );
    }

    HANDLE_ERROR( cudaEventDestroy( start ) );
    HANDLE_ERROR( cudaEventDestroy( stop ) );
    cudaFree( dev_histo );
    cudaFree( dev_buffer );
    free( buffer );
    return 0;
}

9.5 小結

有時候以來原子操作會帶來效能問題,並且這些問題只能通過對演算法的部分重構來加以解決。在直方圖中,使用了一種兩階段演算法,從而降低了在全域性記憶體訪問上競爭程度。通常,這種降低記憶體競爭程度的策略總能帶來不錯的效果。