1. 程式人生 > 其它 >cuda共享記憶體與原子操作,實現計算字元直方圖

cuda共享記憶體與原子操作,實現計算字元直方圖

寫在前面

這個程式再《CUDA by Example》所給出的程式程式碼是有問題的,原先的例子程式碼將所有的資料加到直方圖陣列的第一個元素上,其他數字為零,至少我的是這樣的,所以我就自己寫了一個,並且優化了使用cpu初始化原始資料的效率(STL的多執行緒)。

程式碼在這裡

#include<cuda_runtime.h>
#include<device_launch_parameters.h>
#include<cstdio>
#include<iostream>
#include<thread>
#include<vector>
static const int SIZE_DATA = 100 * 1024 * 1024; unsigned char * h_getrndarray(int size) { const auto cpu_maxthread = std::thread::hardware_concurrency(); unsigned char *buffer = (unsigned char*)malloc(size*sizeof(unsigned char)); std::vector<std::thread> thread_vec; const auto cpusize = size /
cpu_maxthread; auto h_thread_getrnd = [](unsigned char *data, int start, int size) { for (auto i = start; i < start + size; ++i) if (i < SIZE_DATA) data[i] = rand(); }; for (size_t i{}; i < cpu_maxthread; ++i) thread_vec.push_back(std::move(std::thread(h_thread_getrnd, buffer,
i*cpusize, cpusize))); for (auto &i : thread_vec) i.join(); return buffer; } __global__ void histo_kernel(unsigned char* data, size_t*d_histo, long size) { __shared__ unsigned int temp[256]; temp[threadIdx.x] = 0; __syncthreads(); auto i = threadIdx.x + blockIdx.x * blockDim.x; auto stride = blockDim.x * gridDim.x; while (i < size) { atomicAdd(&temp[data[i]], 1); i += stride; } __syncthreads(); atomicAdd(&(d_histo[threadIdx.x]), temp[threadIdx.x]); } int main() { const auto buffer_sizeof = sizeof(unsigned char)*SIZE_DATA; const auto histo_sizeof = sizeof(size_t) * 256; unsigned char *buffer = h_getrndarray(SIZE_DATA); size_t histo[256] = {}; unsigned char *d_buffer; size_t *d_histo; cudaMalloc(&d_buffer, buffer_sizeof); cudaMemcpy(d_buffer, buffer, buffer_sizeof, cudaMemcpyHostToDevice); cudaMalloc(&d_histo, histo_sizeof); cudaMemset(d_histo, 0, histo_sizeof); //初始化 cudaDeviceProp prop; cudaGetDeviceProperties(&prop, 0); auto blocks = prop.multiProcessorCount; histo_kernel << <blocks *2, 256 >> >(d_buffer, d_histo, SIZE_DATA); cudaMemcpy(histo, d_histo, histo_sizeof, cudaMemcpyDeviceToHost); for (char i = 0; i < 256; ++i) std::cout << i << " : " << histo[i] <<" times."<< std::endl; //cpu版*********** for (size_t i{}; i < SIZE_DATA; ++i) { ++histo[buffer[i]]; } for (auto &i : histo) std::cout << i << ", " << std::endl; //***********// free(buffer); }

這是部分的結果:
在這裡插入圖片描述