【演算法】cuda 快排

阿新 • • 發佈：2018-11-07

核心程式碼：

__global__ void cdp_simple_quicksort(unsigned int *data, int left, int right, int depth)
{
    // If we're too deep or there are few elements left, we use an insertion sort...
    if (depth >= MAX_DEPTH || right-left <= INSERTION_SORT)
    {
        selection_sort(data, left, right);
        return;
    }

    unsigned int *lptr = data+left;
    unsigned int *rptr = data+right;
    unsigned int  pivot = data[(left+right)/2];

    // Do the partitioning.
    while (lptr <= rptr)
    {
        // Find the next left- and right-hand values to swap
        unsigned int lval = *lptr;
        unsigned int rval = *rptr;

        // Move the left pointer as long as the pointed element is smaller than the pivot.
        while (lval < pivot)
        {
            lptr++;
            lval = *lptr;
        }

        // Move the right pointer as long as the pointed element is larger than the pivot.
        while (rval > pivot)
        {
            rptr--;
            rval = *rptr;
        }

        // If the swap points are valid, do the swap!
        if (lptr <= rptr)
        {
            *lptr++ = rval;
            *rptr-- = lval;
        }
    }

    // Now the recursive part
    int nright = rptr - data;
    int nleft  = lptr - data;

    // Launch a new block to sort the left part.
    if (left < (rptr-data))
    {
        cudaStream_t s;
        cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking);
        cdp_simple_quicksort<<< 1, 1, 0, s >>>(data, left, nright, depth+1);
        cudaStreamDestroy(s);
    }

    // Launch a new block to sort the right part.
    if ((lptr-data) < right)
    {
        cudaStream_t s1;
        cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
        cdp_simple_quicksort<<< 1, 1, 0, s1 >>>(data, nleft, right, depth+1);
        cudaStreamDestroy(s1);
    }
}

完整程式碼：

/*
* Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#include <iostream>
#include <cstdio>
#include <helper_cuda.h>
#include <helper_string.h>

#define MAX_DEPTH       16
#define INSERTION_SORT  32

////////////////////////////////////////////////////////////////////////////////
// Selection sort used when depth gets too big or the number of elements drops
// below a threshold.
////////////////////////////////////////////////////////////////////////////////
__device__ void selection_sort(unsigned int *data, int left, int right)
{
    for (int i = left ; i <= right ; ++i)
    {
        unsigned min_val = data[i];
        int min_idx = i;

        // Find the smallest value in the range [left, right].
        for (int j = i+1 ; j <= right ; ++j)
        {
            unsigned val_j = data[j];

            if (val_j < min_val)
            {
                min_idx = j;
                min_val = val_j;
            }
        }

        // Swap the values.
        if (i != min_idx)
        {
            data[min_idx] = data[i];
            data[i] = min_val;
        }
    }
}

////////////////////////////////////////////////////////////////////////////////
// Very basic quicksort algorithm, recursively launching the next level.
////////////////////////////////////////////////////////////////////////////////
__global__ void cdp_simple_quicksort(unsigned int *data, int left, int right, int depth)
{
    // If we're too deep or there are few elements left, we use an insertion sort...
    if (depth >= MAX_DEPTH || right-left <= INSERTION_SORT)
    {
        selection_sort(data, left, right);
        return;
    }

    unsigned int *lptr = data+left;
    unsigned int *rptr = data+right;
    unsigned int  pivot = data[(left+right)/2];

    // Do the partitioning.
    while (lptr <= rptr)
    {
        // Find the next left- and right-hand values to swap
        unsigned int lval = *lptr;
        unsigned int rval = *rptr;

        // Move the left pointer as long as the pointed element is smaller than the pivot.
        while (lval < pivot)
        {
            lptr++;
            lval = *lptr;
        }

        // Move the right pointer as long as the pointed element is larger than the pivot.
        while (rval > pivot)
        {
            rptr--;
            rval = *rptr;
        }

        // If the swap points are valid, do the swap!
        if (lptr <= rptr)
        {
            *lptr++ = rval;
            *rptr-- = lval;
        }
    }

    // Now the recursive part
    int nright = rptr - data;
    int nleft  = lptr - data;

    // Launch a new block to sort the left part.
    if (left < (rptr-data))
    {
        cudaStream_t s;
        cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking);
        cdp_simple_quicksort<<< 1, 1, 0, s >>>(data, left, nright, depth+1);
        cudaStreamDestroy(s);
    }

    // Launch a new block to sort the right part.
    if ((lptr-data) < right)
    {
        cudaStream_t s1;
        cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
        cdp_simple_quicksort<<< 1, 1, 0, s1 >>>(data, nleft, right, depth+1);
        cudaStreamDestroy(s1);
    }
}

////////////////////////////////////////////////////////////////////////////////
// Call the quicksort kernel from the host.
////////////////////////////////////////////////////////////////////////////////
void run_qsort(unsigned int *data, unsigned int nitems)
{
    // Prepare CDP for the max depth 'MAX_DEPTH'.
    checkCudaErrors(cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, MAX_DEPTH));

    // Launch on device
    int left = 0;
    int right = nitems-1;
    std::cout << "Launching kernel on the GPU" << std::endl;
    cdp_simple_quicksort<<< 1, 1 >>>(data, left, right, 0);
    checkCudaErrors(cudaDeviceSynchronize());
}

////////////////////////////////////////////////////////////////////////////////
// Initialize data on the host.
////////////////////////////////////////////////////////////////////////////////
void initialize_data(unsigned int *dst, unsigned int nitems)
{
    // Fixed seed for illustration
    srand(2047);

    // Fill dst with random values
    for (unsigned i = 0 ; i < nitems ; i++)
        dst[i] = rand() % nitems ;
}

////////////////////////////////////////////////////////////////////////////////
// Verify the results.
////////////////////////////////////////////////////////////////////////////////
void check_results(int n, unsigned int *results_d)
{
    unsigned int *results_h = new unsigned[n];
    checkCudaErrors(cudaMemcpy(results_h, results_d, n*sizeof(unsigned), cudaMemcpyDeviceToHost));

    for (int i = 1 ; i < n ; ++i)
        if (results_h[i-1] > results_h[i])
        {
            std::cout << "Invalid item[" << i-1 << "]: " << results_h[i-1] << " greater than " << results_h[i] << std::endl;
            exit(EXIT_FAILURE);
        }

    std::cout << "OK" << std::endl;
    delete[] results_h;
}

////////////////////////////////////////////////////////////////////////////////
// Main entry point.
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    int num_items = 128;
    bool verbose = false;

    if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
        checkCmdLineFlag(argc, (const char **)argv, "h"))
    {
        std::cerr << "Usage: " << argv[0] << " num_items=<num_items>\twhere num_items is the number of items to sort" << std::endl;
        exit(EXIT_SUCCESS);
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "v"))
    {
        verbose = true;
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "num_items"))
    {
        num_items = getCmdLineArgumentInt(argc, (const char **)argv, "num_items");

        if (num_items < 1)
        {
            std::cerr << "ERROR: num_items has to be greater than 1" << std::endl;
            exit(EXIT_FAILURE);
        }
    }

    // Find/set device and get device properties
    int device = -1;
    cudaDeviceProp deviceProp;
    device = findCudaDevice(argc, (const char **)argv);
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, device));

    if (!(deviceProp.major > 3 || (deviceProp.major == 3 && deviceProp.minor >= 5)))
    {
        printf("GPU %d - %s  does not support CUDA Dynamic Parallelism\n Exiting.", device, deviceProp.name);
        exit(EXIT_WAIVED);
    }

    // Create input data
    unsigned int *h_data = 0;
    unsigned int *d_data = 0;

    // Allocate CPU memory and initialize data.
    std::cout << "Initializing data:" << std::endl;
    h_data =(unsigned int *)malloc(num_items*sizeof(unsigned int));
    initialize_data(h_data, num_items);

    if (verbose)
    {
        for (int i=0 ; i<num_items ; i++)
            std::cout << "Data [" << i << "]: " << h_data[i] << std::endl;
    }

    // Allocate GPU memory.
    checkCudaErrors(cudaMalloc((void **)&d_data, num_items * sizeof(unsigned int)));
    checkCudaErrors(cudaMemcpy(d_data, h_data, num_items * sizeof(unsigned int), cudaMemcpyHostToDevice));

    // Execute
    std::cout << "Running quicksort on " << num_items << " elements" << std::endl;
    run_qsort(d_data, num_items);

    // Check result
    std::cout << "Validating results: ";
    check_results(num_items, d_data);

    free(h_data);
    checkCudaErrors(cudaFree(d_data));

    exit(EXIT_SUCCESS);
}

選自cuda 自帶的例子。

【演算法】cuda 快排

核心程式碼： __global__ void cdp_simple_quicksort(unsigned int *data, int left, int right, int depth) { // If we're too deep or there are few el

【演算法】最快最簡單的排序——桶排序

在我們生活的這個世界中到處都是被排序過的。站隊的時候會按照身高排序，考試的名次需要按照分數排序，網上購物的時候會按照價格排序，電子郵箱中的郵件按照時間排序……總之很多東西都需要排序，可以說排序是無處不在。現在我們舉個具體的例子來介紹一下排序演算法。首先出場的我們的主人

【演算法】排序02——歸併排序介紹及其在分治演算法思想上與快排的區別（含歸併程式碼）

1、歸併排序是什麼？歸併排序和快速排序一樣，都採用了分治演算法的思想，時間複雜度都為O[ nlog (n)]，但其空間複雜度更大一點，為O[ log (n)],不過相對的，歸併是一種穩定排序，這一點和快排是不同的。歸併排序的思想流程: 先分，我們先舉例一個序列 [ 5 6 9 8 7 4

【演算法】以給定值x為基準將連結串列分割成兩部分，所有小於x的結點排在大於或等於x的節點之前

/* * 直接建立兩個連結串列：一個連結串列存放小於x的元素，另一個存放大於或等於x的元素。 * 然後迭代訪問整個連結串列，將元素插入before或者after連結串列前端！！！一旦抵達連結串列末端，則表明拆分完畢，最後合併兩個連結串列。 */

【演算法】一個比系統自帶的sqrt函式快四倍的sqrt

看完之後，感覺碼程式碼的能力不如數學好。演算法還是很重要的！再加上程式設計師的創造性，一定會有更好的會出現。　　我們平時經常會有一些資料運算的操作，需要呼叫sqrt，exp，abs等函式，那麼時候你有沒有想過：這個些函式系統是如何實現的？就拿最常用的sqrt函式

【演算法】基於優先順序的排班演算法實現

場景：在大學的裡，有不少社團組織會要組織中的成員值班，當然這個值班時間是學生無課的時間才會被安排值班。假設現有如下需求：一天中有3個時間段要有人值班，每週週一到週五都要值班，就是共有15個值班段，每個時間段值班的人都不一樣，現有40個學生，要求根據這些學生的無課表情況安排

【演算法】字串迴圈移位後是否包含

問題給定兩個字串s1和s2，要求判斷s2是否能夠被通過s1做迴圈移位（rotate）得到的字串包含。例如，s1=AABCD和s2=CDAA，返回true；給定s1=ABCD和s2=ACBD，返回false。解法一最直接最笨的方法就對s1進行迴圈移動，再

12、【演算法】查詢演算法總結

一、順序查詢 1、定義順序查詢屬於無序查詢，從資料結構的一端開始，順序掃描，依次將掃描到的節點關鍵字與給定值K相比，若相等，則表示查詢成功，若掃描結束，仍未找到關鍵字與給定值K相等，則表示查詢失敗。時間複雜度分析查詢成功時：平均查詢長度為（N+1）/2

13、【演算法】演算法複雜度分析

一、演算法的時間複雜度分析 1、時間複雜度的定義在進行演算法分析時，演算法中基本操作語句重複執行的次數是問題規模n的某個函式，用T(n)表示，若有某個輔助函式f(n)，使得當n趨近於無窮大時，T（n)/f(n)的極限值為不等於零的常數，則稱f(n)是T(n)的同數量級函式，

11、【演算法】排序演算法總結

常見排序演算法總結一、氣泡排序 1、定義氣泡排序是一種比較簡單的排序演算法，它會遍歷若干次要排序的數列，每次便利時，它都會從前往後依次的比較兩個相鄰的數的大小；如果前者比後者大，則交換它們的位置。這樣一次遍歷之後，最大的元素就在數列的末尾了。採用相同的方法在

【演算法】第三章作業實踐報告

【演算法】實踐第三章作業 1. 實踐題目最大子段和 2. 問題描述給定n個整數（可能為負數）組成的序列a[1],a[2],a[3],…,a[n]，求該序列如a[i]+a[i+1]+…+a[j]的子段和的最大值。當所給的整數均為負數時，定義子段和為0。

【演算法】-003 三次貝塞爾曲線的交點

【演算法】-003 三次貝塞爾曲線的交點最近在工作中遇到一個問題，想通過計算兩條三次貝塞爾曲線的交點位置。嘗試了列舉法之後覺得計算速度太慢，於是來找其他演算法。文章目錄【演算法】-003 三次貝塞爾曲線的交點 1、列舉法求貝塞爾曲線交

【演算法】-002 小數四捨五入截斷

【演算法】-002 小數四捨五入截斷實際應用中，由於採集裝置的精度問題，雖然測量值在有效範圍內，但可能資料尾部會處在一直跳變的過程中。為了將不穩定的資料位捨去，因此需要將測試資料進行帶四捨五入的截斷。【演算法】-002 小數四捨五入截斷

【演算法】二叉樹前序、中序、後序遍歷相互求法（轉）

二叉樹前序、中序、後序遍歷相互求法原文地址今天來總結下二叉樹前序、中序、後序遍歷相互求法，即如果知道兩個的遍歷，如何求第三種遍歷方法，比較笨的方法是畫出來二叉樹，然後根據各種遍歷不同的特性來求，也可以程式設計求出，下面我們分別說明。

【演算法】二叉樹的遞迴和非遞迴遍歷（轉）

原文地址【寫在前面】　　二叉樹是一種非常重要的資料結構，很多其它資料結構都是基於二叉樹的基礎演變而來的。對於二叉樹，有前序、中序以及後序三種遍歷方法。因為樹的定義本身就是遞迴定義，因此採用遞迴的方法去實現樹的三種遍歷不僅容易理解而且程式碼很簡潔。而對於樹的遍歷若採用非遞迴的方法，就要採

【演算法】貪心演算法

1. 定義貪心演算法（又稱貪婪演算法）是指，在對問題求解時，總是做出在當前看來是最好的選擇。也就是說，不從整體最優上加以考慮，他所做出的是在某種意義上的區域性最優解。 2. 基本要素（1）貪心選擇貪心選擇是指所求問題的整體最優解可以通過一系列區域性最優的選擇，即貪心選擇

【演算法】演算法分析

相關資料來源於網路，侵刪歉。如果文章中存在錯誤，請下方評論告知我，謝謝！用科學的方法分析演算法觀察真實世界的特徵；根據觀察提出假設模型；根據模型預測未來的事件；繼續觀察驗證預測的準確性；反覆如此直到確認預測和觀察一致。原則

【演算法】二叉樹的廣度遍歷

廣度優先遍歷的核心思想如下：從根節點開始遍歷，然後遍歷其子節點，再從左至右的，依次遍歷其孫子節點的，以此類推，直到完成整顆二叉樹的遍歷。 50 20 &nbs

【演算法】LeetCode演算法題-Length Of Last Word

這是悅樂書的第155次更新，第157篇原創 01 看題和準備今天介紹的是LeetCode演算法題中Easy級別的第14題（順位題號是58）。給定一個字串，包含戴爾字母、小寫字母和空格，返回最後一個單詞的長度，如果最後一個單詞不存在則返回0。另外，單詞不包含空格。例如：

【演算法】LeetCode演算法題-Maximum Subarray

這是悅樂書的第154次更新，第156篇原創 01 看題和準備今天介紹的是LeetCode演算法題中Easy級別的第13題（順位題號是53）。給定一個整數陣列nums，找出一個最大和，此和是由陣列中索引連續的元素組成，至少包含一個元素。例如：輸入：[-2, 1, -

【演算法】cuda 快排

相關推薦