1. 程式人生 > >視訊流GPU解碼在ffempg的實現(二)-GPU解碼器

視訊流GPU解碼在ffempg的實現(二)-GPU解碼器

1.gpu解碼器的基本呼叫流程

要做視訊流解碼,必須要了解cuda自身的解碼流,因為二者是一樣的底層實現,不一樣的上層呼叫

那cuda的解碼流程是如何的呢

在 https://developer.nvidia.com/nvidia-video-codec-sdk  下載 Video_Codec_SDK_8.0.14

解壓開來

在sampls裡面有幾個針對不同場景應用的小例子,如果不知道自己該參考哪一個,就需要去看開發文件,doc裡面有一個 NVENC_VideoEncoder_API_ProgGuide.pdf 文件

由於我這裡使用的是視訊流解碼,所以最好去檢視NvTranscoder這個demo.

在NvTranscoder裡面主要關注紅框中的這幾個檔案

NvTranscoder.cpp實現了主函式

VideoDecoder.cpp實現瞭解碼

FrameQueue.cpp實現了gpu解碼後的資料回撥

 

先看NvTranscoder.cpp的主要程式碼(比較冗餘,有興趣可以全部看)

int main(int argc, char* argv[])
{
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    typedef HMODULE CUDADRIVER;
#else
    typedef void *CUDADRIVER;
#endif
    CUDADRIVER hHandleDriver = 0;

    __cu(cuInit(0, __CUDA_API_VERSION, hHandleDriver));
    __cu(cuvidInit(0));

    EncodeConfig encodeConfig = { 0 };
    encodeConfig.endFrameIdx = INT_MAX;
    encodeConfig.bitrate = 5000000;
    encodeConfig.rcMode = NV_ENC_PARAMS_RC_CONSTQP;
    encodeConfig.gopLength = NVENC_INFINITE_GOPLENGTH;
    encodeConfig.codec = NV_ENC_H264;
    encodeConfig.fps = 0;
    encodeConfig.qp = 28;
    encodeConfig.i_quant_factor = DEFAULT_I_QFACTOR;
    encodeConfig.b_quant_factor = DEFAULT_B_QFACTOR;  
    encodeConfig.i_quant_offset = DEFAULT_I_QOFFSET;
    encodeConfig.b_quant_offset = DEFAULT_B_QOFFSET;   
    encodeConfig.presetGUID = NV_ENC_PRESET_DEFAULT_GUID;
    encodeConfig.pictureStruct = NV_ENC_PIC_STRUCT_FRAME;

    NVENCSTATUS nvStatus = CNvHWEncoder::ParseArguments(&encodeConfig, argc, argv);
    if (nvStatus != NV_ENC_SUCCESS)
    {
        PrintHelp();
        return 1;
    }

    if (!encodeConfig.inputFileName || !encodeConfig.outputFileName)
    {
        PrintHelp();
        return 1;
    }

    encodeConfig.fOutput = fopen(encodeConfig.outputFileName, "wb");
    if (encodeConfig.fOutput == NULL)
    {
        PRINTERR("Failed to create \"%s\"\n", encodeConfig.outputFileName);
        return 1;
    }

    //init cuda
    CUcontext cudaCtx;
    CUdevice device;
    __cu(cuDeviceGet(&device, encodeConfig.deviceID));
    __cu(cuCtxCreate(&cudaCtx, CU_CTX_SCHED_AUTO, device));

    CUcontext curCtx;
    CUvideoctxlock ctxLock;
    __cu(cuCtxPopCurrent(&curCtx));
    __cu(cuvidCtxLockCreate(&ctxLock, curCtx));

    CudaDecoder* pDecoder   = new CudaDecoder;
    FrameQueue* pFrameQueue = new CUVIDFrameQueue(ctxLock);
    pDecoder->InitVideoDecoder(encodeConfig.inputFileName, ctxLock, pFrameQueue, encodeConfig.width, encodeConfig.height);

    int decodedW, decodedH, decodedFRN, decodedFRD, isProgressive;
    pDecoder->GetCodecParam(&decodedW, &decodedH, &decodedFRN, &decodedFRD, &isProgressive);
    if (decodedFRN <= 0 || decodedFRD <= 0) {
        decodedFRN = 30;
        decodedFRD = 1;
    }

    if(encodeConfig.width <= 0 || encodeConfig.height <= 0) {
        encodeConfig.width  = decodedW;
        encodeConfig.height = decodedH;
    }

    float fpsRatio = 1.f;
    if (encodeConfig.fps <= 0) {
        encodeConfig.fps = decodedFRN / decodedFRD;
    }
    else {
        fpsRatio = (float)encodeConfig.fps * decodedFRD / decodedFRN;
    }

    encodeConfig.pictureStruct = (isProgressive ? NV_ENC_PIC_STRUCT_FRAME : 0);
    pFrameQueue->init(encodeConfig.width, encodeConfig.height);

    VideoEncoder* pEncoder = new VideoEncoder(ctxLock);
    assert(pEncoder->GetHWEncoder());

    nvStatus = pEncoder->GetHWEncoder()->Initialize(cudaCtx, NV_ENC_DEVICE_TYPE_CUDA);
    if (nvStatus != NV_ENC_SUCCESS)
        return 1;

    encodeConfig.presetGUID = pEncoder->GetHWEncoder()->GetPresetGUID(encodeConfig.encoderPreset, encodeConfig.codec);

    printf("Encoding input           : \"%s\"\n", encodeConfig.inputFileName);
    printf("         output          : \"%s\"\n", encodeConfig.outputFileName);
    printf("         codec           : \"%s\"\n", encodeConfig.codec == NV_ENC_HEVC ? "HEVC" : "H264");
    printf("         size            : %dx%d\n", encodeConfig.width, encodeConfig.height);
    printf("         bitrate         : %d bits/sec\n", encodeConfig.bitrate);
    printf("         vbvMaxBitrate   : %d bits/sec\n", encodeConfig.vbvMaxBitrate);
    printf("         vbvSize         : %d bits\n", encodeConfig.vbvSize);
    printf("         fps             : %d frames/sec\n", encodeConfig.fps);
    printf("         rcMode          : %s\n", encodeConfig.rcMode == NV_ENC_PARAMS_RC_CONSTQP ? "CONSTQP" :
                                              encodeConfig.rcMode == NV_ENC_PARAMS_RC_VBR ? "VBR" :
                                              encodeConfig.rcMode == NV_ENC_PARAMS_RC_CBR ? "CBR" :
                                              encodeConfig.rcMode == NV_ENC_PARAMS_RC_VBR_MINQP ? "VBR MINQP (deprecated)" :
                                              encodeConfig.rcMode == NV_ENC_PARAMS_RC_CBR_LOWDELAY_HQ ? "CBR_LOWDELAY_HQ" :
                                              encodeConfig.rcMode == NV_ENC_PARAMS_RC_CBR_HQ ? "CBR_HQ" :
                                              encodeConfig.rcMode == NV_ENC_PARAMS_RC_VBR_HQ ? "VBR_HQ" : "UNKNOWN");
    if (encodeConfig.gopLength == NVENC_INFINITE_GOPLENGTH)
        printf("         goplength       : INFINITE GOP \n");
    else
        printf("         goplength       : %d \n", encodeConfig.gopLength);
    printf("         B frames        : %d \n", encodeConfig.numB);
    printf("         QP              : %d \n", encodeConfig.qp);
    printf("         preset          : %s\n", (encodeConfig.presetGUID == NV_ENC_PRESET_LOW_LATENCY_HQ_GUID) ? "LOW_LATENCY_HQ" :
        (encodeConfig.presetGUID == NV_ENC_PRESET_LOW_LATENCY_HP_GUID) ? "LOW_LATENCY_HP" :
        (encodeConfig.presetGUID == NV_ENC_PRESET_HQ_GUID) ? "HQ_PRESET" :
        (encodeConfig.presetGUID == NV_ENC_PRESET_HP_GUID) ? "HP_PRESET" :
        (encodeConfig.presetGUID == NV_ENC_PRESET_LOSSLESS_HP_GUID) ? "LOSSLESS_HP" : "LOW_LATENCY_DEFAULT");
    printf("\n");

    nvStatus = pEncoder->GetHWEncoder()->CreateEncoder(&encodeConfig);
    if (nvStatus != NV_ENC_SUCCESS)
        return 1;

    nvStatus = pEncoder->AllocateIOBuffers(&encodeConfig);
    if (nvStatus != NV_ENC_SUCCESS)
        return 1;

    unsigned long long lStart, lEnd, lFreq;
    NvQueryPerformanceCounter(&lStart);

    //start decoding thread
#ifdef _WIN32
    HANDLE decodeThread = CreateThread(NULL, 0, DecodeProc, (LPVOID)pDecoder, 0, NULL);
#else
    pthread_t pid;
    pthread_create(&pid, NULL, DecodeProc, (void*)pDecoder);
#endif

    //start encoding thread
    int frmProcessed = 0;
    int frmActual = 0;
    while(!(pFrameQueue->isEndOfDecode() && pFrameQueue->isEmpty()) ) {

        CUVIDPARSERDISPINFO pInfo;
        if(pFrameQueue->dequeue(&pInfo)) {
            CUdeviceptr dMappedFrame = 0;
            unsigned int pitch;
            CUVIDPROCPARAMS oVPP = { 0 };
            oVPP.progressive_frame = pInfo.progressive_frame;
            oVPP.second_field = 0;
            oVPP.top_field_first = pInfo.top_field_first;
            oVPP.unpaired_field = (pInfo.progressive_frame == 1 || pInfo.repeat_first_field <= 1);

            cuvidMapVideoFrame(pDecoder->GetDecoder(), pInfo.picture_index, &dMappedFrame, &pitch, &oVPP);

            EncodeFrameConfig stEncodeConfig = { 0 };
            NV_ENC_PIC_STRUCT picType = (pInfo.progressive_frame || pInfo.repeat_first_field >= 2 ? NV_ENC_PIC_STRUCT_FRAME :
                (pInfo.top_field_first ? NV_ENC_PIC_STRUCT_FIELD_TOP_BOTTOM : NV_ENC_PIC_STRUCT_FIELD_BOTTOM_TOP));

            stEncodeConfig.dptr = dMappedFrame;
            stEncodeConfig.pitch = pitch;
            stEncodeConfig.width = encodeConfig.width;
            stEncodeConfig.height = encodeConfig.height;

            int dropOrDuplicate = MatchFPS(fpsRatio, frmProcessed, frmActual);
            for (int i = 0; i <= dropOrDuplicate; i++) {
                pEncoder->EncodeFrame(&stEncodeConfig, picType);
                frmActual++;
            }
            frmProcessed++;

            cuvidUnmapVideoFrame(pDecoder->GetDecoder(), dMappedFrame);
            pFrameQueue->releaseFrame(&pInfo);
       }
    }

    pEncoder->EncodeFrame(NULL, NV_ENC_PIC_STRUCT_FRAME, true);

#ifdef _WIN32
    WaitForSingleObject(decodeThread, INFINITE);
#else
    pthread_join(pid, NULL);
#endif

    if (pEncoder->GetEncodedFrames() > 0)
    {
        NvQueryPerformanceCounter(&lEnd);
        NvQueryPerformanceFrequency(&lFreq);
        double elapsedTime = (double)(lEnd - lStart)/(double)lFreq;
        printf("Total time: %fms, Decoded Frames: %d, Encoded Frames: %d, Average FPS: %f\n",
        elapsedTime * 1000,
        pDecoder->m_decodedFrames,
        pEncoder->GetEncodedFrames(),
        (float)pEncoder->GetEncodedFrames() / elapsedTime);
    }

    pEncoder->Deinitialize();
    delete pDecoder;
    delete pEncoder;
    delete pFrameQueue;

    cuvidCtxLockDestroy(ctxLock);
    __cu(cuCtxDestroy(cudaCtx));

    return 0;
}


 

下面這個是我的主要流程精簡版

int main(int argc, char* argv[])
{
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    typedef HMODULE CUDADRIVER;
#else
    typedef void *CUDADRIVER;
#endif
    CUDADRIVER hHandleDriver = 0;

    __cu(cuInit(0, __CUDA_API_VERSION, hHandleDriver));//初始化cuda環境,必須的
    __cu(cuvidInit(0)); //初始化解碼器


    //init cuda
    CUcontext cudaCtx;
    CUdevice device;
    __cu(cuDeviceGet(&device, deviceID)); //得到顯示卡操作物件,deviceID是顯示卡的id,一般說來如果一張顯示卡,id就是0,兩張就是0,1
    __cu(cuCtxCreate(&cudaCtx, CU_CTX_SCHED_AUTO, device)); //建立對應顯示卡的執行環境

    CUcontext curCtx;
    CUvideoctxlock ctxLock;
    __cu(cuCtxPopCurrent(&curCtx));//彈出當前CPU執行緒的裡面的可用的cuda環境,也就是上面建立的環境
    __cu(cuvidCtxLockCreate(&ctxLock, curCtx));//為gpu上鎖
    CudaDecoder* pDecoder   = new CudaDecoder;//建立cuda解碼物件(重點檢視)
    FrameQueue* pFrameQueue = new CUVIDFrameQueue(ctxLock);//建立解碼輸出佇列
    pDecoder->InitVideoDecoder(encodeConfig.inputFileName, ctxLock, pFrameQueue, encodeConfig.width, encodeConfig.height);//初始化解碼器(重點檢視)


    pFrameQueue->init(encodeConfig.width, encodeConfig.height);//初始化解碼輸出佇列

    //啟動解碼執行緒
#ifdef _WIN32
    HANDLE decodeThread = CreateThread(NULL, 0, DecodeProc, (LPVOID)pDecoder, 0, NULL);
#else
    pthread_t pid;
    pthread_create(&pid, NULL, DecodeProc, (void*)pDecoder);
#endif

    //start encoding thread
    int frmProcessed = 0;
    int frmActual = 0;
    //從解碼輸出佇列裡面拉取解出來的資料
    while(!(pFrameQueue->isEndOfDecode() && pFrameQueue->isEmpty()) ) {

        CUVIDPARSERDISPINFO pInfo;
        if(pFrameQueue->dequeue(&pInfo)) {
            CUdeviceptr dMappedFrame = 0;
            unsigned int pitch;
            CUVIDPROCPARAMS oVPP = { 0 };
            oVPP.progressive_frame = pInfo.progressive_frame;
            oVPP.second_field = 0;
            oVPP.top_field_first = pInfo.top_field_first;
            oVPP.unpaired_field = (pInfo.progressive_frame == 1 || pInfo.repeat_first_field <= 1);
            //獲取資料在GPU中的地址dMappedFrame,大小為pitch個
            cuvidMapVideoFrame(pDecoder->GetDecoder(), pInfo.picture_index, &dMappedFrame, &pitch, &oVPP);
            //因為解碼後的資料地址還是在GPU中,所有需要找到
              unsigned int nv12_size = pitch * (pDecoder->iHeight + pDecoder->iHeight/2);  // 12bpp  
            //從GPU記憶體拷貝到pa->pFrameBuffer(CPU的記憶體地址)
            oResult = cuMemcpyDtoH(pa->pFrameBuffer, dMappedFrame, nv12_size);  
    
            //釋放GPU中的記憶體
            cuvidUnmapVideoFrame(pDecoder->GetDecoder(), dMappedFrame);
            pFrameQueue->releaseFrame(&pInfo);
       }
    }


#ifdef _WIN32
    WaitForSingleObject(decodeThread, INFINITE);
#else
    pthread_join(pid, NULL);
#endif
    delete pDecoder;
    delete pFrameQueue;

    cuvidCtxLockDestroy(ctxLock);
    __cu(cuCtxDestroy(cudaCtx));

    return 0;
}

 

其中的解碼器的流程呼叫是重點關注的

new解碼器
View Code

初始化解碼器,這裡建立了三個物件,一個是源,一個是解碼器,一個是解析器,
View Code

 

源物件載入資料後會回撥,裡面有CUVIDSOURCEDATAPACKET格式的資料包,資料包會給解析器,解析器回傳資料給解碼器,解碼器把資料回傳給佇列,發往主執行緒
View Code

 

看了以上流程,估計有一個大概的流程在心裡了,

必要的gpu初始化------》初始化解碼器,解析器,源直譯器------》執行-----》處理輸出資料
2.自己解碼器的呼叫對接

現在輪到我們自己的需求,我的需求就是實現那個ffmpeg的解碼GPU化,先看看官方文件

首先用這個必須有一些要求
複製程式碼

NVIDIA Video Codec SDK 8.0

System Requirements

* NVIDIA Kepler/Maxwell/Pascal GPU with hardware video accelerators - Refer to the NVIDIA Video SDK developer zone web page (https://developer.nvidia.com/nvidia-video-codec-sdk) for GPUs which
support encoding and decoding acceleration.
* Windows: Driver version 378.66 or higher
* Linux:   Driver version 378.13 or higher
* CUDA 7.5 Toolkit (optional)

[Windows Configuration Requirements]
- DirectX SDK is needed. You can download the latest SDK from Microsoft's DirectX website
- The CUDA 7.5 Toolkit is optional to install (see below on how to get it)
- CUDA toolkit is used for building CUDA kernels that can interop with NVENC.

The following environment variables need to be set to build the sample applications included with the SDK
* For Windows
  - DXSDK_DIR: pointing to the DirectX SDK root directory

[Linux Configuration Requirements]    
* For Linux
  - X11 and OpenGL, GLUT, GLEW libraries for video playback and display 
  - The CUDA 7.5 Toolkit is optional to install (see below on how to get it)
  - CUDA toolkit is used for building CUDA kernels that can interop with NVENC.  


我看下了我的linux基本滿足條件

 

驗證可行性

再看Using_FFmpeg_with_NVIDIA_GPU_Hardware_Acceleration.pdf裡面的提示可以直接編譯ffmpeg,使用它自帶的cuda解碼器來測試解碼,不過也是有要求的

對號入座,我用的是8.0,所以使用ffmpeg3.4

編譯

./configure --enable-shared  -–enable-cuda --enable-cuvid --enable-nvenc --enable-nonfree -–enable-libnpp --extra-cflags=-I/usr/local/cuda/include --extra-ldflags=-L/usr/local/cuda/lib64 --prefix=/home/user/mjl/algo/ffmpeg/build


make -j 4(建議用四執行緒,八執行緒可能出現找不到的錯誤)


驗證

 ffmpeg -y -hwaccel cuvid -c:v h264_cuvid -vsync 0 -i input.mp4 -vf scale_npp=1920:1072 -vcodec h264_nvenc output0.264 -vf scale_npp=1280:720 -vcodec h264_nvenc output1.264
報錯:Unknown decoder 'h264_cuvid'

注意一定要在超級管理員許可權下面執行,應為只有超級管理員才能訪問gpu

正常輸出了檔案,證明可行

關於它自帶的解碼器,我一直不是很瞭解,ffmpeg在初始化的時候統一註冊了各種編解碼器,但是如何在上層簡單的呼叫,一直不明白,這點可以大家交流

我這裡是自己直接對接,也便於控制資料

 

 

avformat_network_init();
    av_register_all();//1.註冊各種編碼解碼模組,如果3.3及以上版本,裡面包含GPU解碼模組
  
    std::string tempfile = “xxxx”;//視訊流地址

    avformat_find_stream_info(format_context_, nullptr)//2.拉取一小段資料流分析,便於得到資料的基本格式
    if (AVMEDIA_TYPE_VIDEO == enc->codec_type && video_stream_index_ < 0)//3.篩選出視訊流
    codec_ = avcodec_find_decoder(enc->codec_id);//4.找到對應的解碼器
    codec_context_ = avcodec_alloc_context3(codec_);//5.建立解碼器對應的結構體
    
    av_read_frame(format_context_, &packet_); //6.讀取資料包
    
    avcodec_send_packet(codec_context_, &packet_) //7.發出解碼
    avcodec_receive_frame(codec_context_, yuv_frame_) //8.接收解碼 
    
    sws_scale(y2r_sws_context_, yuv_frame_->data, yuv_frame_->linesize, 0, codec_context_->height, rgb_data_, rgb_line_size_) //9.資料格式轉換


在第一節中說過,4,7,8,9步驟需要修改

資料還是由ffmpeg拉取,也就是說不需要cuda自帶的源獲取器,只需要對接解碼器和解析器(如果拉取資料也可以用GPU會更好)

而在ffmpeg中出來的資料格式是AVPacket,而cuda解碼器需要的格式是CUVIDSOURCEDATAPACKET,所以涉及到格式的轉換

開始的時候我在網上資料發現一個 https://www.cnblogs.com/dwdxdy/archive/2013/08/07/3244723.html  這位兄弟的格式轉換部分是這樣實現的

 我試過,不行的,沒有任何解碼輸出!

https://www.cnblogs.com/betterwgo/p/6613641.html 這位兄弟比較全面,但是其中的
View Code

這部分程式碼比較陳舊,還是沒能正常執行,起來,不過很敬佩這兄弟,能分享到這一步,已經很不錯了!

這是我在他的基礎上修改的程式碼,沒有用他的下面這種方式

        //h264bsfc = av_bitstream_filter_init("h264_mp4toannexb");
        //av_bsf_alloc(av_bsf_get_by_name("h264_mp4toannexb"), &bsf);

改用了av_bsf_send_packet和av_bsf_receive_packet方式,下面的我的程式碼
View Code

於是,解碼部分就已經實現,有空在貼出全部原始碼。