視訊流GPU解碼在ffempg的實現(二)-GPU解碼器
阿新 • • 發佈:2018-12-12
1.gpu解碼器的基本呼叫流程 要做視訊流解碼,必須要了解cuda自身的解碼流,因為二者是一樣的底層實現,不一樣的上層呼叫 那cuda的解碼流程是如何的呢 在 https://developer.nvidia.com/nvidia-video-codec-sdk 下載 Video_Codec_SDK_8.0.14 解壓開來 在sampls裡面有幾個針對不同場景應用的小例子,如果不知道自己該參考哪一個,就需要去看開發文件,doc裡面有一個 NVENC_VideoEncoder_API_ProgGuide.pdf 文件 由於我這裡使用的是視訊流解碼,所以最好去檢視NvTranscoder這個demo. 在NvTranscoder裡面主要關注紅框中的這幾個檔案 NvTranscoder.cpp實現了主函式 VideoDecoder.cpp實現瞭解碼 FrameQueue.cpp實現了gpu解碼後的資料回撥 先看NvTranscoder.cpp的主要程式碼(比較冗餘,有興趣可以全部看) int main(int argc, char* argv[]) { #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) typedef HMODULE CUDADRIVER; #else typedef void *CUDADRIVER; #endif CUDADRIVER hHandleDriver = 0; __cu(cuInit(0, __CUDA_API_VERSION, hHandleDriver)); __cu(cuvidInit(0)); EncodeConfig encodeConfig = { 0 }; encodeConfig.endFrameIdx = INT_MAX; encodeConfig.bitrate = 5000000; encodeConfig.rcMode = NV_ENC_PARAMS_RC_CONSTQP; encodeConfig.gopLength = NVENC_INFINITE_GOPLENGTH; encodeConfig.codec = NV_ENC_H264; encodeConfig.fps = 0; encodeConfig.qp = 28; encodeConfig.i_quant_factor = DEFAULT_I_QFACTOR; encodeConfig.b_quant_factor = DEFAULT_B_QFACTOR; encodeConfig.i_quant_offset = DEFAULT_I_QOFFSET; encodeConfig.b_quant_offset = DEFAULT_B_QOFFSET; encodeConfig.presetGUID = NV_ENC_PRESET_DEFAULT_GUID; encodeConfig.pictureStruct = NV_ENC_PIC_STRUCT_FRAME; NVENCSTATUS nvStatus = CNvHWEncoder::ParseArguments(&encodeConfig, argc, argv); if (nvStatus != NV_ENC_SUCCESS) { PrintHelp(); return 1; } if (!encodeConfig.inputFileName || !encodeConfig.outputFileName) { PrintHelp(); return 1; } encodeConfig.fOutput = fopen(encodeConfig.outputFileName, "wb"); if (encodeConfig.fOutput == NULL) { PRINTERR("Failed to create \"%s\"\n", encodeConfig.outputFileName); return 1; } //init cuda CUcontext cudaCtx; CUdevice device; __cu(cuDeviceGet(&device, encodeConfig.deviceID)); __cu(cuCtxCreate(&cudaCtx, CU_CTX_SCHED_AUTO, device)); CUcontext curCtx; CUvideoctxlock ctxLock; __cu(cuCtxPopCurrent(&curCtx)); __cu(cuvidCtxLockCreate(&ctxLock, curCtx)); CudaDecoder* pDecoder = new CudaDecoder; FrameQueue* pFrameQueue = new CUVIDFrameQueue(ctxLock); pDecoder->InitVideoDecoder(encodeConfig.inputFileName, ctxLock, pFrameQueue, encodeConfig.width, encodeConfig.height); int decodedW, decodedH, decodedFRN, decodedFRD, isProgressive; pDecoder->GetCodecParam(&decodedW, &decodedH, &decodedFRN, &decodedFRD, &isProgressive); if (decodedFRN <= 0 || decodedFRD <= 0) { decodedFRN = 30; decodedFRD = 1; } if(encodeConfig.width <= 0 || encodeConfig.height <= 0) { encodeConfig.width = decodedW; encodeConfig.height = decodedH; } float fpsRatio = 1.f; if (encodeConfig.fps <= 0) { encodeConfig.fps = decodedFRN / decodedFRD; } else { fpsRatio = (float)encodeConfig.fps * decodedFRD / decodedFRN; } encodeConfig.pictureStruct = (isProgressive ? NV_ENC_PIC_STRUCT_FRAME : 0); pFrameQueue->init(encodeConfig.width, encodeConfig.height); VideoEncoder* pEncoder = new VideoEncoder(ctxLock); assert(pEncoder->GetHWEncoder()); nvStatus = pEncoder->GetHWEncoder()->Initialize(cudaCtx, NV_ENC_DEVICE_TYPE_CUDA); if (nvStatus != NV_ENC_SUCCESS) return 1; encodeConfig.presetGUID = pEncoder->GetHWEncoder()->GetPresetGUID(encodeConfig.encoderPreset, encodeConfig.codec); printf("Encoding input : \"%s\"\n", encodeConfig.inputFileName); printf(" output : \"%s\"\n", encodeConfig.outputFileName); printf(" codec : \"%s\"\n", encodeConfig.codec == NV_ENC_HEVC ? "HEVC" : "H264"); printf(" size : %dx%d\n", encodeConfig.width, encodeConfig.height); printf(" bitrate : %d bits/sec\n", encodeConfig.bitrate); printf(" vbvMaxBitrate : %d bits/sec\n", encodeConfig.vbvMaxBitrate); printf(" vbvSize : %d bits\n", encodeConfig.vbvSize); printf(" fps : %d frames/sec\n", encodeConfig.fps); printf(" rcMode : %s\n", encodeConfig.rcMode == NV_ENC_PARAMS_RC_CONSTQP ? "CONSTQP" : encodeConfig.rcMode == NV_ENC_PARAMS_RC_VBR ? "VBR" : encodeConfig.rcMode == NV_ENC_PARAMS_RC_CBR ? "CBR" : encodeConfig.rcMode == NV_ENC_PARAMS_RC_VBR_MINQP ? "VBR MINQP (deprecated)" : encodeConfig.rcMode == NV_ENC_PARAMS_RC_CBR_LOWDELAY_HQ ? "CBR_LOWDELAY_HQ" : encodeConfig.rcMode == NV_ENC_PARAMS_RC_CBR_HQ ? "CBR_HQ" : encodeConfig.rcMode == NV_ENC_PARAMS_RC_VBR_HQ ? "VBR_HQ" : "UNKNOWN"); if (encodeConfig.gopLength == NVENC_INFINITE_GOPLENGTH) printf(" goplength : INFINITE GOP \n"); else printf(" goplength : %d \n", encodeConfig.gopLength); printf(" B frames : %d \n", encodeConfig.numB); printf(" QP : %d \n", encodeConfig.qp); printf(" preset : %s\n", (encodeConfig.presetGUID == NV_ENC_PRESET_LOW_LATENCY_HQ_GUID) ? "LOW_LATENCY_HQ" : (encodeConfig.presetGUID == NV_ENC_PRESET_LOW_LATENCY_HP_GUID) ? "LOW_LATENCY_HP" : (encodeConfig.presetGUID == NV_ENC_PRESET_HQ_GUID) ? "HQ_PRESET" : (encodeConfig.presetGUID == NV_ENC_PRESET_HP_GUID) ? "HP_PRESET" : (encodeConfig.presetGUID == NV_ENC_PRESET_LOSSLESS_HP_GUID) ? "LOSSLESS_HP" : "LOW_LATENCY_DEFAULT"); printf("\n"); nvStatus = pEncoder->GetHWEncoder()->CreateEncoder(&encodeConfig); if (nvStatus != NV_ENC_SUCCESS) return 1; nvStatus = pEncoder->AllocateIOBuffers(&encodeConfig); if (nvStatus != NV_ENC_SUCCESS) return 1; unsigned long long lStart, lEnd, lFreq; NvQueryPerformanceCounter(&lStart); //start decoding thread #ifdef _WIN32 HANDLE decodeThread = CreateThread(NULL, 0, DecodeProc, (LPVOID)pDecoder, 0, NULL); #else pthread_t pid; pthread_create(&pid, NULL, DecodeProc, (void*)pDecoder); #endif //start encoding thread int frmProcessed = 0; int frmActual = 0; while(!(pFrameQueue->isEndOfDecode() && pFrameQueue->isEmpty()) ) { CUVIDPARSERDISPINFO pInfo; if(pFrameQueue->dequeue(&pInfo)) { CUdeviceptr dMappedFrame = 0; unsigned int pitch; CUVIDPROCPARAMS oVPP = { 0 }; oVPP.progressive_frame = pInfo.progressive_frame; oVPP.second_field = 0; oVPP.top_field_first = pInfo.top_field_first; oVPP.unpaired_field = (pInfo.progressive_frame == 1 || pInfo.repeat_first_field <= 1); cuvidMapVideoFrame(pDecoder->GetDecoder(), pInfo.picture_index, &dMappedFrame, &pitch, &oVPP); EncodeFrameConfig stEncodeConfig = { 0 }; NV_ENC_PIC_STRUCT picType = (pInfo.progressive_frame || pInfo.repeat_first_field >= 2 ? NV_ENC_PIC_STRUCT_FRAME : (pInfo.top_field_first ? NV_ENC_PIC_STRUCT_FIELD_TOP_BOTTOM : NV_ENC_PIC_STRUCT_FIELD_BOTTOM_TOP)); stEncodeConfig.dptr = dMappedFrame; stEncodeConfig.pitch = pitch; stEncodeConfig.width = encodeConfig.width; stEncodeConfig.height = encodeConfig.height; int dropOrDuplicate = MatchFPS(fpsRatio, frmProcessed, frmActual); for (int i = 0; i <= dropOrDuplicate; i++) { pEncoder->EncodeFrame(&stEncodeConfig, picType); frmActual++; } frmProcessed++; cuvidUnmapVideoFrame(pDecoder->GetDecoder(), dMappedFrame); pFrameQueue->releaseFrame(&pInfo); } } pEncoder->EncodeFrame(NULL, NV_ENC_PIC_STRUCT_FRAME, true); #ifdef _WIN32 WaitForSingleObject(decodeThread, INFINITE); #else pthread_join(pid, NULL); #endif if (pEncoder->GetEncodedFrames() > 0) { NvQueryPerformanceCounter(&lEnd); NvQueryPerformanceFrequency(&lFreq); double elapsedTime = (double)(lEnd - lStart)/(double)lFreq; printf("Total time: %fms, Decoded Frames: %d, Encoded Frames: %d, Average FPS: %f\n", elapsedTime * 1000, pDecoder->m_decodedFrames, pEncoder->GetEncodedFrames(), (float)pEncoder->GetEncodedFrames() / elapsedTime); } pEncoder->Deinitialize(); delete pDecoder; delete pEncoder; delete pFrameQueue; cuvidCtxLockDestroy(ctxLock); __cu(cuCtxDestroy(cudaCtx)); return 0; } 下面這個是我的主要流程精簡版 int main(int argc, char* argv[]) { #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) typedef HMODULE CUDADRIVER; #else typedef void *CUDADRIVER; #endif CUDADRIVER hHandleDriver = 0; __cu(cuInit(0, __CUDA_API_VERSION, hHandleDriver));//初始化cuda環境,必須的 __cu(cuvidInit(0)); //初始化解碼器 //init cuda CUcontext cudaCtx; CUdevice device; __cu(cuDeviceGet(&device, deviceID)); //得到顯示卡操作物件,deviceID是顯示卡的id,一般說來如果一張顯示卡,id就是0,兩張就是0,1 __cu(cuCtxCreate(&cudaCtx, CU_CTX_SCHED_AUTO, device)); //建立對應顯示卡的執行環境 CUcontext curCtx; CUvideoctxlock ctxLock; __cu(cuCtxPopCurrent(&curCtx));//彈出當前CPU執行緒的裡面的可用的cuda環境,也就是上面建立的環境 __cu(cuvidCtxLockCreate(&ctxLock, curCtx));//為gpu上鎖 CudaDecoder* pDecoder = new CudaDecoder;//建立cuda解碼物件(重點檢視) FrameQueue* pFrameQueue = new CUVIDFrameQueue(ctxLock);//建立解碼輸出佇列 pDecoder->InitVideoDecoder(encodeConfig.inputFileName, ctxLock, pFrameQueue, encodeConfig.width, encodeConfig.height);//初始化解碼器(重點檢視) pFrameQueue->init(encodeConfig.width, encodeConfig.height);//初始化解碼輸出佇列 //啟動解碼執行緒 #ifdef _WIN32 HANDLE decodeThread = CreateThread(NULL, 0, DecodeProc, (LPVOID)pDecoder, 0, NULL); #else pthread_t pid; pthread_create(&pid, NULL, DecodeProc, (void*)pDecoder); #endif //start encoding thread int frmProcessed = 0; int frmActual = 0; //從解碼輸出佇列裡面拉取解出來的資料 while(!(pFrameQueue->isEndOfDecode() && pFrameQueue->isEmpty()) ) { CUVIDPARSERDISPINFO pInfo; if(pFrameQueue->dequeue(&pInfo)) { CUdeviceptr dMappedFrame = 0; unsigned int pitch; CUVIDPROCPARAMS oVPP = { 0 }; oVPP.progressive_frame = pInfo.progressive_frame; oVPP.second_field = 0; oVPP.top_field_first = pInfo.top_field_first; oVPP.unpaired_field = (pInfo.progressive_frame == 1 || pInfo.repeat_first_field <= 1); //獲取資料在GPU中的地址dMappedFrame,大小為pitch個 cuvidMapVideoFrame(pDecoder->GetDecoder(), pInfo.picture_index, &dMappedFrame, &pitch, &oVPP); //因為解碼後的資料地址還是在GPU中,所有需要找到 unsigned int nv12_size = pitch * (pDecoder->iHeight + pDecoder->iHeight/2); // 12bpp //從GPU記憶體拷貝到pa->pFrameBuffer(CPU的記憶體地址) oResult = cuMemcpyDtoH(pa->pFrameBuffer, dMappedFrame, nv12_size); //釋放GPU中的記憶體 cuvidUnmapVideoFrame(pDecoder->GetDecoder(), dMappedFrame); pFrameQueue->releaseFrame(&pInfo); } } #ifdef _WIN32 WaitForSingleObject(decodeThread, INFINITE); #else pthread_join(pid, NULL); #endif delete pDecoder; delete pFrameQueue; cuvidCtxLockDestroy(ctxLock); __cu(cuCtxDestroy(cudaCtx)); return 0; } 其中的解碼器的流程呼叫是重點關注的 new解碼器 View Code 初始化解碼器,這裡建立了三個物件,一個是源,一個是解碼器,一個是解析器, View Code 源物件載入資料後會回撥,裡面有CUVIDSOURCEDATAPACKET格式的資料包,資料包會給解析器,解析器回傳資料給解碼器,解碼器把資料回傳給佇列,發往主執行緒 View Code 看了以上流程,估計有一個大概的流程在心裡了, 必要的gpu初始化------》初始化解碼器,解析器,源直譯器------》執行-----》處理輸出資料 2.自己解碼器的呼叫對接 現在輪到我們自己的需求,我的需求就是實現那個ffmpeg的解碼GPU化,先看看官方文件 首先用這個必須有一些要求 複製程式碼 NVIDIA Video Codec SDK 8.0 System Requirements * NVIDIA Kepler/Maxwell/Pascal GPU with hardware video accelerators - Refer to the NVIDIA Video SDK developer zone web page (https://developer.nvidia.com/nvidia-video-codec-sdk) for GPUs which support encoding and decoding acceleration. * Windows: Driver version 378.66 or higher * Linux: Driver version 378.13 or higher * CUDA 7.5 Toolkit (optional) [Windows Configuration Requirements] - DirectX SDK is needed. You can download the latest SDK from Microsoft's DirectX website - The CUDA 7.5 Toolkit is optional to install (see below on how to get it) - CUDA toolkit is used for building CUDA kernels that can interop with NVENC. The following environment variables need to be set to build the sample applications included with the SDK * For Windows - DXSDK_DIR: pointing to the DirectX SDK root directory [Linux Configuration Requirements] * For Linux - X11 and OpenGL, GLUT, GLEW libraries for video playback and display - The CUDA 7.5 Toolkit is optional to install (see below on how to get it) - CUDA toolkit is used for building CUDA kernels that can interop with NVENC. 我看下了我的linux基本滿足條件 驗證可行性 再看Using_FFmpeg_with_NVIDIA_GPU_Hardware_Acceleration.pdf裡面的提示可以直接編譯ffmpeg,使用它自帶的cuda解碼器來測試解碼,不過也是有要求的 對號入座,我用的是8.0,所以使用ffmpeg3.4 編譯 ./configure --enable-shared -–enable-cuda --enable-cuvid --enable-nvenc --enable-nonfree -–enable-libnpp --extra-cflags=-I/usr/local/cuda/include --extra-ldflags=-L/usr/local/cuda/lib64 --prefix=/home/user/mjl/algo/ffmpeg/build make -j 4(建議用四執行緒,八執行緒可能出現找不到的錯誤) 驗證 ffmpeg -y -hwaccel cuvid -c:v h264_cuvid -vsync 0 -i input.mp4 -vf scale_npp=1920:1072 -vcodec h264_nvenc output0.264 -vf scale_npp=1280:720 -vcodec h264_nvenc output1.264 報錯:Unknown decoder 'h264_cuvid' 注意一定要在超級管理員許可權下面執行,應為只有超級管理員才能訪問gpu 正常輸出了檔案,證明可行 關於它自帶的解碼器,我一直不是很瞭解,ffmpeg在初始化的時候統一註冊了各種編解碼器,但是如何在上層簡單的呼叫,一直不明白,這點可以大家交流 我這裡是自己直接對接,也便於控制資料 avformat_network_init(); av_register_all();//1.註冊各種編碼解碼模組,如果3.3及以上版本,裡面包含GPU解碼模組 std::string tempfile = “xxxx”;//視訊流地址 avformat_find_stream_info(format_context_, nullptr)//2.拉取一小段資料流分析,便於得到資料的基本格式 if (AVMEDIA_TYPE_VIDEO == enc->codec_type && video_stream_index_ < 0)//3.篩選出視訊流 codec_ = avcodec_find_decoder(enc->codec_id);//4.找到對應的解碼器 codec_context_ = avcodec_alloc_context3(codec_);//5.建立解碼器對應的結構體 av_read_frame(format_context_, &packet_); //6.讀取資料包 avcodec_send_packet(codec_context_, &packet_) //7.發出解碼 avcodec_receive_frame(codec_context_, yuv_frame_) //8.接收解碼 sws_scale(y2r_sws_context_, yuv_frame_->data, yuv_frame_->linesize, 0, codec_context_->height, rgb_data_, rgb_line_size_) //9.資料格式轉換 在第一節中說過,4,7,8,9步驟需要修改 資料還是由ffmpeg拉取,也就是說不需要cuda自帶的源獲取器,只需要對接解碼器和解析器(如果拉取資料也可以用GPU會更好) 而在ffmpeg中出來的資料格式是AVPacket,而cuda解碼器需要的格式是CUVIDSOURCEDATAPACKET,所以涉及到格式的轉換 開始的時候我在網上資料發現一個 https://www.cnblogs.com/dwdxdy/archive/2013/08/07/3244723.html 這位兄弟的格式轉換部分是這樣實現的 我試過,不行的,沒有任何解碼輸出! https://www.cnblogs.com/betterwgo/p/6613641.html 這位兄弟比較全面,但是其中的 View Code 這部分程式碼比較陳舊,還是沒能正常執行,起來,不過很敬佩這兄弟,能分享到這一步,已經很不錯了! 這是我在他的基礎上修改的程式碼,沒有用他的下面這種方式 //h264bsfc = av_bitstream_filter_init("h264_mp4toannexb"); //av_bsf_alloc(av_bsf_get_by_name("h264_mp4toannexb"), &bsf); 改用了av_bsf_send_packet和av_bsf_receive_packet方式,下面的我的程式碼 View Code 於是,解碼部分就已經實現,有空在貼出全部原始碼。