ffmpeg綜合應用示例（四）——攝像頭直播的視音訊同步

阿新 • • 發佈：2019-01-18

在ffmpeg超詳細綜合教程——攝像頭直播文中完成了一個讀取PC攝像頭視訊資料並以RTMP協議傳送為直播流的示例，但是並沒有實現對音訊的支援，所以在這篇文章中對該示例做進一步的完善並且詳細分析直播流的視音訊同步問題，同樣，也會給出程式碼示例。

對於直播流來說，這裡只考慮傳送端的同步問題，而其中的原理其實很簡單，概括起來分為如下幾個步驟：

1、解析視音訊流，將視訊流和音訊流的時間戳用同樣的時間基準表示

2、比較轉換後的兩個時間戳，找出較小值，對應傳送偏慢的流

3、讀取、轉碼、傳送相應的流，同時，若該流的轉碼時間很快，超前於wall clock，則還需要進行相應的延時

4、迴圈重複以上過程

本文的程式碼是在此前文章的基礎上做的修改，主要是兩大部分，一是音訊轉碼的內容，二是視音訊同步的內容。

音訊轉碼的基本流程

首先是一些音訊輸入輸出的基本設定，非常簡單和常見，如下

//Set own audio device's name
	if (avformat_open_input(&ifmt_ctx_a, device_name_a, ifmt, &device_param) != 0){

        printf("Couldn't open input audio stream.（無法開啟輸入流）\n");
        return -1;
    }
……
//input audio initialize
    if (avformat_find_stream_info(ifmt_ctx_a, NULL) < 0)
    {
        printf("Couldn't find audio stream information.（無法獲取流資訊）\n");
        return -1;
    }
    audioindex = -1;
    for (i = 0; i < ifmt_ctx_a->nb_streams; i++)
    if (ifmt_ctx_a->streams[i]->codec->codec_type == AVMEDIA_TYPE_AUDIO)
    {
        audioindex = i;
        break;
    }
    if (audioindex == -1)
    {
        printf("Couldn't find a audio stream.（沒有找到視訊流）\n");
        return -1;
	}
    if (avcodec_open2(ifmt_ctx_a->streams[audioindex]->codec, avcodec_find_decoder(ifmt_ctx_a->streams[audioindex]->codec->codec_id), NULL) < 0)
    {
        printf("Could not open audio codec.（無法開啟解碼器）\n");
        return -1;
    }
……
 //output audio encoder initialize
    pCodec_a = avcodec_find_encoder(AV_CODEC_ID_AAC);
    if (!pCodec_a){
        printf("Can not find output audio encoder! (沒有找到合適的編碼器！)\n");
        return -1;
    }
    pCodecCtx_a = avcodec_alloc_context3(pCodec_a);
    pCodecCtx_a->channels = 2;
    pCodecCtx_a->channel_layout = av_get_default_channel_layout(2);
	pCodecCtx_a->sample_rate = ifmt_ctx_a->streams[audioindex]->codec->sample_rate;
    pCodecCtx_a->sample_fmt = pCodec_a->sample_fmts[0];
    pCodecCtx_a->bit_rate = 32000;
    pCodecCtx_a->time_base.num = 1;
	pCodecCtx_a->time_base.den = pCodecCtx_a->sample_rate;
    /** Allow the use of the experimental AAC encoder */
    pCodecCtx_a->strict_std_compliance = FF_COMPLIANCE_EXPERIMENTAL;
    /* Some formats want stream headers to be separate. */
    if (ofmt_ctx->oformat->flags & AVFMT_GLOBALHEADER)
        pCodecCtx_a->flags |= CODEC_FLAG_GLOBAL_HEADER;
    if (avcodec_open2(pCodecCtx_a, pCodec_a, NULL) < 0){
        printf("Failed to open ouput audio encoder! (編碼器開啟失敗！)\n");
        return -1;
    }

    //Add a new stream to output,should be called by the user before avformat_write_header() for muxing
    audio_st = avformat_new_stream(ofmt_ctx, pCodec_a);
    if (audio_st == NULL){
        return -1;
    }
    audio_st->time_base.num = 1;
	audio_st->time_base.den = pCodecCtx_a->sample_rate;
    audio_st->codec = pCodecCtx_a;

接下來，考慮到輸入音訊的sample format可能需要進行轉換，則需要用到swresample庫的功能

首先做好相應的初始化

// Initialize the resampler to be able to convert audio sample formats
	aud_convert_ctx = swr_alloc_set_opts(NULL,
		av_get_default_channel_layout(pCodecCtx_a->channels),
		pCodecCtx_a->sample_fmt,
		pCodecCtx_a->sample_rate,
		av_get_default_channel_layout(ifmt_ctx_a->streams[audioindex]->codec->channels),
		ifmt_ctx_a->streams[audioindex]->codec->sample_fmt,
		ifmt_ctx_a->streams[audioindex]->codec->sample_rate,
		0, NULL);
swr_init(aud_convert_ctx);

此外，我參照transcode_aac.c的做法，使用FIFO buffer儲存從輸入端解碼得到的音訊取樣資料，這些資料在後續將被轉換sample format並進行編碼，由此即完成了一個音訊轉碼功能，與前面文章中的視訊轉碼還是比較類似的。

此外，還需要另外一個buffer來儲存轉換格式之後的音訊資料。

//Initialize the FIFO buffer to store audio samples to be encoded. 
    AVAudioFifo *fifo = NULL;
	fifo = av_audio_fifo_alloc(pCodecCtx_a->sample_fmt, pCodecCtx_a->channels, 1);

	//Initialize the buffer to store converted samples to be encoded.
	uint8_t **converted_input_samples = NULL;
	/**
	* Allocate as many pointers as there are audio channels.
	* Each pointer will later point to the audio samples of the corresponding
	* channels (although it may be NULL for interleaved formats).
	*/
	if (!(converted_input_samples = (uint8_t**)calloc(pCodecCtx_a->channels,
		sizeof(**converted_input_samples)))) {
		printf("Could not allocate converted input sample pointers\n");
		return AVERROR(ENOMEM);
	}

至此，一些基本的初始化工作就完成了，現在我們先不看視音訊同步的內容，只看音訊轉碼的部分。程式碼中出現的幾個變數可以先忽略不看，即aud_next_pts vid_next_pts和encode_audio這三個變數。

看過我的視訊直播教程文章的朋友應該會發現這裡計算pts的方法和那裡類似。即先通過sample rate算出每兩個音訊sample之間的時間間隔，再通過計數當前已編碼的音訊sample總數（nb_samples變數的作用）來算出當前編碼音訊幀的時間戳。

如果和視訊的流程做一個類比的話，大概是下面這個關係：framerate相當於sample rate；framecnt相當於nb-samples。

同時也能看到，這裡的延時方法和之前的方法不一樣，還是一樣，我們暫且不管這裡，先專心學習音訊轉碼的基本的流程。

//audio trancoding here
        const int output_frame_size = pCodecCtx_a->frame_size;

        /**
        * Make sure that there is one frame worth of samples in the FIFO
        * buffer so that the encoder can do its work.
        * Since the decoder's and the encoder's frame size may differ, we
        * need to FIFO buffer to store as many frames worth of input samples
        * that they make up at least one frame worth of output samples.
        */
        while (av_audio_fifo_size(fifo) < output_frame_size) {
            /**
            * Decode one frame worth of audio samples, convert it to the
            * output sample format and put it into the FIFO buffer.
            */
			AVFrame *input_frame = av_frame_alloc();
			if (!input_frame)
			{
				ret = AVERROR(ENOMEM);
				return ret;
			}			
			
			/** Decode one frame worth of audio samples. */
			/** Packet used for temporary storage. */
			AVPacket input_packet;
			av_init_packet(&input_packet);
			input_packet.data = NULL;
			input_packet.size = 0;
			
			/** Read one audio frame from the input file into a temporary packet. */
			if ((ret = av_read_frame(ifmt_ctx_a, &input_packet)) < 0) {
				/** If we are at the end of the file, flush the decoder below. */
				if (ret == AVERROR_EOF)
				{
					encode_audio = 0;
				}
				else
				{
					printf("Could not read audio frame\n");
					return ret;
				}					
			}

			/**
			* Decode the audio frame stored in the temporary packet.
			* The input audio stream decoder is used to do this.
			* If we are at the end of the file, pass an empty packet to the decoder
			* to flush it.
			*/
			if ((ret = avcodec_decode_audio4(ifmt_ctx_a->streams[audioindex]->codec, input_frame,
				&dec_got_frame_a, &input_packet)) < 0) {
				printf("Could not decode audio frame\n");
				return ret;
			}
			av_packet_unref(&input_packet);
			/** If there is decoded data, convert and store it */
			if (dec_got_frame_a) {
				/**
				* Allocate memory for the samples of all channels in one consecutive
				* block for convenience.
				*/
				if ((ret = av_samples_alloc(converted_input_samples, NULL,
					pCodecCtx_a->channels,
					input_frame->nb_samples,
					pCodecCtx_a->sample_fmt, 0)) < 0) {
					printf("Could not allocate converted input samples\n");
					av_freep(&(*converted_input_samples)[0]);
					free(*converted_input_samples);
					return ret;
				}

				/**
				* Convert the input samples to the desired output sample format.
				* This requires a temporary storage provided by converted_input_samples.
				*/
				/** Convert the samples using the resampler. */
				if ((ret = swr_convert(aud_convert_ctx,
					converted_input_samples, input_frame->nb_samples,
					(const uint8_t**)input_frame->extended_data, input_frame->nb_samples)) < 0) {
					printf("Could not convert input samples\n");
					return ret;
				}

				/** Add the converted input samples to the FIFO buffer for later processing. */
				/**
				* Make the FIFO as large as it needs to be to hold both,
				* the old and the new samples.
				*/
				if ((ret = av_audio_fifo_realloc(fifo, av_audio_fifo_size(fifo) + input_frame->nb_samples)) < 0) {
					printf("Could not reallocate FIFO\n");
					return ret;
				}

				/** Store the new samples in the FIFO buffer. */
				if (av_audio_fifo_write(fifo, (void **)converted_input_samples,
					input_frame->nb_samples) < input_frame->nb_samples) {
					printf("Could not write data to FIFO\n");
					return AVERROR_EXIT;
				}				
			}
        }

        /**
        * If we have enough samples for the encoder, we encode them.
        * At the end of the file, we pass the remaining samples to
        * the encoder.
        */
        if (av_audio_fifo_size(fifo) >= output_frame_size)
            /**
            * Take one frame worth of audio samples from the FIFO buffer,
            * encode it and write it to the output file.
            */
        {
            /** Temporary storage of the output samples of the frame written to the file. */
			AVFrame *output_frame=av_frame_alloc();
			if (!output_frame)
			{
				ret = AVERROR(ENOMEM);
				return ret;
			}
			/**
			* Use the maximum number of possible samples per frame.
			* If there is less than the maximum possible frame size in the FIFO
			* buffer use this number. Otherwise, use the maximum possible frame size
			*/
			const int frame_size = FFMIN(av_audio_fifo_size(fifo),
				pCodecCtx_a->frame_size);
			
			/** Initialize temporary storage for one output frame. */
			/**
			* Set the frame's parameters, especially its size and format.
			* av_frame_get_buffer needs this to allocate memory for the
			* audio samples of the frame.
			* Default channel layouts based on the number of channels
			* are assumed for simplicity.
			*/
			output_frame->nb_samples = frame_size;
			output_frame->channel_layout = pCodecCtx_a->channel_layout;
			output_frame->format = pCodecCtx_a->sample_fmt;
			output_frame->sample_rate = pCodecCtx_a->sample_rate;

			/**
			* Allocate the samples of the created frame. This call will make
			* sure that the audio frame can hold as many samples as specified.
			*/
			if ((ret = av_frame_get_buffer(output_frame, 0)) < 0) {
				printf("Could not allocate output frame samples\n");
				av_frame_free(&output_frame);
				return ret;
			}
			
			/**
			* Read as many samples from the FIFO buffer as required to fill the frame.
			* The samples are stored in the frame temporarily.
			*/
			if (av_audio_fifo_read(fifo, (void **)output_frame->data, frame_size) < frame_size) {
				printf("Could not read data from FIFO\n");
				return AVERROR_EXIT;
			}

			/** Encode one frame worth of audio samples. */
			/** Packet used for temporary storage. */
			AVPacket output_packet;
			av_init_packet(&output_packet);
			output_packet.data = NULL;
			output_packet.size = 0;
			
			/** Set a timestamp based on the sample rate for the container. */
			if (output_frame) {
				nb_samples += output_frame->nb_samples;
			}

			/**
			* Encode the audio frame and store it in the temporary packet.
			* The output audio stream encoder is used to do this.
			*/
			if ((ret = avcodec_encode_audio2(pCodecCtx_a, &output_packet,
				output_frame, &enc_got_frame_a)) < 0) {
				printf("Could not encode frame\n");
				av_packet_unref(&output_packet);
				return ret;
			}

			/** Write one audio frame from the temporary packet to the output file. */
			if (enc_got_frame_a) {

				output_packet.stream_index = 1;

				AVRational time_base = ofmt_ctx->streams[1]->time_base;
				AVRational r_framerate1 = { ifmt_ctx_a->streams[audioindex]->codec->sample_rate, 1 };// { 44100, 1};  
				int64_t calc_duration = (double)(AV_TIME_BASE)*(1 / av_q2d(r_framerate1));  //內部時間戳  

				output_packet.pts = av_rescale_q(nb_samples*calc_duration, time_base_q, time_base);
				output_packet.dts = output_packet.pts;
				output_packet.duration = output_frame->nb_samples;

				//printf("audio pts : %d\n", output_packet.pts);
				aud_next_pts = nb_samples*calc_duration;

				int64_t pts_time = av_rescale_q(output_packet.pts, time_base, time_base_q);
				int64_t now_time = av_gettime() - start_time;
				
				if ((pts_time > now_time) && ((aud_next_pts + pts_time - now_time)<vid_next_pts))
					av_usleep(pts_time - now_time);

				if ((ret = av_interleaved_write_frame(ofmt_ctx, &output_packet)) < 0) {
					printf("Could not write frame\n");
					av_packet_unref(&output_packet);
					return ret;
				}

				av_packet_unref(&output_packet);
			}			
			av_frame_free(&output_frame);		
        }

視音訊的同步

現在我們來正式看看如何做視音訊的同步，首先我們定義幾個變數

<span style="white-space:pre">	</span>int aud_next_pts = 0;//視訊流目前的pts,可以理解為目前的進度
	int vid_next_pts = 0;//音訊流目前的pts
	int encode_video = 1, encode_audio = 1;//是否要編碼視訊、音訊

則相應的視音訊同步方法如下。

1、首先確定視訊、音訊二者中至少有一個是需要進行轉碼的，

2、比較兩個流的進度，使用av_compare_ts函式，注意：此時的vid_next_pts和aud_next_pts的time base都是ffmpeg內部基準，即

AVRational time_base_q = { 1, AV_TIME_BASE };

3、對進度落後的流進行轉碼，並相應地對進度進行更新。對於視訊，有 vid_next_pts=framecnt*calc_duration;，對於音訊，有 aud_next_pts = nb_samples*calc_duration;這裡framecnt和nb_samples都相當於計數器，而calc_duration是對應流每兩個frame或sample之間的時間間隔，也是以ffmpeg內部時間基準為單位的。
4、若轉碼進度很快完成，則不能急於寫入輸出流，而是需要先進行延時，但是也要確定延時後的時間不會超過另一個流的進度

綜上，流程如下

 //start decode and encode
    int64_t start_time = av_gettime();
    while (encode_video || encode_audio)
    {
        if (encode_video &&
			(!encode_audio || av_compare_ts(vid_next_pts, time_base_q,
			aud_next_pts, time_base_q) <= 0))
        {
              進行視訊轉碼；
              轉碼完成後；
              vid_next_pts=framecnt*calc_duration; //general timebase

                        //Delay
						int64_t pts_time = av_rescale_q(enc_pkt.pts, time_base, time_base_q);
						int64_t now_time = av_gettime() - start_time;						
						if ((pts_time > now_time) && ((vid_next_pts + pts_time - now_time)<aud_next_pts))
							av_usleep(pts_time - now_time);
              寫入流；
}
else
{
              進行音訊轉碼；
              轉碼完成後；
          aud_next_pts = nb_samples*calc_duration;

				int64_t pts_time = av_rescale_q(output_packet.pts, time_base, time_base_q);
				int64_t now_time = av_gettime() - start_time;
				if ((pts_time > now_time) && ((aud_next_pts + pts_time - now_time)<vid_next_pts))
					av_usleep(pts_time - now_time);
              寫入流；
}

至此，即完成了視音訊的同步。最後再完成一些flush encoder的工作即可。

此外，還有一個坑，在使用dshow裝置推流時，經常會報出real time buffer too full dropping frames的錯誤資訊，其原因在這篇文章裡有寫到，可以通過新增rtbufsize引數來解決，位元速率越高對應的rtbufsize就需要越高，但過高的rtbufsize會帶來視訊的延時，若要保持同步，可能就需要對音訊人為增加一定的延時。而根據我的測試，即使不新增rtbufszie引數，雖然會報出錯誤資訊，但並不影響直播流的觀看或錄製，而且可以保持同步。這就是一個trade off的問題了。

最後，本專案完整原始碼github地址。歡迎指出錯誤並一起交流討論。

各位看官，如果您覺得本人的部落格對您有所幫助，可以掃描如下二維碼進行打賞，打賞多少您隨意~

ffmpeg綜合應用示例（四）——攝像頭直播的視音訊同步

音訊轉碼的基本流程

視音訊的同步

ffmpeg綜合應用示例（四）——攝像頭直播的視音訊同步

Qt移動應用開發（四）：應用粒子特效

Docker入門與應用系列（四）網絡管理

Qt與FFmpeg聯合開發指南（四）——編碼（2）：完善功能和基礎封裝

Linux shell腳本示例（四）

[原創]分布式系統之緩存的微觀應用經驗談（四）【交互場景篇】

Zookeeper C API應用示例（3）——配置管理（非同步API）

Zookeeper應用示例（2）——叢集管理

Zookeeper C API應用示例（1）——配置管理（同步API）

[原創]分散式系統之快取的微觀應用經驗談（四）【互動場景篇】

《大話設計模式》Java程式碼示例（四）之代理模式

redis學習之應用示例（六）

Linux應用隨筆（四）檔案換行問題解決方案

Spring4的知識應用總結（四）——Bean的生命週期和註解方式配置

Android系統應用開發（四）系統語言以及新增字型庫

編寫你的應用程式（四）、音訊

prometheus+grafana構建應用監控（四）

神經網路（四）：應用示例之分類

C#基礎知識-流程控制的應用（四）

性能測試（四）應用領域

ffmpeg綜合應用示例（四）——攝像頭直播的視音訊同步

音訊轉碼的基本流程

視音訊的同步

相關推薦