說話人識別中的VAD
分享一下我老師大神的人工智慧教程!零基礎,通俗易懂!http://blog.csdn.net/jiangjunshow
也歡迎大家轉載本篇文章。分享知識,造福人民,實現我們中華民族偉大復興!
本文根據kaldi中的vad的演算法 kaldi/src/ivector/voice-activity-detection.cc以及網上的一些資源來總結一下這個知識點。
首先VAD的全稱是:Voice Activity Detection (語音啟用檢
避免頻寬資源的浪費,這裡我們只討論在說話人識別中需要區分背景噪音來構建UBM模型。
下面直接看kaldi的原始碼,注意看註釋
run.sh中呼叫下面computer_vad_decision.sh
Usage: $0 [options] <data-dir> <log-dir> <path-to-vad-dir>
[plain]
- sid/compute_vad_decision.sh --nj 40 --cmd "$train_cmd" \
- data/train exp/make_vad $vaddir
Usage: compute-vad [options] <feats-rspecifier> <vad-wspecifier>
輸入的是每一個feats檔案,由於上邊的nj是40,所以這JOB: 1~40, 輸入mfcc.ark 輸出vad.ark
compute-vad --config=$vad_config scp:$sdata/JOB/feats.scp ark,scp:$vaddir/vad_${name}.JOB.ark,$vaddir/vad_${name}.JOB.scp
computer-vad是 kaldi/src/ivectorbin/compute-vad.cc 下面是 computer-vad.cc中的邏輯:[cpp] view plain copy
- for (;!feat_reader.Done(); feat_reader.Next()) {
- #讀取每一句話
- std::string utt = feat_reader.Key();
- Matrix<BaseFloat> feat(feat_reader.Value());
- if (feat.NumRows() == 0) {
- KALDI_WARN << "Empty feature matrix for utterance " << utt;
- num_err++;
- continue;
- }
- #宣告一個vector, 維數 = 一句話的幀數
- Vector<BaseFloat> vad_result(feat.NumRows());
- #然後是計算vad,一個可選引數集合,mfcc的matrix, 返回的結果vertor, 看下一個的原始碼片段
- ComputeVadEnergy(opts, feat, &vad_result);
- double sum = vad_result.Sum();
- if (sum == 0.0) {
- KALDI_WARN << "No frames were judged voiced for utterance " << utt;
- num_unvoiced++;
- } else {
- num_done++;
- }
- tot_decision += vad_result.Sum();
- tot_length += vad_result.Dim();
- if (!(omit_unvoiced_utts && sum == 0)) {
- vad_writer.Write(utt, vad_result);
- }
- }
下面這個是計算vad結果的函式: kaldi / src / ivector / voice-activity-detection.cc
[cpp] view plain copy
- #include "ivector/voice-activity-detection.h"
- #include "matrix/matrix-functions.h"
- namespace kaldi {
- void ComputeVadEnergy(const VadEnergyOptions &opts,
- const MatrixBase<BaseFloat> &feats,
- Vector<BaseFloat> *output_voiced) {
- #feats是mfcc的特徵矩陣
- int32 T = feats.NumRows();
- output_voiced->Resize(T);
- if (T == 0) {
- KALDI_WARN << "Empty features";
- return;
- }
- #定義一個維度為T的vector
- Vector<BaseFloat> log_energy(T);
- #激昂feats的第0列as log_energy的value
- log_energy.CopyColFromMat(feats, 0); // column zero is log-energy.
- #讀取配置檔案中的噪聲的閾值: <span style="font-family: Menlo; font-size: 11px;">--vad-energy-threshold=5.5, 若小於這個值則為噪音,若大於則為語音訊號
- BaseFloat energy_threshold = opts.vad_energy_threshold;
- #讀取配置檔案中:
- if (opts.vad_energy_mean_scale != 0.0) {
- KALDI_ASSERT(opts.vad_energy_mean_scale > 0.0);
- energy_threshold += opts.vad_energy_mean_scale * log_energy.Sum() / T;
- }
- KALDI_ASSERT(opts.vad_frames_context >= 0);
- KALDI_ASSERT(opts.vad_proportion_threshold > 0.0 &&
- opts.vad_proportion_threshold < 1.0);
- for (int32 t = 0; t < T; t++) {
- const BaseFloat *log_energy_data = log_energy.Data();
- int32 num_count = 0, den_count = 0, context = opts.vad_frames_context;
- for (int32 t2 = t - context; t2 <= t + context; t2++) {
- if (t2 >= 0 && t2 < T) {
- den_count++;
- if (log_energy_data[t] > energy_threshold)
- num_count++;
- }
- }
- if (num_count >= den_count * opts.vad_proportion_threshold)
- (*output_voiced)(t) = 1.0;
- else
- (*output_voiced)(t) = 0.0;
- }
- }
- }
下面我將給出一個實際的計算過程的demo:
其中raw_mfcc_train1.txt 和 vad_train1.txt分別是在mfcc目錄下執行:
./../../../../src/bin/copy-vector ark:vad_train.1.ark ark,t:- > vad_train1.txt
./../../../../src/featbin/copy-feats ark:raw_mfcc_train.1.ark ark,t:- > raw_mfcc_train1.txt
[python] view plain copy- import numpy as np
- def read_feats(filename):
- f = open(filename, 'r')
- all_xs = []
- arr = []
- for line in f:
- temp = []
- if '[' in line:
- pass
- else:
- l = line.strip().split(' ')
- #print "l->",len(l)
- if ']' in l:
- l_temp = l[:-1]
- for i in range(len(l_temp)):
- if l_temp[i] != '':
- temp.append(eval(l_temp[i]))
- #print "temp->",len(temp)
- arr.append(temp)
- all_xs.append(arr)
- arr = []
- else:
- for i in range(len(l)):
- if l[i] != '':
- temp.append(eval(l[i]))
- #print "temo->",len(temp)
- arr.append(temp)
- return all_xs
- mfcc_filename = 'raw_mfcc_train1.txt'
- all_feats = read_feats(mfcc_filename)
- vad_energy_threshold = 5.5
- vad_energy_mean_scale = 0.5
- vad_frames_context = 5
- vad_proportion_threshold =