1. 程式人生 > >thch30 steps/make_mfcc.sh詳解

thch30 steps/make_mfcc.sh詳解

這個指令碼的輸入引數有三個:1.data/mfcc/train 2.exp/make_mfcc/train 3.mfcc/train
1.data/mfcc/train中有資料預處理後的一些檔案:phone.txt spk2utt text utt2spk wav.scp word.txt
2.exp/make_mfcc/train中應該是要儲存程式執行的日誌檔案的
3.mfcc/train中是提取出的特徵檔案
1是輸入目錄,2,3是輸出目錄

#!/bin/bash

# Copyright 2012-2016  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0 # To be run from .. (one directory up from here) # see ../run.sh for example # Begin configuration section. nj=4 cmd=run.pl mfcc_config=conf/mfcc.conf compress=true write_utt2num_frames=false # if true writes utt2num_frames # End configuration section. # 列印這個指令碼的名稱以及所有的引數 echo "$0 [email protected]
"
# Print the command line for logging # 載入path.sh和parse_options.sh if [ -f path.sh ]; then . ./path.sh; fi . parse_options.sh || exit 1; # 如果引數少於1或者大於3就提示使用指令碼錯誤 if [ $# -lt 1 ] || [ $# -gt 3 ]; then echo "Usage: $0 [options] <data-dir> [<log-dir> [<mfcc-dir>] ]"; echo "e.g.: $0
data/train exp/make_mfcc/train mfcc"
echo "Note: <log-dir> defaults to <data-dir>/log, and <mfccdir> defaults to <data-dir>/data" echo "Options: " echo " --mfcc-config <config-file> # config passed to compute-mfcc-feats " echo " --nj <nj> # number of parallel jobs" echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs." echo " --write-utt2num-frames <true|false> # If true, write utt2num_frames file." exit 1; fi data=$1 # data=data/mfcc/train if [ $# -ge 2 ]; then # 如果引數大於等於2 logdir=$2 # logdir=exp/make_mfcc/train else logdir=$data/log fi if [ $# -ge 3 ]; then # 如果引數大於等於3 mfccdir=$3 #mfccdir=mfcc/train else mfccdir=$data/data fi # make $mfccdir an absolute pathname. mfccdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $mfccdir ${PWD}` # use "name" as part of name of the archive. name=`basename $data` # data/mfcc/train輸出train # 建立mfcc特徵資料夾和log資料夾 mkdir -p $mfccdir || exit 1; mkdir -p $logdir || exit 1; # 如果之前有執行過生成了特徵資訊檔案則備份 if [ -f $data/feats.scp ]; then mkdir -p $data/.backup echo "$0: moving $data/feats.scp to $data/.backup" mv $data/feats.scp $data/.backup fi scp=$data/wav.scp # 得到音訊路徑列表 required="$scp $mfcc_config" for f in $required; do # 檢測wav.scp和mfcc_config.sh檔案是否存在 if [ ! -f $f ]; then echo "make_mfcc.sh: no such file $f" exit 1; fi done # 使用validate_data_dir.sh 檢測$data裡的內容是否正確 utils/validate_data_dir.sh --no-text --no-feats $data || exit 1; if [ -f $data/spk2warp ]; then echo "$0 [info]: using VTLN warp factors from $data/spk2warp" vtln_opts="--vtln-map=ark:$data/spk2warp --utt2spk=ark:$data/utt2spk" elif [ -f $data/utt2warp ]; then echo "$0 [info]: using VTLN warp factors from $data/utt2warp" vtln_opts="--vtln-map=ark:$data/utt2warp" fi for n in $(seq $nj); do # 幾個執行緒就分幾個檔案 .ark中存放音訊mfcc特徵 # the next command does nothing unless $mfccdir/storage/ exists, see # utils/create_data_link.pl for more info. utils/create_data_link.pl $mfccdir/raw_mfcc_$name.$n.ark done if $write_utt2num_frames; then write_num_frames_opt="--write-num-frames=ark,t:$logdir/utt2num_frames.JOB" else write_num_frames_opt= fi if [ -f $data/segments ]; then # 如果存在segments檔案則使用已有檔案 echo "$0 [info]: segments file exists: using that." split_segments="" for n in $(seq $nj); do split_segments="$split_segments $logdir/segments.$n" done utils/split_scp.pl $data/segments $split_segments || exit 1; rm $logdir/.error 2>/dev/null $cmd JOB=1:$nj $logdir/make_mfcc_${name}.JOB.log \ extract-segments scp,p:$scp $logdir/segments.JOB ark:- \| \ compute-mfcc-feats $vtln_opts --verbose=2 --config=$mfcc_config ark:- ark:- \| \ copy-feats --compress=$compress $write_num_frames_opt ark:- \ ark,scp:$mfccdir/raw_mfcc_$name.JOB.ark,$mfccdir/raw_mfcc_$name.JOB.scp \ || exit 1; else # 我使用的時候執行此分支 echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance." split_scps="" for n in $(seq $nj); do split_scps="$split_scps $logdir/wav_${name}.$n.scp" # 後面是 exp/make_mfcc/train/wav_train.1.scp done utils/split_scp.pl $scp $split_scps || exit 1; # 使用指令碼處理 scp=$data/wav.scp # add ,p to the input rspecifier so that we can just skip over # utterances that have bad wave data. # 這裡用run.pl提取特徵開始 $cmd JOB=1:$nj $logdir/make_mfcc_${name}.JOB.log \ compute-mfcc-feats $vtln_opts --verbose=2 --config=$mfcc_config \ scp,p:$logdir/wav_${name}.JOB.scp ark:- \| \ copy-feats $write_num_frames_opt --compress=$compress ark:- \ ark,scp:$mfccdir/raw_mfcc_$name.JOB.ark,$mfccdir/raw_mfcc_$name.JOB.scp \ || exit 1; fi #最後生成的應該就是mfcc/train 中的raw_mfcc_train.1.ark raw_mfcc_train.1.scp if [ -f $logdir/.error.$name ]; then # 如果出現了錯誤則打印出log中最後的錯誤資訊 echo "Error producing mfcc features for $name:" tail $logdir/make_mfcc_${name}.1.log exit 1; fi # concatenate the .scp files together. for n in $(seq $nj); do cat $mfccdir/raw_mfcc_$name.$n.scp || exit 1; done > $data/feats.scp || exit 1 # 將所有的scp檔案拼接起來輸出到data/mfcc/train/feats.scp if $write_utt2num_frames; then for n in $(seq $nj); do cat $logdir/utt2num_frames.$n || exit 1; done > $data/utt2num_frames || exit 1 rm $logdir/utt2num_frames.* fi # 刪除過程檔案 rm $logdir/wav_${name}.*.scp $logdir/segments.* 2>/dev/null nf=`cat $data/feats.scp | wc -l` # 輸出檔案的行數 nu=`cat $data/utt2spk | wc -l` if [ $nf -ne $nu ]; then # 檢測特徵的數目與音訊檔案的數目是否相同 echo "It seems not all of the feature files were successfully processed ($nf != $nu);" echo "consider using utils/fix_data_dir.sh $data" fi if [ $nf -lt $[$nu - ($nu/20)] ]; then echo "Less than 95% the features were successfully generated. Probably a serious error." exit 1; fi echo "Succeeded creating MFCC features for $name"