ROS中階筆記(六):機器人感知—機器語音
ROS中階筆記(六):機器人感知—機器語音
目錄
1 語音識別理論
2 常用語音功能包
- pocketsphinx:整合CMU Sphinx和Festival開源專案中的程式碼,實現語音識別的功能
- audio-common:提供了文字轉語音的功能實現完成"機器人說話"的想法
- AIML:人工智慧標記語音,Artificial Intelligence Markup Language是一種建立自然語音軟體代理的XML語言
3 科大訊飛SDK
登入科大訊飛開放平臺的官方網站:
下載的SDK檔案(Linux_lat···檔案)——解壓——示例在samples資料夾下面(比如iat_record_sample,剛下載完成功能包之後,需要咋當前目錄下面編譯,通過make編譯)
編譯之後的檔案放在bin資料夾下面的,可以看到有一個生成的可執行檔案iat_record_sample;
3.1 使用前提
將科大訊飛SDK的庫檔案拷貝到系統目錄下
$ sudo cp libmsc.so/usr/lib/libmsc.so # (注意選擇相應的處理器架構) # SDK的庫檔案在libs中有(x86和x64,在這裡面放置的庫檔案)
科大訊飛的SDK帶有ID號,每個人每次下載後的ID都不相同,更換SDK之後需要修改程式碼中的APPID,你也可以直接使用本課程的libmsc.so檔案,否則需要將原始碼中的APPID修改為自己下載SDK中的ID。
3.2 語音聽寫
iat_publish.cpp
/* * 語音聽寫(iFly Auto Transform)技術能夠實時地將語音轉換成對應的文字。 */ #include <stdlib.h> #include <stdio.h> #include <string.h> #include <unistd.h> #include "robot_voice/qisr.h" #include "robot_voice/msp_cmn.h" #include "robot_voice/msp_errors.h" #include "robot_voice/speech_recognizer.h" #include <iconv.h> #include "ros/ros.h" #include "std_msgs/String.h" #define FRAME_LEN 640 #define BUFFER_SIZE 4096 int wakeupFlag = 0 ; int resultFlag = 0 ; static void show_result(char *string, char is_over) { resultFlag=1; printf("\rResult: [ %s ]", string); if(is_over) putchar('\n'); } static char *g_result = NULL; static unsigned int g_buffersize = BUFFER_SIZE; void on_result(const char *result, char is_last) { if (result) { size_t left = g_buffersize - 1 - strlen(g_result); size_t size = strlen(result); if (left < size) { g_result = (char*)realloc(g_result, g_buffersize + BUFFER_SIZE); if (g_result) g_buffersize += BUFFER_SIZE; else { printf("mem alloc failed\n"); return; } } strncat(g_result, result, size); show_result(g_result, is_last); } } void on_speech_begin() { if (g_result) { free(g_result); } g_result = (char*)malloc(BUFFER_SIZE); g_buffersize = BUFFER_SIZE; memset(g_result, 0, g_buffersize); printf("Start Listening...\n"); } void on_speech_end(int reason) { if (reason == END_REASON_VAD_DETECT) printf("\nSpeaking done \n"); else printf("\nRecognizer error %d\n", reason); } /* demo recognize the audio from microphone */ static void demo_mic(const char* session_begin_params) { int errcode; int i = 0; struct speech_rec iat; struct speech_rec_notifier recnotifier = { on_result, on_speech_begin, on_speech_end }; errcode = sr_init(&iat, session_begin_params, SR_MIC, &recnotifier); if (errcode) { printf("speech recognizer init failed\n"); return; } errcode = sr_start_listening(&iat); if (errcode) { printf("start listen failed %d\n", errcode); } /* demo 10 seconds recording */ while(i++ < 10) sleep(1); errcode = sr_stop_listening(&iat); if (errcode) { printf("stop listening failed %d\n", errcode); } sr_uninit(&iat); } /* main thread: start/stop record ; query the result of recgonization. * record thread: record callback(data write) * helper thread: ui(keystroke detection) */ void WakeUp(const std_msgs::String::ConstPtr& msg) { printf("waking up\r\n"); usleep(700*1000); wakeupFlag=1; } int main(int argc, char* argv[]) { // 初始化ROS ros::init(argc, argv, "voiceRecognition"); ros::NodeHandle n; ros::Rate loop_rate(10); // 宣告Publisher和Subscriber // 訂閱喚醒語音識別的訊號 ros::Subscriber wakeUpSub = n.subscribe("voiceWakeup", 1000, WakeUp); // 訂閱喚醒語音識別的訊號 ros::Publisher voiceWordsPub = n.advertise<std_msgs::String>("voiceWords", 1000); ROS_INFO("Sleeping..."); int count=0; while(ros::ok()) { // 語音識別喚醒 if (wakeupFlag){ ROS_INFO("Wakeup..."); int ret = MSP_SUCCESS; const char* login_params = "appid = 594a7b46, work_dir = ."; const char* session_begin_params = "sub = iat, domain = iat, language = zh_cn, " "accent = mandarin, sample_rate = 16000, " "result_type = plain, result_encoding = utf8"; ret = MSPLogin(NULL, NULL, login_params); if(MSP_SUCCESS != ret){ MSPLogout(); printf("MSPLogin failed , Error code %d.\n",ret); } printf("Demo recognizing the speech from microphone\n"); printf("Speak in 10 seconds\n"); demo_mic(session_begin_params); printf("10 sec passed\n"); wakeupFlag=0; MSPLogout(); } // 語音識別完成 if(resultFlag){ resultFlag=0; std_msgs::String msg; msg.data = g_result; voiceWordsPub.publish(msg); } ros::spinOnce(); loop_rate.sleep(); count++; } exit: MSPLogout(); // Logout... return 0; }
subscriber(訂閱者):用來接收語音喚醒訊號,接收到喚醒訊號之後,會將wakeupFlag變數置位;
主迴圈中呼叫SDK的語音聽寫功能,識別成功後置位resultFlag變數,通過Publisher將識別出來的字串釋出。
第一步,在CMakeLists.txt中加入編譯規則
add_executable(iat_publish
src/iat_publish.cpp
src/speech_recognizer.c
src/linuxrec.c)
target_link_libraries(
iat_publish
${catkin_LIBRARIES}
libmsc.so -ldl -lpthread -lm -lrt -lasound
)
編譯過程需要連結SDK中的libmsc.so庫,此前已經將此庫拷貝到系統路徑下,所以此處不需要新增完整路徑。
第二步,語音聽寫示例
$ roscore
$ rosrun robot_voice iat_publish
$ rostopic pub /voiceWakeup std_msgs/String "data:'any string'"
'any string' ''裡面可以隨便填
3.3 語音合成
語音合成(Text To Speech,TTS)技術能夠自動將任意文字實時轉換為連續的自然語音,是一種能夠在任何時間、任何地點,向任何人提供語音資訊服務的高效便捷手段,非常符合資訊時代海量資料、動態更新和個性化查詢的需求。
tts_subscribe.cpp
/*
* 語音合成(Text To Speech,TTS)技術能夠自動將任意文字實時轉換為連續的
* 自然語音,是一種能夠在任何時間、任何地點,向任何人提供語音資訊服務的
* 高效便捷手段,非常符合資訊時代海量資料、動態更新和個性化查詢的需求。
*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include "robot_voice/qtts.h"
#include "robot_voice/msp_cmn.h"
#include "robot_voice/msp_errors.h"
#include "ros/ros.h"
#include "std_msgs/String.h"
#include <sstream>
#include <sys/types.h>
#include <sys/stat.h>
/* wav音訊頭部格式 */
typedef struct _wave_pcm_hdr
{
char riff[4]; // = "RIFF"
int size_8; // = FileSize - 8
char wave[4]; // = "WAVE"
char fmt[4]; // = "fmt "
int fmt_size; // = 下一個結構體的大小 : 16
short int format_tag; // = PCM : 1
short int channels; // = 通道數 : 1
int samples_per_sec; // = 取樣率 : 8000 | 6000 | 11025 | 16000
int avg_bytes_per_sec; // = 每秒位元組數 : samples_per_sec * bits_per_sample / 8
short int block_align; // = 每取樣點位元組數 : wBitsPerSample / 8
short int bits_per_sample; // = 量化位元數: 8 | 16
char data[4]; // = "data";
int data_size; // = 純資料長度 : FileSize - 44
} wave_pcm_hdr;
/* 預設wav音訊頭部資料 */
wave_pcm_hdr default_wav_hdr =
{
{ 'R', 'I', 'F', 'F' },
0,
{'W', 'A', 'V', 'E'},
{'f', 'm', 't', ' '},
16,
1,
1,
16000,
32000,
2,
16,
{'d', 'a', 't', 'a'},
0
};
/* 文字合成 */
int text_to_speech(const char* src_text, const char* des_path, const char* params)
{
int ret = -1;
FILE* fp = NULL;
const char* sessionID = NULL;
unsigned int audio_len = 0;
wave_pcm_hdr wav_hdr = default_wav_hdr;
int synth_status = MSP_TTS_FLAG_STILL_HAVE_DATA;
if (NULL == src_text || NULL == des_path)
{
printf("params is error!\n");
return ret;
}
fp = fopen(des_path, "wb");
if (NULL == fp)
{
printf("open %s error.\n", des_path);
return ret;
}
/* 開始合成 */
sessionID = QTTSSessionBegin(params, &ret);
if (MSP_SUCCESS != ret)
{
printf("QTTSSessionBegin failed, error code: %d.\n", ret);
fclose(fp);
return ret;
}
ret = QTTSTextPut(sessionID, src_text, (unsigned int)strlen(src_text), NULL);
if (MSP_SUCCESS != ret)
{
printf("QTTSTextPut failed, error code: %d.\n",ret);
QTTSSessionEnd(sessionID, "TextPutError");
fclose(fp);
return ret;
}
printf("正在合成 ...\n");
fwrite(&wav_hdr, sizeof(wav_hdr) ,1, fp); //新增wav音訊頭,使用取樣率為16000
while (1)
{
/* 獲取合成音訊 */
const void* data = QTTSAudioGet(sessionID, &audio_len, &synth_status, &ret);
if (MSP_SUCCESS != ret)
break;
if (NULL != data)
{
fwrite(data, audio_len, 1, fp);
wav_hdr.data_size += audio_len; //計算data_size大小
}
if (MSP_TTS_FLAG_DATA_END == synth_status)
break;
printf(">");
usleep(150*1000); //防止頻繁佔用CPU
}//合成狀態synth_status取值請參閱《訊飛語音雲API文件》
printf("\n");
if (MSP_SUCCESS != ret)
{
printf("QTTSAudioGet failed, error code: %d.\n",ret);
QTTSSessionEnd(sessionID, "AudioGetError");
fclose(fp);
return ret;
}
/* 修正wav檔案頭資料的大小 */
wav_hdr.size_8 += wav_hdr.data_size + (sizeof(wav_hdr) - 8);
/* 將修正過的資料寫回檔案頭部,音訊檔案為wav格式 */
fseek(fp, 4, 0);
fwrite(&wav_hdr.size_8,sizeof(wav_hdr.size_8), 1, fp); //寫入size_8的值
fseek(fp, 40, 0); //將檔案指標偏移到儲存data_size值的位置
fwrite(&wav_hdr.data_size,sizeof(wav_hdr.data_size), 1, fp); //寫入data_size的值
fclose(fp);
fp = NULL;
/* 合成完畢 */
ret = QTTSSessionEnd(sessionID, "Normal");
if (MSP_SUCCESS != ret)
{
printf("QTTSSessionEnd failed, error code: %d.\n",ret);
}
return ret;
}
void voiceWordsCallback(const std_msgs::String::ConstPtr& msg)
{
char cmd[2000];
const char* text;
int ret = MSP_SUCCESS;
const char* session_begin_params = "voice_name = xiaoyan, text_encoding = utf8, sample_rate = 16000, speed = 50, volume = 50, pitch = 50, rdn = 2";
const char* filename = "tts_sample.wav"; //合成的語音檔名稱
std::cout<<"I heard :"<<msg->data.c_str()<<std::endl;
text = msg->data.c_str();
/* 文字合成 */
printf("開始合成 ...\n");
ret = text_to_speech(text, filename, session_begin_params);
if (MSP_SUCCESS != ret)
{
printf("text_to_speech failed, error code: %d.\n", ret);
}
printf("合成完畢\n");
unlink("/tmp/cmd");
mkfifo("/tmp/cmd", 0777);
popen("mplayer -quiet -slave -input file=/tmp/cmd 'tts_sample.wav'","r");
sleep(3);
}
void toExit()
{
printf("按任意鍵退出 ...\n");
getchar();
MSPLogout(); //退出登入
}
int main(int argc, char* argv[])
{
int ret = MSP_SUCCESS;
const char* login_params = "appid = 594a7b46, work_dir = .";//登入引數,appid與msc庫繫結,請勿隨意改動
/*
* rdn: 合成音訊數字發音方式
* volume: 合成音訊的音量
* pitch: 合成音訊的音調
* speed: 合成音訊對應的語速
* voice_name: 合成發音人
* sample_rate: 合成音訊取樣率
* text_encoding: 合成文字編碼格式
*
* 詳細引數說明請參閱《訊飛語音雲MSC--API文件》
*/
/* 使用者登入 */
ret = MSPLogin(NULL, NULL, login_params);//第一個引數是使用者名稱,第二個引數是密碼,第三個引數是登入引數,使用者名稱和密碼可在http://open.voicecloud.cn註冊獲取
if (MSP_SUCCESS != ret)
{
printf("MSPLogin failed, error code: %d.\n", ret);
/*goto exit ;*///登入失敗,退出登入
toExit();
}
printf("\n###########################################################################\n");
printf("## 語音合成(Text To Speech,TTS)技術能夠自動將任意文字實時轉換為連續的 ##\n");
printf("## 自然語音,是一種能夠在任何時間、任何地點,向任何人提供語音資訊服務的 ##\n");
printf("## 高效便捷手段,非常符合資訊時代海量資料、動態更新和個性化查詢的需求。 ##\n");
printf("###########################################################################\n\n");
ros::init(argc,argv,"TextToSpeech");
ros::NodeHandle n;
ros::Subscriber sub =n.subscribe("voiceWords", 1000,voiceWordsCallback);
ros::spin();
exit:
printf("按任意鍵退出 ...\n");
getchar();
MSPLogout(); //退出登入
return 0;
}
- main函式中聲明瞭一個訂閱voiceWords話題的subscriber,接收輸入的語音字串。
- 回撥函式voiceWordsCallback中使用SDK介面將字串轉換成中文語音。
第一步,在CMakeLists.txt中加入編譯規則
add_executable(tts_subscribe src/tts_subscribe.cpp)
target_link_libraries(
tts_subscribe
${catkin_LIBRARIES}
libmsc.so -ldl -pthread
)
第二步,語音合成示例
$ roscore
$ rosrun robot_voice tts_subscribe
$ rostopic pub/voiceWords std_msgs/String"data:'你好,我是機器人'"
語音合成啟動後可能產生的錯誤
原因:mplayer配置導致
解決方法:在$HOME/.mplayer/config檔案中新增如下設定:lirc=no
3.4 智慧語音助手
voice_assistant.cpp
#include <iostream>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <time.h>
#include "robot_voice/qtts.h"
#include "robot_voice/msp_cmn.h"
#include "robot_voice/msp_errors.h"
#include "ros/ros.h"
#include "std_msgs/String.h"
#include <sstream>
#include <sys/types.h>
#include <sys/stat.h>
/* wav音訊頭部格式 */
typedef struct _wave_pcm_hdr
{
char riff[4]; // = "RIFF"
int size_8; // = FileSize - 8
char wave[4]; // = "WAVE"
char fmt[4]; // = "fmt "
int fmt_size; // = 下一個結構體的大小 : 16
short int format_tag; // = PCM : 1
short int channels; // = 通道數 : 1
int samples_per_sec; // = 取樣率 : 8000 | 6000 | 11025 | 16000
int avg_bytes_per_sec; // = 每秒位元組數 : samples_per_sec * bits_per_sample / 8
short int block_align; // = 每取樣點位元組數 : wBitsPerSample / 8
short int bits_per_sample; // = 量化位元數: 8 | 16
char data[4]; // = "data";
int data_size; // = 純資料長度 : FileSize - 44
} wave_pcm_hdr;
/* 預設wav音訊頭部資料 */
wave_pcm_hdr default_wav_hdr =
{
{ 'R', 'I', 'F', 'F' },
0,
{'W', 'A', 'V', 'E'},
{'f', 'm', 't', ' '},
16,
1,
1,
16000,
32000,
2,
16,
{'d', 'a', 't', 'a'},
0
};
/* 文字合成 */
int text_to_speech(const char* src_text, const char* des_path, const char* params)
{
int ret = -1;
FILE* fp = NULL;
const char* sessionID = NULL;
unsigned int audio_len = 0;
wave_pcm_hdr wav_hdr = default_wav_hdr;
int synth_status = MSP_TTS_FLAG_STILL_HAVE_DATA;
if (NULL == src_text || NULL == des_path)
{
printf("params is error!\n");
return ret;
}
fp = fopen(des_path, "wb");
if (NULL == fp)
{
printf("open %s error.\n", des_path);
return ret;
}
/* 開始合成 */
sessionID = QTTSSessionBegin(params, &ret);
if (MSP_SUCCESS != ret)
{
printf("QTTSSessionBegin failed, error code: %d.\n", ret);
fclose(fp);
return ret;
}
ret = QTTSTextPut(sessionID, src_text, (unsigned int)strlen(src_text), NULL);
if (MSP_SUCCESS != ret)
{
printf("QTTSTextPut failed, error code: %d.\n",ret);
QTTSSessionEnd(sessionID, "TextPutError");
fclose(fp);
return ret;
}
printf("正在合成 ...\n");
fwrite(&wav_hdr, sizeof(wav_hdr) ,1, fp); //新增wav音訊頭,使用取樣率為16000
while (1)
{
/* 獲取合成音訊 */
const void* data = QTTSAudioGet(sessionID, &audio_len, &synth_status, &ret);
if (MSP_SUCCESS != ret)
break;
if (NULL != data)
{
fwrite(data, audio_len, 1, fp);
wav_hdr.data_size += audio_len; //計算data_size大小
}
if (MSP_TTS_FLAG_DATA_END == synth_status)
break;
printf(">");
usleep(150*1000); //防止頻繁佔用CPU
}//合成狀態synth_status取值請參閱《訊飛語音雲API文件》
printf("\n");
if (MSP_SUCCESS != ret)
{
printf("QTTSAudioGet failed, error code: %d.\n",ret);
QTTSSessionEnd(sessionID, "AudioGetError");
fclose(fp);
return ret;
}
/* 修正wav檔案頭資料的大小 */
wav_hdr.size_8 += wav_hdr.data_size + (sizeof(wav_hdr) - 8);
/* 將修正過的資料寫回檔案頭部,音訊檔案為wav格式 */
fseek(fp, 4, 0);
fwrite(&wav_hdr.size_8,sizeof(wav_hdr.size_8), 1, fp); //寫入size_8的值
fseek(fp, 40, 0); //將檔案指標偏移到儲存data_size值的位置
fwrite(&wav_hdr.data_size,sizeof(wav_hdr.data_size), 1, fp); //寫入data_size的值
fclose(fp);
fp = NULL;
/* 合成完畢 */
ret = QTTSSessionEnd(sessionID, "Normal");
if (MSP_SUCCESS != ret)
{
printf("QTTSSessionEnd failed, error code: %d.\n",ret);
}
return ret;
}
std::string to_string(int val)
{
char buf[20];
sprintf(buf, "%d", val);
return std::string(buf);
}
void voiceWordsCallback(const std_msgs::String::ConstPtr& msg)
{
char cmd[2000];
const char* text;
int ret = MSP_SUCCESS;
const char* session_begin_params = "voice_name = xiaoyan, text_encoding = utf8, sample_rate = 16000, speed = 50, volume = 50, pitch = 50, rdn = 2";
const char* filename = "tts_sample.wav"; //合成的語音檔名稱
std::cout<<"I heard :"<<msg->data.c_str()<<std::endl;
std::string dataString = msg->data;
if(dataString.compare("你是誰?") == 0)
{
char nameString[40] = "我是你的語音小助手";
text = nameString;
std::cout<<text<<std::endl;
}
else if(dataString.compare("你可以做什麼?") == 0)
{
char helpString[40] = "你可以問我現在時間";
text = helpString;
std::cout<<text<<std::endl;
}
else if(dataString.compare("現在時間。") == 0)
{
//獲取當前時間
struct tm *ptm;
long ts;
ts = time(NULL);
ptm = localtime(&ts);
std::string string = "現在時間" + to_string(ptm-> tm_hour) + "點" + to_string(ptm-> tm_min) + "分";
char timeString[40];
string.copy(timeString, sizeof(string), 0);
text = timeString;
std::cout<<text<<std::endl;
}
else
{
text = msg->data.c_str();
}
/* 文字合成 */
printf("開始合成 ...\n");
ret = text_to_speech(text, filename, session_begin_params);
if (MSP_SUCCESS != ret)
{
printf("text_to_speech failed, error code: %d.\n", ret);
}
printf("合成完畢\n");
unlink("/tmp/cmd");
mkfifo("/tmp/cmd", 0777);
popen("mplayer -quiet -slave -input file=/tmp/cmd 'tts_sample.wav'","r");
sleep(3);
}
void toExit()
{
printf("按任意鍵退出 ...\n");
getchar();
MSPLogout(); //退出登入
}
int main(int argc, char* argv[])
{
int ret = MSP_SUCCESS;
const char* login_params = "appid = 594a7b46, work_dir = .";//登入引數,appid與msc庫繫結,請勿隨意改動
/*
* rdn: 合成音訊數字發音方式
* volume: 合成音訊的音量
* pitch: 合成音訊的音調
* speed: 合成音訊對應的語速
* voice_name: 合成發音人
* sample_rate: 合成音訊取樣率
* text_encoding: 合成文字編碼格式
*
* 詳細引數說明請參閱《訊飛語音雲MSC--API文件》
*/
/* 使用者登入 */
ret = MSPLogin(NULL, NULL, login_params);//第一個引數是使用者名稱,第二個引數是密碼,第三個引數是登入引數,使用者名稱和密碼可在http://open.voicecloud.cn註冊獲取
if (MSP_SUCCESS != ret)
{
printf("MSPLogin failed, error code: %d.\n", ret);
/*goto exit ;*///登入失敗,退出登入
toExit();
}
printf("\n###########################################################################\n");
printf("## 語音合成(Text To Speech,TTS)技術能夠自動將任意文字實時轉換為連續的 ##\n");
printf("## 自然語音,是一種能夠在任何時間、任何地點,向任何人提供語音資訊服務的 ##\n");
printf("## 高效便捷手段,非常符合資訊時代海量資料、動態更新和個性化查詢的需求。 ##\n");
printf("###########################################################################\n\n");
ros::init(argc,argv,"TextToSpeech");
ros::NodeHandle n;
ros::Subscriber sub =n.subscribe("voiceWords", 1000,voiceWordsCallback);
ros::spin();
exit:
printf("按任意鍵退出 ...\n");
getchar();
MSPLogout(); //退出登入
return 0;
}
第一步,在CMakeLists.txt中加入編譯規則
add_executable(voice_assistant src/voice_assistant.cpp)
target_link_libraries(
voice_assistant
${catkin_LIBRARIES}
libmsc.so -ldl -pthread
)
第二步,智慧語音助手示例
$ roscore
$ rosrun robot_voice iat_publish # 啟動語音識別節點
$ rosrun robot_voice voice_assistant # 啟動語音助手作為語音判斷
$ rostopic pub /voiceWakeup std_msgs/String "data:'any string'" # 喚醒詞
4 參考資料
1.ROS探索總結(二十八)——機器聽覺
2.ROS Kinetic使用PocketSphinx進行語音識別
https://blog.csdn.net/seeseeatre/article/details/79228816
3.訊飛開放平臺-以語音互動為核心的人工智慧開放平臺
微信公眾號:喵哥解說
公眾號介紹:主要研究機器學習、計算機視覺、深度學習、ROS等相關內容,分享學習過程中的學習筆記和心得!期待您的關注,歡迎一起學習交流進步!同時還有1200G的Python視訊和書籍資料等你領取!!!