關於語音合成和識別
阿新 • • 發佈:2018-06-27
沒有 asr 進制 lower 編碼 IV 業務 key -i
最近研究了下語音合成和語音識別。分別看了一些文章,也下載jdk寫了些代碼測試了下。
發現,對於語音合成。中文來說,百度語音和科大訊飛,基本都差不多。
英文的話,百度合成出來的效果不佳。科大訊飛稍好點。但是總體都沒有國外語音合成好。比如 iSpeech、FreeTTS,可能國外的主語都是英語的緣故吧。
百度日調用額度比較多,據說有2萬額度。訊飛每天就500,有點少。iSpeech 是要收費的。FreeTTS 可以離線使用。
百度識別和合成代碼:
public class SoundAPI { private static final Logger logger = LoggerFactory.getLogger(SoundAPI.class); final static String FILE_PATH = Config.getString("download.folder"); // 設置APPID/AK/SK private static final String APP_ID = "你的APP ID"; private static final String API_KEY = "你的key"; private static final String SECRET_KEY = "你的秘鑰"; // 初始化一個AipSpeech private static AipSpeech client = null; private static long iniTime = 0L; /** 30 天 24 小時 **/ private static final long MONTH_TIME = 30 * 24 * 60 * 60 * 1000; private static final Base64 base64 = new Base64(); private static void iniAPI() { boolean needToReset = false; // 判斷是否一個月了,如果一個月後,需要重新初始話 long currentTime = System.currentTimeMillis(); if (currentTime - iniTime > MONTH_TIME) { needToReset = true; } if (client == null || needToReset) { client = new AipSpeech(APP_ID, API_KEY, SECRET_KEY); /** 2秒超時時間 **/ client.setConnectionTimeoutInMillis(2000); iniTime = System.currentTimeMillis(); } } public static String getSoundMp3(String text, String fileName, QuestionTypeEnum questionType) { String rtnfileName = ""; String type = "zh"; if (StringUtils.isEmpty(text)) return ""; try { iniAPI(); if (QuestionTypeEnum.ENGLISH_WORD.getType().equals(questionType.getType())) { type = "en"; } TtsResponse res = client.synthesis(text, type, 1, null); byte[] data = res.getData(); if (data != null) { // String uuid = UUID.randomUUID().toString().replace("-", // "").toLowerCase(); String uuid = base64.encodeToString(fileName.getBytes()); rtnfileName = type + "/" + uuid.replaceAll("=", "") + ".mp3"; String path = FILE_PATH + rtnfileName; File file = new File(path); if (!file.exists()) { Util.writeBytesToFileSystem(data, path); } } else { JSONObject jsonObj = res.getResult(); logger.info("invoke baidu synthesis API error:", jsonObj); } } catch (Exception e) { rtnfileName = ""; logger.error("invoke baidu synthesis API error:", e); } return rtnfileName; } public static String recognizeSound(String filePath, QuestionTypeEnum questionType) { String result = ""; JSONObject asrRes = null; if (StringUtils.isEmpty(filePath)) return ""; try { iniAPI(); if (QuestionTypeEnum.ENGLISH_WORD.getType().equals(questionType.getType())) { HashMap<String, Object> options = new HashMap<>(); options.put("dev_pid", 1737); asrRes = client.asr(filePath, "pcm", 16000, options); } else { asrRes = client.asr(filePath, "pcm", 16000, null); } result = getResult(asrRes); } catch (Exception e) { logger.error("invoke baidu asr API error:", e); } return result; } private static String getResult(JSONObject asrRes) { String result = ""; if (asrRes.getInt("err_no") == 0) { JSONArray arrayResult = asrRes.getJSONArray("result"); StringBuilder sbResult = new StringBuilder(); for (int i = 0; i < arrayResult.length(); i++) { if (i == 0) { sbResult.append(arrayResult.get(i).toString()); } else { if (!StringUtils.isEmpty(arrayResult.get(i).toString())) sbResult.append(";" + arrayResult.get(i).toString()); } } result = sbResult.toString().replaceAll(",", ""); } else { logger.error("invoke baidu asr API error:", asrRes); } return result; }
科大訊飛的語音識別及合成
public class IatAPI { private static final Logger logger = LoggerFactory.getLogger(IatAPI.class); /** * 科大訊飛語音識別寫入參考 * https://github.com/IflytekAIUI/DemoCode/blob/master/webapi/java/Iat.java */ final static String APPID = "你的APPID"; final static String APPKEY_IAT = "你的秘鑰"; final static String URL_IAT = "http://api.xfyun.cn/v1/service/v1/iat"; final static String IP = "服務器IP地址"; /** * * 發送語音,獲取文字 * * @param audioByteArray * @return * @throws Exception */ public static String process(String filePath) throws Exception { Map<String, String> header = getHeader("raw", "sms16k"); // 讀取音頻文件,轉二進制數組,然後Base64編碼 byte[] audioByteArray = FileUtil.read2ByteArray(filePath); String audioBase64 = new String(Base64.encodeBase64(audioByteArray), "UTF-8"); String bodyParam = "audio=" + audioBase64; // logger.info(bodyParam); String result = HttpUtil.doPost(URL_IAT, header, bodyParam); return result; } /** * 組裝http請求頭 * * @param aue * @param resultLevel * @param language * @param category * @return * @throws UnsupportedEncodingException */ private static Map<String, String> getHeader(String aue, String engineType) throws UnsupportedEncodingException { // 系統當前時間戳 String X_CurTime = System.currentTimeMillis() / 1000L + ""; // 業務參數 String param = "{\"aue\":\"" + aue + "\"" + ",\"engine_type\":\"" + engineType + "\"}"; String X_Param = new String(Base64.encodeBase64(param.getBytes("UTF-8"))); // 接口密鑰 String apiKey = APPKEY_IAT; // 訊飛開放平臺應用ID String X_Appid = APPID; // 生成令牌 String X_CheckSum = DigestUtils.md5Hex(apiKey + X_CurTime + X_Param); // 組裝請求頭 Map<String, String> header = new HashMap<String, String>(); header.put("Content-Type", "application/x-www-form-urlencoded; charset=utf-8"); header.put("X-Param", X_Param); header.put("X-CurTime", X_CurTime); header.put("X-CheckSum", X_CheckSum); header.put("X-Appid", X_Appid); header.put("X-Real-Ip", IP); return header; }
public class TtsAPI { private static final Logger logger = LoggerFactory.getLogger(TtsAPI.class); /** * 科大訊飛語音識別寫入參考 * https://github.com/IflytekAIUI/DemoCode/blob/master/webapi/java/Iat.java */ final static String APPID = "你的APP id"; final static String APPKEY_TTS = "你的秘鑰"; final static String URL_TTS = "http://api.xfyun.cn/v1/service/v1/tts"; final static String IP = "服務器地址"; final static String FILE_PATH = Config.getString("download.folder"); /** * * 發送文字,獲取語音 * * @param text * @throws Exception */ public static String process(String text) throws Exception { String result = null; Long startTime = System.currentTimeMillis(); try { Map<String, String> header = getHeader("audio/L16;rate=16000", "lame", "xiaoyan", "50", "50", "", "text", "50"); Map<String, Object> resultMap = HttpUtil.doMultiPost(URL_TTS, header, "text=" + text); // 合成成功 if ("audio/mpeg".equals(resultMap.get("Content-Type"))) { FileUtil.save(FILE_PATH, resultMap.get("sid") + ".mp3", (byte[]) resultMap.get("body")); result = resultMap.get("sid") + ".mp3"; } else { // 合成失敗 logger.error(resultMap.get("body").toString()); } } catch (Exception e) { logger.error("there is error:", e); } Long endTime = System.currentTimeMillis(); logger.info("finish get voice:" + (endTime - startTime)); return result; } /** * 組裝http請求頭 * * @param aue * @param resultLevel * @param language * @param category * @return * @throws UnsupportedEncodingException */ private static Map<String, String> getHeader(String auf, String aue, String voiceName, String speed, String volume, String engineType, String textType, String pitch) throws UnsupportedEncodingException { String curTime = System.currentTimeMillis() / 1000L + ""; StringBuilder param = new StringBuilder("{\"auf\":\"" + auf + "\""); if (!StringUtil.isNullOrEmpty(aue)) { param.append(",\"aue\":\"" + aue + "\""); } if (!StringUtil.isNullOrEmpty(voiceName)) { param.append(",\"voice_name\":\"" + voiceName + "\""); } if (!StringUtil.isNullOrEmpty(speed)) { param.append(",\"speed\":\"" + speed + "\""); } if (!StringUtil.isNullOrEmpty(volume)) { param.append(",\"volume\":\"" + volume + "\""); } if (!StringUtil.isNullOrEmpty(pitch)) { param.append(",\"pitch\":\"" + pitch + "\""); } if (!StringUtil.isNullOrEmpty(engineType)) { param.append(",\"engine_type\":\"" + engineType + "\""); } if (!StringUtil.isNullOrEmpty(textType)) { param.append(",\"text_type\":\"" + textType + "\""); } param.append("}"); String paramBase64 = new String(Base64.encodeBase64(param.toString().getBytes("UTF-8"))); String checkSum = DigestUtils.md5Hex(APPKEY_TTS + curTime + paramBase64); Map<String, String> header = new HashMap<String, String>(); header.put("Content-Type", "application/x-www-form-urlencoded; charset=utf-8"); header.put("X-Param", paramBase64); header.put("X-CurTime", curTime); header.put("X-CheckSum", checkSum); header.put("X-Real-Ip", IP); header.put("X-Appid", APPID); // logger.info(JSON.toJSONString(header)); return header; }
關於語音合成和識別