最近研究了下语音合成和语音识别。分别看了一些文章,也下载jdk写了些代码测试了下。
发现,对于语音合成。中文来说,百度语音和科大讯飞,基本都差不多。
英文的话,百度合成出来的效果不佳。科大讯飞稍好点。但是总体都没有国外语音合成好。比如 iSpeech、FreeTTS,可能国外的主语都是英语的缘故吧。
百度日调用额度比较多,据说有2万额度。讯飞每天就500,有点少。iSpeech 是要收费的。FreeTTS 可以离线使用。
百度识别和合成代码:
public class SoundAPI { private static final Logger logger = LoggerFactory.getLogger(SoundAPI.class); final static String FILE_PATH = Config.getString("download.folder"); // 设置APPID/AK/SK private static final String APP_ID = "你的APP ID"; private static final String API_KEY = "你的key"; private static final String SECRET_KEY = "你的秘钥"; // 初始化一个AipSpeech private static AipSpeech client = null; private static long iniTime = 0L; /** 30 天 24 小时 **/ private static final long MONTH_TIME = 30 * 24 * 60 * 60 * 1000; private static final Base64 base64 = new Base64(); private static void iniAPI() { boolean needToReset = false; // 判断是否一个月了,如果一个月后,需要重新初始话 long currentTime = System.currentTimeMillis(); if (currentTime - iniTime > MONTH_TIME) { needToReset = true; } if (client == null || needToReset) { client = new AipSpeech(APP_ID, API_KEY, SECRET_KEY); /** 2秒超时时间 **/ client.setConnectionTimeoutInMillis(2000); iniTime = System.currentTimeMillis(); } } public static String getSoundMp3(String text, String fileName, QuestionTypeEnum questionType) { String rtnfileName = ""; String type = "zh"; if (StringUtils.isEmpty(text)) return ""; try { iniAPI(); if (QuestionTypeEnum.ENGLISH_WORD.getType().equals(questionType.getType())) { type = "en"; } TtsResponse res = client.synthesis(text, type, 1, null); byte[] data = res.getData(); if (data != null) { // String uuid = UUID.randomUUID().toString().replace("-", // "").toLowerCase(); String uuid = base64.encodeToString(fileName.getBytes()); rtnfileName = type + "/" + uuid.replaceAll("=", "") + ".mp3"; String path = FILE_PATH + rtnfileName; File file = new File(path); if (!file.exists()) { Util.writeBytesToFileSystem(data, path); } } else { JSONObject jsonObj = res.getResult(); logger.info("invoke baidu synthesis API error:", jsonObj); } } catch (Exception e) { rtnfileName = ""; logger.error("invoke baidu synthesis API error:", e); } return rtnfileName; } public static String recognizeSound(String filePath, QuestionTypeEnum questionType) { String result = ""; JSONObject asrRes = null; if (StringUtils.isEmpty(filePath)) return ""; try { iniAPI(); if (QuestionTypeEnum.ENGLISH_WORD.getType().equals(questionType.getType())) { HashMap<String, Object> options = new HashMap<>(); options.put("dev_pid", 1737); asrRes = client.asr(filePath, "pcm", 16000, options); } else { asrRes = client.asr(filePath, "pcm", 16000, null); } result = getResult(asrRes); } catch (Exception e) { logger.error("invoke baidu asr API error:", e); } return result; } private static String getResult(JSONObject asrRes) { String result = ""; if (asrRes.getInt("err_no") == 0) { JSONArray arrayResult = asrRes.getJSONArray("result"); StringBuilder sbResult = new StringBuilder(); for (int i = 0; i < arrayResult.length(); i++) { if (i == 0) { sbResult.append(arrayResult.get(i).toString()); } else { if (!StringUtils.isEmpty(arrayResult.get(i).toString())) sbResult.append(";" + arrayResult.get(i).toString()); } } result = sbResult.toString().replaceAll(",", ""); } else { logger.error("invoke baidu asr API error:", asrRes); } return result; }
科大讯飞的语音识别及合成
public class IatAPI { private static final Logger logger = LoggerFactory.getLogger(IatAPI.class); /** * 科大讯飞语音识别写入参考 * https://github.com/IflytekAIUI/DemoCode/blob/master/webapi/java/Iat.java */ final static String APPID = "你的APPID"; final static String APPKEY_IAT = "你的秘钥"; final static String URL_IAT = "http://api.xfyun.cn/v1/service/v1/iat"; final static String IP = "服务器IP地址"; /** * * 发送语音,获取文字 * * @param audioByteArray * @return * @throws Exception */ public static String process(String filePath) throws Exception { Map<String, String> header = getHeader("raw", "sms16k"); // 读取音频文件,转二进制数组,然后Base64编码 byte[] audioByteArray = FileUtil.read2ByteArray(filePath); String audioBase64 = new String(Base64.encodeBase64(audioByteArray), "UTF-8"); String bodyParam = "audio=" + audioBase64; // logger.info(bodyParam); String result = HttpUtil.doPost(URL_IAT, header, bodyParam); return result; } /** * 组装http请求头 * * @param aue * @param resultLevel * @param language * @param category * @return * @throws UnsupportedEncodingException */ private static Map<String, String> getHeader(String aue, String engineType) throws UnsupportedEncodingException { // 系统当前时间戳 String X_CurTime = System.currentTimeMillis() / 1000L + ""; // 业务参数 String param = "{"aue":"" + aue + """ + ","engine_type":"" + engineType + ""}"; String X_Param = new String(Base64.encodeBase64(param.getBytes("UTF-8"))); // 接口密钥 String apiKey = APPKEY_IAT; // 讯飞开放平台应用ID String X_Appid = APPID; // 生成令牌 String X_CheckSum = DigestUtils.md5Hex(apiKey + X_CurTime + X_Param); // 组装请求头 Map<String, String> header = new HashMap<String, String>(); header.put("Content-Type", "application/x-www-form-urlencoded; charset=utf-8"); header.put("X-Param", X_Param); header.put("X-CurTime", X_CurTime); header.put("X-CheckSum", X_CheckSum); header.put("X-Appid", X_Appid); header.put("X-Real-Ip", IP); return header; }
public class TtsAPI { private static final Logger logger = LoggerFactory.getLogger(TtsAPI.class); /** * 科大讯飞语音识别写入参考 * https://github.com/IflytekAIUI/DemoCode/blob/master/webapi/java/Iat.java */ final static String APPID = "你的APP id"; final static String APPKEY_TTS = "你的秘钥"; final static String URL_TTS = "http://api.xfyun.cn/v1/service/v1/tts"; final static String IP = "服务器地址"; final static String FILE_PATH = Config.getString("download.folder"); /** * * 发送文字,获取语音 * * @param text * @throws Exception */ public static String process(String text) throws Exception { String result = null; Long startTime = System.currentTimeMillis(); try { Map<String, String> header = getHeader("audio/L16;rate=16000", "lame", "xiaoyan", "50", "50", "", "text", "50"); Map<String, Object> resultMap = HttpUtil.doMultiPost(URL_TTS, header, "text=" + text); // 合成成功 if ("audio/mpeg".equals(resultMap.get("Content-Type"))) { FileUtil.save(FILE_PATH, resultMap.get("sid") + ".mp3", (byte[]) resultMap.get("body")); result = resultMap.get("sid") + ".mp3"; } else { // 合成失败 logger.error(resultMap.get("body").toString()); } } catch (Exception e) { logger.error("there is error:", e); } Long endTime = System.currentTimeMillis(); logger.info("finish get voice:" + (endTime - startTime)); return result; } /** * 组装http请求头 * * @param aue * @param resultLevel * @param language * @param category * @return * @throws UnsupportedEncodingException */ private static Map<String, String> getHeader(String auf, String aue, String voiceName, String speed, String volume, String engineType, String textType, String pitch) throws UnsupportedEncodingException { String curTime = System.currentTimeMillis() / 1000L + ""; StringBuilder param = new StringBuilder("{"auf":"" + auf + """); if (!StringUtil.isNullOrEmpty(aue)) { param.append(","aue":"" + aue + """); } if (!StringUtil.isNullOrEmpty(voiceName)) { param.append(","voice_name":"" + voiceName + """); } if (!StringUtil.isNullOrEmpty(speed)) { param.append(","speed":"" + speed + """); } if (!StringUtil.isNullOrEmpty(volume)) { param.append(","volume":"" + volume + """); } if (!StringUtil.isNullOrEmpty(pitch)) { param.append(","pitch":"" + pitch + """); } if (!StringUtil.isNullOrEmpty(engineType)) { param.append(","engine_type":"" + engineType + """); } if (!StringUtil.isNullOrEmpty(textType)) { param.append(","text_type":"" + textType + """); } param.append("}"); String paramBase64 = new String(Base64.encodeBase64(param.toString().getBytes("UTF-8"))); String checkSum = DigestUtils.md5Hex(APPKEY_TTS + curTime + paramBase64); Map<String, String> header = new HashMap<String, String>(); header.put("Content-Type", "application/x-www-form-urlencoded; charset=utf-8"); header.put("X-Param", paramBase64); header.put("X-CurTime", curTime); header.put("X-CheckSum", checkSum); header.put("X-Real-Ip", IP); header.put("X-Appid", APPID); // logger.info(JSON.toJSONString(header)); return header; }