java实现微软文本转语音（TTS）经验总结

一、使用背景

公司项目之前一直是采用人工录音，然而上线一段时间之后发现，人工录音成本太高，而且每周上线的音频不多，老板发现问题后，甚至把音频功能裸停了一段时间。直到最近项目要向海外扩展，需要内容做国际化，就想到了用机器翻译。目前机翻已经相对成熟，做的好的国内有科大讯飞，国外有微软。既然项目主要面对海外用户，就决定采用微软的TTS。（PS：这里不是打广告，微软的TTS是真的不错，自己可以去官网试听下，虽然无法像人一样很有感情的朗读诗歌什么的，但是朗读新闻咨询类文章还是抑扬顿挫的。）

二、上代码

使用背景已经啰嗦了一大堆，我觉得读者还是会关注的，但是我想作为资深CV码农，我想你们更关注还是如何应用，所以还是老规矩，简简单单的上代码。（申请账号这些就不介绍了）

1.依赖

<dependency>
    <groupId>com.microsoft.cognitiveservices.speech</groupId>
    <artifactId>client-sdk</artifactId>
    <version>1.12.1</version>
</dependency>

2.配置常量

public class TtsConst {
    /**
     * 音频合成类型（亲测这种效果最佳，其他的你自己去试试）
     */
    public static final String AUDIO_24KHZ_48KBITRATE_MONO_MP3 = "audio-24khz-48kbitrate-mono-mp3";
    /**
     * 授权url
     */
    public static final String ACCESS_TOKEN_URI = "https://eastasia.api.cognitive.microsoft.com/sts/v1.0/issuetoken";
    /**
     * api key
     */
    public static final String API_KEY = "你自己的 api key";
    /**
     * 设置accessToken的过期时间为9分钟
     */
    public static final Integer ACCESS_TOKEN_EXPIRE_TIME = 9 * 60;
    /**
     * 性别
     */
    public static final String MALE = "Male";
    /**
     * tts服务url
     */
    public static final String TTS_SERVICE_URI = "https://eastasia.tts.speech.microsoft.com/cognitiveservices/v1";

}

3.https连接

public class HttpsConnection {

    public static HttpsURLConnection getHttpsConnection(String connectingUrl) throws Exception {

        URL url = new URL(connectingUrl);
        return (HttpsURLConnection) url.openConnection();
    }
}

3.授权

@Component
@Slf4j
public class Authentication {
    @Resource
    private RedisCache redisCache;

    public String genAccessToken() {
        InputStream inSt;
        HttpsURLConnection webRequest;

        try {
            String accessToken = redisCache.get(RedisKey.KEY_TTS_ACCESS_TOKEN);
            if (StringUtils.isEmpty(accessToken)) {
                webRequest = HttpsConnection.getHttpsConnection(TtsConst.ACCESS_TOKEN_URI);
                webRequest.setDoInput(true);
                webRequest.setDoOutput(true);
                webRequest.setConnectTimeout(5000);
                webRequest.setReadTimeout(5000);
                webRequest.setRequestMethod("POST");

                byte[] bytes = new byte[0];
                webRequest.setRequestProperty("content-length", String.valueOf(bytes.length));
                webRequest.setRequestProperty("Ocp-Apim-Subscription-Key", TtsConst.API_KEY);
                webRequest.connect();

                DataOutputStream dop = new DataOutputStream(webRequest.getOutputStream());
                dop.write(bytes);
                dop.flush();
                dop.close();

                inSt = webRequest.getInputStream();
                InputStreamReader in = new InputStreamReader(inSt);
                BufferedReader bufferedReader = new BufferedReader(in);
                StringBuilder strBuffer = new StringBuilder();
                String line = null;
                while ((line = bufferedReader.readLine()) != null) {
                    strBuffer.append(line);
                }

                bufferedReader.close();
                in.close();
                inSt.close();
                webRequest.disconnect();

                accessToken = strBuffer.toString();
                //设置accessToken的过期时间为9分钟
                redisCache.set(RedisKey.KEY_TTS_ACCESS_TOKEN, accessToken, TtsConst.ACCESS_TOKEN_EXPIRE_TIME);
                log.info("New tts access token {}", accessToken);
            }
            return accessToken;
        } catch (Exception e) {
            log.error("Generate tts access token failed {}", e.getMessage());
        }
        return null;
    }
}

4.字节数组处理

public class ByteArray {
    private byte[] data;
    private int length;

    public ByteArray(){
        length = 0;
        data = new byte[length];
    }

    public ByteArray(byte[] ba){
        data = ba;
        length = ba.length;
    }

    /**
    合并数组
     */
    public  void cat(byte[] second, int offset, int length){

        if(this.length + length > data.length) {
            int allocatedLength = Math.max(data.length, length);
            byte[] allocated = new byte[allocatedLength << 1];
            System.arraycopy(data, 0, allocated, 0, this.length);
            System.arraycopy(second, offset, allocated, this.length, length);
            data = allocated;
        }else {
            System.arraycopy(second, offset, data, this.length, length);
        }

        this.length += length;
    }

    public  void cat(byte[] second){
        cat(second, 0, second.length);
    }

    public byte[] getArray(){
        if(length == data.length){
            return data;
        }

        byte[] ba = new byte[length];
        System.arraycopy(data, 0, ba, 0, this.length);
        data = ba;
        return ba;
    }

    public int getLength(){
        return length;
    }
}

5.创建SSML文件

@Slf4j
public class XmlDom {
    public static String createDom(String locale, String genderName, String voiceName, String textToSynthesize){
        Document doc = null;
        Element speak, voice;
        try {
            DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
            DocumentBuilder builder = dbf.newDocumentBuilder();
            doc = builder.newDocument();
            if (doc != null){
                speak = doc.createElement("speak");
                speak.setAttribute("version", "1.0");
                speak.setAttribute("xml:lang", "en-us");
                voice = doc.createElement("voice");
                voice.setAttribute("xml:lang", locale);
                voice.setAttribute("xml:gender", genderName);
                voice.setAttribute("name", voiceName);

                voice.appendChild(doc.createTextNode(textToSynthesize));
                speak.appendChild(voice);
                doc.appendChild(speak);
            }
        } catch (ParserConfigurationException e) {
            log.error("Create ssml document failed: {}",e.getMessage());
            return null;
        }
        return transformDom(doc);
    }

    private static String transformDom(Document doc){
        StringWriter writer = new StringWriter();
        try {
            TransformerFactory tf = TransformerFactory.newInstance();
            Transformer transformer;
            transformer = tf.newTransformer();
            transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
            transformer.transform(new DOMSource(doc), new StreamResult(writer));
        } catch (TransformerException e) {
            log.error("Transform ssml document failed: {}",e.getMessage());
            return null;
        }
        return writer.getBuffer().toString().replaceAll("
|
", "");
    }
}

6.正主来了！TTS服务

@Slf4j
@Component
public class TtsService {

    @Resource
    private Authentication authentication;

    /**
     * 合成音频
     */
    public byte[] genAudioBytes(String textToSynthesize, String locale, String gender, String voiceName) {

        String accessToken = authentication.genAccessToken();
        if (StringUtils.isEmpty(accessToken)) {
            return new byte[0];
        }
        try {
            HttpsURLConnection webRequest = HttpsConnection.getHttpsConnection(TtsConst.TTS_SERVICE_URI);
            webRequest.setDoInput(true);
            webRequest.setDoOutput(true);
            webRequest.setConnectTimeout(5000);
            webRequest.setReadTimeout(300000);
            webRequest.setRequestMethod("POST");

            webRequest.setRequestProperty("Content-Type", "application/ssml+xml");
            webRequest.setRequestProperty("X-Microsoft-OutputFormat", TtsConst.AUDIO_24KHZ_48KBITRATE_MONO_MP3);
            webRequest.setRequestProperty("Authorization", "Bearer " + accessToken);
            webRequest.setRequestProperty("X-Search-AppId", "07D3234E49CE426DAA29772419F436CC");
            webRequest.setRequestProperty("X-Search-ClientID", "1ECFAE91408841A480F00935DC390962");
            webRequest.setRequestProperty("User-Agent", "TTSAndroid");
            webRequest.setRequestProperty("Accept", "*/*");

            String body = XmlDom.createDom(locale, gender, voiceName, textToSynthesize);
            if (StringUtils.isEmpty(body)) {
                return new byte[0];
            }
            byte[] bytes = body.getBytes();
            webRequest.setRequestProperty("content-length", String.valueOf(bytes.length));
            webRequest.connect();
            DataOutputStream dop = new DataOutputStream(webRequest.getOutputStream());
            dop.write(bytes);
            dop.flush();
            dop.close();

            InputStream inSt = webRequest.getInputStream();
            ByteArray ba = new ByteArray();

            int rn2 = 0;
            int bufferLength = 4096;
            byte[] buf2 = new byte[bufferLength];
            while ((rn2 = inSt.read(buf2, 0, bufferLength)) > 0) {
                ba.cat(buf2, 0, rn2);
            }

            inSt.close();
            webRequest.disconnect();

            return ba.getArray();
        } catch (Exception e) {
            log.error("Synthesis tts speech failed {}", e.getMessage());
        }
        return null;
    }
}

由于项目中需要将音频上传到OSS，所以这里生成的是字节码文件，你也可以选择下载或保存音频文件。

三、问题及总结

1.问题

由于项目中需要生成超过10分钟的音频，我在调试中发现tts不能生成超过10分钟的音频，尴尬了呀，在微软官网中摸索了半天也没找到生成超过10分钟音频的办法，放弃了吗？不可能的。在我感觉到无计可施的时候，我的脑海中蹦出了四个字，那就是”断点续传“。我就想能不能通过tts把内容分段生成字节码两个，然后拼接后再上传到OSS。说干就干，没想到最后真的可行。成功那一瞬间的感觉无法言喻呀。不废话了，嗯，上大妈，哦不是，上代码。太激动了。

    /**
     * 生成中文音频信息
     */
    public byte[] getZHAudioBuffer(String gender, String chapterContent, String locale, String voiceName) {
        byte[] audioBuffer;
        if (chapterContent.length() <= 2600) {
            audioBuffer = ttsService.genAudioBytes(chapterContent, locale, gender, voiceName);
        } else {
            byte[] audioBuffer1 = ttsService.genAudioBytes(chapterContent.substring(0, chapterContent.length() / 2), locale, gender, voiceName);
            byte[] audioBuffer2 = ttsService.genAudioBytes(chapterContent.substring(chapterContent.length() / 2), locale, gender, voiceName);
            ByteArray byteArray = new ByteArray(audioBuffer1);
            byteArray.cat(audioBuffer2);
            audioBuffer = byteArray.getArray();
        }
        return audioBuffer;
    }

    /**
     * 生成英文音频信息
     */
    public byte[] getUSAudioBuffer(String gender, String chapterContent, String locale, String voiceName) {
        String[] words = chapterContent.split(" ");
        byte[] audioBuffer;
        int maxLength = 1500;
        if (words.length <= maxLength) {
            audioBuffer = ttsService.genAudioBytes(chapterContent, locale, gender, voiceName);
        } else {
            String[] part1 = new String[maxLength];
            String[] part2 = new String[words.length - maxLength];
            for (int i = 0; i < words.length; i++) {
                if (i < maxLength) {
                    part1[i] = words[i];
                } else {
                    part2[i - maxLength] = words[i];
                }
            }
            byte[] audioBuffer1 = ttsService.genAudioBytes(String.join(" ", part1), locale, gender, voiceName);
            byte[] audioBuffer2 = ttsService.genAudioBytes(String.join(" ", part2), locale, gender, voiceName);
            ByteArray byteArray = new ByteArray(audioBuffer1);
            byteArray.cat(audioBuffer2);
            audioBuffer = byteArray.getArray();
        }
        return audioBuffer;
    }

我要说的都在代码里了，你细品。（PS：中文的2600字符和英文的1500字符，是我调试出来的，生成的音频肯定是在10分钟以内的）

2.总结

微软TTS还是挺香的，嗯，总结很到位，我继续摸索其他功能去了。

相关阅读:
五分钟完成 ABP vNext 通讯录 App 开发
 .NET Conf： Xamarin专场会议3.23 开幕
 2020 年中国.NET开发者调查报告
 推荐一个很棒的开源工作流elsa-core
尝试使用 Visual Studio Online (Cloud IDE)
Mongo2Go 介绍
 DevExpress作为企业赞助商加入.NET基金会
 【新书推荐】《ASP.NET Core微服务实战：在云环境中开发、测试和部署跨平台服务》带你走近微服务开发
 云原生时代来看看十年前李彦宏、马化腾和马云对云计算的评价
 .NET 在云原生时代的蜕变，让我在云时代脱颖而出
原文地址：https://www.cnblogs.com/aohongzhu/p/15174381.html