wukong-robot--TTS和ASR

前言

这个实际上就是调用百度的提供AipSpeech，这个接口库。
语音合成:
就是将文字上传，接口返回的是语音文件，将语音文件保存在本地
语音识别：
上传一段60S以内的语音文件，返回语音内的内容
下面是示例代码
这里会有一个问题：
语音识别：报错
{\'err_msg\': \'request pv too much\', \'err_no\': 3305, \'sn\': \'399602018291622379966\'}
目前查看有两种情况：1、修改dev_id为1537.2、账号没有相关的额度
from aip import AipSpeech
from abc import ABCMeta, abstractmethod
import wave
from pydub import AudioSegment
APP_ID = \'15422825\'
APP_KEY = \'DhXGtWHYMujMVZZGRI3a7rzb\'
SECRET_KEY = \'PbyUvTL31fImGthOOIP5ZbbtEOGwGOoT\'

class AbstractTTS(object):
    """
    Generic parent class for all TTS engines
    """

    __metaclass__ = ABCMeta

    @classmethod
    def get_config(cls):
        return {}

    @classmethod
    def get_instance(cls):
        profile = cls.get_config()
        instance = cls(**profile)
        return instance

    @abstractmethod
    def get_speech(self, phrase):
        pass

class BaiduTTS(AbstractTTS):
    """
    使用百度语音合成技术
    要使用本模块, 首先到 yuyin.baidu.com 注册一个开发者账号,
    之后创建一个新应用, 然后在应用管理的"查看key"中获得 API Key 和 Secret Key
    填入 config.yml 中.
    ...
        baidu_yuyin:
            appid: \'9670645\'
            api_key: \'qg4haN8b2bGvFtCbBGqhrmZy\'
            secret_key: \'585d4eccb50d306c401d7df138bb02e7\'
            dev_pid: 1936
            per: 1
            lan: \'zh\'
        ...
    """

    SLUG = "baidu-tts"

    def __init__(self, appid, api_key, secret_key, per=1, lan=\'zh\', **args):
        super(self.__class__, self).__init__()
        self.client = AipSpeech(appid, api_key, secret_key)
        self.per, self.lan = str(per), lan

    @classmethod
    def get_config(cls):
        # Try to get baidu_yuyin config from config
        pass
        #return config.get(\'baidu_yuyin\', {})

    def get_speech(self, phrase):
        result  = self.client.synthesis(phrase, self.lan, 1, {\'per\': self.per});
        # 识别正确返回语音二进制 错误则返回dict 参照下面错误码
        if not isinstance(result, dict):
            with open(\'audio.mp3\',\'wb\') as f:
                f.write(result)
            #tmpfile = utils.write_temp_file(result, \'.mp3\')
            print(\'{} 语音合成成功，合成路径：{}\')
            #return tmpfile
        else:
            print(\'{} 合成失败！\'.format(self.SLUG), exc_info=True)


class BaiduASR():
    """
    百度的语音识别API.
    dev_pid:
        - 1936: 普通话远场
        - 1536：普通话(支持简单的英文识别)
        - 1537：普通话(纯中文识别)
        - 1737：英语
        - 1637：粤语
        - 1837：四川话
    要使用本模块, 首先到 yuyin.baidu.com 注册一个开发者账号,
    之后创建一个新应用, 然后在应用管理的"查看key"中获得 API Key 和 Secret Key
    填入 config.xml 中.
    ...
        baidu_yuyin:
            appid: \'9670645\'
            api_key: \'qg4haN8b2bGvFtCbBGqhrmZy\'
            secret_key: \'585d4eccb50d306c401d7df138bb02e7\'
        ...
    """

    SLUG = "baidu-asr"

    def __init__(self, appid, api_key, secret_key, dev_pid=1936, **args):
        super(self.__class__, self).__init__()
        self.client = AipSpeech(appid, api_key, secret_key)
        self.dev_pid = 1537

    @classmethod
    def get_config(cls):
        # Try to get baidu_yuyin config from config
        pass

    def transcribe(self, wav_path):
        # 识别本地文件
        print(\'dev_pid：{}\',self.dev_pid)
        wav = wave.open(wav_path, \'rb\')
        pcm = wav.readframes(wav.getnframes())
        res = self.client.asr(pcm, \'pcm\', 16000, {
            \'dev_pid\': self.dev_pid,
        })
        print(res)
        if res[\'err_no\'] == 0:
            print(\'{} 语音识别到了：{}\'.format(self.SLUG, res[\'result\']))
            return \'\'.join(res[\'result\'])
        else:

            print(\'{} 语音识别出错了: {}\'.format(self.SLUG, res[\'err_msg\']))
            return \'\'

if __name__ == \'__main__\':
    BaiduTTS =BaiduTTS(appid="24226351",api_key=\'V0rrRk2k95ytnsOTV8ZCqtX7\',secret_key=\'bxuKH3eFu50akaKgYYuEYvR1aOhQgqUK\')
    BaiduTTS.get_speech(phrase = "今天是个好额日子")
    song = AudioSegment.from_mp3("audio.mp3")
    song.export("audio.wav", format="wav")
    BaiduASR =BaiduASR(appid=APP_ID,api_key=APP_KEY,secret_key=SECRET_KEY)
    BaiduASR.transcribe(wav_path = "audio.wav")
\'\'\'

# 与百度进行一次加密校验,认证你是合法用户合法的应用
# AipSpeech是百度语音的客户端,认证成功之后,客户端将被开启,这里的client就是已经开启的百度语音的客户端了
client = AipSpeech(APP_ID,APP_KEY,SECRET_KEY)
str = \'今天天气怎么样？\'
result = client.synthesis(
    str,  # text:合成的文本,使用UTF-8编码,请注意文本长度必须小于1024字节
    \'zh\',           # lang:语言,中文:zh,英文:en
    1,              # ctp:客户端信息这里就写1,写别的不好使,至于为什么咱们以后再解释
    {
        \'vol\':5,    # 合成音频文件的准音量
        \'spd\':4,    # 语速取值0-9,默认为5中语速
        \'pit\':8,    # 语调音量,取值0-9,默认为5中语调
        \'per\':4     # 发音人选择,0为女声,1为男生,3为情感合成-度逍遥,4为情感合成-度丫丫,默认为普通女
    } # options:这是一个dict类型的参数,里面的键值对才是关键.
)
# 如果上面的三个参数APP_ID,APP_KEY,SECRET_KEY填写正确的话
# result就是音频文件的二进制文件流,如果返回失败的话,result就会是个字典
print(result)

if not isinstance(result,dict):
    with open(\'audio.mp3\',\'wb\') as f:
        f.write(result)


# 识别正确返回语音二进制文件流,错误则返回dict,参照下面错误代码.
"""
result = {
            \'err_detail\': \'Tex length exceeds limit.\',
            \'err_msg\': \'parameter error.\',
            \'err_no\': 501,
            \'err_subcode\': 10,
            \'tts_logid\': 3257246120
        }
"""
\'\'\'