使用 Watson for Python 连续实时语音转文本答案

【问题标题】：Continuous Real Time Speech to Text with Watson for Python使用 Watson for Python 连续实时语音转文本
【发布时间】：2018-04-08 13:43:42
【问题描述】：

我正在尝试创建一个小型 Python 程序，它可以让我使用来自 Watson 服务器的麦克风实时获取文本，类似于 here 的工作方式。

这是我想出的代码，但在我完成录制后它会得到文本：

import pyaudio
import json
from watson_developer_cloud import SpeechToTextV1

CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 44100
RECORD_SECONDS = 10

p = pyaudio.PyAudio()

stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                frames_per_buffer=CHUNK)
print("* recording")

frames = []

for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
    data = stream.read(CHUNK)
    frames.append(data)

print("* done recording")

stream.stop_stream()
stream.close()
p.terminate()

data_feed = b''.join(frames)

speech_to_text = SpeechToTextV1(
    username='secret',
    password='secret too',
    x_watson_learning_opt_out=False
)

result = speech_to_text.recognize(data_feed,
                                  content_type="audio/l16;rate=44100;channels=2",
                                  word_confidence=True,
                                  max_alternatives=4,
                                  word_alternatives_threshold=0.5,
                                  model="en-US_BroadbandModel",
                                  continuous=True)

j = json.dumps(result, indent=2)
print(j)

【问题讨论】：

您是否正在尝试解决某个特定问题？您是否尝试过逐行浏览代码？
嗨@alex，目前python的SDK仅限于使用音频文件，而不是通过麦克风获取直接音频。我目前正在开展一个项目，该项目可以在使用麦克风时获取实时文本。
我现在正在使用 websockets 来做这件事，我想我明天可能会有一些可靠的东西。

标签： python machine-learning websocket speech-to-text ibm-watson

【解决方案1】：

我继续从头开始创建了一个程序，以使用 websockets 连接到 Watson 服务器。它仍然没有完全符合我的预期，但非常接近。

音频正在实时发送到服务器，但我会在录制完成后获取脚本。

import asyncio
import websockets
import json
import requests
import pyaudio
import time

# Variables to use for recording audio
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 16000

p = pyaudio.PyAudio()

# This is the language model to use to transcribe the audio
model = "en-US_BroadbandModel"

# These are the urls we will be using to communicate with Watson
default_url = "https://stream.watsonplatform.net/speech-to-text/api"
token_url = "https://stream.watsonplatform.net/authorization/api/v1/token?" \
            "url=https://stream.watsonplatform.net/speech-to-text/api"
url = "wss://stream.watsonplatform.net/speech-to-text/api/v1/recognize?model=en-US_BroadbandModel"

# BlueMix app credentials
username = ""   # Your Bluemix App username
password = ""   # Your Bluemix App password

# Send a request to get an authorization key
r = requests.get(token_url, auth=(username, password))
auth_token = r.text
token_header = {"X-Watson-Authorization-Token": auth_token}

# Params to use for Watson API
params = {
    "word_confidence": True,
    "content_type": "audio/l16;rate=16000;channels=2",
    "action": "start",
    "interim_results": True
}

# Opens the stream to start recording from the default microphone
stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                output=True,
                frames_per_buffer=CHUNK)


async def send_audio(ws):
    # Starts recording of microphone
    print("* READY *")

    start = time.time()
    while True:
        try:
            print(".")
            data = stream.read(CHUNK)
            await ws.send(data)
            if time.time() - start > 20:    # Records for n seconds
                await ws.send(json.dumps({'action': 'stop'}))
                return False
        except Exception as e:
            print(e)
            return False

    # Stop the stream and terminate the recording
    stream.stop_stream()
    stream.close()
    p.terminate()


async def speech_to_text():
    async with websockets.connect(url, extra_headers=token_header) as conn:
        # Send request to watson and waits for the listening response
        send = await conn.send(json.dumps(params))
        rec = await conn.recv()
        print(rec)
        asyncio.ensure_future(send_audio(conn))

        # Keeps receiving transcript until we have the final transcript
        while True:
            try:
                rec = await conn.recv()
                parsed = json.loads(rec)
                transcript = parsed["results"][0]["alternatives"][0]["transcript"]
                print(transcript)
                #print(parsed)
                if "results" in parsed:
                    if len(parsed["results"]) > 0:
                        if "final" in parsed["results"][0]:
                            if parsed["results"][0]["final"]:
                                #conn.close()
                                #return False
                                pass
            except KeyError:
                conn.close()
                return False

# Starts the application loop
loop = asyncio.get_event_loop()
loop.run_until_complete(speech_to_text())
loop.close()

所以我现在只想在通过麦克风录音的同时获取成绩单。

【讨论】：

看这里：ibm.com/blogs/watson/2016/07/…它使用arecord录制麦克风并将音频实时发送到云端