使用 Python 和 Pocketsphinx 进行实时识别
声明:本页面是StackOverFlow热门问题的中英对照翻译,遵循CC BY-SA 4.0协议,如果您需要使用它,必须同样遵循CC BY-SA许可,注明原文地址和作者信息,同时你必须将它归于原作者(不是我):StackOverFlow
原文地址: http://stackoverflow.com/questions/14307816/
Warning: these are provided under cc-by-sa 4.0 license. You are free to use/share it, But you must attribute it to the original authors (not me):
StackOverFlow
Live recognition with Python and Pocketsphinx
提问by Tonderai Ratisai
I have recently been working with pocket sphinx in python. I have successfully got the example below to work recognising a recorded wav.
我最近一直在使用 python 中的口袋狮身人面像。我已经成功地得到了下面的例子来识别录制的 wav。
#!/usr/bin/env python
import sys,os
def decodeSpeech(hmmd,lmdir,dictp,wavfile):
"""
Decodes a speech file
"""
try:
import pocketsphinx as ps
import sphinxbase
except:
print """Pocket sphinx and sphixbase is not installed
in your system. Please install it with package manager.
"""
speechRec = ps.Decoder(hmm = hmmd, lm = lmdir, dict = dictp)
wavFile = file(wavfile,'rb')
wavFile.seek(44)
speechRec.decode_raw(wavFile)
result = speechRec.get_hyp()
return result[0]
if __name__ == "__main__":
hmdir = "/home/jaganadhg/Desktop/Docs_New/kgisl/model/hmm/wsj1"
lmd = "/home/jaganadhg/Desktop/Docs_New/kgisl/model/lm/wsj/wlist5o.3e-7.vp.tg.lm.DMP"
dictd = "/home/jaganadhg/Desktop/Docs_New/kgisl/model/lm/wsj/wlist5o.dic"
wavfile = "/home/jaganadhg/Desktop/Docs_New/kgisl/sa1.wav"
recognised = decodeSpeech(hmdir,lmd,dictd,wavfile)
print "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%"
print recognised
print "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%"
The problem is how can I do real time speech recognition from a microphone? In a while loop with a if statement so that if a set word is recognised from the microphone a function can be called?
问题是如何通过麦克风进行实时语音识别?在带有 if 语句的 while 循环中,如果从麦克风中识别出设置的单词,则可以调用函数?
回答by Nikolay Shmyrev
The code for realtime recognition looks like this:
实时识别的代码如下所示:
config = Decoder.default_config()
config.set_string('-hmm', path.join(MODELDIR, 'en-us/en-us'))
config.set_string('-lm', path.join(MODELDIR, 'en-us/en-us.lm.bin'))
config.set_string('-dict', path.join(MODELDIR, 'en-us/cmudict-en-us.dict'))
config.set_string('-logfn', '/dev/null')
decoder = Decoder(config)
import pyaudio
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024)
stream.start_stream()
in_speech_bf = False
decoder.start_utt()
while True:
buf = stream.read(1024)
if buf:
decoder.process_raw(buf, False, False)
if decoder.get_in_speech() != in_speech_bf:
in_speech_bf = decoder.get_in_speech()
if not in_speech_bf:
decoder.end_utt()
print 'Result:', decoder.hyp().hypstr
decoder.start_utt()
else:
break
decoder.end_utt()
You can also use gstreamer python bindings in pocketsphinx, check livedemo.py
您还可以在pocketsphinx 中使用gstreamer python 绑定,查看livedemo.py
回答by user1953384
回答by V? Hu?nh
This is the code I see on the internet and I've modified a few things to really listen to the words very bad and slow You can help me modify it for good. It is built on ubuntu 16.04 LTS I do not know much about programming Looking forward to help
这是我在网上看到的代码,我修改了一些东西来真正听听非常糟糕和缓慢的话你可以帮我修改一下。它建立在 ubuntu 16.04 LTS 上我对编程不太了解 期待帮助
# -*- encoding: utf-8 -*-
#!/usr/bin/env python
from pocketsphinx.pocketsphinx import *
from sphinxbase.sphinxbase import *
import os
import pyaudio
import wave
import audioop
from collections import deque
import time
import math;import Mic
"""
Written by Sophie Li, 2016
http://blog.justsophie.com/python-speech-to-text-with-pocketsphinx/
"""
class SpeechDetector:
def __init__(self):
# Microphone stream config.
self.CHUNK = 1024 # CHUNKS of bytes to read each time from mic
self.FORMAT = pyaudio.paInt16
self.CHANNELS = 1
self.RATE = 16000
self.SILENCE_LIMIT = 1 # Silence limit in seconds. The max ammount of seconds where
# only silence is recorded. When this time passes the
# recording finishes and the file is decoded
self.PREV_AUDIO = 0.5 # Previous audio (in seconds) to prepend. When noise
# is detected, how much of previously recorded audio is
# prepended. This helps to prevent chopping the beginning
# of the phrase.
self.THRESHOLD = 4500
self.num_phrases = -1
# These will need to be modified according to where the pocketsphinx folder is
MODELDIR = "/home/l/Desktop/pocketsphinx/model/en-us"
# Create a decoder with certain model
config = Decoder.default_config()
config.set_string('-hmm', os.path.join(MODELDIR, '/home/l/Desktop/pocketsphinx/model/en-us/en-us/'))
config.set_string('-lm', os.path.join(MODELDIR, '/home/l/Desktop/pocketsphinx/model/en-us/en-us.lm.bin'))
config.set_string('-dict', os.path.join(MODELDIR, '/home/l/Desktop/pocketsphinx/model/en-us/cmudict-en-us.dict'))
config.set_string('-keyphrase', 'no one')
config.set_float('-kws_threshold', 1e+20)
# Creaders decoder object for streaming data.
self.decoder = Decoder(config)
def setup_mic(self, num_samples=50):
""" Gets average audio intensity of your mic sound. You can use it to get
average intensities while you're talking and/or silent. The average
is the avg of the .2 of the largest intensities recorded.
"""
#print "Getting intensity values from mic."
p = pyaudio.PyAudio()
stream = p.open(format=self.FORMAT,
channels=self.CHANNELS,
rate=self.RATE,
input=True,
frames_per_buffer=self.CHUNK)
values = [math.sqrt(abs(audioop.avg(stream.read(self.CHUNK), 4)))
for x in range(num_samples)]
values = sorted(values, reverse=True)
r = sum(values[:int(num_samples * 0.2)]) / int(num_samples * 0.2)
#print " Finished "
#print " Average audio intensity is ", r
stream.close()
p.terminate()
if r < 3000:
self.THRESHOLD = 3500
else:
self.THRESHOLD = r + 100
def save_speech(self, data, p):
"""
Saves mic data to temporary WAV file. Returns filename of saved
file
"""
filename = 'output_'+str(int(time.time()))
# writes data to WAV file
data = ''.join(data)
wf = wave.open(filename + '.wav', 'wb')
wf.setnchannels(1)
wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
wf.setframerate(16000) # TODO make this value a function parameter?
wf.writeframes(data)
wf.close()
return filename + '.wav'
def decode_phrase(self, wav_file):
self.decoder.start_utt()
stream = open(wav_file, "rb")
while True:
buf = stream.read(1024)
if buf:
self.decoder.process_raw(buf, False, False)
else:
break
self.decoder.end_utt()
words = []
[words.append(seg.word) for seg in self.decoder.seg()]
return words
def run(self):
"""
Listens to Microphone, extracts phrases from it and calls pocketsphinx
to decode the sound
"""
self.setup_mic()
#Open stream
p = pyaudio.PyAudio()
stream = p.open(format=self.FORMAT,
channels=self.CHANNELS,
rate=self.RATE,
input=True,
frames_per_buffer=self.CHUNK)
audio2send = []
cur_data = '' # current chunk of audio data
rel = self.RATE/self.CHUNK
slid_win = deque(maxlen=self.SILENCE_LIMIT * rel)
#Prepend audio from 0.5 seconds before noise was detected
prev_audio = deque(maxlen=self.PREV_AUDIO * rel)
started = False
while True:
cur_data = stream.read(self.CHUNK)
slid_win.append(math.sqrt(abs(audioop.avg(cur_data, 4))))
if sum([x > self.THRESHOLD for x in slid_win]) > 0:
if started == False:
print "B?t ??u ghi am"
started = True
audio2send.append(cur_data)
elif started:
print "Hoàn thành ghi am"
filename = self.save_speech(list(prev_audio) + audio2send, p)
r = self.decode_phrase(filename)
print "RESULT: ", r
# hot word for me " no one" if r.count('one') and r.count("no") > 0 the end programs
if r.count("one") > 0 and r.count("no") > 0:
Mic.playaudiofromAudio().play("/home/l/Desktop/PROJECT/Audio/beep_hi.wav")
os.remove(filename)
return
# Removes temp audio file
os.remove(filename)
# Reset all
started = False
slid_win = deque(maxlen=self.SILENCE_LIMIT * rel)
prev_audio = deque(maxlen= 0.5 * rel)
audio2send = []
print "Ch? ?? nghe ..."
else:
prev_audio.append(cur_data)
print "* Hoàn thành nghe"
stream.close()
p.terminate()

