#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Classes and functions for transcribing speech in audio data to text.
"""
# Part of the PsychoPy library
# Copyright (C) 2002-2018 Jonathan Peirce (C) 2019-2025 Open Science Tools Ltd.
# Distributed under the terms of the GNU General Public License (GPL).
__all__ = [
'TranscriptionResult',
'transcribe',
'TRANSCR_LANG_DEFAULT',
'BaseTranscriber',
'recognizerEngineValues',
'recognizeSphinx',
'recognizeGoogle',
'getAllTranscriberInterfaces',
'getTranscriberInterface',
'setupTranscriber',
'getActiveTranscriber',
'getActiveTranscriberEngine',
'submit'
]
import importlib
import json
import sys
import os
import psychopy.logging as logging
from psychopy.alerts import alert
from pathlib import Path
from psychopy.preferences import prefs
from .audioclip import *
from .exceptions import *
import numpy as np
# ------------------------------------------------------------------------------
# Initialize the speech recognition system
#
# _hasSpeechRecognition = True
# try:
# import speech_recognition as sr
# except (ImportError, ModuleNotFoundError):
# logging.warning(
# "Speech-to-text recognition module for PocketSphinx is not available "
# "(use command `pip install SpeechRecognition` to get it). "
# "Transcription will be unavailable using that service this session.")
# _hasSpeechRecognition = False
# Google Cloud API
# _hasGoogleCloud = True
# _googleCloudClient = None # client for Google Cloud, instanced on first use
# try:
# import google.cloud.speech
# import google.auth.exceptions
# except (ImportError, ModuleNotFoundError):
# logging.warning(
# "Speech-to-text recognition using Google online services is not "
# "available (use command `pip install google-api-core google-auth "
# "google-cloud google-cloud-speech googleapis-common-protos` to get "
# "it). Transcription will be unavailable using that service this "
# "session.")
# _hasGoogleCloud = False
# try:
# import pocketsphinx
# sphinxLangs = [folder.stem for folder
# in Path(pocketsphinx.get_model_path()).glob('??-??')]
# haveSphinx = True
# except (ImportError, ModuleNotFoundError):
# haveSphinx = False
# sphinxLangs = None
# Constants related to the transcription system.
TRANSCR_LANG_DEFAULT = 'en-US'
# Values for specifying recognizer engines. This dictionary is used by Builder
# to populate the component property dropdown.
recognizerEngineValues = {
0: ('sphinx', "CMU Pocket Sphinx", "Offline"),
1: ('google', "Google Cloud Speech API", "Online, Key Required"),
2: ('whisper', "OpenAI Whisper", "Offline, Built-in")
}
# the active transcriber interface
_activeTranscriber = None
# ------------------------------------------------------------------------------
# Exceptions for the speech recognition interface
#
class TranscriberError(Exception):
"""Base class for transcriber exceptions.
"""
pass
class TranscriberNotSetupError(TranscriberError):
"""Exception raised when a transcriber interface has not been setup.
"""
pass
# ------------------------------------------------------------------------------
# Classes and functions for speech-to-text transcription
#
[docs]
class TranscriptionResult:
"""Descriptor for returned transcription data.
Fields within this class can be used to access transcribed words and other
information related to the transcription request.
This is returned by functions and methods which perform speech-to-text
transcription from audio data within PsychoPy. The user usually does not
create instances of this class themselves.
Parameters
----------
words : list of str
Words extracted from the audio clip.
unknownValue : bool
`True` if the transcription API failed make sense of the audio and did
not complete the transcription.
requestFailed : bool
`True` if there was an error with the transcriber itself. For instance,
network error or improper formatting of the audio data.
engine : str
Name of engine used to perform this transcription.
language : str
Identifier for the language used to perform the transcription.
"""
__slots__ = [
'_words',
'_wordData', # additional word data
'_text', # unused for now, will be used in future
'_confidence', # unused on Python for now
'_response',
'_engine',
'_language',
'_expectedWords',
'_requestFailed',
'_unknownValue']
def __init__(self, words, unknownValue, requestFailed, engine, language):
self.words = words
self.unknownValue = unknownValue
self.requestFailed = requestFailed
self.engine = engine
self.language = language
# initialize other fields
self._wordData = None
self._text = ""
self._confidence = 0.0
self._response = None
self._expectedWords = None
self._requestFailed = True
self._unknownValue = True
def __repr__(self):
return (f"TranscriptionResult(words={self._words}, "
f"unknownValue={self._unknownValue}, ",
f"requestFailed={self._requestFailed}, ",
f"engine={self._engine}, ",
f"language={self._language})")
def __str__(self):
return " ".join(self._words)
def __json__(self):
return str(self)
@property
def wordCount(self):
"""Number of words found (`int`)."""
return len(self._words)
@property
def words(self):
"""Words extracted from the audio clip (`list` of `str`)."""
return self._words
@property
def text(self):
"""Text transcribed for the audio data (`str`).
"""
return self._text
@words.setter
def words(self, value):
self._words = list(value)
@property
def response(self):
"""Raw API response from the transcription engine (`str`).
"""
return self._response
@response.setter
def response(self, val):
self._response = val
@property
def responseData(self):
"""
Values from self.response, parsed into a `dict`.
"""
return json.loads(self.response)
@responseData.setter
def responseData(self, val):
self._response = str(val)
@property
def wordData(self):
"""Additional data about each word (`list`).
Not all engines provide this data in the same format or at all.
"""
return self._wordData
@wordData.setter
def wordData(self, val):
self._wordData = val
[docs]
def getSpeechInterval(self):
"""Get the start and stop times for the interval of speech in the audio
clip.
This feature is only supported by the Whisper transcriber. The start and
end times of the speech interval are returned in seconds.
Returns
-------
tuple
Start and end times of the speech interval in seconds. If the engine
does not support this feature, or if the data is missing,
`(None, None)` is returned. In cases where either the start or end
time is missing, the value will be `None` for that field.
"""
nullData = (None, None) # default return value if no data
if self._engine in ('sphinx', 'google'):
logging.warning(
"Method `getSpeechInterval` is not supported for the "
"transcription engine `{}`.".format(self._engine))
return nullData
elif self._engine == 'whisper':
if self.responseData is None:
return nullData
# this value is in the response data which is in JSON format
segmentData = self.responseData.get('segments', None)
if segmentData is None:
return nullData
# integers for keys
segmentKeys = list(segmentData.keys())
if len(segmentKeys) == 0:
return nullData
# sort segment keys to ensure monotonic ordering
segmentKeys.sort()
# get first and last segment
firstSegment = segmentData.get(segmentKeys[0], None)
lastSegment = segmentData.get(segmentKeys[-1], None)
# get speech onset/offset times
speechOnset = firstSegment.get('start', None)
speechOffset = lastSegment.get('end', None)
# return start and end times
return speechOnset, speechOffset
@property
def success(self):
"""`True` if the transcriber returned a result successfully (`bool`)."""
return not (self._unknownValue or self._requestFailed)
@property
def error(self):
"""`True` if there was an error during transcription (`bool`). Value is
always the compliment of `.success`."""
return not self.success
@property
def unknownValue(self):
"""`True` if the transcription API failed make sense of the audio and
did not complete the transcription (`bool`).
"""
return self._unknownValue
@unknownValue.setter
def unknownValue(self, value):
self._unknownValue = bool(value)
@property
def requestFailed(self):
"""`True` if there was an error with the transcriber itself (`bool`).
For instance, network error or improper formatting of the audio data,
invalid key, or if there was network connection error.
"""
return self._requestFailed
@requestFailed.setter
def requestFailed(self, value):
self._requestFailed = bool(value)
@property
def engine(self):
"""Name of engine used to perform this transcription (`str`).
"""
return self._engine
@engine.setter
def engine(self, value):
if value == 'sphinx':
if not haveSphinx:
raise ModuleNotFoundError(
"To perform built-in (local) transcription you need to "
"have pocketsphinx installed (pip install pocketsphinx)")
self._engine = str(value)
@property
def language(self):
"""Identifier for the language used to perform the transcription
(`str`).
"""
return self._language
@language.setter
def language(self, value):
self._language = str(value)
# empty result returned when a transcriber is given no data
NULL_TRANSCRIPTION_RESULT = TranscriptionResult(
words=[''],
unknownValue=False,
requestFailed=False,
engine='null',
language=TRANSCR_LANG_DEFAULT
)
# ------------------------------------------------------------------------------
# Transcription interfaces
#
class BaseTranscriber:
"""Base class for text-to-speech transcribers.
This class defines the API for transcription, which is an interface to a
speech-to-text engine. All transcribers must be sub-classes of this class
and implement all members of this class.
Parameters
----------
initConfig : dict or None
Options to configure the speech-to-text engine during initialization.
"""
_isLocal = True
_engine = u'Null'
_longName = u"Null"
_lastResult = None
def __init__(self, initConfig=None):
self._initConf = initConfig
@property
def longName(self):
"""Human-readable name of the transcriber (`str`).
"""
return self._longName
@property
def engine(self):
"""Identifier for the transcription engine which this object interfaces
with (`str`).
"""
return self._engine
@property
def isLocal(self):
"""`True` if the transcription engine works locally without sending data
to a remote server.
"""
return self._isLocal
@property
def isComplete(self):
"""`True` if the transcriber has completed its transcription. The result
can be accessed through `.lastResult`.
"""
return True
@property
def lastResult(self):
"""Result of the last transcription.
"""
return self._lastResult
@lastResult.setter
def lastResult(self, val):
self._lastResult = val
def transcribe(self, audioClip, modelConfig=None, decoderConfig=None):
"""Perform speech-to-text conversion on the provided audio samples.
Parameters
----------
audioClip : :class:`~psychopy.sound.AudioClip`
Audio clip containing speech to transcribe (e.g., recorded from a
microphone).
modelConfig : dict or None
Additional configuration options for the model used by the engine.
decoderConfig : dict or None
Additional configuration options for the decoder used by the engine.
Returns
-------
TranscriptionResult
Transcription result object.
"""
self._lastResult = NULL_TRANSCRIPTION_RESULT # dummy value
return self._lastResult
def unload(self):
"""Unload the transcriber interface.
This method is called when the transcriber interface is no longer
needed. This is useful for freeing up resources used by the transcriber
interface.
This might not be available on all transcriber interfaces.
"""
pass
class PocketSphinxTranscriber(BaseTranscriber):
"""Class to perform speech-to-text conversion on the provided audio samples
using CMU Pocket Sphinx.
Parameters
----------
initConfig : dict or None
Options to configure the speech-to-text engine during initialization.
"""
_isLocal = True
_engine = u'sphinx'
_longName = u"CMU PocketSphinx"
def __init__(self, initConfig=None):
super(PocketSphinxTranscriber, self).__init__(initConfig)
# import the library and get language models
import speech_recognition as sr
# create a recognizer interface
self._recognizer = sr.Recognizer()
@staticmethod
def getAllModels():
"""Get available language models for the PocketSphinx transcriber
(`list`).
Returns
-------
list
List of available models.
"""
import pocketsphinx
modelPath = pocketsphinx.get_model_path()
toReturn = [folder.stem for folder in Path(modelPath).glob('??-??')]
return toReturn
def transcribe(self, audioClip, modelConfig=None, decoderConfig=None):
"""Perform speech-to-text conversion on the provided audio samples using
CMU Pocket Sphinx.
Parameters
----------
audioClip : :class:`~psychopy.sound.AudioClip`
Audio clip containing speech to transcribe (e.g., recorded from a
microphone).
modelConfig : dict or None
Additional configuration options for the model used by the engine.
decoderConfig : dict or None
Additional configuration options for the decoder used by the engine.
Presently unused by this transcriber.
Returns
-------
TranscriptionResult
Transcription result object.
"""
import speech_recognition as sr
try:
import pocketsphinx
except (ImportError, ModuleNotFoundError):
raise RecognizerEngineNotFoundError()
# warmup the engine, not used here but needed for compatibility
if audioClip is None:
return NULL_TRANSCRIPTION_RESULT
if isinstance(audioClip, AudioClip):
pass
elif isinstance(audioClip, (tuple, list,)):
waveform, sampleRate = audioClip
audioClip = AudioClip(waveform, sampleRateHz=sampleRate)
else:
raise TypeError(
"Expected type for parameter `audioClip` to be either "
"`AudioClip`, `list` or `tuple`")
# engine configuration
modelConfig = {} if modelConfig is None else modelConfig
if not isinstance(modelConfig, dict):
raise TypeError(
"Invalid type for parameter `config` specified, must be `dict` "
"or `None`.")
language = modelConfig.get('language', TRANSCR_LANG_DEFAULT)
if not isinstance(language, str):
raise TypeError(
"Invalid type for parameter `language`, must be type `str`.")
language = language.lower()
if language not in sphinxLangs: # missing a language pack error
url = "https://sourceforge.net/projects/cmusphinx/files/" \
"Acoustic%20and%20Language%20Models/"
msg = (f"Language `{language}` is not installed for "
f"`pocketsphinx`. You can download languages here: {url}. "
f"Install them here: {pocketsphinx.get_model_path()}")
raise RecognizerLanguageNotSupportedError(msg)
# configure the recognizer
modelConfig['language'] = language # sphinx users en-us not en-US
modelConfig['show_all'] = False
expectedWords = modelConfig.get('keyword_entries', None)
if expectedWords is not None:
words, sens = _parseExpectedWords(expectedWords)
modelConfig['keyword_entries'] = tuple(zip(words, sens))
# convert audio to format for transcription
sampleWidth = 2 # two bytes per sample for WAV
audioData = sr.AudioData(
audioClip.asMono().convertToWAV(),
sample_rate=audioClip.sampleRateHz,
sample_width=sampleWidth)
# submit audio samples to the API
respAPI = ''
unknownValueError = requestError = False
try:
respAPI = self._recognizer.recognize_sphinx(
audioData, **modelConfig)
except sr.UnknownValueError:
unknownValueError = True
except sr.RequestError:
requestError = True
# remove empty words
result = [word for word in respAPI.split(' ') if word != '']
# object to return containing transcription data
self.lastResult = toReturn = TranscriptionResult(
words=result,
unknownValue=unknownValueError,
requestFailed=requestError,
engine='sphinx',
language=language)
# split only if the user does not want the raw API data
return toReturn
class GoogleCloudTranscriber(BaseTranscriber):
"""Class for speech-to-text transcription using Google Cloud API services.
Parameters
----------
initConfig : dict or None
Options to configure the speech-to-text engine during initialization.
"""
_isLocal = False
_engine = u'googleCloud'
_longName = u'Google Cloud'
def __init__(self, initConfig=None):
super(GoogleCloudTranscriber, self).__init__(initConfig)
try:
import google.cloud.speech
import google.auth.exceptions
except (ImportError, ModuleNotFoundError):
pass
if "GOOGLE_APPLICATION_CREDENTIALS" not in os.environ:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = \
prefs.general['appKeyGoogleCloud']
# empty string indicates no key has been specified, raise error
if not os.environ["GOOGLE_APPLICATION_CREDENTIALS"]:
raise RecognizerAPICredentialsError(
'No application key specified for Google Cloud Services, '
'specify the path to the key file with either the system '
'environment variable `GOOGLE_APPLICATION_CREDENTIALS` or in '
'preferences (General -> appKeyGoogleCloud).')
# open new client, takes a while the first go
try:
client = google.cloud.speech.SpeechClient()
except google.auth.exceptions.DefaultCredentialsError:
raise RecognizerAPICredentialsError(
'Invalid key specified for Google Cloud Services, check if the '
'key file is valid and readable.')
self._googleCloudClient = client
def transcribe(self, audioClip, modelConfig=None, decoderConfig=None):
"""Transcribe text using Google Cloud.
Parameters
----------
audioClip : AudioClip, list or tuple
Audio clip containing speech to transcribe (e.g., recorded from a
microphone). Can be either an :class:`~psychopy.sound.AudioClip`
object or tuple where the first value is as a Nx1 or Nx2 array of
audio samples (`ndarray`) and the second the sample rate (`int`) in
Hertz (e.g., ``(samples, 48000)``).
Returns
-------
TranscriptionResult
Result of the transcription.
"""
# if None, return a null transcription result and just open a client
if audioClip is None:
return NULL_TRANSCRIPTION_RESULT
if isinstance(audioClip, (list, tuple,)):
waveform, sr = audioClip
audioClip = AudioClip(waveform, sampleRateHz=sr)
# check if we have a valid audio clip
if not isinstance(audioClip, AudioClip):
raise TypeError(
"Expected parameter `audioClip` to have type "
"`psychopy.sound.AudioClip`.")
# import here the first time
import google.cloud.speech as speech
import google.auth.exceptions
# defaults
languageCode = modelConfig.get('language', 'language_code')
model = modelConfig.get('model', 'command_and_search')
expectedWords = modelConfig.get('expectedWords', None)
# configure the recognizer
params = {
'encoding': speech.RecognitionConfig.AudioEncoding.LINEAR16,
'sample_rate_hertz': audioClip.sampleRateHz,
'language_code': languageCode,
'model': model,
'audio_channel_count': audioClip.channels,
'max_alternatives': 1}
if isinstance(modelConfig, dict): # overwrites defaults!
params.update(modelConfig)
# speech context (i.e. expected phrases)
if expectedWords is not None:
expectedWords, _ = _parseExpectedWords(expectedWords)
params['speech_contexts'] = \
[google.cloud.speech.SpeechContext(phrases=expectedWords)]
# Detects speech in the audio file
response = self._googleCloudClient.recognize(
config=google.cloud.speech.RecognitionConfig(**params),
audio=google.cloud.speech.RecognitionAudio(
content=audioClip.convertToWAV()))
# package up response
result = [
result.alternatives[0].transcript for result in response.results]
toReturn = TranscriptionResult(
words=result,
unknownValue=False, # not handled yet
requestFailed=False, # not handled yet
engine='google',
language=languageCode)
toReturn.response = response
self._lastResult = toReturn
return toReturn
# ------------------------------------------------------------------------------
# Functions
#
def getAllTranscriberInterfaces(engineKeys=False):
"""Get all available transcriber interfaces.
Transcriber interface can be implemented in plugins. When loaded, this
function will return them.
It is not recommended to work with transcriber interfaces directly. Instead,
setup a transcriber interface using `setupTranscriber()` and use
`submit()` to perform transcriptions.
Parameters
----------
engineKeys : bool
Have the returned mapping use engine names for keys instead of class
names.
Returns
-------
dict
Mapping of transcriber class or engine names (`str`) and references to
classes (subclasses of `BaseTranscriber`.)
Examples
--------
Getting a transcriber interface, initializing it, and doing a
transcription::
whisperInterface = sound.getAllTranscribers()['WhisperTranscriber']
# create the instance which initialize the transcriber service
transcriber = whisperInterface({'device': 'cuda'})
# you can now begin transcribing audio
micRecording = mic.getRecording()
result = transcriber.transcribe(micRecording)
"""
from psychopy.plugins import discoverModuleClasses
# build a dictionary with names
here = sys.modules[__name__]
foundTranscribers = discoverModuleClasses(here, BaseTranscriber)
del foundTranscribers['BaseTranscriber'] # remove base, not needed
if not engineKeys:
return foundTranscribers
# remap using engine names, more useful for builder
toReturn = {}
for className, interface in foundTranscribers.items():
if hasattr(interface, '_engine'): # has interface
toReturn[interface._engine] = interface
return toReturn
def getTranscriberInterface(engine):
"""Get a transcriber interface by name.
It is not recommended to work with transcriber interfaces directly. Instead,
setup a transcriber interface using `setupTranscriber()` and use
`submit()` to perform transcriptions.
Parameters
----------
engine : str
Name of the transcriber interface to get.
Returns
-------
Subclass of `BaseTranscriber`
Transcriber interface.
Examples
--------
Get a transcriber interface and initalize it::
whisperInterface = getTranscriberInterface('whisper')
# initialize it
transcriber = whisperInterface({'device': 'cuda'})
"""
transcribers = getAllTranscriberInterfaces(engineKeys=True)
try:
transcriber = transcribers[engine]
except KeyError:
raise ValueError(
f"Transcriber with engine name `{engine}` not found.")
return transcriber
def setupTranscriber(engine, config=None):
"""Setup a transcriber interface.
Calling this function will instantiate a transcriber interface and perform
any necessary setup steps. This function is useful for performing the
initialization step without blocking the main thread during a time-sensitive
part of the experiment.
You can only instantiate a single transcriber interface at a time. Calling
this function will replace the existing transcriber interface if one is
already setup.
Parameters
----------
engine : str
Name of the transcriber interface to setup, or a path to the backend class (e.g.
`psychopy_whisper.transcribe:WhisperTranscriber`).
config : dict or None
Options to configure the speech-to-text engine during initialization.
"""
global _activeTranscriber
if _activeTranscriber is not None:
oldInterface = _activeTranscriber.engine
logging.warning(
"Transcriber interface already setup, replacing existing "
"interface `{}` with `{}`".format(oldInterface, engine))
# unload the model if the interface supports it
if hasattr(_activeTranscriber, 'unload'):
_activeTranscriber.unload()
_activeTranscriber = None
# get all named transcribers
allTranscribers = getAllTranscriberInterfaces(engineKeys=True)
if engine in allTranscribers:
# if engine is included by name, get it
transcriber = allTranscribers[engine]
elif engine.lower() in allTranscribers:
# try lowercase
transcriber = allTranscribers[engine.lower()]
else:
# try to import it
try:
if ":" in engine:
group, name = engine.split(":")
else:
group, name = str.rsplit(".")
mod = importlib.import_module(group)
transcriber = getattr(mod, name)
except ModuleNotFoundError:
raise KeyError(
f"Could not find transcriber engine from '{engine}'"
)
logging.debug(f"Setting up transcriber `{engine}` with options `{config}`.")
_activeTranscriber = transcriber(config) # init the transcriber
def getActiveTranscriber():
"""Get the currently active transcriber interface instance.
Should return a subclass of `BaseTranscriber` upon a successful call to
`setupTranscriber()`, otherwise `None` is returned.
Returns
-------
Subclass of `BaseTranscriber` or None
Active transcriber interface instance, or `None` if none is active.
"""
global _activeTranscriber
return _activeTranscriber
def getActiveTranscriberEngine():
"""Get the name currently active transcriber interface.
Should return a string upon a successful call to `setupTranscriber()`,
otherwise `None` is returned.
Returns
-------
str or None
Name of the active transcriber interface, or `None` if none is active.
"""
activeTranscriber = getActiveTranscriber()
if activeTranscriber is None:
return None
return activeTranscriber.engine
def submit(audioClip, config=None):
"""Submit an audio clip for transcription.
This will begin the transcription process using the currently loaded
transcriber and return when completed. Unlike `transcribe`, not calling
`setupTranscriber` before calling this function will raise an exception.
Parameters
----------
audioClip : :class:`~psychopy.sound.AudioClip` or tuple
Audio clip containing speech to transcribe (e.g., recorded from a
microphone). Can be either an :class:`~psychopy.sound.AudioClip` object
or tuple where the first value is as a Nx1 or Nx2 array of audio
samples (`ndarray`) and the second the sample rate (`int`) in Hertz
(e.g., `(samples, 48000)`).
config : dict or None
Additional configuration options for the specified engine. These
are specified using a dictionary (ex. `config={'pfilter': 1}` will
enable the profanity filter when using the `'google'` engine).
Returns
-------
TranscriptionResult
Result of the transcription.
"""
global _activeTranscriber
if getActiveTranscriberEngine() is None:
raise TranscriberNotSetupError(
"No transcriber interface has been setup, call `setupTranscriber` "
"before calling `submit`.")
return _activeTranscriber.transcribe(audioClip, config=config)
def transcribe(audioClip, engine='whisper', language='en-US', expectedWords=None,
config=None):
"""Convert speech in audio to text.
This function accepts an audio clip and returns a transcription of the
speech in the clip. The efficacy of the transcription depends on the engine
selected, audio quality, and language support.
Speech-to-text conversion blocks the main application thread when used on
Python. Don't transcribe audio during time-sensitive parts of your
experiment! Instead, initialize the transcriber before the experiment
begins by calling this function with `audioClip=None`.
Parameters
----------
audioClip : :class:`~psychopy.sound.AudioClip` or tuple
Audio clip containing speech to transcribe (e.g., recorded from a
microphone). Can be either an :class:`~psychopy.sound.AudioClip` object
or tuple where the first value is as a Nx1 or Nx2 array of audio
samples (`ndarray`) and the second the sample rate (`int`) in Hertz
(e.g., `(samples, 48000)`). Passing `None` will initialize the
the transcriber without performing a transcription. This is useful for
performing the initialization step without blocking the main thread
during a time-sensitive part of the experiment.
engine : str
Speech-to-text engine to use.
language : str
BCP-47 language code (eg., 'en-US'). Note that supported languages
vary between transcription engines.
expectedWords : list or tuple
List of strings representing expected words or phrases. This will
constrain the possible output words to the ones specified which
constrains the model for better accuracy. Note not all engines support
this feature (only Sphinx and Google Cloud do at this time). A warning
will be logged if the engine selected does not support this feature. CMU
PocketSphinx has an additional feature where the sensitivity can be
specified for each expected word. You can indicate the sensitivity level
to use by putting a ``:`` after each word in the list (see the Example
below). Sensitivity levels range between 0 and 100. A higher number
results in the engine being more conservative, resulting in a higher
likelihood of false rejections. The default sensitivity is 80% for
words/phrases without one specified.
config : dict or None
Additional configuration options for the specified engine. These
are specified using a dictionary (ex. `config={'pfilter': 1}` will
enable the profanity filter when using the `'google'` engine).
Returns
-------
:class:`~psychopy.sound.transcribe.TranscriptionResult`
Transcription result.
Notes
-----
* The recommended transcriber is OpenAI Whisper which can be used locally
without an internet connection once a model is downloaded to cache. It can
be selected by passing `engine='whisper'` to this function.
* Online transcription services (eg., Google) provide robust and accurate
speech recognition capabilities with broader language support than offline
solutions. However, these services may require a paid subscription to use,
reliable broadband internet connections, and may not respect the privacy
of your participants as their responses are being sent to a third-party.
Also consider that a track of audio data being sent over the network can
be large, users on metered connections may incur additional costs to run
your experiment. Offline transcription services (eg., CMU PocketSphinx and
OpenAI Whisper) do not require an internet connection after the model has
been downloaded and installed.
* If the audio clip has multiple channels, they will be combined prior to
being passed to the transcription service if needed.
Examples
--------
Use a voice command as a response to a task::
# after doing microphone recording
resp = mic.getRecording()
transcribeResults = transcribe(resp)
if transcribeResults.success: # successful transcription
words = transcribeResults.words
if 'hello' in words:
print('You said hello.')
Initialize the transcriber without performing a transcription::
# initialize the transcriber
transcribe(None, config={
'model_name': 'tiny.en',
'device': 'auto'}
)
Specifying expected words with sensitivity levels when using CMU Pocket
Sphinx:
# expected words 90% sensitivity on the first two, default for the rest
expectedWords = ['right:90', 'left:90', 'up', 'down']
transcribeResults = transcribe(
resp.samples,
resp.sampleRateHz,
expectedWords=expectedWords)
if transcribeResults.success: # successful transcription
# process results ...
Specifying the API key to use Google's Cloud service for speech-to-text::
# set the environment variable
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = \
"C:\\path\\to\\my\\key.json"
# you can now call the transcriber ...
results = transcribe(
myRecording,
engine='google',
expectedWords=['left', 'right'])
if results.success:
print("You said: {}".format(results.words[0]))
"""
# check if the engine parameter is valid
engine = engine.lower() # make lower case
if config is None:
config = {}
global _activeTranscriber
if _activeTranscriber is None:
logging.warning(
"Called `transcribe` before calling `setupTranscriber`. The "
"transcriber interface will be initialized now. If this is a "
"time sensitive part of your experiment, consider calling "
"`setupTranscriber` before any experiment routine begins.")
setupTranscriber(engine, config=config)
return NULL_TRANSCRIPTION_RESULT
# check if we have necessary keys
if engine in ('google',):
alert(4615, strFields={'engine': engine})
# if we got a tuple, convert to audio clip object
if isinstance(audioClip, (tuple, list,)):
samples, sampleRateHz = audioClip
audioClip = AudioClip(samples, sampleRateHz)
# bit of a hack for the wisper transcriber
if engine == 'whisper':
# trim the language specifier, this should be close enough for now
langSplit = language.split('-')
if len(langSplit) > 1:
language = langSplit[0]
else:
language = language
else:
config['expectedWords'] = expectedWords
config['language'] = language
# do the actual transcription
return _activeTranscriber.transcribe(
audioClip,
language=language,
expectedWords=expectedWords,
config=config)
def _parseExpectedWords(wordList, defaultSensitivity=80):
"""Parse expected words list.
This function is used internally by other functions and classes within the
`transcribe` module.
Expected words or phrases are usually specified as a list of strings. CMU
Pocket Sphinx allows for additional 'sensitivity' values for each phrase
ranging from *0* to *100*. This function will generate to lists, first with
just words and another with specified sensitivity values. This allows the
user to specify sensitivity levels which can be ignored if the recognizer
engine does not support it.
Parameters
----------
wordList : list of str
List of words of phrases. Sensitivity levels for each can be specified
by putting a value at the end of each string separated with a colon `:`.
For example, ``'hello:80'`` for 80% sensitivity on 'hello'. Values are
normalized between *0.0* and *1.0* when returned.
defaultSensitivity : int or float
Default sensitivity to use if a word does not have one specified between
0 and 100%.
Returns
-------
tuple
Returns list of expected words and list of normalized sensitivities for
each.
Examples
--------
Specifying expected words to CMU Pocket Sphinx::
words = [('hello:95', 'bye:50')]
expectedWords = zip(_parseExpectedWords(words))
"""
defaultSensitivity = defaultSensitivity / 100. # normalized
sensitivities = []
if wordList is not None:
# sensitivity specified as `word:80`
wordListTemp = []
for word in wordList:
wordAndSense = word.split(':')
if len(wordAndSense) == 2: # specified as `word:80`
word, sensitivity = wordAndSense
sensitivity = int(sensitivity) / 100.
else:
word = wordAndSense[0]
sensitivity = defaultSensitivity # default is 80% confidence
wordListTemp.append(word)
sensitivities.append(sensitivity)
wordList = wordListTemp
return wordList, sensitivities
# ------------------------------------------------------------------------------
# Recognizers
#
# These functions are used to send off audio and configuration data to the
# indicated speech-to-text engine. Most of these functions are synchronous,
# meaning they block the application until they return. Don't run these in any
# time critical parts of your program.
#
_pocketSphinxTranscriber = None
def recognizeSphinx(audioClip=None, language='en-US', expectedWords=None,
config=None):
"""Perform speech-to-text conversion on the provided audio samples using
CMU Pocket Sphinx.
Parameters
----------
audioClip : :class:`~psychopy.sound.AudioClip` or None
Audio clip containing speech to transcribe (e.g., recorded from a
microphone). Specify `None` to open a client without performing a
transcription, this will reduce latency when the transcriber is invoked
in successive calls.
language : str
BCP-47 language code (eg., 'en-US'). Should match the language which the
speaker is using. Pocket Sphinx requires language packs to be installed
locally.
expectedWords : list or None
List of strings representing expected words or phrases. This will
attempt bias the possible output words to the ones specified if the
engine is uncertain. Sensitivity can be specified for each expected
word. You can indicate the sensitivity level to use by putting a ``:``
after each word in the list (see the Example below). Sensitivity levels
range between 0 and 100. A higher number results in the engine being
more conservative, resulting in a higher likelihood of false rejections.
The default sensitivity is 80% for words/phrases without one specified.
config : dict or None
Additional configuration options for the specified engine.
Returns
-------
TranscriptionResult
Transcription result object.
"""
if config is None:
config = {} # empty dict if `None`
onlyInitialize = audioClip is None
global _pocketSphinxTranscriber
if _pocketSphinxTranscriber is None:
allTranscribers = getAllTranscriberInterfaces(engineKeys=True)
try:
interface = allTranscribers['sphinx']
except KeyError:
raise RecognizerEngineNotFoundError(
"Cannot load transcriber interface for 'sphinx'.")
_pocketSphinxTranscriber = interface() # create instance
if onlyInitialize:
return NULL_TRANSCRIPTION_RESULT
# extract parameters which we used to support
config['expectedWords'] = expectedWords
config['language'] = language
# do transcription and return result
return _pocketSphinxTranscriber.transcribe(audioClip, modelConfig=config)
_googleCloudTranscriber = None # keep instance for legacy functions
def recognizeGoogle(audioClip=None, language='en-US', expectedWords=None,
config=None):
"""Perform speech-to-text conversion on the provided audio clip using
the Google Cloud API.
This is an online based speech-to-text engine provided by Google as a
subscription service, providing exceptional accuracy compared to `built-in`.
Requires an API key to use which you must generate and specify prior to
calling this function.
Parameters
----------
audioClip : :class:`~psychopy.sound.AudioClip` or None
Audio clip containing speech to transcribe (e.g., recorded from a
microphone). Specify `None` to open a client without performing a
transcription, this will reduce latency when the transcriber is invoked
in successive calls.
language : str
BCP-47 language code (eg., 'en-US'). Should match the language which the
speaker is using.
expectedWords : list or None
List of strings representing expected words or phrases. These are passed
as speech context metadata which will make the recognizer prefer a
particular word in cases where there is ambiguity or uncertainty.
config : dict or None
Additional configuration options for the recognizer as a dictionary.
Notes
-----
* The first invocation of this function will take considerably longer to run
that successive calls as the client has not been started yet. Only one
instance of a recognizer client can be created per-session.
Examples
--------
Specifying the API key to use Google's Cloud service for speech-to-text::
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = \
"C:\\path\\to\\my\\key.json"
# you can now call the transcriber
results = recognizeGoogle(myRecording, expectedWords=['left', 'right'])
if results.success:
print("You said: {}".format(results.words[0])) # first word
"""
if config is None:
config = {} # empty dict if `None`
onlyInitialize = audioClip is None
global _googleCloudTranscriber
if _googleCloudTranscriber is None:
allTranscribers = getAllTranscriberInterfaces(engineKeys=True)
try:
interface = allTranscribers['googleCloud']
except KeyError:
raise RecognizerEngineNotFoundError(
"Cannot load transcriber interface for 'googleCloud'.")
_googleCloudTranscriber = interface() # create instance
if onlyInitialize:
return NULL_TRANSCRIPTION_RESULT
# set parameters which we used to support
config['expectedWords'] = expectedWords
config['language'] = language
# do transcription and return result
return _googleCloudTranscriber.transcribe(audioClip, modelConfig=config)
if __name__ == "__main__":
pass