Source code for psychopy.sound.transcribe

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""Classes and functions for transcribing speech in audio data to text.
"""

# Part of the PsychoPy library
# Copyright (C) 2002-2018 Jonathan Peirce (C) 2019-2025 Open Science Tools Ltd.
# Distributed under the terms of the GNU General Public License (GPL).

__all__ = [
    'TranscriptionResult',
    'transcribe',
    'TRANSCR_LANG_DEFAULT',
    'BaseTranscriber',
    'recognizerEngineValues',
    'recognizeSphinx',
    'recognizeGoogle',
    'getAllTranscriberInterfaces',
    'getTranscriberInterface',
    'setupTranscriber',
    'getActiveTranscriber',
    'getActiveTranscriberEngine',
    'submit'
]

import importlib
import json
import sys
import os
import psychopy.logging as logging
from psychopy.alerts import alert
from pathlib import Path
from psychopy.preferences import prefs
from .audioclip import *
from .exceptions import *
import numpy as np

# ------------------------------------------------------------------------------
# Initialize the speech recognition system
#

# _hasSpeechRecognition = True
# try:
#     import speech_recognition as sr
# except (ImportError, ModuleNotFoundError):
#     logging.warning(
#         "Speech-to-text recognition module for PocketSphinx is not available "
#         "(use command `pip install SpeechRecognition` to get it). "
#         "Transcription will be unavailable using that service this session.")
#     _hasSpeechRecognition = False

# Google Cloud API
# _hasGoogleCloud = True
# _googleCloudClient = None  # client for Google Cloud, instanced on first use
# try:
#     import google.cloud.speech
#     import google.auth.exceptions
# except (ImportError, ModuleNotFoundError):
#     logging.warning(
#         "Speech-to-text recognition using Google online services is not "
#         "available (use command `pip install google-api-core google-auth "
#         "google-cloud google-cloud-speech googleapis-common-protos` to get "
#         "it). Transcription will be unavailable using that service this "
#         "session.")
#     _hasGoogleCloud = False

# try:
#     import pocketsphinx
#     sphinxLangs = [folder.stem for folder
#                    in Path(pocketsphinx.get_model_path()).glob('??-??')]
#     haveSphinx = True
# except (ImportError, ModuleNotFoundError):
#     haveSphinx = False
#     sphinxLangs = None

# Constants related to the transcription system.
TRANSCR_LANG_DEFAULT = 'en-US'

# Values for specifying recognizer engines. This dictionary is used by Builder
# to populate the component property dropdown.
recognizerEngineValues = {
    0: ('sphinx', "CMU Pocket Sphinx", "Offline"),
    1: ('google', "Google Cloud Speech API", "Online, Key Required"),
    2: ('whisper', "OpenAI Whisper", "Offline, Built-in")
}

# the active transcriber interface
_activeTranscriber = None


# ------------------------------------------------------------------------------
# Exceptions for the speech recognition interface
#

class TranscriberError(Exception):
    """Base class for transcriber exceptions.
    """
    pass


class TranscriberNotSetupError(TranscriberError):
    """Exception raised when a transcriber interface has not been setup.
    """
    pass


# ------------------------------------------------------------------------------
# Classes and functions for speech-to-text transcription
#

[docs] class TranscriptionResult: """Descriptor for returned transcription data. Fields within this class can be used to access transcribed words and other information related to the transcription request. This is returned by functions and methods which perform speech-to-text transcription from audio data within PsychoPy. The user usually does not create instances of this class themselves. Parameters ---------- words : list of str Words extracted from the audio clip. unknownValue : bool `True` if the transcription API failed make sense of the audio and did not complete the transcription. requestFailed : bool `True` if there was an error with the transcriber itself. For instance, network error or improper formatting of the audio data. engine : str Name of engine used to perform this transcription. language : str Identifier for the language used to perform the transcription. """ __slots__ = [ '_words', '_wordData', # additional word data '_text', # unused for now, will be used in future '_confidence', # unused on Python for now '_response', '_engine', '_language', '_expectedWords', '_requestFailed', '_unknownValue'] def __init__(self, words, unknownValue, requestFailed, engine, language): self.words = words self.unknownValue = unknownValue self.requestFailed = requestFailed self.engine = engine self.language = language # initialize other fields self._wordData = None self._text = "" self._confidence = 0.0 self._response = None self._expectedWords = None self._requestFailed = True self._unknownValue = True def __repr__(self): return (f"TranscriptionResult(words={self._words}, " f"unknownValue={self._unknownValue}, ", f"requestFailed={self._requestFailed}, ", f"engine={self._engine}, ", f"language={self._language})") def __str__(self): return " ".join(self._words) def __json__(self): return str(self) @property def wordCount(self): """Number of words found (`int`).""" return len(self._words) @property def words(self): """Words extracted from the audio clip (`list` of `str`).""" return self._words @property def text(self): """Text transcribed for the audio data (`str`). """ return self._text @words.setter def words(self, value): self._words = list(value) @property def response(self): """Raw API response from the transcription engine (`str`). """ return self._response @response.setter def response(self, val): self._response = val @property def responseData(self): """ Values from self.response, parsed into a `dict`. """ return json.loads(self.response) @responseData.setter def responseData(self, val): self._response = str(val) @property def wordData(self): """Additional data about each word (`list`). Not all engines provide this data in the same format or at all. """ return self._wordData @wordData.setter def wordData(self, val): self._wordData = val
[docs] def getSpeechInterval(self): """Get the start and stop times for the interval of speech in the audio clip. This feature is only supported by the Whisper transcriber. The start and end times of the speech interval are returned in seconds. Returns ------- tuple Start and end times of the speech interval in seconds. If the engine does not support this feature, or if the data is missing, `(None, None)` is returned. In cases where either the start or end time is missing, the value will be `None` for that field. """ nullData = (None, None) # default return value if no data if self._engine in ('sphinx', 'google'): logging.warning( "Method `getSpeechInterval` is not supported for the " "transcription engine `{}`.".format(self._engine)) return nullData elif self._engine == 'whisper': if self.responseData is None: return nullData # this value is in the response data which is in JSON format segmentData = self.responseData.get('segments', None) if segmentData is None: return nullData # integers for keys segmentKeys = list(segmentData.keys()) if len(segmentKeys) == 0: return nullData # sort segment keys to ensure monotonic ordering segmentKeys.sort() # get first and last segment firstSegment = segmentData.get(segmentKeys[0], None) lastSegment = segmentData.get(segmentKeys[-1], None) # get speech onset/offset times speechOnset = firstSegment.get('start', None) speechOffset = lastSegment.get('end', None) # return start and end times return speechOnset, speechOffset
@property def success(self): """`True` if the transcriber returned a result successfully (`bool`).""" return not (self._unknownValue or self._requestFailed) @property def error(self): """`True` if there was an error during transcription (`bool`). Value is always the compliment of `.success`.""" return not self.success @property def unknownValue(self): """`True` if the transcription API failed make sense of the audio and did not complete the transcription (`bool`). """ return self._unknownValue @unknownValue.setter def unknownValue(self, value): self._unknownValue = bool(value) @property def requestFailed(self): """`True` if there was an error with the transcriber itself (`bool`). For instance, network error or improper formatting of the audio data, invalid key, or if there was network connection error. """ return self._requestFailed @requestFailed.setter def requestFailed(self, value): self._requestFailed = bool(value) @property def engine(self): """Name of engine used to perform this transcription (`str`). """ return self._engine @engine.setter def engine(self, value): if value == 'sphinx': if not haveSphinx: raise ModuleNotFoundError( "To perform built-in (local) transcription you need to " "have pocketsphinx installed (pip install pocketsphinx)") self._engine = str(value) @property def language(self): """Identifier for the language used to perform the transcription (`str`). """ return self._language @language.setter def language(self, value): self._language = str(value)
# empty result returned when a transcriber is given no data NULL_TRANSCRIPTION_RESULT = TranscriptionResult( words=[''], unknownValue=False, requestFailed=False, engine='null', language=TRANSCR_LANG_DEFAULT ) # ------------------------------------------------------------------------------ # Transcription interfaces # class BaseTranscriber: """Base class for text-to-speech transcribers. This class defines the API for transcription, which is an interface to a speech-to-text engine. All transcribers must be sub-classes of this class and implement all members of this class. Parameters ---------- initConfig : dict or None Options to configure the speech-to-text engine during initialization. """ _isLocal = True _engine = u'Null' _longName = u"Null" _lastResult = None def __init__(self, initConfig=None): self._initConf = initConfig @property def longName(self): """Human-readable name of the transcriber (`str`). """ return self._longName @property def engine(self): """Identifier for the transcription engine which this object interfaces with (`str`). """ return self._engine @property def isLocal(self): """`True` if the transcription engine works locally without sending data to a remote server. """ return self._isLocal @property def isComplete(self): """`True` if the transcriber has completed its transcription. The result can be accessed through `.lastResult`. """ return True @property def lastResult(self): """Result of the last transcription. """ return self._lastResult @lastResult.setter def lastResult(self, val): self._lastResult = val def transcribe(self, audioClip, modelConfig=None, decoderConfig=None): """Perform speech-to-text conversion on the provided audio samples. Parameters ---------- audioClip : :class:`~psychopy.sound.AudioClip` Audio clip containing speech to transcribe (e.g., recorded from a microphone). modelConfig : dict or None Additional configuration options for the model used by the engine. decoderConfig : dict or None Additional configuration options for the decoder used by the engine. Returns ------- TranscriptionResult Transcription result object. """ self._lastResult = NULL_TRANSCRIPTION_RESULT # dummy value return self._lastResult def unload(self): """Unload the transcriber interface. This method is called when the transcriber interface is no longer needed. This is useful for freeing up resources used by the transcriber interface. This might not be available on all transcriber interfaces. """ pass class PocketSphinxTranscriber(BaseTranscriber): """Class to perform speech-to-text conversion on the provided audio samples using CMU Pocket Sphinx. Parameters ---------- initConfig : dict or None Options to configure the speech-to-text engine during initialization. """ _isLocal = True _engine = u'sphinx' _longName = u"CMU PocketSphinx" def __init__(self, initConfig=None): super(PocketSphinxTranscriber, self).__init__(initConfig) # import the library and get language models import speech_recognition as sr # create a recognizer interface self._recognizer = sr.Recognizer() @staticmethod def getAllModels(): """Get available language models for the PocketSphinx transcriber (`list`). Returns ------- list List of available models. """ import pocketsphinx modelPath = pocketsphinx.get_model_path() toReturn = [folder.stem for folder in Path(modelPath).glob('??-??')] return toReturn def transcribe(self, audioClip, modelConfig=None, decoderConfig=None): """Perform speech-to-text conversion on the provided audio samples using CMU Pocket Sphinx. Parameters ---------- audioClip : :class:`~psychopy.sound.AudioClip` Audio clip containing speech to transcribe (e.g., recorded from a microphone). modelConfig : dict or None Additional configuration options for the model used by the engine. decoderConfig : dict or None Additional configuration options for the decoder used by the engine. Presently unused by this transcriber. Returns ------- TranscriptionResult Transcription result object. """ import speech_recognition as sr try: import pocketsphinx except (ImportError, ModuleNotFoundError): raise RecognizerEngineNotFoundError() # warmup the engine, not used here but needed for compatibility if audioClip is None: return NULL_TRANSCRIPTION_RESULT if isinstance(audioClip, AudioClip): pass elif isinstance(audioClip, (tuple, list,)): waveform, sampleRate = audioClip audioClip = AudioClip(waveform, sampleRateHz=sampleRate) else: raise TypeError( "Expected type for parameter `audioClip` to be either " "`AudioClip`, `list` or `tuple`") # engine configuration modelConfig = {} if modelConfig is None else modelConfig if not isinstance(modelConfig, dict): raise TypeError( "Invalid type for parameter `config` specified, must be `dict` " "or `None`.") language = modelConfig.get('language', TRANSCR_LANG_DEFAULT) if not isinstance(language, str): raise TypeError( "Invalid type for parameter `language`, must be type `str`.") language = language.lower() if language not in sphinxLangs: # missing a language pack error url = "https://sourceforge.net/projects/cmusphinx/files/" \ "Acoustic%20and%20Language%20Models/" msg = (f"Language `{language}` is not installed for " f"`pocketsphinx`. You can download languages here: {url}. " f"Install them here: {pocketsphinx.get_model_path()}") raise RecognizerLanguageNotSupportedError(msg) # configure the recognizer modelConfig['language'] = language # sphinx users en-us not en-US modelConfig['show_all'] = False expectedWords = modelConfig.get('keyword_entries', None) if expectedWords is not None: words, sens = _parseExpectedWords(expectedWords) modelConfig['keyword_entries'] = tuple(zip(words, sens)) # convert audio to format for transcription sampleWidth = 2 # two bytes per sample for WAV audioData = sr.AudioData( audioClip.asMono().convertToWAV(), sample_rate=audioClip.sampleRateHz, sample_width=sampleWidth) # submit audio samples to the API respAPI = '' unknownValueError = requestError = False try: respAPI = self._recognizer.recognize_sphinx( audioData, **modelConfig) except sr.UnknownValueError: unknownValueError = True except sr.RequestError: requestError = True # remove empty words result = [word for word in respAPI.split(' ') if word != ''] # object to return containing transcription data self.lastResult = toReturn = TranscriptionResult( words=result, unknownValue=unknownValueError, requestFailed=requestError, engine='sphinx', language=language) # split only if the user does not want the raw API data return toReturn class GoogleCloudTranscriber(BaseTranscriber): """Class for speech-to-text transcription using Google Cloud API services. Parameters ---------- initConfig : dict or None Options to configure the speech-to-text engine during initialization. """ _isLocal = False _engine = u'googleCloud' _longName = u'Google Cloud' def __init__(self, initConfig=None): super(GoogleCloudTranscriber, self).__init__(initConfig) try: import google.cloud.speech import google.auth.exceptions except (ImportError, ModuleNotFoundError): pass if "GOOGLE_APPLICATION_CREDENTIALS" not in os.environ: os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = \ prefs.general['appKeyGoogleCloud'] # empty string indicates no key has been specified, raise error if not os.environ["GOOGLE_APPLICATION_CREDENTIALS"]: raise RecognizerAPICredentialsError( 'No application key specified for Google Cloud Services, ' 'specify the path to the key file with either the system ' 'environment variable `GOOGLE_APPLICATION_CREDENTIALS` or in ' 'preferences (General -> appKeyGoogleCloud).') # open new client, takes a while the first go try: client = google.cloud.speech.SpeechClient() except google.auth.exceptions.DefaultCredentialsError: raise RecognizerAPICredentialsError( 'Invalid key specified for Google Cloud Services, check if the ' 'key file is valid and readable.') self._googleCloudClient = client def transcribe(self, audioClip, modelConfig=None, decoderConfig=None): """Transcribe text using Google Cloud. Parameters ---------- audioClip : AudioClip, list or tuple Audio clip containing speech to transcribe (e.g., recorded from a microphone). Can be either an :class:`~psychopy.sound.AudioClip` object or tuple where the first value is as a Nx1 or Nx2 array of audio samples (`ndarray`) and the second the sample rate (`int`) in Hertz (e.g., ``(samples, 48000)``). Returns ------- TranscriptionResult Result of the transcription. """ # if None, return a null transcription result and just open a client if audioClip is None: return NULL_TRANSCRIPTION_RESULT if isinstance(audioClip, (list, tuple,)): waveform, sr = audioClip audioClip = AudioClip(waveform, sampleRateHz=sr) # check if we have a valid audio clip if not isinstance(audioClip, AudioClip): raise TypeError( "Expected parameter `audioClip` to have type " "`psychopy.sound.AudioClip`.") # import here the first time import google.cloud.speech as speech import google.auth.exceptions # defaults languageCode = modelConfig.get('language', 'language_code') model = modelConfig.get('model', 'command_and_search') expectedWords = modelConfig.get('expectedWords', None) # configure the recognizer params = { 'encoding': speech.RecognitionConfig.AudioEncoding.LINEAR16, 'sample_rate_hertz': audioClip.sampleRateHz, 'language_code': languageCode, 'model': model, 'audio_channel_count': audioClip.channels, 'max_alternatives': 1} if isinstance(modelConfig, dict): # overwrites defaults! params.update(modelConfig) # speech context (i.e. expected phrases) if expectedWords is not None: expectedWords, _ = _parseExpectedWords(expectedWords) params['speech_contexts'] = \ [google.cloud.speech.SpeechContext(phrases=expectedWords)] # Detects speech in the audio file response = self._googleCloudClient.recognize( config=google.cloud.speech.RecognitionConfig(**params), audio=google.cloud.speech.RecognitionAudio( content=audioClip.convertToWAV())) # package up response result = [ result.alternatives[0].transcript for result in response.results] toReturn = TranscriptionResult( words=result, unknownValue=False, # not handled yet requestFailed=False, # not handled yet engine='google', language=languageCode) toReturn.response = response self._lastResult = toReturn return toReturn # ------------------------------------------------------------------------------ # Functions # def getAllTranscriberInterfaces(engineKeys=False): """Get all available transcriber interfaces. Transcriber interface can be implemented in plugins. When loaded, this function will return them. It is not recommended to work with transcriber interfaces directly. Instead, setup a transcriber interface using `setupTranscriber()` and use `submit()` to perform transcriptions. Parameters ---------- engineKeys : bool Have the returned mapping use engine names for keys instead of class names. Returns ------- dict Mapping of transcriber class or engine names (`str`) and references to classes (subclasses of `BaseTranscriber`.) Examples -------- Getting a transcriber interface, initializing it, and doing a transcription:: whisperInterface = sound.getAllTranscribers()['WhisperTranscriber'] # create the instance which initialize the transcriber service transcriber = whisperInterface({'device': 'cuda'}) # you can now begin transcribing audio micRecording = mic.getRecording() result = transcriber.transcribe(micRecording) """ from psychopy.plugins import discoverModuleClasses # build a dictionary with names here = sys.modules[__name__] foundTranscribers = discoverModuleClasses(here, BaseTranscriber) del foundTranscribers['BaseTranscriber'] # remove base, not needed if not engineKeys: return foundTranscribers # remap using engine names, more useful for builder toReturn = {} for className, interface in foundTranscribers.items(): if hasattr(interface, '_engine'): # has interface toReturn[interface._engine] = interface return toReturn def getTranscriberInterface(engine): """Get a transcriber interface by name. It is not recommended to work with transcriber interfaces directly. Instead, setup a transcriber interface using `setupTranscriber()` and use `submit()` to perform transcriptions. Parameters ---------- engine : str Name of the transcriber interface to get. Returns ------- Subclass of `BaseTranscriber` Transcriber interface. Examples -------- Get a transcriber interface and initalize it:: whisperInterface = getTranscriberInterface('whisper') # initialize it transcriber = whisperInterface({'device': 'cuda'}) """ transcribers = getAllTranscriberInterfaces(engineKeys=True) try: transcriber = transcribers[engine] except KeyError: raise ValueError( f"Transcriber with engine name `{engine}` not found.") return transcriber def setupTranscriber(engine, config=None): """Setup a transcriber interface. Calling this function will instantiate a transcriber interface and perform any necessary setup steps. This function is useful for performing the initialization step without blocking the main thread during a time-sensitive part of the experiment. You can only instantiate a single transcriber interface at a time. Calling this function will replace the existing transcriber interface if one is already setup. Parameters ---------- engine : str Name of the transcriber interface to setup, or a path to the backend class (e.g. `psychopy_whisper.transcribe:WhisperTranscriber`). config : dict or None Options to configure the speech-to-text engine during initialization. """ global _activeTranscriber if _activeTranscriber is not None: oldInterface = _activeTranscriber.engine logging.warning( "Transcriber interface already setup, replacing existing " "interface `{}` with `{}`".format(oldInterface, engine)) # unload the model if the interface supports it if hasattr(_activeTranscriber, 'unload'): _activeTranscriber.unload() _activeTranscriber = None # get all named transcribers allTranscribers = getAllTranscriberInterfaces(engineKeys=True) if engine in allTranscribers: # if engine is included by name, get it transcriber = allTranscribers[engine] elif engine.lower() in allTranscribers: # try lowercase transcriber = allTranscribers[engine.lower()] else: # try to import it try: if ":" in engine: group, name = engine.split(":") else: group, name = str.rsplit(".") mod = importlib.import_module(group) transcriber = getattr(mod, name) except ModuleNotFoundError: raise KeyError( f"Could not find transcriber engine from '{engine}'" ) logging.debug(f"Setting up transcriber `{engine}` with options `{config}`.") _activeTranscriber = transcriber(config) # init the transcriber def getActiveTranscriber(): """Get the currently active transcriber interface instance. Should return a subclass of `BaseTranscriber` upon a successful call to `setupTranscriber()`, otherwise `None` is returned. Returns ------- Subclass of `BaseTranscriber` or None Active transcriber interface instance, or `None` if none is active. """ global _activeTranscriber return _activeTranscriber def getActiveTranscriberEngine(): """Get the name currently active transcriber interface. Should return a string upon a successful call to `setupTranscriber()`, otherwise `None` is returned. Returns ------- str or None Name of the active transcriber interface, or `None` if none is active. """ activeTranscriber = getActiveTranscriber() if activeTranscriber is None: return None return activeTranscriber.engine def submit(audioClip, config=None): """Submit an audio clip for transcription. This will begin the transcription process using the currently loaded transcriber and return when completed. Unlike `transcribe`, not calling `setupTranscriber` before calling this function will raise an exception. Parameters ---------- audioClip : :class:`~psychopy.sound.AudioClip` or tuple Audio clip containing speech to transcribe (e.g., recorded from a microphone). Can be either an :class:`~psychopy.sound.AudioClip` object or tuple where the first value is as a Nx1 or Nx2 array of audio samples (`ndarray`) and the second the sample rate (`int`) in Hertz (e.g., `(samples, 48000)`). config : dict or None Additional configuration options for the specified engine. These are specified using a dictionary (ex. `config={'pfilter': 1}` will enable the profanity filter when using the `'google'` engine). Returns ------- TranscriptionResult Result of the transcription. """ global _activeTranscriber if getActiveTranscriberEngine() is None: raise TranscriberNotSetupError( "No transcriber interface has been setup, call `setupTranscriber` " "before calling `submit`.") return _activeTranscriber.transcribe(audioClip, config=config) def transcribe(audioClip, engine='whisper', language='en-US', expectedWords=None, config=None): """Convert speech in audio to text. This function accepts an audio clip and returns a transcription of the speech in the clip. The efficacy of the transcription depends on the engine selected, audio quality, and language support. Speech-to-text conversion blocks the main application thread when used on Python. Don't transcribe audio during time-sensitive parts of your experiment! Instead, initialize the transcriber before the experiment begins by calling this function with `audioClip=None`. Parameters ---------- audioClip : :class:`~psychopy.sound.AudioClip` or tuple Audio clip containing speech to transcribe (e.g., recorded from a microphone). Can be either an :class:`~psychopy.sound.AudioClip` object or tuple where the first value is as a Nx1 or Nx2 array of audio samples (`ndarray`) and the second the sample rate (`int`) in Hertz (e.g., `(samples, 48000)`). Passing `None` will initialize the the transcriber without performing a transcription. This is useful for performing the initialization step without blocking the main thread during a time-sensitive part of the experiment. engine : str Speech-to-text engine to use. language : str BCP-47 language code (eg., 'en-US'). Note that supported languages vary between transcription engines. expectedWords : list or tuple List of strings representing expected words or phrases. This will constrain the possible output words to the ones specified which constrains the model for better accuracy. Note not all engines support this feature (only Sphinx and Google Cloud do at this time). A warning will be logged if the engine selected does not support this feature. CMU PocketSphinx has an additional feature where the sensitivity can be specified for each expected word. You can indicate the sensitivity level to use by putting a ``:`` after each word in the list (see the Example below). Sensitivity levels range between 0 and 100. A higher number results in the engine being more conservative, resulting in a higher likelihood of false rejections. The default sensitivity is 80% for words/phrases without one specified. config : dict or None Additional configuration options for the specified engine. These are specified using a dictionary (ex. `config={'pfilter': 1}` will enable the profanity filter when using the `'google'` engine). Returns ------- :class:`~psychopy.sound.transcribe.TranscriptionResult` Transcription result. Notes ----- * The recommended transcriber is OpenAI Whisper which can be used locally without an internet connection once a model is downloaded to cache. It can be selected by passing `engine='whisper'` to this function. * Online transcription services (eg., Google) provide robust and accurate speech recognition capabilities with broader language support than offline solutions. However, these services may require a paid subscription to use, reliable broadband internet connections, and may not respect the privacy of your participants as their responses are being sent to a third-party. Also consider that a track of audio data being sent over the network can be large, users on metered connections may incur additional costs to run your experiment. Offline transcription services (eg., CMU PocketSphinx and OpenAI Whisper) do not require an internet connection after the model has been downloaded and installed. * If the audio clip has multiple channels, they will be combined prior to being passed to the transcription service if needed. Examples -------- Use a voice command as a response to a task:: # after doing microphone recording resp = mic.getRecording() transcribeResults = transcribe(resp) if transcribeResults.success: # successful transcription words = transcribeResults.words if 'hello' in words: print('You said hello.') Initialize the transcriber without performing a transcription:: # initialize the transcriber transcribe(None, config={ 'model_name': 'tiny.en', 'device': 'auto'} ) Specifying expected words with sensitivity levels when using CMU Pocket Sphinx: # expected words 90% sensitivity on the first two, default for the rest expectedWords = ['right:90', 'left:90', 'up', 'down'] transcribeResults = transcribe( resp.samples, resp.sampleRateHz, expectedWords=expectedWords) if transcribeResults.success: # successful transcription # process results ... Specifying the API key to use Google's Cloud service for speech-to-text:: # set the environment variable import os os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = \ "C:\\path\\to\\my\\key.json" # you can now call the transcriber ... results = transcribe( myRecording, engine='google', expectedWords=['left', 'right']) if results.success: print("You said: {}".format(results.words[0])) """ # check if the engine parameter is valid engine = engine.lower() # make lower case if config is None: config = {} global _activeTranscriber if _activeTranscriber is None: logging.warning( "Called `transcribe` before calling `setupTranscriber`. The " "transcriber interface will be initialized now. If this is a " "time sensitive part of your experiment, consider calling " "`setupTranscriber` before any experiment routine begins.") setupTranscriber(engine, config=config) return NULL_TRANSCRIPTION_RESULT # check if we have necessary keys if engine in ('google',): alert(4615, strFields={'engine': engine}) # if we got a tuple, convert to audio clip object if isinstance(audioClip, (tuple, list,)): samples, sampleRateHz = audioClip audioClip = AudioClip(samples, sampleRateHz) # bit of a hack for the wisper transcriber if engine == 'whisper': # trim the language specifier, this should be close enough for now langSplit = language.split('-') if len(langSplit) > 1: language = langSplit[0] else: language = language else: config['expectedWords'] = expectedWords config['language'] = language # do the actual transcription return _activeTranscriber.transcribe( audioClip, language=language, expectedWords=expectedWords, config=config) def _parseExpectedWords(wordList, defaultSensitivity=80): """Parse expected words list. This function is used internally by other functions and classes within the `transcribe` module. Expected words or phrases are usually specified as a list of strings. CMU Pocket Sphinx allows for additional 'sensitivity' values for each phrase ranging from *0* to *100*. This function will generate to lists, first with just words and another with specified sensitivity values. This allows the user to specify sensitivity levels which can be ignored if the recognizer engine does not support it. Parameters ---------- wordList : list of str List of words of phrases. Sensitivity levels for each can be specified by putting a value at the end of each string separated with a colon `:`. For example, ``'hello:80'`` for 80% sensitivity on 'hello'. Values are normalized between *0.0* and *1.0* when returned. defaultSensitivity : int or float Default sensitivity to use if a word does not have one specified between 0 and 100%. Returns ------- tuple Returns list of expected words and list of normalized sensitivities for each. Examples -------- Specifying expected words to CMU Pocket Sphinx:: words = [('hello:95', 'bye:50')] expectedWords = zip(_parseExpectedWords(words)) """ defaultSensitivity = defaultSensitivity / 100. # normalized sensitivities = [] if wordList is not None: # sensitivity specified as `word:80` wordListTemp = [] for word in wordList: wordAndSense = word.split(':') if len(wordAndSense) == 2: # specified as `word:80` word, sensitivity = wordAndSense sensitivity = int(sensitivity) / 100. else: word = wordAndSense[0] sensitivity = defaultSensitivity # default is 80% confidence wordListTemp.append(word) sensitivities.append(sensitivity) wordList = wordListTemp return wordList, sensitivities # ------------------------------------------------------------------------------ # Recognizers # # These functions are used to send off audio and configuration data to the # indicated speech-to-text engine. Most of these functions are synchronous, # meaning they block the application until they return. Don't run these in any # time critical parts of your program. # _pocketSphinxTranscriber = None def recognizeSphinx(audioClip=None, language='en-US', expectedWords=None, config=None): """Perform speech-to-text conversion on the provided audio samples using CMU Pocket Sphinx. Parameters ---------- audioClip : :class:`~psychopy.sound.AudioClip` or None Audio clip containing speech to transcribe (e.g., recorded from a microphone). Specify `None` to open a client without performing a transcription, this will reduce latency when the transcriber is invoked in successive calls. language : str BCP-47 language code (eg., 'en-US'). Should match the language which the speaker is using. Pocket Sphinx requires language packs to be installed locally. expectedWords : list or None List of strings representing expected words or phrases. This will attempt bias the possible output words to the ones specified if the engine is uncertain. Sensitivity can be specified for each expected word. You can indicate the sensitivity level to use by putting a ``:`` after each word in the list (see the Example below). Sensitivity levels range between 0 and 100. A higher number results in the engine being more conservative, resulting in a higher likelihood of false rejections. The default sensitivity is 80% for words/phrases without one specified. config : dict or None Additional configuration options for the specified engine. Returns ------- TranscriptionResult Transcription result object. """ if config is None: config = {} # empty dict if `None` onlyInitialize = audioClip is None global _pocketSphinxTranscriber if _pocketSphinxTranscriber is None: allTranscribers = getAllTranscriberInterfaces(engineKeys=True) try: interface = allTranscribers['sphinx'] except KeyError: raise RecognizerEngineNotFoundError( "Cannot load transcriber interface for 'sphinx'.") _pocketSphinxTranscriber = interface() # create instance if onlyInitialize: return NULL_TRANSCRIPTION_RESULT # extract parameters which we used to support config['expectedWords'] = expectedWords config['language'] = language # do transcription and return result return _pocketSphinxTranscriber.transcribe(audioClip, modelConfig=config) _googleCloudTranscriber = None # keep instance for legacy functions def recognizeGoogle(audioClip=None, language='en-US', expectedWords=None, config=None): """Perform speech-to-text conversion on the provided audio clip using the Google Cloud API. This is an online based speech-to-text engine provided by Google as a subscription service, providing exceptional accuracy compared to `built-in`. Requires an API key to use which you must generate and specify prior to calling this function. Parameters ---------- audioClip : :class:`~psychopy.sound.AudioClip` or None Audio clip containing speech to transcribe (e.g., recorded from a microphone). Specify `None` to open a client without performing a transcription, this will reduce latency when the transcriber is invoked in successive calls. language : str BCP-47 language code (eg., 'en-US'). Should match the language which the speaker is using. expectedWords : list or None List of strings representing expected words or phrases. These are passed as speech context metadata which will make the recognizer prefer a particular word in cases where there is ambiguity or uncertainty. config : dict or None Additional configuration options for the recognizer as a dictionary. Notes ----- * The first invocation of this function will take considerably longer to run that successive calls as the client has not been started yet. Only one instance of a recognizer client can be created per-session. Examples -------- Specifying the API key to use Google's Cloud service for speech-to-text:: import os os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = \ "C:\\path\\to\\my\\key.json" # you can now call the transcriber results = recognizeGoogle(myRecording, expectedWords=['left', 'right']) if results.success: print("You said: {}".format(results.words[0])) # first word """ if config is None: config = {} # empty dict if `None` onlyInitialize = audioClip is None global _googleCloudTranscriber if _googleCloudTranscriber is None: allTranscribers = getAllTranscriberInterfaces(engineKeys=True) try: interface = allTranscribers['googleCloud'] except KeyError: raise RecognizerEngineNotFoundError( "Cannot load transcriber interface for 'googleCloud'.") _googleCloudTranscriber = interface() # create instance if onlyInitialize: return NULL_TRANSCRIPTION_RESULT # set parameters which we used to support config['expectedWords'] = expectedWords config['language'] = language # do transcription and return result return _googleCloudTranscriber.transcribe(audioClip, modelConfig=config) if __name__ == "__main__": pass

Back to top