mirror of
https://github.com/scratchfoundation/scratch-vm.git
synced 2024-12-24 06:52:40 -05:00
Initial version of the speech extension. There is certainly still some stuff to figure out but the core of it is here.
This commit is contained in:
parent
a85f641d25
commit
e7987a6a4c
5 changed files with 760 additions and 0 deletions
|
@ -37,6 +37,7 @@
|
||||||
"canvas-toBlob": "1.0.0",
|
"canvas-toBlob": "1.0.0",
|
||||||
"copy-webpack-plugin": "^4.5.1",
|
"copy-webpack-plugin": "^4.5.1",
|
||||||
"decode-html": "2.0.0",
|
"decode-html": "2.0.0",
|
||||||
|
"diff-match-patch": "^1.0.0",
|
||||||
"escape-html": "1.0.3",
|
"escape-html": "1.0.3",
|
||||||
"eslint": "^4.5.0",
|
"eslint": "^4.5.0",
|
||||||
"eslint-config-scratch": "^5.0.0",
|
"eslint-config-scratch": "^5.0.0",
|
||||||
|
|
BIN
src/extensions/scratch3_speech/assets/speech-rec-end.mp3
Normal file
BIN
src/extensions/scratch3_speech/assets/speech-rec-end.mp3
Normal file
Binary file not shown.
BIN
src/extensions/scratch3_speech/assets/speech-rec-start.mp3
Normal file
BIN
src/extensions/scratch3_speech/assets/speech-rec-start.mp3
Normal file
Binary file not shown.
755
src/extensions/scratch3_speech/index.js
Normal file
755
src/extensions/scratch3_speech/index.js
Normal file
|
@ -0,0 +1,755 @@
|
||||||
|
const ArgumentType = require('../../extension-support/argument-type');
|
||||||
|
const Cast = require('../../util/cast');
|
||||||
|
const BlockType = require('../../extension-support/block-type');
|
||||||
|
const log = require('../../util/log');
|
||||||
|
const DiffMatchPatch = require('diff-match-patch');
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Url of icon to be displayed at the left edge of each extension block.
|
||||||
|
* TODO: Find the final Icon. Replace it with the right format. data URI?
|
||||||
|
* @type {string}
|
||||||
|
*/
|
||||||
|
const iconURI = 'https://www.gstatic.com/images/icons/material/system/1x/mic_white_24dp.png';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Url of icon to be displayed in the toolbox menu for the extension category.
|
||||||
|
* TODO: Find the final Icon. Replace it with the right format. data URI?
|
||||||
|
* @type {string}
|
||||||
|
*/
|
||||||
|
const menuIconURI = 'https://www.gstatic.com/images/icons/material/system/1x/mic_grey600_24dp.png';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The url of the speech server.
|
||||||
|
* @type {string}
|
||||||
|
*/
|
||||||
|
const serverURL = 'wss://speech.scratch.mit.edu';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The amount of time to wait between when we stop sending speech data to the server and when
|
||||||
|
* we expect the transcription result marked with isFinal: true to come back from the server.
|
||||||
|
* @type {int}
|
||||||
|
*/
|
||||||
|
const finalResponseTimeoutDurationMs = 3000;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The amount of time to wait between when we stop sending speech data to the server and when
|
||||||
|
* we expect the transcription result marked with isFinal: true to come back from the server.
|
||||||
|
* Currently set to 10sec. This should not exceed the speech api limit (60sec) without redoing how
|
||||||
|
* we stream the microphone data data.
|
||||||
|
* @type {int}
|
||||||
|
*/
|
||||||
|
const listenAndWaitBlockTimeoutMs = 10000;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The start and stop sounds, loaded as static assets.
|
||||||
|
* @type {object}
|
||||||
|
*/
|
||||||
|
let assetData = {};
|
||||||
|
try {
|
||||||
|
assetData = require('./manifest');
|
||||||
|
} catch (e) {
|
||||||
|
// Non-webpack environment, don't worry about assets.
|
||||||
|
}
|
||||||
|
|
||||||
|
class Scratch3SpeechBlocks {
|
||||||
|
constructor (runtime) {
|
||||||
|
/**
|
||||||
|
* The runtime instantiating this block package.
|
||||||
|
* @type {Runtime}
|
||||||
|
*/
|
||||||
|
this.runtime = runtime;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An array of phrases from the [when I hear] hat blocks.
|
||||||
|
* The list of phrases in the when I hear hat blocks. This list is sent
|
||||||
|
* to the speech api to seed the recognition engine and for deciding
|
||||||
|
* whether the transcription results match.
|
||||||
|
* @type {Array}
|
||||||
|
* @private
|
||||||
|
*/
|
||||||
|
this._phraseList = [];
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The most recent transcription result received from the speech API that we decided to keep.
|
||||||
|
* This is the value returned by the reporter block.
|
||||||
|
* @type {String}
|
||||||
|
* @private
|
||||||
|
*/
|
||||||
|
this._currentUtterance = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Similar to _currentUtterance, but set back to '' at the beginning of listening block.
|
||||||
|
* Used to get the hat blocks to edge trigger. In order to detect someone saying
|
||||||
|
* the same thing twice in two subsequent liten and wait blocks
|
||||||
|
* and still trigger the hat, we need this to go from
|
||||||
|
* '' at the beginning of the listen block to '<transcription value>' at the end.
|
||||||
|
* @type {string}
|
||||||
|
* @private
|
||||||
|
*/
|
||||||
|
this._utteranceForEdgeTrigger = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The list of queued `resolve` callbacks for 'Listen and Wait' blocks.
|
||||||
|
* We only listen to for one utterance at a time. We may encounter multiple
|
||||||
|
* 'Listen and wait' blocks that tell us to start listening. If one starts
|
||||||
|
* and hasn't receieved results back yet, when we encounter more, any further ones
|
||||||
|
* will all resolve when we get the next acceptable transcription result back.
|
||||||
|
* @type {!Array}
|
||||||
|
* @private
|
||||||
|
*/
|
||||||
|
this._speechPromises = [];
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The id of the timeout that will run if we start listening and don't get any
|
||||||
|
* transcription results back. e.g. because we didn't hear anything.
|
||||||
|
* @type {number}
|
||||||
|
* @private
|
||||||
|
*/
|
||||||
|
this._speechTimeoutId = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The id of the timeout that will run to wait for after we're done listening but
|
||||||
|
* are still waiting for a potential isFinal:true transcription result to come back.
|
||||||
|
* @type {number}
|
||||||
|
* @private
|
||||||
|
*/
|
||||||
|
this._speechFinalResponseTimeout = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The ScriptProcessorNode hooked up to the audio context.
|
||||||
|
* @type {ScriptProcessorNode}
|
||||||
|
* @private
|
||||||
|
*/
|
||||||
|
this._scriptNode = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The socket used to communicate with the speech server to send microphone data
|
||||||
|
* and recieve transcription results.
|
||||||
|
* @type {WebSocket}
|
||||||
|
* @private
|
||||||
|
*/
|
||||||
|
this._socket = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The AudioContext used to manage the microphone.
|
||||||
|
* @type {AudioContext}
|
||||||
|
* @private
|
||||||
|
*/
|
||||||
|
this._context = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* MediaStreamAudioSourceNode to handle microphone data.
|
||||||
|
* @type {MediaStreamAudioSourceNode}
|
||||||
|
* @private
|
||||||
|
*/
|
||||||
|
this._sourceNode = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A Promise whose fulfillment handler receives a MediaStream object when the microphone has been obtained.
|
||||||
|
* @type {Promise}
|
||||||
|
* @private
|
||||||
|
*/
|
||||||
|
this._audioPromise = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Audio buffer for sound to indicate that listending has started.
|
||||||
|
* @type {bufferSourceNode}
|
||||||
|
* @private
|
||||||
|
*/
|
||||||
|
this._startSoundBuffer = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Audio buffer for sound to indicate that listending has ended.
|
||||||
|
* @type {bufferSourceNode}
|
||||||
|
* @private
|
||||||
|
*/
|
||||||
|
this._endSoundBuffer = null;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Diff Match Patch is used to do some fuzzy matching of the transcription results
|
||||||
|
* with what is in the hat blocks.
|
||||||
|
*/
|
||||||
|
this._dmp = new DiffMatchPatch();
|
||||||
|
// Threshold for diff match patch to use: (0.0 = perfection, 1.0 = very loose).
|
||||||
|
this._dmp.Match_Threshold = 0.3;
|
||||||
|
|
||||||
|
// Come back and figure out which of these I really need.
|
||||||
|
this._newSocketCallback = this._newSocketCallback.bind(this);
|
||||||
|
this._setupSocketCallback = this._setupSocketCallback.bind(this);
|
||||||
|
this._socketMessageCallback = this._socketMessageCallback.bind(this);
|
||||||
|
this._processAudioCallback = this._processAudioCallback.bind(this);
|
||||||
|
this._onTranscriptionFromServer = this._onTranscriptionFromServer.bind(this);
|
||||||
|
this._resetListening = this._resetListening.bind(this);
|
||||||
|
this._stopTranscription = this._stopTranscription.bind(this);
|
||||||
|
|
||||||
|
|
||||||
|
this.runtime.on('PROJECT_STOP_ALL', this._resetListening.bind(this));
|
||||||
|
|
||||||
|
// Load in the start and stop listening indicator sounds.
|
||||||
|
this._loadUISounds();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Load the UI sounds played when listening starts and stops.
|
||||||
|
* @private
|
||||||
|
*/
|
||||||
|
_loadUISounds () {
|
||||||
|
const startSoundBuffer = assetData['speech-rec-start.mp3'].buffer;
|
||||||
|
this._decodeSound(startSoundBuffer).then(buffer => {
|
||||||
|
this._startSoundBuffer = buffer;
|
||||||
|
});
|
||||||
|
|
||||||
|
const endSoundBuffer = assetData['speech-rec-end.mp3'].buffer;
|
||||||
|
this._decodeSound(endSoundBuffer).then(buffer => {
|
||||||
|
this._endSoundBuffer = buffer;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Decode a sound and return a promise with the audio buffer.
|
||||||
|
* @param {ArrayBuffer} soundBuffer - a buffer containing the encoded audio.
|
||||||
|
* @return {Promise} - a promise which will resolve once the sound has decoded.
|
||||||
|
* @private
|
||||||
|
*/
|
||||||
|
_decodeSound (soundBuffer) {
|
||||||
|
const context = this.runtime.audioEngine && this.runtime.audioEngine.audioContext;
|
||||||
|
|
||||||
|
if (!context) {
|
||||||
|
return Promise.reject(new Error('No Audio Context Detected'));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for newer promise-based API
|
||||||
|
if (context.decodeAudioData.length === 1) {
|
||||||
|
return context.decodeAudioData(soundBuffer);
|
||||||
|
} else { // eslint-disable-line no-else-return
|
||||||
|
// Fall back to callback API
|
||||||
|
return new Promise((resolve, reject) =>
|
||||||
|
context.decodeAudioData(soundBuffer,
|
||||||
|
buffer => resolve(buffer),
|
||||||
|
error => reject(error)
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Play the given sound.
|
||||||
|
* @param {ArrayBuffer} buffer The audio buffer to play.
|
||||||
|
* @returns {Promise} A promise that resoloves when the sound is done playing.
|
||||||
|
* @private
|
||||||
|
*/
|
||||||
|
_playSound (buffer) {
|
||||||
|
if (this.runtime.audioEngine === null) return;
|
||||||
|
const context = this.runtime.audioEngine.audioContext;
|
||||||
|
const bufferSource = context.createBufferSource();
|
||||||
|
bufferSource.buffer = buffer;
|
||||||
|
bufferSource.connect(this.runtime.audioEngine.input);
|
||||||
|
bufferSource.start();
|
||||||
|
return new Promise(resolve => {
|
||||||
|
bufferSource.onended = () => {
|
||||||
|
resolve();
|
||||||
|
};
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Scans all the 'When I hear' hat blocks for each sprite and pulls out the text. The list
|
||||||
|
* is sent off to the speech recognition server as hints. This *only* reads the value out of
|
||||||
|
* the hat block shadow. If a block is dropped on top of the shadow, it is skipped.
|
||||||
|
* @returns {Array} list of strings from the hat blocks in the project.
|
||||||
|
* @private
|
||||||
|
*/
|
||||||
|
_scanBlocksForPhraseList () {
|
||||||
|
const words = [];
|
||||||
|
// For each each target, walk through the top level blocks and check whether
|
||||||
|
// they are speech hat/when I hear blocks.
|
||||||
|
this.runtime.targets.forEach(target => {
|
||||||
|
target.blocks._scripts.forEach(id => {
|
||||||
|
const b = target.blocks.getBlock(id);
|
||||||
|
if (b.opcode === 'speech.whenIHearHat') {
|
||||||
|
// Grab the text from the hat block's shadow.
|
||||||
|
const inputId = b.inputs.PHRASE.block;
|
||||||
|
const inputBlock = target.blocks.getBlock(inputId);
|
||||||
|
// Only grab the value from text blocks. This means we'll
|
||||||
|
// miss some. e.g. values in variables or other reporters.
|
||||||
|
if (inputBlock.opcode === 'text') {
|
||||||
|
const word = target.blocks.getBlock(inputId).fields.TEXT.value;
|
||||||
|
words.push(word);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
return words;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resets all things related to listening. Called on Red Stop sign button.
|
||||||
|
* - suspends audio processing
|
||||||
|
* - closes socket with speech socket server
|
||||||
|
* - clears out any remaining speech blocks that are waiting.
|
||||||
|
* @private.
|
||||||
|
*/
|
||||||
|
_resetListening () {
|
||||||
|
this._stopListening();
|
||||||
|
this._closeWebsocket();
|
||||||
|
this._resolveSpeechPromises();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Close the connection to the socket server if it is open.
|
||||||
|
* @private
|
||||||
|
*/
|
||||||
|
_closeWebsocket () {
|
||||||
|
if (this._socket && this._socket.readyState === this._socket.OPEN) {
|
||||||
|
this._socket.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Call to suspend getting data from the microphone.
|
||||||
|
* @private
|
||||||
|
*/
|
||||||
|
_stopListening () {
|
||||||
|
// Note that this can be called before any Listen And Wait block did setup,
|
||||||
|
// so check that things exist before disconnecting them.
|
||||||
|
if (this._context) {
|
||||||
|
this._context.suspend.bind(this._context);
|
||||||
|
}
|
||||||
|
// This is called on green flag to reset things that may never have existed
|
||||||
|
// in the first place. Do a bunch of checks.
|
||||||
|
if (this._scriptNode) {
|
||||||
|
this._scriptNode.disconnect();
|
||||||
|
}
|
||||||
|
if (this._sourceNode) {
|
||||||
|
this._sourceNode.disconnect();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resolves all the speech promises we've accumulated so far and empties out the list.
|
||||||
|
* @private
|
||||||
|
*/
|
||||||
|
_resolveSpeechPromises () {
|
||||||
|
for (let i = 0; i < this._speechPromises.length; i++) {
|
||||||
|
const resFn = this._speechPromises[i];
|
||||||
|
resFn();
|
||||||
|
}
|
||||||
|
this._speechPromises = [];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Called when we want to stop listening (e.g. when a listen block times out)
|
||||||
|
* but we still want to wait a little to see if we get any transcription results
|
||||||
|
* back before yielding the block execution.
|
||||||
|
* @private
|
||||||
|
*/
|
||||||
|
_stopTranscription () {
|
||||||
|
this._stopListening();
|
||||||
|
if (this._socket && this._socket.readyState === this._socket.OPEN) {
|
||||||
|
this._socket.send('stopTranscription');
|
||||||
|
}
|
||||||
|
// Give it a couple seconds to response before giving up and assuming nothing else will come back.
|
||||||
|
this._speechFinalResponseTimeout = setTimeout(this._resetListening, finalResponseTimeoutDurationMs);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Decides whether to keep a given transcirption result.
|
||||||
|
* @param {number} fuzzyMatchIndex Index of the fuzzy match or -1 if there is no match.
|
||||||
|
* @param {object} result The json object representing the transcription result.
|
||||||
|
* @param {string} normalizedTranscript The transcription text used for matching (i.e. lowercased, no punctuation).
|
||||||
|
* @returns {boolean} true If a result is good enough to be kept.
|
||||||
|
* @private
|
||||||
|
*/
|
||||||
|
_shouldKeepResult (fuzzyMatchIndex, result, normalizedTranscript) {
|
||||||
|
// The threshold above which we decide transcription results are unlikely to change again.
|
||||||
|
// See https://cloud.google.com/speech-to-text/docs/basics#streaming_responses.
|
||||||
|
const stabilityThreshold = .85;
|
||||||
|
|
||||||
|
// For responsiveness of the When I Hear hat blocks, sometimes we want to keep results that are not
|
||||||
|
// yet marked 'isFinal' by the speech api. Here are some signals we use.
|
||||||
|
|
||||||
|
// If the result from the speech api isn't very stable and we only had a fuzzy match, we don't want to use it.
|
||||||
|
const shouldKeepFuzzyMatch = fuzzyMatchIndex !== -1 && result.stability > stabilityThreshold;
|
||||||
|
|
||||||
|
// If the result is in the phraseList (i.e. it matches one of the 'When I Hear' blocks), we keep it.
|
||||||
|
// This might be aggressive... but so far seems to be a good thing.
|
||||||
|
const shouldKeepPhraseListMatch = this._phraseList.includes(normalizedTranscript);
|
||||||
|
|
||||||
|
if (!result.isFinal && !shouldKeepPhraseListMatch && !shouldKeepFuzzyMatch) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalizes text a bit to facilitate matching. Lowercases, removes some punctuation and whitespace.
|
||||||
|
* @param {string} text The text to normalzie
|
||||||
|
* @returns {string} The normalized text.
|
||||||
|
* @private
|
||||||
|
*/
|
||||||
|
_normalizeText (text) {
|
||||||
|
text = Cast.toString(text).toLowerCase();
|
||||||
|
text = text.replace(/[.?!]/g, '');
|
||||||
|
text = text.trim();
|
||||||
|
return text;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Call into diff match patch library to compute whether there is a fuzzy match.
|
||||||
|
* @param {string} text The text to search in.
|
||||||
|
* @param {string} pattern The pattern to look for in text.
|
||||||
|
* @returns {number} The index of the match or -1 if there isn't one.
|
||||||
|
*/
|
||||||
|
_computeFuzzyMatch (text, pattern) {
|
||||||
|
// Don't bother matching if any are null.
|
||||||
|
if (!pattern || !text) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
let match = -1;
|
||||||
|
try {
|
||||||
|
// Look for the text in the pattern starting at position 0.
|
||||||
|
match = this._dmp.match_main(text, pattern, 0);
|
||||||
|
} catch (e) {
|
||||||
|
// This can happen inf the text or pattern gets too long. If so just substring match.
|
||||||
|
return pattern.indexOf(text);
|
||||||
|
}
|
||||||
|
return match;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Processes the results we get back from the speech server. Decides whether the results
|
||||||
|
* are good enough to keep. If they are, resolves the 'Listen and Wait' blocks promise and cleans up.
|
||||||
|
* @param {object} result The transcription result.
|
||||||
|
* @private
|
||||||
|
*/
|
||||||
|
_processTranscriptionResult (result) {
|
||||||
|
log.info(`Got result: ${JSON.stringify(result)}`);
|
||||||
|
const transcriptionResult = this._normalizeText(result.alternatives[0].transcript);
|
||||||
|
|
||||||
|
// Waiting for an exact match is not satisfying. It makes it hard to catch
|
||||||
|
// things like homonyms or things that sound similar "let us" vs "lettuce". Using the fuzzy matching helps
|
||||||
|
// more aggressively match the phrases that are in the "When I hear" hat blocks.
|
||||||
|
const phrases = this._phraseList.join(' ');
|
||||||
|
const fuzzyMatchIndex = this._computeFuzzyMatch(phrases, transcriptionResult);
|
||||||
|
|
||||||
|
let fuzzyMatchResult = null;
|
||||||
|
if (fuzzyMatchIndex !== -1) {
|
||||||
|
fuzzyMatchResult = transcriptionResult.substring(fuzzyMatchIndex, fuzzyMatchIndex + phrases.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the result isn't good enough yet, return without saving and resolving the promises.
|
||||||
|
if (!this._shouldKeepResult(fuzzyMatchIndex, result, transcriptionResult)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: Decide whether this is the right thing.
|
||||||
|
// This sets the currentUtterance (which is returned by the reporter) to the fuzzy match if we had one.
|
||||||
|
// That means it'll often get set to a phrase from one of the 'when I hear' blocks instead of the
|
||||||
|
// full phrase that the user said.
|
||||||
|
if (fuzzyMatchResult) {
|
||||||
|
this._currentUtterance = fuzzyMatchResult;
|
||||||
|
} else {
|
||||||
|
this._currentUtterance = transcriptionResult;
|
||||||
|
}
|
||||||
|
log.info(`Keeing result: ${this._currentUtterance}`);
|
||||||
|
this._utteranceForEdgeTrigger = transcriptionResult;
|
||||||
|
|
||||||
|
// We're done listening so resolove all the promises and reset everying so we're ready for next time.
|
||||||
|
this._resetListening();
|
||||||
|
|
||||||
|
// We got results so clear out the timeouts.
|
||||||
|
if (this._speechTimeoutId) {
|
||||||
|
clearTimeout(this._speechTimeoutId);
|
||||||
|
this._speechTimeoutId = null;
|
||||||
|
}
|
||||||
|
if (this._speechFinalResponseTimeout) {
|
||||||
|
clearTimeout(this._speechFinalResponseTimeout);
|
||||||
|
this._speechFinalResponseTimeout = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Handle a message from the socket. It contains transcription results.
|
||||||
|
* @param {MessageEvent} e The message event containing data from speech server.
|
||||||
|
* @private
|
||||||
|
*/
|
||||||
|
_onTranscriptionFromServer (e) {
|
||||||
|
let result = null;
|
||||||
|
try {
|
||||||
|
result = JSON.parse(e.data);
|
||||||
|
} catch (ex) {
|
||||||
|
log.error(`Problem parsing json. continuing: ${ex}`);
|
||||||
|
// TODO: Question - Should we kill listening and continue?
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
this._processTranscriptionResult(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Decide whether the pattern given matches the text. Uses fuzzy matching
|
||||||
|
* @param {string} pattern The pattern to look for. Usually this is the transcription result
|
||||||
|
* @param {string} text The text to look in. Usually this is the set of phrases from the when I hear blocks
|
||||||
|
* @returns {boolean} true if there is a fuzzy match.
|
||||||
|
* @private
|
||||||
|
*/
|
||||||
|
_speechMatches (pattern, text) {
|
||||||
|
pattern = this._normalizeText(pattern);
|
||||||
|
text = this._normalizeText(text);
|
||||||
|
const match = this._computeFuzzyMatch(text, pattern);
|
||||||
|
return match !== -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Kick off the listening process.
|
||||||
|
* @private
|
||||||
|
*/
|
||||||
|
_startListening () {
|
||||||
|
// If we've already setup the context, we can resume instead of doing all the setup again.
|
||||||
|
if (this._context) {
|
||||||
|
this._resumeListening();
|
||||||
|
} else {
|
||||||
|
this._initListening();
|
||||||
|
}
|
||||||
|
// Force the block to timeout if we don't get any results back/the user didn't say anything.
|
||||||
|
this._speechTimeoutId = setTimeout(this._stopTranscription, listenAndWaitBlockTimeoutMs);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resume listening for audio and re-open the socket to send data.
|
||||||
|
* @private
|
||||||
|
*/
|
||||||
|
_resumeListening () {
|
||||||
|
this._context.resume.bind(this._context);
|
||||||
|
this._newWebsocket();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Does all setup to get microphone data and initializes the web socket.
|
||||||
|
* that data to the speech server.
|
||||||
|
* @private
|
||||||
|
*/
|
||||||
|
_initListening () {
|
||||||
|
this._initializeMicrophone();
|
||||||
|
this._initScriptNode();
|
||||||
|
this._newWebsocket();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initialize the audio context and connect the microphone.
|
||||||
|
* @private
|
||||||
|
*/
|
||||||
|
_initializeMicrophone () {
|
||||||
|
this._context = new AudioContext();
|
||||||
|
this._audioPromise = navigator.mediaDevices.getUserMedia({
|
||||||
|
audio: {
|
||||||
|
echoCancellation: true,
|
||||||
|
channelCount: 1,
|
||||||
|
sampleRate: {
|
||||||
|
ideal: 16000
|
||||||
|
},
|
||||||
|
sampleSize: 16
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
const tempContext = this._context;
|
||||||
|
this._audioPromise.then(micStream => {
|
||||||
|
const microphone = tempContext.createMediaStreamSource(micStream);
|
||||||
|
const analyser = tempContext.createAnalyser();
|
||||||
|
microphone.connect(analyser);
|
||||||
|
}).catch(e => {
|
||||||
|
log.error(`Problem connecting to microphone: ${e}`);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets up the script processor and the web socket.
|
||||||
|
* @private
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
_initScriptNode () {
|
||||||
|
// Create a node that sends raw bytes across the websocket
|
||||||
|
this._scriptNode = this._context.createScriptProcessor(4096, 1, 1);
|
||||||
|
// Need the maximum value for 16-bit signed samples, to convert from float.
|
||||||
|
this._scriptNode.addEventListener('audioprocess', this._processAudioCallback);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Callback called when it is time to setup the new web socket.
|
||||||
|
* @param {Function} resolve - function to call when the web socket opens succesfully.
|
||||||
|
* @param {Function} reject - function to call if opening the web socket fails.
|
||||||
|
*/
|
||||||
|
_newSocketCallback (resolve, reject) {
|
||||||
|
this._socket = new WebSocket(serverURL);
|
||||||
|
this._socket.addEventListener('open', resolve);
|
||||||
|
this._socket.addEventListener('error', reject);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Callback called once we've initially established the web socket is open and working.
|
||||||
|
* Sets up the callback for subsequent messages (i.e. transcription results) and
|
||||||
|
* connects to the script node to get data.
|
||||||
|
* @private
|
||||||
|
*/
|
||||||
|
_socketMessageCallback () {
|
||||||
|
this._socket.addEventListener('message', this._onTranscriptionFromServer);
|
||||||
|
this._startByteStream();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets up callback for when socket and audio are initialized.
|
||||||
|
* @private
|
||||||
|
*/
|
||||||
|
_newWebsocket () {
|
||||||
|
const websocketPromise = new Promise(this._newSocketCallback);
|
||||||
|
Promise.all([this._audioPromise, websocketPromise]).then(
|
||||||
|
this._setupSocketCallback)
|
||||||
|
.catch(e => {
|
||||||
|
log.error(`Problem with setup: ${e}`);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Callback to handle initial setting up of a socket.
|
||||||
|
* Currently we send a setup message (only contains sample rate) but might
|
||||||
|
* be useful to send more data so we can do quota stuff.
|
||||||
|
* @param {Array} values The
|
||||||
|
*/
|
||||||
|
_setupSocketCallback (values) {
|
||||||
|
this._micStream = values[0];
|
||||||
|
this._socket = values[1].target;
|
||||||
|
|
||||||
|
this._socket.addEventListener('error', e => {
|
||||||
|
log.error(`Error from web socket: ${e}`);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Send the initial configuration message. When the server acknowledges
|
||||||
|
// it, start streaming the audio bytes to the server and listening for
|
||||||
|
// transcriptions.
|
||||||
|
this._socket.addEventListener('message', this._socketMessageCallback, {once: true});
|
||||||
|
this._socket.send(JSON.stringify(
|
||||||
|
{
|
||||||
|
sampleRate: this._context.sampleRate,
|
||||||
|
phrases: this._phraseList
|
||||||
|
}
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Do setup so we can start streaming mic data.
|
||||||
|
* @private
|
||||||
|
*/
|
||||||
|
_startByteStream () {
|
||||||
|
// Hook up the scriptNode to the mic
|
||||||
|
this._sourceNode = this._context.createMediaStreamSource(this._micStream);
|
||||||
|
this._sourceNode.connect(this._scriptNode);
|
||||||
|
this._scriptNode.connect(this._context.destination);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Called when we have data from the microphone. Takes that data and ships
|
||||||
|
* it off to the speech server for transcription.
|
||||||
|
* @param {audioProcessingEvent} e The event with audio data in it.
|
||||||
|
* @private
|
||||||
|
*/
|
||||||
|
_processAudioCallback (e) {
|
||||||
|
if (this._socket.readyState === WebSocket.CLOSED ||
|
||||||
|
this._socket.readyState === WebSocket.CLOSING) {
|
||||||
|
log.error(`Not sending data because not in ready state. State: ${this._socket.readyState}`);
|
||||||
|
// TODO: should we stop trying and reset state so it might work next time?
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const MAX_INT = Math.pow(2, 16 - 1) - 1;
|
||||||
|
const floatSamples = e.inputBuffer.getChannelData(0);
|
||||||
|
// The samples are floats in range [-1, 1]. Convert to 16-bit signed
|
||||||
|
// integer.
|
||||||
|
this._socket.send(Int16Array.from(floatSamples.map(n => n * MAX_INT)));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The key to load & store a target's speech-related state.
|
||||||
|
* @type {string}
|
||||||
|
*/
|
||||||
|
static get STATE_KEY () {
|
||||||
|
return 'Scratch.speech';
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @returns {object} Metadata for this extension and its blocks.
|
||||||
|
*/
|
||||||
|
getInfo () {
|
||||||
|
return {
|
||||||
|
id: 'speech',
|
||||||
|
name: 'Google Speech',
|
||||||
|
menuIconURI: menuIconURI,
|
||||||
|
blockIconURI: iconURI,
|
||||||
|
blocks: [
|
||||||
|
{
|
||||||
|
opcode: 'listenAndWait',
|
||||||
|
text: 'Listen and Wait',
|
||||||
|
blockType: BlockType.COMMAND
|
||||||
|
},
|
||||||
|
{
|
||||||
|
opcode: 'whenIHearHat',
|
||||||
|
text: 'When I hear [PHRASE]',
|
||||||
|
blockType: BlockType.HAT,
|
||||||
|
arguments: {
|
||||||
|
PHRASE: {
|
||||||
|
type: ArgumentType.STRING,
|
||||||
|
defaultValue: 'cat'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
opcode: 'getSpeech',
|
||||||
|
text: 'speech',
|
||||||
|
blockType: BlockType.REPORTER
|
||||||
|
}
|
||||||
|
]
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Start the listening process if it isn't already in progress, playing a sound to indicate
|
||||||
|
* when it starts and stops.
|
||||||
|
* @return {Promise} A promise that will resolve when listening is complete.
|
||||||
|
*/
|
||||||
|
listenAndWait () {
|
||||||
|
// TODO: Look into the timing of when to start the sound. There currently seems
|
||||||
|
// to be some lag between when the sound starts and when the socket message
|
||||||
|
// callback is received. Perhaps we should play the sound after the socket is setup.
|
||||||
|
// TODO: Question - Should we only play the sound if listening isn't already in progress?
|
||||||
|
return this._playSound(this._startSoundBuffer).then(() => {
|
||||||
|
this._phraseList = this._scanBlocksForPhraseList();
|
||||||
|
this._utteranceForEdgeTrigger = '';
|
||||||
|
const speechPromise = new Promise(resolve => {
|
||||||
|
const listeningInProgress = this._speechPromises.length > 0;
|
||||||
|
this._speechPromises.push(resolve);
|
||||||
|
if (!listeningInProgress) {
|
||||||
|
this._startListening();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return speechPromise.then(() => this._playSound(this._endSoundBuffer));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An edge triggered hat block to listen for a specific phrase.
|
||||||
|
* @param {object} args - the block arguments.
|
||||||
|
* @return {boolean} true if the phrase matches what was transcribed.
|
||||||
|
*/
|
||||||
|
whenIHearHat (args) {
|
||||||
|
return this._speechMatches(args.PHRASE, this._utteranceForEdgeTrigger);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reporter for the last heard phrase/utterance.
|
||||||
|
* @return {string} The lastest thing we heard from a listen and wait block.
|
||||||
|
*/
|
||||||
|
getSpeech () {
|
||||||
|
return this._currentUtterance;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
module.exports = Scratch3SpeechBlocks;
|
4
src/extensions/scratch3_speech/manifest.js
Normal file
4
src/extensions/scratch3_speech/manifest.js
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
module.exports = {
|
||||||
|
'speech-rec-start.mp3': require('!buffer-loader!./assets/speech-rec-start.mp3'),
|
||||||
|
'speech-rec-end.mp3': require('!buffer-loader!./assets/speech-rec-end.mp3')
|
||||||
|
};
|
Loading…
Reference in a new issue