Initial version of the speech extension. There is certainly still some stuff to figure out but the core of it is here.

2025-07-07 19:43:57 -04:00 · 2018-05-04 17:23:01 -07:00 · 2018-05-04 17:23:01 -07:00 · e7987a6a4c
commit e7987a6a4c
parent a85f641d25
5 changed files with 760 additions and 0 deletions
--- a/package.json
+++ b/package.json
@ -37,6 +37,7 @@
    "canvas-toBlob": "1.0.0",
    "copy-webpack-plugin": "^4.5.1",
    "decode-html": "2.0.0",
+    "diff-match-patch": "^1.0.0",
    "escape-html": "1.0.3",
    "eslint": "^4.5.0",
    "eslint-config-scratch": "^5.0.0",
--- a/src/extensions/scratch3_speech/assets/speech-rec-end.mp3
+++ b/src/extensions/scratch3_speech/assets/speech-rec-end.mp3
--- a/src/extensions/scratch3_speech/assets/speech-rec-start.mp3
+++ b/src/extensions/scratch3_speech/assets/speech-rec-start.mp3
--- a/src/extensions/scratch3_speech/index.js
+++ b/src/extensions/scratch3_speech/index.js
@ -0,0 +1,755 @@
+const ArgumentType = require('../../extension-support/argument-type');
+const Cast = require('../../util/cast');
+const BlockType = require('../../extension-support/block-type');
+const log = require('../../util/log');
+const DiffMatchPatch = require('diff-match-patch');
+
+
+/**
+ * Url of icon to be displayed at the left edge of each extension block.
+ * TODO: Find the final Icon. Replace it with the right format. data URI?
+ * @type {string}
+ */
+const iconURI = 'https://www.gstatic.com/images/icons/material/system/1x/mic_white_24dp.png';
+
+/**
+ * Url of icon to be displayed in the toolbox menu for the extension category.
+ * TODO: Find the final Icon. Replace it with the right format. data URI?
+ * @type {string}
+ */
+const menuIconURI = 'https://www.gstatic.com/images/icons/material/system/1x/mic_grey600_24dp.png';
+
+/**
+ * The url of the speech server.
+ * @type {string}
+ */
+const serverURL = 'wss://speech.scratch.mit.edu';
+
+/**
+ * The amount of time to wait between when we stop sending speech data to the server and when
+ * we expect the transcription result marked with isFinal: true to come back from the server.
+ * @type {int}
+ */
+const finalResponseTimeoutDurationMs = 3000;
+
+/**
+ * The amount of time to wait between when we stop sending speech data to the server and when
+ * we expect the transcription result marked with isFinal: true to come back from the server.
+ * Currently set to 10sec. This should not exceed the speech api limit (60sec) without redoing how
+ * we stream the microphone data data.
+ * @type {int}
+ */
+const listenAndWaitBlockTimeoutMs = 10000;
+
+/**
+ * The start and stop sounds, loaded as static assets.
+ * @type {object}
+ */
+let assetData = {};
+try {
+    assetData = require('./manifest');
+} catch (e) {
+    // Non-webpack environment, don't worry about assets.
+}
+
+class Scratch3SpeechBlocks {
+    constructor (runtime) {
+        /**
+         * The runtime instantiating this block package.
+         * @type {Runtime}
+         */
+        this.runtime = runtime;
+
+        /**
+         * An array of phrases from the [when I hear] hat blocks.
+         * The list of phrases in the when I hear hat blocks.  This list is sent
+         * to the speech api to seed the recognition engine and for deciding
+         * whether the transcription results match.
+         * @type {Array}
+         * @private
+         */
+        this._phraseList = [];
+
+        /**
+         * The most recent transcription result received from the speech API that we decided to keep.
+         * This is the value returned by the reporter block.
+         * @type {String}
+         * @private
+         */
+        this._currentUtterance = null;
+
+        /**
+         *  Similar to _currentUtterance, but set back to '' at the beginning of listening block.
+         *  Used to get the hat blocks to edge trigger.  In order to detect someone saying
+         *  the same thing twice in two subsequent liten and wait blocks
+         *  and still trigger the hat, we need this to go from
+         *  '' at the beginning of the listen block to '<transcription value>' at the end.
+         * @type {string}
+         * @private
+         */
+        this._utteranceForEdgeTrigger = null;
+
+        /**
+         * The list of queued `resolve` callbacks for 'Listen and Wait' blocks.
+         * We only listen to for one utterance at a time.  We may encounter multiple
+         * 'Listen and wait' blocks that tell us to start listening. If one starts
+         * and hasn't receieved results back yet, when we encounter more, any further ones
+         * will all resolve when we get the next acceptable transcription result back.
+         * @type {!Array}
+         * @private
+         */
+        this._speechPromises = [];
+
+        /**
+         * The id of the timeout that will run if we start listening and don't get any
+         * transcription results back. e.g. because we didn't hear anything.
+         * @type {number}
+         * @private
+         */
+        this._speechTimeoutId = null;
+
+        /**
+         * The id of the timeout that will run to wait for after we're done listening but
+         * are still waiting for a potential isFinal:true transcription result to come back.
+         * @type {number}
+         * @private
+         */
+        this._speechFinalResponseTimeout = null;
+
+        /**
+         * The ScriptProcessorNode hooked up to the audio context.
+         * @type {ScriptProcessorNode}
+         * @private
+         */
+        this._scriptNode = null;
+
+        /**
+         * The socket used to communicate with the speech server to send microphone data
+         * and recieve transcription results.
+         * @type {WebSocket}
+         * @private
+         */
+        this._socket = null;
+
+        /**
+         * The AudioContext used to manage the microphone.
+         * @type {AudioContext}
+         * @private
+         */
+        this._context = null;
+
+        /**
+         * MediaStreamAudioSourceNode to handle microphone data.
+         * @type {MediaStreamAudioSourceNode}
+         * @private
+         */
+        this._sourceNode = null;
+
+        /**
+         * A Promise whose fulfillment handler receives a MediaStream object when the microphone has been obtained.
+         * @type {Promise}
+         * @private
+         */
+        this._audioPromise = null;
+
+        /**
+         * Audio buffer for sound to indicate that listending has started.
+         * @type {bufferSourceNode}
+         * @private
+         */
+        this._startSoundBuffer = null;
+
+        /**
+         * Audio buffer for sound to indicate that listending has ended.
+         * @type {bufferSourceNode}
+         * @private
+         */
+        this._endSoundBuffer = null;
+
+
+        /**
+         * Diff Match Patch is used to do some fuzzy matching of the transcription results
+         * with what is in the hat blocks.
+         */
+        this._dmp = new DiffMatchPatch();
+        // Threshold for diff match patch to use: (0.0 = perfection, 1.0 = very loose).
+        this._dmp.Match_Threshold = 0.3;
+
+        // Come back and figure out which of these I really need.
+        this._newSocketCallback = this._newSocketCallback.bind(this);
+        this._setupSocketCallback = this._setupSocketCallback.bind(this);
+        this._socketMessageCallback = this._socketMessageCallback.bind(this);
+        this._processAudioCallback = this._processAudioCallback.bind(this);
+        this._onTranscriptionFromServer = this._onTranscriptionFromServer.bind(this);
+        this._resetListening = this._resetListening.bind(this);
+        this._stopTranscription = this._stopTranscription.bind(this);
+
+
+        this.runtime.on('PROJECT_STOP_ALL', this._resetListening.bind(this));
+
+        // Load in the start and stop listening indicator sounds.
+        this._loadUISounds();
+    }
+
+    /**
+     * Load the UI sounds played when listening starts and stops.
+     * @private
+     */
+    _loadUISounds () {
+        const startSoundBuffer = assetData['speech-rec-start.mp3'].buffer;
+        this._decodeSound(startSoundBuffer).then(buffer => {
+            this._startSoundBuffer = buffer;
+        });
+
+        const endSoundBuffer = assetData['speech-rec-end.mp3'].buffer;
+        this._decodeSound(endSoundBuffer).then(buffer => {
+            this._endSoundBuffer = buffer;
+        });
+    }
+
+    /**
+     * Decode a sound and return a promise with the audio buffer.
+     * @param  {ArrayBuffer} soundBuffer - a buffer containing the encoded audio.
+     * @return {Promise} - a promise which will resolve once the sound has decoded.
+     * @private
+     */
+    _decodeSound (soundBuffer) {
+        const context = this.runtime.audioEngine && this.runtime.audioEngine.audioContext;
+
+        if (!context) {
+            return Promise.reject(new Error('No Audio Context Detected'));
+        }
+
+        // Check for newer promise-based API
+        if (context.decodeAudioData.length === 1) {
+            return context.decodeAudioData(soundBuffer);
+        } else { // eslint-disable-line no-else-return
+            // Fall back to callback API
+            return new Promise((resolve, reject) =>
+                context.decodeAudioData(soundBuffer,
+                    buffer => resolve(buffer),
+                    error => reject(error)
+                )
+            );
+        }
+    }
+
+    /**
+     * Play the given sound.
+     * @param {ArrayBuffer} buffer The audio buffer to play.
+     * @returns {Promise} A promise that resoloves when the sound is done playing.
+     * @private
+     */
+    _playSound (buffer) {
+        if (this.runtime.audioEngine === null) return;
+        const context = this.runtime.audioEngine.audioContext;
+        const bufferSource = context.createBufferSource();
+        bufferSource.buffer = buffer;
+        bufferSource.connect(this.runtime.audioEngine.input);
+        bufferSource.start();
+        return new Promise(resolve => {
+            bufferSource.onended = () => {
+                resolve();
+            };
+        });
+    }
+
+    /**
+     * Scans all the 'When I hear' hat blocks for each sprite and pulls out the text.  The list
+     * is sent off to the speech recognition server as hints.  This *only* reads the value out of
+     * the hat block shadow.  If a block is dropped on top of the shadow, it is skipped.
+     * @returns {Array} list of strings from the hat blocks in the project.
+     * @private
+     */
+    _scanBlocksForPhraseList () {
+        const words = [];
+        // For each each target, walk through the top level blocks and check whether
+        // they are speech hat/when I hear blocks.
+        this.runtime.targets.forEach(target => {
+            target.blocks._scripts.forEach(id => {
+                const b = target.blocks.getBlock(id);
+                if (b.opcode === 'speech.whenIHearHat') {
+                    // Grab the text from the hat block's shadow.
+                    const inputId = b.inputs.PHRASE.block;
+                    const inputBlock = target.blocks.getBlock(inputId);
+                    // Only grab the value from text blocks. This means we'll
+                    // miss some. e.g. values in variables or other reporters.
+                    if (inputBlock.opcode === 'text') {
+                        const word = target.blocks.getBlock(inputId).fields.TEXT.value;
+                        words.push(word);
+                    }
+                }
+            });
+        });
+        return words;
+    }
+
+    /**
+     * Resets all things related to listening. Called on Red Stop sign button.
+     *   - suspends audio processing
+     *   - closes socket with speech socket server
+     *   - clears out any remaining speech blocks that are waiting.
+     * @private.
+     */
+    _resetListening () {
+        this._stopListening();
+        this._closeWebsocket();
+        this._resolveSpeechPromises();
+    }
+
+    /**
+     * Close the connection to the socket server if it is open.
+     * @private
+     */
+    _closeWebsocket () {
+        if (this._socket && this._socket.readyState === this._socket.OPEN) {
+            this._socket.close();
+        }
+    }
+
+    /**
+     * Call to suspend getting data from the microphone.
+     * @private
+     */
+    _stopListening () {
+        // Note that this can be called before any Listen And Wait block did setup,
+        // so check that things exist before disconnecting them.
+        if (this._context) {
+            this._context.suspend.bind(this._context);
+        }
+        // This is called on green flag to reset things that may never have existed
+        // in the first place. Do a bunch of checks.
+        if (this._scriptNode) {
+            this._scriptNode.disconnect();
+        }
+        if (this._sourceNode) {
+            this._sourceNode.disconnect();
+        }
+    }
+
+    /**
+     * Resolves all the speech promises we've accumulated so far and empties out the list.
+     * @private
+     */
+    _resolveSpeechPromises () {
+        for (let i = 0; i < this._speechPromises.length; i++) {
+            const resFn = this._speechPromises[i];
+            resFn();
+        }
+        this._speechPromises = [];
+    }
+
+    /**
+     * Called when we want to stop listening (e.g. when a listen block times out)
+     * but we still want to wait a little to see if we get any transcription results
+     * back before yielding the block execution.
+     * @private
+     */
+    _stopTranscription () {
+        this._stopListening();
+        if (this._socket && this._socket.readyState === this._socket.OPEN) {
+            this._socket.send('stopTranscription');
+        }
+        // Give it a couple seconds to response before giving up and assuming nothing else will come back.
+        this._speechFinalResponseTimeout = setTimeout(this._resetListening, finalResponseTimeoutDurationMs);
+    }
+    
+    /**
+     * Decides whether to keep a given transcirption result.
+     * @param {number} fuzzyMatchIndex Index of the fuzzy match or -1 if there is no match.
+     * @param {object} result The json object representing the transcription result.
+     * @param {string} normalizedTranscript The transcription text used for matching (i.e. lowercased, no punctuation).
+     * @returns {boolean} true If a result is good enough to be kept.
+     * @private
+     */
+    _shouldKeepResult (fuzzyMatchIndex, result, normalizedTranscript) {
+        // The threshold above which we decide transcription results are unlikely to change again.
+        // See https://cloud.google.com/speech-to-text/docs/basics#streaming_responses.
+        const stabilityThreshold = .85;
+
+        // For responsiveness of the When I Hear hat blocks, sometimes we want to keep results that are not
+        // yet marked 'isFinal' by the speech api.  Here are some signals we use.
+
+        // If the result from the speech api isn't very stable and we only had a fuzzy match, we don't want to use it.
+        const shouldKeepFuzzyMatch = fuzzyMatchIndex !== -1 && result.stability > stabilityThreshold;
+
+        // If the result is in the phraseList (i.e. it matches one of the 'When I Hear' blocks), we keep it.
+        // This might be aggressive... but so far seems to be a good thing.
+        const shouldKeepPhraseListMatch = this._phraseList.includes(normalizedTranscript);
+
+        if (!result.isFinal && !shouldKeepPhraseListMatch && !shouldKeepFuzzyMatch) {
+            return false;
+        }
+        return true;
+    }
+
+    /**
+     * Normalizes text a bit to facilitate matching.  Lowercases, removes some punctuation and whitespace.
+     * @param {string} text The text to normalzie
+     * @returns {string} The normalized text.
+     * @private
+     */
+    _normalizeText (text) {
+        text = Cast.toString(text).toLowerCase();
+        text = text.replace(/[.?!]/g, '');
+        text = text.trim();
+        return text;
+    }
+
+    /**
+     * Call into diff match patch library to compute whether there is a fuzzy match.
+     * @param {string} text The text to search in.
+     * @param {string} pattern The pattern to look for in text.
+     * @returns {number} The index of the match or -1 if there isn't one.
+     */
+    _computeFuzzyMatch (text, pattern) {
+        // Don't bother matching if any are null.
+        if (!pattern || !text) {
+            return -1;
+        }
+        let match = -1;
+        try {
+            // Look for the text in the pattern starting at position 0.
+            match = this._dmp.match_main(text, pattern, 0);
+        } catch (e) {
+            // This can happen inf the text or pattern gets too long.  If so just substring match.
+            return pattern.indexOf(text);
+        }
+        return match;
+    }
+
+    /**
+     * Processes the results we get back from the speech server.  Decides whether the results
+     * are good enough to keep. If they are, resolves the 'Listen and Wait' blocks promise and cleans up.
+     * @param {object} result The transcription result.
+     * @private
+     */
+    _processTranscriptionResult (result) {
+        log.info(`Got result: ${JSON.stringify(result)}`);
+        const transcriptionResult = this._normalizeText(result.alternatives[0].transcript);
+  
+        // Waiting for an exact match is not satisfying.  It makes it hard to catch
+        // things like homonyms or things that sound similar "let us" vs "lettuce".  Using the fuzzy matching helps
+        // more aggressively match the phrases that are in the "When I hear" hat blocks.
+        const phrases = this._phraseList.join(' ');
+        const fuzzyMatchIndex = this._computeFuzzyMatch(phrases, transcriptionResult);
+
+        let fuzzyMatchResult = null;
+        if (fuzzyMatchIndex !== -1) {
+            fuzzyMatchResult = transcriptionResult.substring(fuzzyMatchIndex, fuzzyMatchIndex + phrases.length);
+        }
+
+        // If the result isn't good enough yet, return without saving and resolving the promises.
+        if (!this._shouldKeepResult(fuzzyMatchIndex, result, transcriptionResult)) {
+            return;
+        }
+
+        // TODO: Decide whether this is the right thing.
+        // This sets the currentUtterance (which is returned by the reporter) to the fuzzy match if we had one.
+        // That means it'll often get set to a phrase from one of the 'when I hear' blocks instead of the
+        // full phrase that the user said.
+        if (fuzzyMatchResult) {
+            this._currentUtterance = fuzzyMatchResult;
+        } else {
+            this._currentUtterance = transcriptionResult;
+        }
+        log.info(`Keeing result: ${this._currentUtterance}`);
+        this._utteranceForEdgeTrigger = transcriptionResult;
+
+        // We're done listening so resolove all the promises and reset everying so we're ready for next time.
+        this._resetListening();
+        
+        // We got results so clear out the timeouts.
+        if (this._speechTimeoutId) {
+            clearTimeout(this._speechTimeoutId);
+            this._speechTimeoutId = null;
+        }
+        if (this._speechFinalResponseTimeout) {
+            clearTimeout(this._speechFinalResponseTimeout);
+            this._speechFinalResponseTimeout = null;
+        }
+    }
+
+    /**
+     * Handle a message from the socket. It contains transcription results.
+     * @param {MessageEvent} e The message event containing data from speech server.
+     * @private
+     */
+    _onTranscriptionFromServer (e) {
+        let result = null;
+        try {
+            result = JSON.parse(e.data);
+        } catch (ex) {
+            log.error(`Problem parsing json. continuing: ${ex}`);
+            // TODO: Question - Should we kill listening and continue?
+            return;
+        }
+        this._processTranscriptionResult(result);
+    }
+
+  
+    /**
+     * Decide whether the pattern given matches the text. Uses fuzzy matching
+     * @param {string} pattern The pattern to look for.  Usually this is the transcription result
+     * @param {string} text The text to look in. Usually this is the set of phrases from the when I hear blocks
+     * @returns {boolean} true if there is a fuzzy match.
+     * @private
+     */
+    _speechMatches (pattern, text) {
+        pattern = this._normalizeText(pattern);
+        text = this._normalizeText(text);
+        const match = this._computeFuzzyMatch(text, pattern);
+        return match !== -1;
+    }
+
+    /**
+     * Kick off the listening process.
+     * @private
+     */
+    _startListening () {
+        // If we've already setup the context, we can resume instead of doing all the setup again.
+        if (this._context) {
+            this._resumeListening();
+        } else {
+            this._initListening();
+        }
+        // Force the block to timeout if we don't get any results back/the user didn't say anything.
+        this._speechTimeoutId = setTimeout(this._stopTranscription, listenAndWaitBlockTimeoutMs);
+    }
+
+    /**
+     * Resume listening for audio and re-open the socket to send data.
+     * @private
+     */
+    _resumeListening () {
+        this._context.resume.bind(this._context);
+        this._newWebsocket();
+    }
+
+    /**
+     * Does all setup to get microphone data and initializes the web socket.
+     * that data to the speech server.
+     * @private
+     */
+    _initListening () {
+        this._initializeMicrophone();
+        this._initScriptNode();
+        this._newWebsocket();
+    }
+
+    /**
+     * Initialize the audio context and connect the microphone.
+     * @private
+     */
+    _initializeMicrophone () {
+        this._context = new AudioContext();
+        this._audioPromise = navigator.mediaDevices.getUserMedia({
+            audio: {
+                echoCancellation: true,
+                channelCount: 1,
+                sampleRate: {
+                    ideal: 16000
+                },
+                sampleSize: 16
+            }
+        });
+
+        const tempContext = this._context;
+        this._audioPromise.then(micStream => {
+            const microphone = tempContext.createMediaStreamSource(micStream);
+            const analyser = tempContext.createAnalyser();
+            microphone.connect(analyser);
+        }).catch(e => {
+            log.error(`Problem connecting to microphone:  ${e}`);
+        });
+    }
+
+    /**
+     * Sets up the script processor and the web socket.
+     * @private
+     *
+     */
+    _initScriptNode () {
+        // Create a node that sends raw bytes across the websocket
+        this._scriptNode = this._context.createScriptProcessor(4096, 1, 1);
+        // Need the maximum value for 16-bit signed samples, to convert from float.
+        this._scriptNode.addEventListener('audioprocess', this._processAudioCallback);
+    }
+
+    /**
+     * Callback called when it is time to setup the new web socket.
+     * @param {Function} resolve - function to call when the web socket opens succesfully.
+     * @param {Function} reject - function to call if opening the web socket fails.
+     */
+    _newSocketCallback (resolve, reject) {
+        this._socket = new WebSocket(serverURL);
+        this._socket.addEventListener('open', resolve);
+        this._socket.addEventListener('error', reject);
+    }
+
+    /**
+     * Callback called once we've initially established the web socket is open and working.
+     * Sets up the callback for subsequent messages (i.e. transcription results)  and
+     * connects to the script node to get data.
+     * @private
+     */
+    _socketMessageCallback () {
+        this._socket.addEventListener('message', this._onTranscriptionFromServer);
+        this._startByteStream();
+    }
+
+    /**
+     * Sets up callback for when socket and audio are initialized.
+     * @private
+     */
+    _newWebsocket () {
+        const websocketPromise = new Promise(this._newSocketCallback);
+        Promise.all([this._audioPromise, websocketPromise]).then(
+            this._setupSocketCallback)
+            .catch(e => {
+                log.error(`Problem with setup:  ${e}`);
+            });
+    }
+
+    /**
+     * Callback to handle initial setting up of a socket.
+     * Currently we send a setup message (only contains sample rate) but might
+     * be useful to send more data so we can do quota stuff.
+     * @param {Array} values The
+     */
+    _setupSocketCallback (values) {
+        this._micStream = values[0];
+        this._socket = values[1].target;
+
+        this._socket.addEventListener('error', e => {
+            log.error(`Error from web socket: ${e}`);
+        });
+
+        // Send the initial configuration message. When the server acknowledges
+        // it, start streaming the audio bytes to the server and listening for
+        // transcriptions.
+        this._socket.addEventListener('message', this._socketMessageCallback, {once: true});
+        this._socket.send(JSON.stringify(
+            {
+                sampleRate: this._context.sampleRate,
+                phrases: this._phraseList
+            }
+        ));
+    }
+
+    /**
+     * Do setup so we can start streaming mic data.
+     * @private
+     */
+    _startByteStream () {
+        // Hook up the scriptNode to the mic
+        this._sourceNode = this._context.createMediaStreamSource(this._micStream);
+        this._sourceNode.connect(this._scriptNode);
+        this._scriptNode.connect(this._context.destination);
+    }
+
+    /**
+     * Called when we have data from the microphone. Takes that data and ships
+     * it off to the speech server for transcription.
+     * @param {audioProcessingEvent} e The event with audio data in it.
+     * @private
+     */
+    _processAudioCallback (e) {
+        if (this._socket.readyState === WebSocket.CLOSED ||
+        this._socket.readyState === WebSocket.CLOSING) {
+            log.error(`Not sending data because not in ready state. State: ${this._socket.readyState}`);
+            // TODO: should we stop trying and reset state so it might work next time?
+            return;
+        }
+        const MAX_INT = Math.pow(2, 16 - 1) - 1;
+        const floatSamples = e.inputBuffer.getChannelData(0);
+        // The samples are floats in range [-1, 1]. Convert to 16-bit signed
+        // integer.
+        this._socket.send(Int16Array.from(floatSamples.map(n => n * MAX_INT)));
+    }
+
+    /**
+     * The key to load & store a target's speech-related state.
+     * @type {string}
+     */
+    static get STATE_KEY () {
+        return 'Scratch.speech';
+    }
+
+    /**
+     * @returns {object} Metadata for this extension and its blocks.
+     */
+    getInfo () {
+        return {
+            id: 'speech',
+            name: 'Google Speech',
+            menuIconURI: menuIconURI,
+            blockIconURI: iconURI,
+            blocks: [
+                {
+                    opcode: 'listenAndWait',
+                    text: 'Listen and Wait',
+                    blockType: BlockType.COMMAND
+                },
+                {
+                    opcode: 'whenIHearHat',
+                    text: 'When I hear [PHRASE]',
+                    blockType: BlockType.HAT,
+                    arguments: {
+                        PHRASE: {
+                            type: ArgumentType.STRING,
+                            defaultValue: 'cat'
+                        }
+                    }
+                },
+                {
+                    opcode: 'getSpeech',
+                    text: 'speech',
+                    blockType: BlockType.REPORTER
+                }
+            ]
+        };
+    }
+
+    /**
+     * Start the listening process if it isn't already in progress, playing a sound to indicate
+     * when it starts and stops.
+     * @return {Promise} A promise that will resolve when listening is complete.
+     */
+    listenAndWait () {
+        // TODO: Look into the timing of when to start the sound.  There currently seems
+        // to be some lag between when the sound starts and when the socket message
+        // callback is received. Perhaps we should play the sound after the socket is setup.
+        // TODO: Question - Should we only play the sound if listening isn't already in progress?
+        return this._playSound(this._startSoundBuffer).then(() => {
+            this._phraseList = this._scanBlocksForPhraseList();
+            this._utteranceForEdgeTrigger = '';
+            const speechPromise = new Promise(resolve => {
+                const listeningInProgress = this._speechPromises.length > 0;
+                this._speechPromises.push(resolve);
+                if (!listeningInProgress) {
+                    this._startListening();
+                }
+            });
+            return speechPromise.then(() => this._playSound(this._endSoundBuffer));
+        });
+    }
+
+    /**
+     * An edge triggered hat block to listen for a specific phrase.
+     * @param {object} args - the block arguments.
+     * @return {boolean} true if the phrase matches what was transcribed.
+     */
+    whenIHearHat (args) {
+        return this._speechMatches(args.PHRASE, this._utteranceForEdgeTrigger);
+    }
+
+    /**
+     * Reporter for the last heard phrase/utterance.
+     * @return {string} The lastest thing we heard from a listen and wait block.
+     */
+    getSpeech () {
+        return this._currentUtterance;
+    }
+}
+module.exports = Scratch3SpeechBlocks;
--- a/src/extensions/scratch3_speech/manifest.js
+++ b/src/extensions/scratch3_speech/manifest.js
@ -0,0 +1,4 @@
+module.exports = {
+    'speech-rec-start.mp3': require('!buffer-loader!./assets/speech-rec-start.mp3'),
+    'speech-rec-end.mp3': require('!buffer-loader!./assets/speech-rec-end.mp3')
+};