From 85da0ac5e3b13c1270d2a42a8f257a29e4067cd8 Mon Sep 17 00:00:00 2001 From: picklesrus Date: Fri, 4 May 2018 17:23:01 -0700 Subject: [PATCH] Initial version of the speech extension. There is certainly still some stuff to figure out but the core of it is here. --- package.json | 1 + .../scratch3_speech/assets/speech-rec-end.mp3 | Bin 0 -> 5634 bytes .../assets/speech-rec-start.mp3 | Bin 0 -> 5947 bytes src/extensions/scratch3_speech/index.js | 755 ++++++++++++++++++ src/extensions/scratch3_speech/manifest.js | 4 + 5 files changed, 760 insertions(+) create mode 100644 src/extensions/scratch3_speech/assets/speech-rec-end.mp3 create mode 100644 src/extensions/scratch3_speech/assets/speech-rec-start.mp3 create mode 100644 src/extensions/scratch3_speech/index.js create mode 100644 src/extensions/scratch3_speech/manifest.js diff --git a/package.json b/package.json index 37656764f..3ec63b3e4 100644 --- a/package.json +++ b/package.json @@ -37,6 +37,7 @@ "canvas-toBlob": "1.0.0", "copy-webpack-plugin": "4.2.1", "decode-html": "2.0.0", + "diff-match-patch": "^1.0.0", "escape-html": "1.0.3", "eslint": "^4.5.0", "eslint-config-scratch": "^5.0.0", diff --git a/src/extensions/scratch3_speech/assets/speech-rec-end.mp3 b/src/extensions/scratch3_speech/assets/speech-rec-end.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..eaf6556e6cc7f8f146486084af61ca16d6c22f4c GIT binary patch literal 5634 zcmd_uS5Q+=xB&1FdW#|SDkQW(K%|L)bdX*{6X`{W(mRSK(yR0qiaB??WfJ0!UY2T3@@Ja488~4?KD_x2>Ms2T@v}`U&hWurXM{jOo>&81V8*WM&)O}{ zxThw@rwV;0t5?!CyMyhI>hJD;Jj_wG?92E__HG;_y_LX(QcwZugA`kUbvMQ2QkWZL zgaf>T=Au787u@)!#stxbJw%PwU1&$MWW&#czyY`ridE^tpn!LDB`dGWtb}FO zrHlSzSD@Y(Ht+PrAT3Z0;ZccJH-!uU)5b79Fcn>YdWHT7bu7kw-Hk#&Ny0SrDJ*)W zHF~AGiRot6ScI-h!J>uuOm$2llq?BI z1D4u@4v~*T4>-^O7CJg>^BOcCh?|w`68RL}M~h~CUwJiM_4b?a@+S&!3j=Ytg@Wim zzD8>eNKlAqVTFXkZqU2`cw?nADr3}WaJ|OXYDi+>*>O3QpNN6IKZ5Gap?^50{c=a} zV2Lf~7xUV@-XKkP8ygH?ZjxzsQ;*`}iuFA?1L->s7X`hG{(#su+d|S(b>W^WpFVrIcZTfWkrsE(7$oB57 z)c&b8TbSbLW+ciCul-C2+oGdo`BWjItYj|{MoKw49F($t#lT_fZGO9CB-P?IvLM%m zEgGj3`-|nfm9``m&CR23rkSaP*vTw;p&s&SKp#7RrM=Q($CVnY{R|bICsePCG<_?V z1<>IMC{|Hsb)(K2kFsvtvS4E`yH2n5)ieF1Ay%G8i&xaBlwkEpnM_{A^iY9zJV?k@ToT+YKkksPrL z6nACU(#C9Hbsp-|P*FM}Dn%^H1wz=Nx-8RphnbYVidP9%9nJTSSG6#H?(TO!ViL2h z4e>OFkcf8-32R))INaVoDxJr>z?5B*Q+_|@pra=Ln@3E1`9vM6H{lAzpjo3SpV@4h z{AXY$_os#bV7>0*;+S*jNVNYCiwo1ZLA#B1E=6E3H#a;6NQKu?CEciOo>dHwwD;!E1$L#n_ zRO!s`O_``!rE&7GbDDOL|8%1^{f{wEz}1c>xR(w^FWw-^*UFfo@w7E-oDx?#No7-r zRh9$3WgD^NkRT;fI?b4AXUdmn@=TDb*$M_mPA_2H^3sSHbAT&`K~Sz@6OFMP7b)rz zm#XPnk?;21VW(3sdQ=;=-Oz8+7(yo-BPNcobrPvto-uGL^3X5Jx~wyv!ipD%*+g!2 ze2InTFn*0?9?(Jz-NYoB6K}tA(~*Qj%Zr}Bva!;6t2bqZXSD27fL|U=;!7pF-X2Q` zD((s`lsy6#Qa_~Uq}2gPmO2c}K8$~LK6_{Z;EhVF?Jl|sPzx%$5_E^c$JC{KVhvP* z=+X;_rI;@*KlC)TgX!kc8_6k~=`!>1_&5-a?QOWaPWEaaDW?GC-?A$N%%14tnVWyV zhPw5P5r;?A-i)CT+e*4sN>?uzqP$AvIh>JpU)l}1NNYv8t~0w4>AbOo2_-`wvM5T^ zd_y6Jh+H>_VWs3o+}~^H6&%F4xKjlPOR=VE7&B@8ta(VyS@P}cq_8uSq6+V}g6aU& zuPR&K+pyyBf$Rl`vgRT3mnXgYHkZO=#+Qj@{Wybil?>zPa^b?bmV75(jk>nFIKX0> z&GQ>tn3P=_c~*`8gV{*SkV`Jv#quZ0twVDK3zDFyx56f-*}Zk99NX1 z6HoV1op*fE(JJz-@K+#3A>sYUt`wEb&IN?(PkU@+zL2b?%MI~_3z={B>dwfm+kToG zyM<-uO`-$M9s!??#^lWWvMo3;aS*^~pjV2>k%Ny$u#5!MhzmCYb6^6>%dQ395Go;=ZIv7w^eG-gfW5Gg*jHdRyraWrL9C4X1j($4-M#UA%YZD-@8jB0Ql%P2AC z#KV7k&*Y`qTe}HO@BN&cIOI?iNh3Xjl}1$O zir?}M1{ed#$y1|PL&OhI@ra)Kod2123}D(7@Bl>3`46TjO&;c$Gg{P zLeuGolLC*dPi|CR&DZ2rI35$Uz9))`(F|dO?-TB?iH-TsdcC9^ zmFv;#A)c4i3={zl#mP z2;4XHH_C&C3(p^x-g1)|`V@MeLVvfP29;(zmzh`9o)MyR$`5jA;IKHs&gJ(>1-bc1 zQAhL6u+dV$gbT|=oLy6f@{zac-*+ConTjkLEuML~oaWh5{rzV!@P%!#+IsJTB+r8DhlX?w~Dkrmlq)vO3|PxGQN8=5;I~4jk0v&tTfj4qyaw?lM&b#PP~8NPp68 zQaHrc(LAv0x>5w=5iWM4KA6xJ&nDFI@(_H(C-Bzc+COH~TAcH_7sbaA|an5I~LS`u>2l zecF>9wH-nwcq6BTQj*Sc&vCGA{AvPPTCF^sgRe%3-qt!mMkJT+Ob{ppKOK0=k%)m! z#kz*TZxC`>7BYA`F+C#=OFuj%sb8DxgV23|*Yw(4mre9%y|WaBPYsN#z2a)h9+5(M z46_`Ebn6o2yyk|MJB+$vYeWb&*3ZYLa$>pxZo^1wElMK)k<{;MC)fo}4lp3t%^|g2 z_?V_E8lNl77xw~wRh|cq0S)$bx7uz3qZDhmx^gQ_MCpv<^0zWo*w(_o0aG$;CdAKP zP*3d7&`l7TRP8P*Zb>j*%bMVU*YD;sKxioObaZYt_`JEJUS>_Z7KTEw`AfdQ9W##6qN93k9Vx_7hiY35~p2^rdEZA6PEa+^gI8R z8UnEx=fP%yzHwTYFMZ;xz>K`(R*u{>MoFr3+kkxzvf6M)@Ynj)?j|;uWG9>Ty3FG| z^Y|bTHrT^d?}P5h%{J5g_1i<8Rl`8D3i-0hH&|nye&T#}X zH?Xf^`evu*Vy~RnD>eX2$$8NZ-4d^nxK_v2bul15jh5v2tcTwt;oYsJQ_~^kj|gt^ z;7s+sKCY&< zm$NXIqmaz@#WaQH;ebMJuI$aUCVqZ_m(1xLOaKijVrNzpAfi!*)l|a)iTEuTnqyd- zI%>b)G-+y*CILuvDP{;B+Ywb=jQHqc4!0L%))PBaJq*Tv8B5Q9CHp?p*ezfHI(obz z8XZrn=7#QcJ~$w|54?mmqWgBpc6u)7-{b+?}7VxY1DJkNVoo zO_A%TOQCX0|LPT*#WBTtM>rTICb<<`6FcosSDs_;YZrOf=el@$F8#d}zM zI`)hp64FT8c@~R{D~T)XB9(pxLumVI6D`9Qa?{zPmyO@rS7x}wG6!IQ4cDH)#XtA_)nP#*=^x?nk?r@GwTKl>D2jQgJG%OD&BYi1NW^-Ql^X#(4&G~j$07jYJV^WoRkPuEL`vg>kJD|cex7z*=nKSbi*2qz0_`(+S2pO{pU6Zx`-q*pHQR3Y+N8=CrYPJZoHnu9k3f=8 zyQ#UkH@hOf@A1xCUp>X>jZ^W?%GTtx(+7DyBC+<@V3xj&d*EWlC#NXCzM`Aq;RdKD zUTnl0Mm_W6GBqTH$e6(9xyN*(6aoYMVeb|TShafUmg8?n@nXQiEq;>Q6~-woEs8gt z@2EH$jbaRx3;y^AKTe`oTn|c}|20ptmbH_%&DXwD_siFKe~<)UzS=a+SS0D6r~Bgy zwC5lf@6z^F$KwIX42i3@ue-F@Zrk^Da=NCd)6>gWapjxw$*Y<;^kr{ZJ>!E$kbFTnPVm!Qgct#p#cg5V1X9xmv=q%I+rBZU{~{lx z@a=1y_TnB-Q^jYqaeeB!m&~l@MA+TpUKUw-&;ZcWDi1XtWX?f$$&cqbb9lVVi`g#@ zG-#XL^#o$x$Y?;hB5JxPE|%J_#wSpGe&j7i?LMUHeI?QJuVP5XDL8@9Pt;HE^Rc$)t(7e5<8t!=Dked5#5z zIm2(O5gp8+WOf69{pjW5;HIQoL5DyL7$Nh3qkg%7R{%p2>CQPnpcz8I==)sF*(OWkBQ>vgs z6i7fpP-%JMoBx+L^FFDovGk14q@0>m7?4650PagsNPv&l3E`P^p{vLDy0OK71 z)itJT{MTgu(Y$7S&GDMgKatl`uRXt3`_G$equ1uIZT*A2c76?i?eDwvwN-S~B}L?9 zeTh5&GbJew?iKYP8 zj?SrT!mn4HJTT-G6G{X?s9n$}GIf78*%?~#Q8W{rO;Ou%FC#)4;bi!V);qHUr#V5F zEW`zIYt;T>ZW3>F;_PZ$BySqn9ygUIEW#nv)Y8DUR85VvZioA3(ub)BpGpOhW%z&M zY1P&`XUuK>qPWiV4!X}+R}fC>?#2B&g+l4e@U!O=Ht?A6kE~AY_gx#e5mSq5ONllb7KOA(^C&9`27*OqA~L-M-YMifoUV!ipiV%m9ks zRiWIFqBsx>Nt~*?zq{UIqMqSx9j^+e;OijJMA8`K$Q4CJ5`xNH#IsY}1gF`b76Nm& zBDd``1mT8UrFT<(d((993G**%mD8Po3JNIt`AzbkEaAc;&39Pa&WOCNSz_LPn#97UNJs65maJ3Z zv-vq-6in4Y$jazUCe9(NQA0@ew6)(-hff`g#4#IaD|jfMGVR|J6nk}p3Njqauy8_x za~yVJ^~JpW1*PlHnA%DqM~|za_S0Jxn@%VDF^xqFA={iOFBUmYI%7gN&!otXPFhd- z)S!4|gw;lyD~Jr)|HwfFNtI7O5g&y=j~pQb&^o`AOKI{XBDhoe0}YjHMZ73BhEo!A ztjRyJ+J&8x@&V15`9y2#T;d}-DZbyNXIuZCphmH^oT3t6t}i+8RkLSkd$`F678F+i z^SvqF;00a&G_R5Bgb{M0k36FH&_y;=9UI!_cC(PuMjasZUZ*E{OTeEn*i5JePJiqY z`t2vw^@ueg&sEv{+tX7i#8M&x03a+8E+2y$Q4-vXMX&?%`LqH{2E{(532~mXL4*R? zA#vP*qbio7QdtfHWxAW%gn%hpCQi@A{L;!La~tqXlV1`&vpDzkuAhj|7(iEbTU_%A zAbWP2jgtJ))*V@&?UvO%w|AZ2U?a-UF#^r8vO6YiT{&L`Cz?*G|9sv5xf+5xvpsnM zJP?>>ZbwP*1kRsFx{0bJ6AOEd{))z*KN_I`&}Y0f!-|CVUOcGLZh+vSh%NcKMt6l?lEXfJkwy_lZP`AqH1zQB>Ov@yeg8ucw;H#m)RQk$0;pV78zS*oeU#$(IavZk!qR&**Ci+3RU&H)+TUW}@4SSi8NZlU(`dnZDf| zwp~}^#iQu9Y?z|hLD=>mR(=Cc}oIK4aDwjkP4F}!f7j@(ZbL0!9{qaLqnQ5ah6$xY-M z97ssaUIr$G5KP1(2*GN#ZKnlKQJ?s>2ck6@Zkxeq^jxl>GSW=~4{rVvZe3>zg910} ziWY04Bof4|<;H2IqmHTl?z1k|yO^k#HCnfgW^%0yPaNWcMB9%?0u(QnDAT>w)7wz*?%PbwNo(;pOND}JBwP0tqgi?nC8Z%ZkhK+k z6Zf3=Sqi+mRugZVNh^44E-H=PIpE3-pgAw}qwOwQf2-H|t$ANH=7tXSO>0ZIRIZn6 zhJEgDr^OK{chAjTiR}BP>4(RTi*xqYmtQlyOTO2DCGtn_c{peh7m0o(*g>kc3G0t(8Yd zLJb7&X_5r9=+FDzrDP7jYglRcleaCAAd2M(MNi>II~>EDo(n?Up}Q^8m`KKE=s?H z^6t8s!Tws|0PiOD;x$sLrQRa&S0EjXG_wKM|Au5-Y)v?0J`h1c3Kk~R=aZ+-%V~?u zH`eYq^~?SdDW6VNG1m0UVhNr^Q|+WIC7;mQjH|WmDce<7*fe$o4T_bq$Qky!-LkBR z{}JCz_-*VVrB#;Mu0#*pIU%Cn{gbK=;INdM7=zRa;UAwXGm-sX@bbBCpw!UY+UbN~nz3 zEi$M>2o9`Lz~4uxhTeb{(PZk%1g5_ zLmf!1VV5Yf5&niSx8PA^rZNjKDoPEQp5v&Z%u${YJi%=$zosjsE@-izp_z9^#;6JEnY&A*S|X)<@d;rAARY2M9b)-iJ6Ba$3_~#iPdTn$f77y zL8tx8k~2eV>MjW1x7%c#G3E^I9HbnbpPCmLmGr*XEN%LTasxpzH#B?aJHHN@{UN1} zS+{N=lgGK>@z{3w4(UCI2F)-Ah)v;@~VmuUrR?1VGDPW`^Z3`>Y^^yo}{c_O*-wiL(-v@kM`M zHTdjlxqzNFK&cqWHG~4Qs!CxNN!q7m7P&k|Co4ECj*LwIw4V~K4Jcs9z+z(($HVL$ zHN@10({2VWuDJ7Sewe1ZrMDWvcCv3g>rlPdq`*QsZSfQ%G^iHAXeN8gP#Vil(ynxn zUATE!9;XrN+xG??XgP2r#U>W54)#vt+D#?Y|gQhb^#u5)XrTWC1voN$-j~u^A zxmEmeIJ*1N&ueq%OCUEUt{tY$wAXAB?)!V{vjfv7TA}#B8r_inz5RrrwN809wfCM_ zt|(;>X0JoiV=6!vMH7Q);$ql!;BElGKKCrJbqfO2ReP72sCwqadiz1*uJkG)*nY1d zqHx#7fGqJ3AbB|-#va!axS&K9B{_A!`hHeAf2ns+0G?~f`of3*2w6e*J0}l`-nhUY zPyIrOEM|VtJHPb)h|to>Tq)abQg+blltn*HHRX2l3i^=t`ttL^L8xz5KmHBx)$fIu zXs@zmEL~vFRxlkShy+sjoVBxOZcv{iG)6yaM7!RKVJW8M7=>297vsD7K(y*5U9(^p z%Ht9?T>R|q2*9=TW5u5GlV*Q^ICVuI_#c0Di2cpub>{zq} zn+W2)ZdkXMalPkZW@gI`N&eU4z>(m$WhU6-fX`gl%MqYwBmvJNhPKA37+Qy`0qVA3 zmFNW~W#`BGn{R!4v-<8EcV080CJ8V|`u*ia{h{uBuf@U5cPou}JjUfc6;m~#qmJNw zY-+pQ_U>VU3hT0?Jv8}j>z(`8rp_FS6X=bD_j`Vp;$~khmSn7eC6;QRwPM`#Z9!Z7~wQNK3CJtyQ7jHs|{sRzXLP z*ysXj8P9U3$fujbuJo^$7EA!}-%`bK_&F#_BE#cK0^YGeK2DSCN@&l>bwL;#Nb;eT zmq<~2dn9<0-;G+@`y%bw^c)wPm?c=+`Pz$V85F>!k04HD=s-yvP2&q`PbFfDO~X*Q zh@t|F`V5?F$`<^nm%cRJEl^`?tEvUp`-Wv(Jc?dt2*v4o+~gK3*2>KZotU4g4AE-j zt(&gn377PoeTW#FldLY}WSnSxp7e2EiI+D?)~76Nk*$-Qr4G(xb z;-MuXkU;+QS%8epb=eU`f-TYvu!`iIR5b#MSwQ1{s{{wlXSDHIY?K{acAZJ%#nXfU zQX)BWNBVcJnUVvup@t{7j#?_liwi6C=>U}!rtm^WLZPyW4vfLDQy3%qdFu_D2W!R} z;iF7)83Lg%WUGI#VpuBEn0u_U;P*N2KkKrPm%=aPC*_w2Ya4gNZ7w6~se+;(blYas zIKGIlEJ`X9vYAVSuymc{$kNx!=c5Ip$5VVN%^l37P1^zEE^OoHcb}{%=(4A3ad>%l z(eV@;f0|R_{pAIFQk>|l%0ku6*~R6hZzTDF6Dk!aL;8hSey-)AuAjVv%UW`<(X%KZ z@h8=)p-~N9X-)x`ojpw-vV?lcQI^lTF)++i#Cvg4fHQ0TeLp+6n@rz+6eo$F-RRMj z&~%3aNAZ3AG#%;b>+AtWf~a5WVe`GK|1=1MX8~cQtX|9Ws`oh+?1_DjGTX@J>sv`k zveiG`9-pO(YZ!W{SE{2KNH#8`X#2om;6P+i9y-ipQlQHp0%msB{3zr_)$;0|?j125 zCH(3=JlnbU_-ZK%dKZHkc9yL=7!5FPDdVB@)Dv>8kHBBH#Aa6g{X@+U59ZqJ(JF8m%?7{aF<19?P1d0eEbd#+ff`{Mb`*4dCIJnC2Lo^wLA zzYl|F1RnpiL@e3*zneD5#K-{4&N$qfV!tqxU%yKKikNsgXpH_nr( z;GTn^@rCg6Y0U6TWlf-L?Wk^j=i#FU3(L@Y8%d`lyPl*u1Ucw2=iQOIz|<<~Z>qPr z*b>&i>t#Zg9cF9H9j7be+Jc4sqUiHc+hX!3IGXjlfhQ3=H;9t*+un~zaYnYN+`8A7p(}>etuY#pULPnohRxs-6fY4 zI={T7k>R%*|#kD95Luk(MF=PdO}RSAqUA?Go?ZR{!KM^bi20$RYzG8!Di zJ3(zwgK}&C($1C45tgD7ke~19D&^+Y@1`}1{M>qziOP;WMXf64aeaz--WyVPTe4aY zLUvjTqoeTL>+FF<()5CL{syl4k`<}cGC+cQVXQYbN%Tr@A!yCQkGx)e>Q7<7D%En- z&t^U+eUKUbz#-5C zn%-ZF!g>KdloZKl`2G(}Nff5~RWn>r7ANlS1IhR0K8|MpzR$k8P|x>LZw}s{(#zQM zL7cKlr9Qyzc@Eb}#p7RYD{lb}D-CA3bXL9TAM;zUtB%ty7}&D-N!Fv_b$~ zLT0JS#evW8>+AtW-l`~+`!fvy#1L=4i3SlfrBHzY;23QQA_+0j$W|FaOrcYkSce+1 ztSPTk!4%(t8hH~FcnqmeJL|Ulq`LBDl`n-ZpXXN8!`NHe-+G#>2$GlGQP zH`xvEYoum8kh{gKDb9Y)LZj~|ZpP&AgY3rUYA~Z-HQlkK;zbfA)w)BuxGn_;b(*m}me%Q2uw1m;(TiiUa_N L|Cf9JFT?)@FILcA literal 0 HcmV?d00001 diff --git a/src/extensions/scratch3_speech/index.js b/src/extensions/scratch3_speech/index.js new file mode 100644 index 000000000..873c10032 --- /dev/null +++ b/src/extensions/scratch3_speech/index.js @@ -0,0 +1,755 @@ +const ArgumentType = require('../../extension-support/argument-type'); +const Cast = require('../../util/cast'); +const BlockType = require('../../extension-support/block-type'); +const log = require('../../util/log'); +const DiffMatchPatch = require('diff-match-patch'); + + +/** + * Url of icon to be displayed at the left edge of each extension block. + * TODO: Find the final Icon. Replace it with the right format. data URI? + * @type {string} + */ +const iconURI = 'https://www.gstatic.com/images/icons/material/system/1x/mic_white_24dp.png'; + +/** + * Url of icon to be displayed in the toolbox menu for the extension category. + * TODO: Find the final Icon. Replace it with the right format. data URI? + * @type {string} + */ +const menuIconURI = 'https://www.gstatic.com/images/icons/material/system/1x/mic_grey600_24dp.png'; + +/** + * The url of the speech server. + * @type {string} + */ +const serverURL = 'wss://speech.scratch.mit.edu'; + +/** + * The amount of time to wait between when we stop sending speech data to the server and when + * we expect the transcription result marked with isFinal: true to come back from the server. + * @type {int} + */ +const finalResponseTimeoutDurationMs = 3000; + +/** + * The amount of time to wait between when we stop sending speech data to the server and when + * we expect the transcription result marked with isFinal: true to come back from the server. + * Currently set to 10sec. This should not exceed the speech api limit (60sec) without redoing how + * we stream the microphone data data. + * @type {int} + */ +const listenAndWaitBlockTimeoutMs = 10000; + +/** + * The start and stop sounds, loaded as static assets. + * @type {object} + */ +let assetData = {}; +try { + assetData = require('./manifest'); +} catch (e) { + // Non-webpack environment, don't worry about assets. +} + +class Scratch3SpeechBlocks { + constructor (runtime) { + /** + * The runtime instantiating this block package. + * @type {Runtime} + */ + this.runtime = runtime; + + /** + * An array of phrases from the [when I hear] hat blocks. + * The list of phrases in the when I hear hat blocks. This list is sent + * to the speech api to seed the recognition engine and for deciding + * whether the transcription results match. + * @type {Array} + * @private + */ + this._phraseList = []; + + /** + * The most recent transcription result received from the speech API that we decided to keep. + * This is the value returned by the reporter block. + * @type {String} + * @private + */ + this._currentUtterance = null; + + /** + * Similar to _currentUtterance, but set back to '' at the beginning of listening block. + * Used to get the hat blocks to edge trigger. In order to detect someone saying + * the same thing twice in two subsequent liten and wait blocks + * and still trigger the hat, we need this to go from + * '' at the beginning of the listen block to '' at the end. + * @type {string} + * @private + */ + this._utteranceForEdgeTrigger = null; + + /** + * The list of queued `resolve` callbacks for 'Listen and Wait' blocks. + * We only listen to for one utterance at a time. We may encounter multiple + * 'Listen and wait' blocks that tell us to start listening. If one starts + * and hasn't receieved results back yet, when we encounter more, any further ones + * will all resolve when we get the next acceptable transcription result back. + * @type {!Array} + * @private + */ + this._speechPromises = []; + + /** + * The id of the timeout that will run if we start listening and don't get any + * transcription results back. e.g. because we didn't hear anything. + * @type {number} + * @private + */ + this._speechTimeoutId = null; + + /** + * The id of the timeout that will run to wait for after we're done listening but + * are still waiting for a potential isFinal:true transcription result to come back. + * @type {number} + * @private + */ + this._speechFinalResponseTimeout = null; + + /** + * The ScriptProcessorNode hooked up to the audio context. + * @type {ScriptProcessorNode} + * @private + */ + this._scriptNode = null; + + /** + * The socket used to communicate with the speech server to send microphone data + * and recieve transcription results. + * @type {WebSocket} + * @private + */ + this._socket = null; + + /** + * The AudioContext used to manage the microphone. + * @type {AudioContext} + * @private + */ + this._context = null; + + /** + * MediaStreamAudioSourceNode to handle microphone data. + * @type {MediaStreamAudioSourceNode} + * @private + */ + this._sourceNode = null; + + /** + * A Promise whose fulfillment handler receives a MediaStream object when the microphone has been obtained. + * @type {Promise} + * @private + */ + this._audioPromise = null; + + /** + * Audio buffer for sound to indicate that listending has started. + * @type {bufferSourceNode} + * @private + */ + this._startSoundBuffer = null; + + /** + * Audio buffer for sound to indicate that listending has ended. + * @type {bufferSourceNode} + * @private + */ + this._endSoundBuffer = null; + + + /** + * Diff Match Patch is used to do some fuzzy matching of the transcription results + * with what is in the hat blocks. + */ + this._dmp = new DiffMatchPatch(); + // Threshold for diff match patch to use: (0.0 = perfection, 1.0 = very loose). + this._dmp.Match_Threshold = 0.3; + + // Come back and figure out which of these I really need. + this._newSocketCallback = this._newSocketCallback.bind(this); + this._setupSocketCallback = this._setupSocketCallback.bind(this); + this._socketMessageCallback = this._socketMessageCallback.bind(this); + this._processAudioCallback = this._processAudioCallback.bind(this); + this._onTranscriptionFromServer = this._onTranscriptionFromServer.bind(this); + this._resetListening = this._resetListening.bind(this); + this._stopTranscription = this._stopTranscription.bind(this); + + + this.runtime.on('PROJECT_STOP_ALL', this._resetListening.bind(this)); + + // Load in the start and stop listening indicator sounds. + this._loadUISounds(); + } + + /** + * Load the UI sounds played when listening starts and stops. + * @private + */ + _loadUISounds () { + const startSoundBuffer = assetData['speech-rec-start.mp3'].buffer; + this._decodeSound(startSoundBuffer).then(buffer => { + this._startSoundBuffer = buffer; + }); + + const endSoundBuffer = assetData['speech-rec-end.mp3'].buffer; + this._decodeSound(endSoundBuffer).then(buffer => { + this._endSoundBuffer = buffer; + }); + } + + /** + * Decode a sound and return a promise with the audio buffer. + * @param {ArrayBuffer} soundBuffer - a buffer containing the encoded audio. + * @return {Promise} - a promise which will resolve once the sound has decoded. + * @private + */ + _decodeSound (soundBuffer) { + const context = this.runtime.audioEngine && this.runtime.audioEngine.audioContext; + + if (!context) { + return Promise.reject(new Error('No Audio Context Detected')); + } + + // Check for newer promise-based API + if (context.decodeAudioData.length === 1) { + return context.decodeAudioData(soundBuffer); + } else { // eslint-disable-line no-else-return + // Fall back to callback API + return new Promise((resolve, reject) => + context.decodeAudioData(soundBuffer, + buffer => resolve(buffer), + error => reject(error) + ) + ); + } + } + + /** + * Play the given sound. + * @param {ArrayBuffer} buffer The audio buffer to play. + * @returns {Promise} A promise that resoloves when the sound is done playing. + * @private + */ + _playSound (buffer) { + if (this.runtime.audioEngine === null) return; + const context = this.runtime.audioEngine.audioContext; + const bufferSource = context.createBufferSource(); + bufferSource.buffer = buffer; + bufferSource.connect(this.runtime.audioEngine.input); + bufferSource.start(); + return new Promise(resolve => { + bufferSource.onended = () => { + resolve(); + }; + }); + } + + /** + * Scans all the 'When I hear' hat blocks for each sprite and pulls out the text. The list + * is sent off to the speech recognition server as hints. This *only* reads the value out of + * the hat block shadow. If a block is dropped on top of the shadow, it is skipped. + * @returns {Array} list of strings from the hat blocks in the project. + * @private + */ + _scanBlocksForPhraseList () { + const words = []; + // For each each target, walk through the top level blocks and check whether + // they are speech hat/when I hear blocks. + this.runtime.targets.forEach(target => { + target.blocks._scripts.forEach(id => { + const b = target.blocks.getBlock(id); + if (b.opcode === 'speech.whenIHearHat') { + // Grab the text from the hat block's shadow. + const inputId = b.inputs.PHRASE.block; + const inputBlock = target.blocks.getBlock(inputId); + // Only grab the value from text blocks. This means we'll + // miss some. e.g. values in variables or other reporters. + if (inputBlock.opcode === 'text') { + const word = target.blocks.getBlock(inputId).fields.TEXT.value; + words.push(word); + } + } + }); + }); + return words; + } + + /** + * Resets all things related to listening. Called on Red Stop sign button. + * - suspends audio processing + * - closes socket with speech socket server + * - clears out any remaining speech blocks that are waiting. + * @private. + */ + _resetListening () { + this._stopListening(); + this._closeWebsocket(); + this._resolveSpeechPromises(); + } + + /** + * Close the connection to the socket server if it is open. + * @private + */ + _closeWebsocket () { + if (this._socket && this._socket.readyState === this._socket.OPEN) { + this._socket.close(); + } + } + + /** + * Call to suspend getting data from the microphone. + * @private + */ + _stopListening () { + // Note that this can be called before any Listen And Wait block did setup, + // so check that things exist before disconnecting them. + if (this._context) { + this._context.suspend.bind(this._context); + } + // This is called on green flag to reset things that may never have existed + // in the first place. Do a bunch of checks. + if (this._scriptNode) { + this._scriptNode.disconnect(); + } + if (this._sourceNode) { + this._sourceNode.disconnect(); + } + } + + /** + * Resolves all the speech promises we've accumulated so far and empties out the list. + * @private + */ + _resolveSpeechPromises () { + for (let i = 0; i < this._speechPromises.length; i++) { + const resFn = this._speechPromises[i]; + resFn(); + } + this._speechPromises = []; + } + + /** + * Called when we want to stop listening (e.g. when a listen block times out) + * but we still want to wait a little to see if we get any transcription results + * back before yielding the block execution. + * @private + */ + _stopTranscription () { + this._stopListening(); + if (this._socket && this._socket.readyState === this._socket.OPEN) { + this._socket.send('stopTranscription'); + } + // Give it a couple seconds to response before giving up and assuming nothing else will come back. + this._speechFinalResponseTimeout = setTimeout(this._resetListening, finalResponseTimeoutDurationMs); + } + + /** + * Decides whether to keep a given transcirption result. + * @param {number} fuzzyMatchIndex Index of the fuzzy match or -1 if there is no match. + * @param {object} result The json object representing the transcription result. + * @param {string} normalizedTranscript The transcription text used for matching (i.e. lowercased, no punctuation). + * @returns {boolean} true If a result is good enough to be kept. + * @private + */ + _shouldKeepResult (fuzzyMatchIndex, result, normalizedTranscript) { + // The threshold above which we decide transcription results are unlikely to change again. + // See https://cloud.google.com/speech-to-text/docs/basics#streaming_responses. + const stabilityThreshold = .85; + + // For responsiveness of the When I Hear hat blocks, sometimes we want to keep results that are not + // yet marked 'isFinal' by the speech api. Here are some signals we use. + + // If the result from the speech api isn't very stable and we only had a fuzzy match, we don't want to use it. + const shouldKeepFuzzyMatch = fuzzyMatchIndex !== -1 && result.stability > stabilityThreshold; + + // If the result is in the phraseList (i.e. it matches one of the 'When I Hear' blocks), we keep it. + // This might be aggressive... but so far seems to be a good thing. + const shouldKeepPhraseListMatch = this._phraseList.includes(normalizedTranscript); + + if (!result.isFinal && !shouldKeepPhraseListMatch && !shouldKeepFuzzyMatch) { + return false; + } + return true; + } + + /** + * Normalizes text a bit to facilitate matching. Lowercases, removes some punctuation and whitespace. + * @param {string} text The text to normalzie + * @returns {string} The normalized text. + * @private + */ + _normalizeText (text) { + text = Cast.toString(text).toLowerCase(); + text = text.replace(/[.?!]/g, ''); + text = text.trim(); + return text; + } + + /** + * Call into diff match patch library to compute whether there is a fuzzy match. + * @param {string} text The text to search in. + * @param {string} pattern The pattern to look for in text. + * @returns {number} The index of the match or -1 if there isn't one. + */ + _computeFuzzyMatch (text, pattern) { + // Don't bother matching if any are null. + if (!pattern || !text) { + return -1; + } + let match = -1; + try { + // Look for the text in the pattern starting at position 0. + match = this._dmp.match_main(text, pattern, 0); + } catch (e) { + // This can happen inf the text or pattern gets too long. If so just substring match. + return pattern.indexOf(text); + } + return match; + } + + /** + * Processes the results we get back from the speech server. Decides whether the results + * are good enough to keep. If they are, resolves the 'Listen and Wait' blocks promise and cleans up. + * @param {object} result The transcription result. + * @private + */ + _processTranscriptionResult (result) { + log.info(`Got result: ${JSON.stringify(result)}`); + const transcriptionResult = this._normalizeText(result.alternatives[0].transcript); + + // Waiting for an exact match is not satisfying. It makes it hard to catch + // things like homonyms or things that sound similar "let us" vs "lettuce". Using the fuzzy matching helps + // more aggressively match the phrases that are in the "When I hear" hat blocks. + const phrases = this._phraseList.join(' '); + const fuzzyMatchIndex = this._computeFuzzyMatch(phrases, transcriptionResult); + + let fuzzyMatchResult = null; + if (fuzzyMatchIndex !== -1) { + fuzzyMatchResult = transcriptionResult.substring(fuzzyMatchIndex, fuzzyMatchIndex + phrases.length); + } + + // If the result isn't good enough yet, return without saving and resolving the promises. + if (!this._shouldKeepResult(fuzzyMatchIndex, result, transcriptionResult)) { + return; + } + + // TODO: Decide whether this is the right thing. + // This sets the currentUtterance (which is returned by the reporter) to the fuzzy match if we had one. + // That means it'll often get set to a phrase from one of the 'when I hear' blocks instead of the + // full phrase that the user said. + if (fuzzyMatchResult) { + this._currentUtterance = fuzzyMatchResult; + } else { + this._currentUtterance = transcriptionResult; + } + log.info(`Keeing result: ${this._currentUtterance}`); + this._utteranceForEdgeTrigger = transcriptionResult; + + // We're done listening so resolove all the promises and reset everying so we're ready for next time. + this._resetListening(); + + // We got results so clear out the timeouts. + if (this._speechTimeoutId) { + clearTimeout(this._speechTimeoutId); + this._speechTimeoutId = null; + } + if (this._speechFinalResponseTimeout) { + clearTimeout(this._speechFinalResponseTimeout); + this._speechFinalResponseTimeout = null; + } + } + + /** + * Handle a message from the socket. It contains transcription results. + * @param {MessageEvent} e The message event containing data from speech server. + * @private + */ + _onTranscriptionFromServer (e) { + let result = null; + try { + result = JSON.parse(e.data); + } catch (ex) { + log.error(`Problem parsing json. continuing: ${ex}`); + // TODO: Question - Should we kill listening and continue? + return; + } + this._processTranscriptionResult(result); + } + + + /** + * Decide whether the pattern given matches the text. Uses fuzzy matching + * @param {string} pattern The pattern to look for. Usually this is the transcription result + * @param {string} text The text to look in. Usually this is the set of phrases from the when I hear blocks + * @returns {boolean} true if there is a fuzzy match. + * @private + */ + _speechMatches (pattern, text) { + pattern = this._normalizeText(pattern); + text = this._normalizeText(text); + const match = this._computeFuzzyMatch(text, pattern); + return match !== -1; + } + + /** + * Kick off the listening process. + * @private + */ + _startListening () { + // If we've already setup the context, we can resume instead of doing all the setup again. + if (this._context) { + this._resumeListening(); + } else { + this._initListening(); + } + // Force the block to timeout if we don't get any results back/the user didn't say anything. + this._speechTimeoutId = setTimeout(this._stopTranscription, listenAndWaitBlockTimeoutMs); + } + + /** + * Resume listening for audio and re-open the socket to send data. + * @private + */ + _resumeListening () { + this._context.resume.bind(this._context); + this._newWebsocket(); + } + + /** + * Does all setup to get microphone data and initializes the web socket. + * that data to the speech server. + * @private + */ + _initListening () { + this._initializeMicrophone(); + this._initScriptNode(); + this._newWebsocket(); + } + + /** + * Initialize the audio context and connect the microphone. + * @private + */ + _initializeMicrophone () { + this._context = new AudioContext(); + this._audioPromise = navigator.mediaDevices.getUserMedia({ + audio: { + echoCancellation: true, + channelCount: 1, + sampleRate: { + ideal: 16000 + }, + sampleSize: 16 + } + }); + + const tempContext = this._context; + this._audioPromise.then(micStream => { + const microphone = tempContext.createMediaStreamSource(micStream); + const analyser = tempContext.createAnalyser(); + microphone.connect(analyser); + }).catch(e => { + log.error(`Problem connecting to microphone: ${e}`); + }); + } + + /** + * Sets up the script processor and the web socket. + * @private + * + */ + _initScriptNode () { + // Create a node that sends raw bytes across the websocket + this._scriptNode = this._context.createScriptProcessor(4096, 1, 1); + // Need the maximum value for 16-bit signed samples, to convert from float. + this._scriptNode.addEventListener('audioprocess', this._processAudioCallback); + } + + /** + * Callback called when it is time to setup the new web socket. + * @param {Function} resolve - function to call when the web socket opens succesfully. + * @param {Function} reject - function to call if opening the web socket fails. + */ + _newSocketCallback (resolve, reject) { + this._socket = new WebSocket(serverURL); + this._socket.addEventListener('open', resolve); + this._socket.addEventListener('error', reject); + } + + /** + * Callback called once we've initially established the web socket is open and working. + * Sets up the callback for subsequent messages (i.e. transcription results) and + * connects to the script node to get data. + * @private + */ + _socketMessageCallback () { + this._socket.addEventListener('message', this._onTranscriptionFromServer); + this._startByteStream(); + } + + /** + * Sets up callback for when socket and audio are initialized. + * @private + */ + _newWebsocket () { + const websocketPromise = new Promise(this._newSocketCallback); + Promise.all([this._audioPromise, websocketPromise]).then( + this._setupSocketCallback) + .catch(e => { + log.error(`Problem with setup: ${e}`); + }); + } + + /** + * Callback to handle initial setting up of a socket. + * Currently we send a setup message (only contains sample rate) but might + * be useful to send more data so we can do quota stuff. + * @param {Array} values The + */ + _setupSocketCallback (values) { + this._micStream = values[0]; + this._socket = values[1].target; + + this._socket.addEventListener('error', e => { + log.error(`Error from web socket: ${e}`); + }); + + // Send the initial configuration message. When the server acknowledges + // it, start streaming the audio bytes to the server and listening for + // transcriptions. + this._socket.addEventListener('message', this._socketMessageCallback, {once: true}); + this._socket.send(JSON.stringify( + { + sampleRate: this._context.sampleRate, + phrases: this._phraseList + } + )); + } + + /** + * Do setup so we can start streaming mic data. + * @private + */ + _startByteStream () { + // Hook up the scriptNode to the mic + this._sourceNode = this._context.createMediaStreamSource(this._micStream); + this._sourceNode.connect(this._scriptNode); + this._scriptNode.connect(this._context.destination); + } + + /** + * Called when we have data from the microphone. Takes that data and ships + * it off to the speech server for transcription. + * @param {audioProcessingEvent} e The event with audio data in it. + * @private + */ + _processAudioCallback (e) { + if (this._socket.readyState === WebSocket.CLOSED || + this._socket.readyState === WebSocket.CLOSING) { + log.error(`Not sending data because not in ready state. State: ${this._socket.readyState}`); + // TODO: should we stop trying and reset state so it might work next time? + return; + } + const MAX_INT = Math.pow(2, 16 - 1) - 1; + const floatSamples = e.inputBuffer.getChannelData(0); + // The samples are floats in range [-1, 1]. Convert to 16-bit signed + // integer. + this._socket.send(Int16Array.from(floatSamples.map(n => n * MAX_INT))); + } + + /** + * The key to load & store a target's speech-related state. + * @type {string} + */ + static get STATE_KEY () { + return 'Scratch.speech'; + } + + /** + * @returns {object} Metadata for this extension and its blocks. + */ + getInfo () { + return { + id: 'speech', + name: 'Google Speech', + menuIconURI: menuIconURI, + blockIconURI: iconURI, + blocks: [ + { + opcode: 'listenAndWait', + text: 'Listen and Wait', + blockType: BlockType.COMMAND + }, + { + opcode: 'whenIHearHat', + text: 'When I hear [PHRASE]', + blockType: BlockType.HAT, + arguments: { + PHRASE: { + type: ArgumentType.STRING, + defaultValue: 'cat' + } + } + }, + { + opcode: 'getSpeech', + text: 'speech', + blockType: BlockType.REPORTER + } + ] + }; + } + + /** + * Start the listening process if it isn't already in progress, playing a sound to indicate + * when it starts and stops. + * @return {Promise} A promise that will resolve when listening is complete. + */ + listenAndWait () { + // TODO: Look into the timing of when to start the sound. There currently seems + // to be some lag between when the sound starts and when the socket message + // callback is received. Perhaps we should play the sound after the socket is setup. + // TODO: Question - Should we only play the sound if listening isn't already in progress? + return this._playSound(this._startSoundBuffer).then(() => { + this._phraseList = this._scanBlocksForPhraseList(); + this._utteranceForEdgeTrigger = ''; + const speechPromise = new Promise(resolve => { + const listeningInProgress = this._speechPromises.length > 0; + this._speechPromises.push(resolve); + if (!listeningInProgress) { + this._startListening(); + } + }); + return speechPromise.then(() => this._playSound(this._endSoundBuffer)); + }); + } + + /** + * An edge triggered hat block to listen for a specific phrase. + * @param {object} args - the block arguments. + * @return {boolean} true if the phrase matches what was transcribed. + */ + whenIHearHat (args) { + return this._speechMatches(args.PHRASE, this._utteranceForEdgeTrigger); + } + + /** + * Reporter for the last heard phrase/utterance. + * @return {string} The lastest thing we heard from a listen and wait block. + */ + getSpeech () { + return this._currentUtterance; + } +} +module.exports = Scratch3SpeechBlocks; diff --git a/src/extensions/scratch3_speech/manifest.js b/src/extensions/scratch3_speech/manifest.js new file mode 100644 index 000000000..e5ce54d59 --- /dev/null +++ b/src/extensions/scratch3_speech/manifest.js @@ -0,0 +1,4 @@ +module.exports = { + 'speech-rec-start.mp3': require('!buffer-loader!./assets/speech-rec-start.mp3'), + 'speech-rec-end.mp3': require('!buffer-loader!./assets/speech-rec-end.mp3') +};