1
|
using UnityEngine;
|
2
|
using System;
|
3
|
using System.Collections;
|
4
|
using System.IO;
|
5
|
using System.Collections.Generic;
|
6
|
using System.Text;
|
7
|
|
8
|
|
9
|
/// <summary>
|
10
|
/// This interface needs to be implemented by all speech-recognition listeners
|
11
|
/// </summary>
|
12
|
public interface SpeechRecognitionInterface
|
13
|
{
|
14
|
/// <summary>
|
15
|
/// Invoked when speech phrase gets recognized.
|
16
|
/// </summary>
|
17
|
/// <returns><c>true</c>, if the recognized phrase has to be cleared, <c>false</c> otherwise.</returns>
|
18
|
/// <param name="phraseTag">The phrase tag.</param>
|
19
|
/// <param name="condidence">Recognized with condidence (0-1).</param>
|
20
|
bool SpeechPhraseRecognized(string phraseTag, float condidence);
|
21
|
}
|
22
|
|
23
|
/// <summary>
|
24
|
/// Speech manager is the component that manages the Kinect speech recognition.
|
25
|
/// </summary>
|
26
|
public class SpeechManager : MonoBehaviour
|
27
|
{
|
28
|
[Tooltip("File name of the grammar file, used by the speech recognizer. The file will be copied from Resources, if it does not exist.")]
|
29
|
public string grammarFileName = "SpeechGrammar.grxml";
|
30
|
|
31
|
[Tooltip("Whether the grammar is dynamic or static. Dynamic grammars allow adding phrases at run-time.")]
|
32
|
public bool dynamicGrammar = false;
|
33
|
|
34
|
[Tooltip("Code of the language, used by the speech recognizer. Default is English (1033).")]
|
35
|
public int languageCode = 1033;
|
36
|
|
37
|
[Tooltip("Minimum confidence required, to consider a phrase as recognized. Confidence varies between 0.0 and 1.0.")]
|
38
|
public float requiredConfidence = 0f;
|
39
|
|
40
|
[Tooltip("List of the speech recognition listeners in the scene. If the list is empty, the available gesture listeners will be detected at the scene start up.")]
|
41
|
public List<MonoBehaviour> speechRecognitionListeners;
|
42
|
|
43
|
[Tooltip("GUI-Text to display the speech-manager debug messages.")]
|
44
|
public GUIText debugText;
|
45
|
|
46
|
// Is currently listening
|
47
|
private bool isListening;
|
48
|
|
49
|
// Current phrase recognized
|
50
|
private bool isPhraseRecognized;
|
51
|
private string phraseTagRecognized;
|
52
|
private float phraseConfidence;
|
53
|
|
54
|
// primary sensor data structure
|
55
|
private KinectInterop.SensorData sensorData = null;
|
56
|
|
57
|
// Bool to keep track of whether Kinect and SAPI have been initialized
|
58
|
private bool sapiInitialized = false;
|
59
|
|
60
|
// The single instance of SpeechManager
|
61
|
private static SpeechManager instance;
|
62
|
|
63
|
|
64
|
/// <summary>
|
65
|
/// Gets the single SpeechManager instance.
|
66
|
/// </summary>
|
67
|
/// <value>The SpeechManager instance.</value>
|
68
|
public static SpeechManager Instance
|
69
|
{
|
70
|
get
|
71
|
{
|
72
|
return instance;
|
73
|
}
|
74
|
}
|
75
|
|
76
|
/// <summary>
|
77
|
/// Determines whether SAPI (Speech API) was successfully initialized.
|
78
|
/// </summary>
|
79
|
/// <returns><c>true</c> if SAPI was successfully initialized; otherwise, <c>false</c>.</returns>
|
80
|
public bool IsSapiInitialized()
|
81
|
{
|
82
|
return sapiInitialized;
|
83
|
}
|
84
|
|
85
|
/// <summary>
|
86
|
/// Adds a phrase to the from-rule of dynamic grammar. If the to-rule is empty, this means end of the phrase recognition.
|
87
|
/// </summary>
|
88
|
/// <returns><c>true</c> if the phrase was successfully added to the grammar; otherwise, <c>false</c>.</returns>
|
89
|
/// <param name="fromRule">From-rule name.</param>
|
90
|
/// <param name="toRule">To-rule name or empty string.</param>
|
91
|
/// <param name="phrase">The dynamic phrase.</param>
|
92
|
/// <param name="bClearRulePhrases">If set to <c>true</c> clears current rule phrases before adding this one.</param>
|
93
|
/// <param name="bCommitGrammar">If set to <c>true</c> commits dynamic grammar changes.</param>
|
94
|
public bool AddGrammarPhrase(string fromRule, string toRule, string phrase, bool bClearRulePhrases, bool bCommitGrammar)
|
95
|
{
|
96
|
if(sapiInitialized)
|
97
|
{
|
98
|
int hr = sensorData.sensorInterface.AddGrammarPhrase(fromRule, toRule, phrase, bClearRulePhrases, bCommitGrammar);
|
99
|
return (hr == 0);
|
100
|
}
|
101
|
|
102
|
return false;
|
103
|
}
|
104
|
|
105
|
/// <summary>
|
106
|
/// Determines whether the speech recogizer is in listening-state.
|
107
|
/// </summary>
|
108
|
/// <returns><c>true</c> if the speech recogizer is in listening-state; otherwise, <c>false</c>.</returns>
|
109
|
public bool IsListening()
|
110
|
{
|
111
|
return isListening;
|
112
|
}
|
113
|
|
114
|
/// <summary>
|
115
|
/// Determines whether the speech recognizer has recognized a phrase.
|
116
|
/// </summary>
|
117
|
/// <returns><c>true</c> if the speech recognizer has recognized a phrase; otherwise, <c>false</c>.</returns>
|
118
|
public bool IsPhraseRecognized()
|
119
|
{
|
120
|
return isPhraseRecognized;
|
121
|
}
|
122
|
|
123
|
/// <summary>
|
124
|
/// Gets the confidence of the currently recognized phrase, in range [0, 1].
|
125
|
/// </summary>
|
126
|
/// <returns>The phrase confidence.</returns>
|
127
|
public float GetPhraseConfidence()
|
128
|
{
|
129
|
return phraseConfidence;
|
130
|
}
|
131
|
|
132
|
/// <summary>
|
133
|
/// Gets the tag of the recognized phrase.
|
134
|
/// </summary>
|
135
|
/// <returns>The tag of the recognized phrase.</returns>
|
136
|
public string GetPhraseTagRecognized()
|
137
|
{
|
138
|
return phraseTagRecognized;
|
139
|
}
|
140
|
|
141
|
/// <summary>
|
142
|
/// Clears the recognized phrase.
|
143
|
/// </summary>
|
144
|
public void ClearPhraseRecognized()
|
145
|
{
|
146
|
isPhraseRecognized = false;
|
147
|
phraseTagRecognized = String.Empty;
|
148
|
phraseConfidence = 0f;
|
149
|
}
|
150
|
|
151
|
|
152
|
// gets speech recognition data as csv line
|
153
|
public string GetSpeechDataAsCsv(char delimiter)
|
154
|
{
|
155
|
if (!sapiInitialized)
|
156
|
return string.Empty;
|
157
|
|
158
|
// create the output string
|
159
|
StringBuilder sbBuf = new StringBuilder();
|
160
|
sbBuf.Append("sr").Append(delimiter);
|
161
|
|
162
|
if(isPhraseRecognized)
|
163
|
{
|
164
|
sbBuf.Append(1).Append(delimiter);
|
165
|
sbBuf.Append(phraseTagRecognized).Append(delimiter);
|
166
|
sbBuf.AppendFormat("{0:F3}", phraseConfidence).Append(delimiter);
|
167
|
|
168
|
//Debug.Log(phraseTagRecognized + ", confidence: " + phraseConfidence);
|
169
|
}
|
170
|
else
|
171
|
{
|
172
|
sbBuf.Append(0).Append(delimiter);
|
173
|
}
|
174
|
|
175
|
// remove the last delimiter
|
176
|
if(sbBuf.Length > 0 && sbBuf[sbBuf.Length - 1] == delimiter)
|
177
|
{
|
178
|
sbBuf.Remove(sbBuf.Length - 1, 1);
|
179
|
}
|
180
|
|
181
|
return sbBuf.ToString();
|
182
|
}
|
183
|
|
184
|
// sets speech recognition data from a csv line
|
185
|
public bool SetSpeechDataFromCsv(string sCsvLine, char[] delimiters)
|
186
|
{
|
187
|
if(sCsvLine.Length == 0)
|
188
|
return false;
|
189
|
|
190
|
// split the csv line in parts
|
191
|
string[] alCsvParts = sCsvLine.Split(delimiters);
|
192
|
|
193
|
if(alCsvParts.Length < 1 || alCsvParts[0] != "sr")
|
194
|
return false;
|
195
|
|
196
|
int iIndex = 1;
|
197
|
int iLength = alCsvParts.Length;
|
198
|
|
199
|
if (iLength < (iIndex + 1))
|
200
|
return false;
|
201
|
|
202
|
// whether there is recognized phrase or not
|
203
|
int phraseRecognized = 0;
|
204
|
int.TryParse(alCsvParts[iIndex], out phraseRecognized);
|
205
|
iIndex++;
|
206
|
|
207
|
if (phraseRecognized != 0 && iLength >= (iIndex + 2))
|
208
|
{
|
209
|
// get the recognized phrase
|
210
|
isPhraseRecognized = true;
|
211
|
phraseTagRecognized = alCsvParts[iIndex];
|
212
|
float.TryParse(alCsvParts[iIndex + 1], out phraseConfidence);
|
213
|
}
|
214
|
// else
|
215
|
// {
|
216
|
// // no phrase recognized
|
217
|
// isPhraseRecognized = false;
|
218
|
// phraseTagRecognized = String.Empty;
|
219
|
// phraseConfidence = 0f;
|
220
|
// }
|
221
|
|
222
|
return true;
|
223
|
}
|
224
|
|
225
|
|
226
|
//----------------------------------- end of public functions --------------------------------------//
|
227
|
|
228
|
|
229
|
void Awake()
|
230
|
{
|
231
|
instance = this;
|
232
|
}
|
233
|
|
234
|
|
235
|
void Start()
|
236
|
{
|
237
|
try
|
238
|
{
|
239
|
// get sensor data
|
240
|
KinectManager kinectManager = KinectManager.Instance;
|
241
|
if(kinectManager && kinectManager.IsInitialized())
|
242
|
{
|
243
|
sensorData = kinectManager.GetSensorData();
|
244
|
}
|
245
|
|
246
|
if(sensorData == null || sensorData.sensorInterface == null)
|
247
|
{
|
248
|
throw new Exception("Speech recognition cannot be started, because KinectManager is missing or not initialized.");
|
249
|
}
|
250
|
|
251
|
if(debugText != null)
|
252
|
{
|
253
|
debugText.text = "Please, wait...";
|
254
|
}
|
255
|
|
256
|
// ensure the needed dlls are in place and speech recognition is available for this interface
|
257
|
bool bNeedRestart = false;
|
258
|
if(sensorData.sensorInterface.IsSpeechRecognitionAvailable(ref bNeedRestart))
|
259
|
{
|
260
|
if(bNeedRestart)
|
261
|
{
|
262
|
KinectInterop.RestartLevel(gameObject, "SM");
|
263
|
return;
|
264
|
}
|
265
|
}
|
266
|
else
|
267
|
{
|
268
|
string sInterfaceName = sensorData.sensorInterface.GetType().Name;
|
269
|
throw new Exception(sInterfaceName + ": Speech recognition is not supported!");
|
270
|
}
|
271
|
|
272
|
// Initialize the speech recognizer
|
273
|
string sCriteria = String.Format("Language={0:X};Kinect=True", languageCode);
|
274
|
int rc = sensorData.sensorInterface.InitSpeechRecognition(sCriteria, true, false);
|
275
|
if (rc < 0)
|
276
|
{
|
277
|
string sErrorMessage = (new SpeechErrorHandler()).GetSapiErrorMessage(rc);
|
278
|
throw new Exception(String.Format("Error initializing Kinect/SAPI: " + sErrorMessage));
|
279
|
}
|
280
|
|
281
|
if(requiredConfidence > 0)
|
282
|
{
|
283
|
sensorData.sensorInterface.SetSpeechConfidence(requiredConfidence);
|
284
|
}
|
285
|
|
286
|
if(grammarFileName != string.Empty)
|
287
|
{
|
288
|
// copy the grammar file from Resources, if available
|
289
|
//if(!File.Exists(grammarFileName))
|
290
|
{
|
291
|
TextAsset textRes = Resources.Load(grammarFileName, typeof(TextAsset)) as TextAsset;
|
292
|
|
293
|
if(textRes != null)
|
294
|
{
|
295
|
string sResText = textRes.text;
|
296
|
|
297
|
#if !NETFX_CORE
|
298
|
File.WriteAllText(grammarFileName, sResText);
|
299
|
#else
|
300
|
System.Threading.Tasks.Task task = null;
|
301
|
|
302
|
// UnityEngine.WSA.Application.InvokeOnUIThread(() =>
|
303
|
// {
|
304
|
task = CopyGrammarFileToStorageAsync(grammarFileName, sResText);
|
305
|
// }, true);
|
306
|
|
307
|
while (task != null && !task.IsCompleted && !task.IsFaulted)
|
308
|
{
|
309
|
task.Wait(100);
|
310
|
}
|
311
|
|
312
|
if(task != null)
|
313
|
{
|
314
|
if(task == null)
|
315
|
throw new Exception("Could not create task for CopyGrammarFileToStorageAsync()");
|
316
|
else if(task.IsFaulted)
|
317
|
throw task.Exception;
|
318
|
}
|
319
|
#endif
|
320
|
}
|
321
|
else
|
322
|
{
|
323
|
throw new Exception("Couldn't find grammar resource: " + grammarFileName + ".txt");
|
324
|
}
|
325
|
}
|
326
|
|
327
|
// load the grammar file
|
328
|
rc = sensorData.sensorInterface.LoadSpeechGrammar(grammarFileName, (short)languageCode, dynamicGrammar);
|
329
|
if (rc < 0)
|
330
|
{
|
331
|
string sErrorMessage = (new SpeechErrorHandler()).GetSapiErrorMessage(rc);
|
332
|
throw new Exception("Error loading grammar file " + grammarFileName + ": " + sErrorMessage);
|
333
|
}
|
334
|
|
335
|
// // test dynamic grammar phrases
|
336
|
// AddGrammarPhrase("addressBook", string.Empty, "Nancy Anderson", true, false);
|
337
|
// AddGrammarPhrase("addressBook", string.Empty, "Cindy White", false, false);
|
338
|
// AddGrammarPhrase("addressBook", string.Empty, "Oliver Lee", false, false);
|
339
|
// AddGrammarPhrase("addressBook", string.Empty, "Alan Brewer", false, false);
|
340
|
// AddGrammarPhrase("addressBook", string.Empty, "April Reagan", false, true);
|
341
|
}
|
342
|
|
343
|
sapiInitialized = true;
|
344
|
|
345
|
//DontDestroyOnLoad(gameObject);
|
346
|
|
347
|
if(debugText != null)
|
348
|
{
|
349
|
debugText.text = "Speech recognizer is ready.";
|
350
|
}
|
351
|
|
352
|
// try to automatically detect the available speech recognition listeners in the scene
|
353
|
if(speechRecognitionListeners.Count == 0)
|
354
|
{
|
355
|
MonoBehaviour[] monoScripts = FindObjectsOfType(typeof(MonoBehaviour)) as MonoBehaviour[];
|
356
|
|
357
|
foreach(MonoBehaviour monoScript in monoScripts)
|
358
|
{
|
359
|
if((monoScript is SpeechRecognitionInterface) && monoScript.enabled)
|
360
|
{
|
361
|
speechRecognitionListeners.Add(monoScript);
|
362
|
}
|
363
|
}
|
364
|
}
|
365
|
|
366
|
}
|
367
|
catch(DllNotFoundException ex)
|
368
|
{
|
369
|
Debug.LogError(ex.ToString());
|
370
|
if(debugText != null)
|
371
|
debugText.text = "Please check the Kinect and SAPI installations.";
|
372
|
}
|
373
|
catch (Exception ex)
|
374
|
{
|
375
|
Debug.LogError(ex.ToString());
|
376
|
if(debugText != null)
|
377
|
debugText.text = ex.Message;
|
378
|
}
|
379
|
}
|
380
|
|
381
|
#if NETFX_CORE
|
382
|
private async System.Threading.Tasks.Task CopyGrammarFileToStorageAsync(string grammarFileName, string grammarContent)
|
383
|
{
|
384
|
Windows.Storage.StorageFolder storageFolder = Windows.Storage.ApplicationData.Current.LocalFolder;
|
385
|
Windows.Storage.StorageFile grammarFile = await storageFolder.CreateFileAsync(grammarFileName,
|
386
|
Windows.Storage.CreationCollisionOption.ReplaceExisting);
|
387
|
|
388
|
await Windows.Storage.FileIO.WriteTextAsync(grammarFile, grammarContent);
|
389
|
}
|
390
|
#endif
|
391
|
|
392
|
void OnDestroy()
|
393
|
{
|
394
|
if(sapiInitialized && sensorData != null && sensorData.sensorInterface != null)
|
395
|
{
|
396
|
// finish speech recognition
|
397
|
sensorData.sensorInterface.FinishSpeechRecognition();
|
398
|
}
|
399
|
|
400
|
sapiInitialized = false;
|
401
|
instance = null;
|
402
|
}
|
403
|
|
404
|
void Update ()
|
405
|
{
|
406
|
// start Kinect speech recognizer as needed
|
407
|
// if(!sapiInitialized)
|
408
|
// {
|
409
|
// StartRecognizer();
|
410
|
//
|
411
|
// if(!sapiInitialized)
|
412
|
// {
|
413
|
// Application.Quit();
|
414
|
// return;
|
415
|
// }
|
416
|
// }
|
417
|
|
418
|
if(sapiInitialized)
|
419
|
{
|
420
|
// update the speech recognizer
|
421
|
int rc = sensorData.sensorInterface.UpdateSpeechRecognition();
|
422
|
|
423
|
if(rc >= 0)
|
424
|
{
|
425
|
// estimate the listening state
|
426
|
if(sensorData.sensorInterface.IsSpeechStarted())
|
427
|
{
|
428
|
isListening = true;
|
429
|
}
|
430
|
else if(sensorData.sensorInterface.IsSpeechEnded())
|
431
|
{
|
432
|
isListening = false;
|
433
|
}
|
434
|
|
435
|
// check if a grammar phrase has been recognized
|
436
|
if(sensorData.sensorInterface.IsPhraseRecognized())
|
437
|
{
|
438
|
isPhraseRecognized = true;
|
439
|
phraseConfidence = sensorData.sensorInterface.GetPhraseConfidence();
|
440
|
|
441
|
phraseTagRecognized = sensorData.sensorInterface.GetRecognizedPhraseTag();
|
442
|
sensorData.sensorInterface.ClearRecognizedPhrase();
|
443
|
|
444
|
//Debug.Log(phraseTagRecognized);
|
445
|
if(debugText != null)
|
446
|
{
|
447
|
if(isPhraseRecognized)
|
448
|
{
|
449
|
debugText.text = string.Format("{0} ({1:F1}%)", phraseTagRecognized, phraseConfidence * 100f);
|
450
|
}
|
451
|
else if(isListening)
|
452
|
{
|
453
|
debugText.text = "Listening...";
|
454
|
}
|
455
|
}
|
456
|
|
457
|
// invoke SpeechPhraseRecognized() of the available speech listeners
|
458
|
bool bClearPhrase = false;
|
459
|
foreach(SpeechRecognitionInterface listener in speechRecognitionListeners)
|
460
|
{
|
461
|
if(listener.SpeechPhraseRecognized(phraseTagRecognized, phraseConfidence))
|
462
|
{
|
463
|
bClearPhrase = true;
|
464
|
}
|
465
|
}
|
466
|
|
467
|
if(bClearPhrase)
|
468
|
{
|
469
|
ClearPhraseRecognized();
|
470
|
}
|
471
|
}
|
472
|
|
473
|
}
|
474
|
}
|
475
|
}
|
476
|
|
477
|
// void OnGUI()
|
478
|
// {
|
479
|
// if(sapiInitialized)
|
480
|
// {
|
481
|
// if(debugText != null)
|
482
|
// {
|
483
|
// if(isPhraseRecognized)
|
484
|
// {
|
485
|
// debugText.text = string.Format("{0} ({1:F1}%)", phraseTagRecognized, phraseConfidence * 100f);
|
486
|
// }
|
487
|
// else if(isListening)
|
488
|
// {
|
489
|
// debugText.text = "Listening...";
|
490
|
// }
|
491
|
// }
|
492
|
// }
|
493
|
// }
|
494
|
|
495
|
|
496
|
}
|