I am writing some automation tools for transcribing and indexing a series of audio files (i.e. podcasts). The Microsoft Speech SDK 5.1
is great for speech-enabling desktop applications and adding creating
automated phone systems and I thought I’d try to use it to analyze the
podcasts. Here’s the code needed:
void Transcribe (string inputFilename) {
SpInprocRecognizer recognizer = new SpInprocRecognizer();
fileStream = new SpFileStreamClass();
fileStream.Open(inputFilename,SpeechStreamFileMode.SSFMOpenForRead,true);
recognizer.AudioInputStream = fileStream;
SpSharedRecoContext recoContext = (SpSharedRecoContext) recognizer.CreateRecoContext();
recoContext.Recognition += new
_ISpeechRecoContextEvents_RecognitionEventHandler(recoContext_Recognition);
SpeechLib.ISpeechRecoGrammar grammar = objRecoContext.CreateGrammar(0);
grammar.DictationSetState(SpeechRuleState.SGDSActive);
}
void recoContext_Recognition(int streamNumber, object streamPosition,
SpeechRecognitionType recognitionType, ISpeechRecoResult result) {
string recognizedText = result.PhraseInfo.GetText(0, -1, true);
// DO SOMETHING WITH TEXT
}
The only problem is that the recognition is … not that good. One way
to increase recognition is to add a discreet set of grammar rules. My
current goal is to make our database of audio files searchable, so I am
looking to index only a few hundred words. Limiting the vocabulary in
this way drastically increases recognition. Here is the code:
void Transcribe (string inputFilename) {
SpInprocRecognizer recognizer = new SpInprocRecognizer();
fileStream = new SpFileStreamClass();
fileStream.Open(inputFilename,SpeechStreamFileMode.SSFMOpenForRead,true);
recognizer.AudioInputStream = fileStream;
SpSharedRecoContext recoContext = (SpSharedRecoContext) recognizer.CreateRecoContext();
recoContext.Recognition += new
_ISpeechRecoContextEvents_RecognitionEventHandler(recoContext_Recognition);
SpeechLib.ISpeechRecoGrammar grammar = objRecoContext.CreateGrammar(0);
ISpeechGrammarRule simpleRules =
grammar.Rules.Add(“simpleRules”,SpeechRuleAttributes.SRATopLevel|SpeechRuleAttributes.SRADynamic,1);
object PropValue = string.Empty;
simpleRules.InitialState.AddWordTransition(null,”Genesis”,”
“,SpeechGrammarWordType.SGLexical,”Genesis”, 1, ref PropValue, 1.0F );
simpleRules.InitialState.AddWordTransition(null,”Adam”,”
“,SpeechGrammarWordType.SGLexical,”Adam”, 2, ref PropValue, 1.0F );
simpleRules.InitialState.AddWordTransition(null,”Eve”,”
“,SpeechGrammarWordType.SGLexical,”Eve”, 3, ref PropValue, 1.0F );
simpleRules.InitialState.AddWordTransition(null,”Moses”,”
“,SpeechGrammarWordType.SGLexical,”Moses”, 4, ref PropValue, 1.0F );
simpleRules.InitialState.AddWordTransition(null,”Exodus”,”
“,SpeechGrammarWordType.SGLexical,”Exodus”, 5, ref PropValue, 1.0F );
grammar.Rules.Commit();
grammar.CmdSetRuleState(“simpleRules”, SpeechRuleState.SGDSActive);
}
void recoContext_Recognition(int streamNumber, object streamPosition,
SpeechRecognitionType recognitionType, ISpeechRecoResult result) {
string recognizedText = result.PhraseInfo.GetText(0, -1, true);
// DO SOMETHING WITH TEXT
}
Hope that helps someone!
Thanks, it helped me.!