I'm building a game where the player is able to speak commands, so I want to enable speech-to-text capability. I've setup the required info.plist property (for speech recognition privacy) as well as the App Sandbox hardware setting (for audio input). I've confirmed that the application is listening via the audio tap and sending audio buffers to the recognition request. However, the recognition task never executes.
NOTE: This is for MacOS, NOT iOS. Also, it works when I have this in a Playground, but when I try to do this in an actual application, the recognition task isn't called.
Specs:
- MacOS: 12.1
- XCode: 13.2.1 (13C100)
- Swift: 5.5.2
Here is the code that I've placed in the AppDelegate
of a freshly built SpriteKit application:
//
// AppDelegate.swift
//
import Cocoa
import AVFoundation
import Speech
@main
class AppDelegate: NSObject, NSApplicationDelegate {
private let speechRecognizer = SFSpeechRecognizer(locale: Locale(identifier: "en-US"))!
private let audioEngine = AVAudioEngine()
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
private var recognitionTask: SFSpeechRecognitionTask?
func applicationDidFinishLaunching(_ aNotification: Notification) {
SFSpeechRecognizer.requestAuthorization(requestMicrophoneAccess)
}
func applicationWillTerminate(_ aNotification: Notification) {
// Insert code here to tear down your application
}
func applicationShouldTerminateAfterLastWindowClosed(_ sender: NSApplication) -> Bool {
return true
}
fileprivate func requestMicrophoneAccess(authStatus: SFSpeechRecognizerAuthorizationStatus) {
OperationQueue.main.addOperation {
switch authStatus {
case .authorized:
self.speechRecognizer.supportsOnDeviceRecognition = true
if let speechRecognizer = SFSpeechRecognizer() {
if speechRecognizer.isAvailable {
do {
try self.startListening()
} catch {
print(">>> ERROR >>> Listening Error: \(error)")
}
}
}
case .denied:
print("Denied")
case .restricted:
print("Restricted")
case .notDetermined:
print("Undetermined")
default:
print("Unknown")
}
}
}
func startListening() throws {
// Cancel the previous task if it's running.
recognitionTask?.cancel()
recognitionTask = nil
let inputNode = audioEngine.inputNode
// Configure the microphone input.
let recordingFormat = inputNode.outputFormat(forBus: 0)
inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat) {(buffer: AVAudioPCMBuffer, when: AVAudioTime) in
/**********
* Confirmed that the following line is executing continuously
**********/
self.recognitionRequest?.append(buffer)
}
startRecognizing()
audioEngine.prepare()
try audioEngine.start()
}
func startRecognizing() {
// Create a recognition task for the speech recognition session.
recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
guard let recognitionRequestInternal = recognitionRequest else { fatalError("Unable to create a SFSpeechAudioBufferRecognitionRequest object") }
recognitionRequestInternal.shouldReportPartialResults = true
recognitionRequestInternal.requiresOnDeviceRecognition = true
/**************
* Confirmed that the following line is executed,
* however the function given to 'recognitionTask' is never called
**************/
recognitionTask = speechRecognizer.recognitionTask(with: recognitionRequestInternal) { result, error in
var isFinal = false
if result != nil {
let firstTranscriptionTimestamp = result!.transcriptions.first?.segments.first?.timestamp ?? TimeInterval.zero
isFinal = result!.isFinal || (firstTranscriptionTimestamp != 0)
}
if error != nil {
// Stop recognizing speech if there is a problem.
print("\n>>> ERROR >>> Recognition Error: \(error)")
self.audioEngine.stop()
self.audioEngine.inputNode.removeTap(onBus: 0)
self.recognitionRequest = nil
self.recognitionTask = nil
} else if isFinal {
self.recognitionTask = nil
}
}
}
}