How to make AVSpeechSynthesizer work for write and delegate (Big Sur)

I am unable to get AVSpeechSynthesizer to write or to acknowledge the delegate actions .

I was informed this was resolved in macOS 11. I thought it was a lot to ask but am now running on macOS 11.4 (Big Sur).

My target is to output speech faster than real-time and and drive the output through AVAudioengine.

First, I need to know why the write doesnt occur and neither do delegates get called whether I am using write or simply uttering to the default speakers in "func speak(_ string: String)".

What am I missing?

Is there a workaround?

Reference: https://developer.apple.com/forums/thread/678287

let sentenceToSpeak = "This should write to buffer and also call 'didFinish' and 'willSpeakRangeOfSpeechString' delegates."
SpeakerTest().writeToBuffer(sentenceToSpeak)
SpeakerTest().speak(sentenceToSpeak)


class SpeakerTest: NSObject, AVSpeechSynthesizerDelegate {
    let synth = AVSpeechSynthesizer()

    override init() {
        super.init()
        synth.delegate = self
    }

    func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didFinish utterance: AVSpeechUtterance) {
        print("Utterance didFinish")
    }

    func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer,
                           willSpeakRangeOfSpeechString characterRange: NSRange,
                           utterance: AVSpeechUtterance)
    {
            print("speaking range: \(characterRange)")
    }

    func speak(_ string: String) {
        let utterance   = AVSpeechUtterance(string: string)
        var usedVoice   = AVSpeechSynthesisVoice(language: "en") // should be the default voice
        let voices      = AVSpeechSynthesisVoice.speechVoices()
        let targetVoice = "Allison"
        for voice in voices {
//            print("\(voice.identifier) \(voice.name) \(voice.quality) \(voice.language)")
            if (voice.name.lowercased() == targetVoice.lowercased())
            {
                usedVoice = AVSpeechSynthesisVoice(identifier: voice.identifier)
                break
            }
        }
        utterance.voice = usedVoice
        print("utterance.voice: \(utterance.voice)")
        synth.speak(utterance)
    }

    func writeToBuffer(_ string: String)
    {
        print("entering writeToBuffer")
        let utterance = AVSpeechUtterance(string: string)
        synth.write(utterance) { (buffer: AVAudioBuffer) in
            print("executing synth.write")
            guard let pcmBuffer = buffer as? AVAudioPCMBuffer else {
                fatalError("unknown buffer type: \(buffer)")
            }
            if pcmBuffer.frameLength == 0 {
                print("buffer is empty")
            } else {
                print("buffer has content \(buffer)")
            }
        }
    }
}

Accepted Reply

It looks like your synthesizer might be going out of scope. You create a new SpeakerTest object and call writeToBuffer, and then your program ends (at least form this snippet). While we'll kick off the speech and finish that utterance, the synthesizer needs to stay in memory for your delegate callbacks to work. If your SpeakerTest object is being deallocated, the synthesizer it creates will be deallocated along with it. Make sure your SpeakerTest object remains in memory until speech has completed.

Replies

It looks like your synthesizer might be going out of scope. You create a new SpeakerTest object and call writeToBuffer, and then your program ends (at least form this snippet). While we'll kick off the speech and finish that utterance, the synthesizer needs to stay in memory for your delegate callbacks to work. If your SpeakerTest object is being deallocated, the synthesizer it creates will be deallocated along with it. Make sure your SpeakerTest object remains in memory until speech has completed.

You are 100% correct.

The utterance aloud completed but nothing else worked.

I mistakenly thought the process would remain in memory until the completion of speech and the associated delegates fired.

For those who may need a working solution, here it is.

class SpeakerTest: NSObject, AVSpeechSynthesizerDelegate {
    let synth = AVSpeechSynthesizer()

    override init() {
        super.init()
        synth.delegate = self
    }

    func isSandboxEnvironment() -> Bool
    {
        let environ = ProcessInfo.processInfo.environment
        return ( environ["APP_SANDBOX_CONTAINER_ID"] != nil )
    }

    func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didFinish utterance: AVSpeechUtterance) {
        print("Utterance didFinish")
    }

    func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer,
                           willSpeakRangeOfSpeechString characterRange: NSRange,
                           utterance: AVSpeechUtterance)
    {
        print("speaking range: \(characterRange)")
    }

    func selectVoice(targetSpeaker: String, defLangCode: String) -> AVSpeechSynthesisVoice
    {
        var usedVoice   = AVSpeechSynthesisVoice(language: defLangCode) // should be the default voice
        let userCode    = AVSpeechSynthesisVoice.currentLanguageCode()
        let voices      = AVSpeechSynthesisVoice.speechVoices()
        for voice in voices {
            //        print("\(voice.identifier) \(voice.name) \(voice.quality) \(voice.language)")
            if (voice.name.lowercased() == targetSpeaker.lowercased())
            {
                usedVoice = AVSpeechSynthesisVoice(identifier: voice.identifier)
                break
            }
        }
        // ensure we return a valid voice
        if (usedVoice == nil) {usedVoice = AVSpeechSynthesisVoice(language: userCode) }
        return usedVoice!
    }

    func speak(_ string: String, speaker: String) {
        let utterance = AVSpeechUtterance(string: string)
        utterance.voice = selectVoice(targetSpeaker: speaker, defLangCode: "en-US")
        synth.speak(utterance)
    }

    func writeToBuffer(_ stringToSpeak: String, speaker: String)
    {
        print("entering writeToBuffer")
        let utterance   = AVSpeechUtterance(string: stringToSpeak)
        utterance.voice = selectVoice(targetSpeaker: speaker, defLangCode: "en-US")

        synth.write(utterance) { (buffer: AVAudioBuffer) in
            print("executing synth.write")
            guard let pcmBuffer = buffer as? AVAudioPCMBuffer else {
                fatalError("unknown buffer type: \(buffer)")
            }
            if ( pcmBuffer.frameLength == 0 ) {
                print("buffer is empty")
            } else {
                print("buffer has content \(buffer)")
            }
        }
    }

    func writeToFile(_ stringToSpeak: String, speaker: String)
    {
        let utterance = AVSpeechUtterance(string: stringToSpeak)
        var output    : AVAudioFile?
        let desktop   = "~/Desktop"
        let fileName  = "Utterance_Test.caf" // not in sandbox
        var tempPath  = desktop + "/" + fileName
        tempPath      = (tempPath as NSString).expandingTildeInPath
        // if sandboxed, it goes in the container
        if ( isSandboxEnvironment() ) { tempPath = "Utterance_Test.caf" }

        utterance.voice = selectVoice(targetSpeaker: speaker, defLangCode: "en-US")

        synth.write(utterance) { (buffer: AVAudioBuffer) in
            guard let pcmBuffer = buffer as? AVAudioPCMBuffer else {
                fatalError("unknown buffer type: \(buffer)")
            }
            if ( pcmBuffer.frameLength == 0 ) {
                // done
            } else {
                // append buffer to file
                if ( output == nil ) {
                    let bufferSettings = utterance.voice?.audioFileSettings
                    output = try! AVAudioFile( forWriting: URL(fileURLWithPath: tempPath),settings: bufferSettings!)
                }
                try! output?.write(from: pcmBuffer)
            }
        }
    }
}

class ViewController: NSViewController {
    let speechDelivery = SpeakerTest()

    override func viewDidLoad() {
        super.viewDidLoad()

        let targetSpeaker   = "Allison"
        var sentenceToSpeak = "This writes to buffer and disk."
        sentenceToSpeak    += "Also, 'didFinish' and 'willSpeakRangeOfSpeechString' delegates fire."

        speechDelivery.writeToBuffer(sentenceToSpeak, speaker: targetSpeaker)
        speechDelivery.speak(sentenceToSpeak, speaker: targetSpeaker)
        speechDelivery.writeToFile(sentenceToSpeak, speaker: targetSpeaker)
    }

    override var representedObject: Any? {
        didSet {
            // Update the view, if already loaded.
        }
    }
}

Why export wav format that can't play, but caf format is OK?