Is the format description AVSpeechSynthesizer for the speech buffer is correct? When I attempt to convert it, I get back noise from two different conversion methods.
I am seeking to convert the speech buffer provided by the AVSpeechSynthesizer "func write(_ utterance: AVSpeechUtterance..." method. The goal is to convert the sample type, change the sample rate and change from mono to stereo buffer. I later manipulate the buffer data and pass it through AVAudioengine. For testing purposes, I have kept the sample rate to the original 22050.0
What have I tried? I have a method that I've been using for years named "resampleBuffer" that does this. When I apply it to the speech buffer, I get back noise. When I attempt to manually convert format and to stereo with "convertSpeechBufferToFloatStereo", I am getting back clipped output. I tested flipping the samples, addressing the Big Endian, Signed Integer but that didn't work.
The speech buffer description is inBuffer description: <AVAudioFormat 0x6000012862b0: 1 ch, 22050 Hz, 'lpcm' (0x0000000E) 32-bit big-endian signed integer>
import Cocoa
import AVFoundation
class SpeakerTest: NSObject, AVSpeechSynthesizerDelegate {
let synth = AVSpeechSynthesizer()
override init() {
super.init()
}
func resampleBuffer( inSource: AVAudioPCMBuffer, newSampleRate: Double) -> AVAudioPCMBuffer?
{
// resample and convert mono to stereo
var error : NSError?
let kChannelStereo = AVAudioChannelCount(2)
let convertRate = newSampleRate / inSource.format.sampleRate
let outFrameCount = AVAudioFrameCount(Double(inSource.frameLength) * convertRate)
let outFormat = AVAudioFormat(standardFormatWithSampleRate: newSampleRate, channels: kChannelStereo)!
let avConverter = AVAudioConverter(from: inSource.format, to: outFormat )
let outBuffer = AVAudioPCMBuffer(pcmFormat: outFormat, frameCapacity: outFrameCount)!
let inputBlock : AVAudioConverterInputBlock = { (inNumPackets, outStatus) -> AVAudioBuffer? in
outStatus.pointee = AVAudioConverterInputStatus.haveData // very important, must have
let audioBuffer : AVAudioBuffer = inSource
return audioBuffer
}
avConverter?.sampleRateConverterAlgorithm = AVSampleRateConverterAlgorithm_Mastering
avConverter?.sampleRateConverterQuality = .max
if let converter = avConverter
{
let status = converter.convert(to: outBuffer, error: &error, withInputFrom: inputBlock)
// print("\(status): \(status.rawValue)")
if ((status != .haveData) || (error != nil))
{
print("\(status): \(status.rawValue), error: \(String(describing: error))")
return nil // conversion error
}
} else {
return nil // converter not created
}
// print("success!")
return outBuffer
}
func writeToFile(_ stringToSpeak: String, speaker: String)
{
var output : AVAudioFile?
let utterance = AVSpeechUtterance(string: stringToSpeak)
let desktop = "~/Desktop"
let fileName = "Utterance_Test.caf" // not in sandbox
var tempPath = desktop + "/" + fileName
tempPath = (tempPath as NSString).expandingTildeInPath
let usingSampleRate = 22050.0 // 44100.0
let outSettings = [
AVFormatIDKey : kAudioFormatLinearPCM, // kAudioFormatAppleLossless
AVSampleRateKey : usingSampleRate,
AVNumberOfChannelsKey : 2,
AVEncoderAudioQualityKey : AVAudioQuality.max.rawValue
] as [String : Any]
// temporarily ignore the speaker and use the default voice
let curLangCode = AVSpeechSynthesisVoice.currentLanguageCode()
utterance.voice = AVSpeechSynthesisVoice(language: curLangCode)
// utterance.volume = 1.0
print("Int32.max: \(Int32.max), Int32.min: \(Int32.min)")
synth.write(utterance) { (buffer: AVAudioBuffer) in
guard let pcmBuffer = buffer as? AVAudioPCMBuffer else {
fatalError("unknown buffer type: \(buffer)")
}
if ( pcmBuffer.frameLength == 0 ) {
// done
} else {
// append buffer to file
var outBuffer : AVAudioPCMBuffer
outBuffer = self.resampleBuffer( inSource: pcmBuffer, newSampleRate: usingSampleRate)! // doesnt work
// outBuffer = self.convertSpeechBufferToFloatStereo( pcmBuffer ) // doesnt work
// outBuffer = pcmBuffer // original format does work
if ( output == nil ) {
//var bufferSettings = utterance.voice?.audioFileSettings
// Audio files cannot be non-interleaved.
var outSettings = outBuffer.format.settings
outSettings["AVLinearPCMIsNonInterleaved"] = false
let inFormat = pcmBuffer.format
print("inBuffer description: \(inFormat.description)")
print("inBuffer settings: \(inFormat.settings)")
print("inBuffer format: \(inFormat.formatDescription)")
print("outBuffer settings: \(outSettings)\n")
print("outBuffer format: \(outBuffer.format.formatDescription)")
output = try! AVAudioFile( forWriting: URL(fileURLWithPath: tempPath),settings: outSettings)
}
try! output?.write(from: outBuffer)
print("done")
}
}
}
}
class ViewController: NSViewController {
let speechDelivery = SpeakerTest()
override func viewDidLoad() {
super.viewDidLoad()
let targetSpeaker = "Allison"
var sentenceToSpeak = ""
for indx in 1...10
{
sentenceToSpeak += "This is sentence number \(indx). [[slnc 3000]] \n"
}
speechDelivery.writeToFile(sentenceToSpeak, speaker: targetSpeaker)
}
}
Three test can be performed. The only one that works is to directly write the buffer to disk
Is this really "32-bit big-endian signed integer"?
Am I addressing this correctly or is this a bug?
I'm on macOS 11.4