Is it possible to use AVCaptureAudioDataOutput and AVCaptureMetadataOutput at the same time?

I've created an app that can detect faces using Dlib's facial landmark predictor. I'm creating an AVCaptureSession to use the iphone camera and for it to work with the Dlib framework I'm using AVCaptureVideoDataOutput and system face detection via AVCaptureMetadataoutput. I got the face detection working so now I want to be able to record a video of the face being detected and have it save to file using AVAssetWriter.


When I do this just using AVCaptureVideoDataOutput and AVCaptureMetadataOutput it works fine. But when I try to add an audio output using AVCaptureAudioDataOutput it causes the app to crash. I can't think of a way around this as I need to have the audio recorded as well. Is there some problem trying to use AudioDataOutptu with MetadataOutput together? It's almost like the captureOutput function can't handle audio output while there's a face being detected.


Here's some of my code (this code works in another project where I'm just recording video output using AVCaptureVideoDataOutput and AVCaptureAudioOutput):

    // MARK: Set up camera
    func openSession() {
        
        // size of the output video will be 720x1280
        session.sessionPreset = .hd1280x720
        
        // set up camera
        videoCaptureDevice = AVCaptureDevice.default(.builtInWideAngleCamera, for: AVMediaType.video, position: .front)
        
        if videoCaptureDevice != nil {
            do {
                // add the input from the device
                try session.addInput(AVCaptureDeviceInput(device: videoCaptureDevice!))
                // set up the microphone
                if let audioInput = AVCaptureDevice.default(for: AVMediaType.audio) {
                    try session.addInput(AVCaptureDeviceInput(device: audioInput))
                }
                
                // define video output
                videoDataOutput.videoSettings = [kCVPixelBufferPixelFormatTypeKey as String: kCVPixelFormatType_32BGRA,
                ]
                videoDataOutput.alwaysDiscardsLateVideoFrames = true

                let metaOutput = AVCaptureMetadataOutput()
                
                if session.canAddOutput(videoDataOutput) {
                    videoDataOutput.setSampleBufferDelegate(self, queue: sampleQueue)
                    print("videodataoutput added")
                    session.addOutput(videoDataOutput)
                }
                
                // define audio output *** THIS IS THE BLOCK OF CODE RESULTING IN CRASH ***
                if session.canAddOutput(audioDataOutput) {
                    audioDataOutput.setSampleBufferDelegate(self, queue: sampleQueue)
                    session.addOutput(audioDataOutput)
                    print("audiodataoutput added")
                }
                
                // define metadata output
                if session.canAddOutput(metaOutput) {
                    metaOutput.setMetadataObjectsDelegate(self, queue: faceQueue)
                    session.addOutput(metaOutput)
                    print("metaoutput added")
                }
                
                // availableMetadataObjectTypes change when output is added to session.
                // before it is added, availableMetadataObjectTypes is empty
                metaOutput.metadataObjectTypes = [AVMetadataObject.ObjectType.face]

                session.commitConfiguration()

                // prepare the dlib face detection
                wrapper?.prepare()

                // start the session
                session.startRunning()
                
            } catch {
                print(error)
            }
// Mark: Create AVAssetWriter
    func createAssetWriter() {
        
        do {
            outputUrl = videoFileLocation()
            videoWriter = try AVAssetWriter(outputURL: outputUrl!, fileType: AVFileType.mov)
            
            // add video input
            videoWriterInput = AVAssetWriterInput(mediaType: AVMediaType.video, outputSettings: [
                AVVideoCodecKey : AVVideoCodecType.h264,
                AVVideoWidthKey : 720,
                AVVideoHeightKey : 1280,
                AVVideoCompressionPropertiesKey : [
                    AVVideoAverageBitRateKey : 2300000,
                ],
                ])
            
            
            //  var pixelBufferAdaptor = AVAssetWriterInputPixelBufferAdaptor(assetWriterInput: videoWriterInput!, sourcePixelBufferAttributes: [ kCVPixelBufferPixelFormatTypeKey as String : Int(kCVPixelFormatType_32BGRA)])
            
            videoWriterInput.expectsMediaDataInRealTime = true
            
            if videoWriter.canAdd(videoWriterInput) {
                videoWriter.add(videoWriterInput)
                print("video input added")
            } else {
                print("no input added")
            }
            
            // add audio input
            audioWriterInput = AVAssetWriterInput(mediaType: AVMediaType.audio, outputSettings: [
                AVFormatIDKey: kAudioFormatMPEG4AAC,
                AVNumberOfChannelsKey: 1,
                AVSampleRateKey: 44100,
                AVEncoderBitRateKey: 64000,
                ])
            
            audioWriterInput.expectsMediaDataInRealTime = true
            
            if videoWriter.canAdd(audioWriterInput!) {
                videoWriter.add(audioWriterInput!)
                print("audio input added")
            }
            
            videoWriter.startWriting()
            
        } catch let error {
            debugPrint(error.localizedDescription)
        }
    }
// MARK: AVCaptureVideoDataOutputSampleBufferDelegate
    func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
        
        if !currentMetadata.isEmpty {
            let boundsArray = currentMetadata
                .compactMap { $0 as? AVMetadataFaceObject }
                .map { (faceObject) -> NSValue in
                    // *** CONVERTEDOBJECT IS PRODUCING A FATAL ERROR AS IT'S NIL ***
                    let convertedObject = output.transformedMetadataObject(for: faceObject, connection: connection)
                    return NSValue(cgRect: convertedObject!.bounds)
            }
            
            wrapper?.doWork(on: sampleBuffer, inRects: boundsArray)
        }
        
        layer.enqueue(sampleBuffer)
        
        let writable = canWrite()
        
        if writable,
            sessionAtSourceTime == nil {
            // start writing
            sessionAtSourceTime = CMSampleBufferGetPresentationTimeStamp(sampleBuffer)
            videoWriter.startSession(atSourceTime: sessionAtSourceTime!)
            //print("Writing")
        }

        if writable,
            output == videoDataOutput,
            (videoWriterInput.isReadyForMoreMediaData) {
            // write video buffer
            videoWriterInput.append(sampleBuffer)
            print("video buffering")
        } else if writable,
            output == audioDataOutput,
            (audioWriterInput.isReadyForMoreMediaData) {
            // write audio buffer
            audioWriterInput?.append(sampleBuffer)
            print("audio buffering")
        }
        
    }

     }
    }

Thanks in advance

Post not yet marked as solved Up vote post of Hardy143 Down vote post of Hardy143
2.2k views