I am currently using CoreImage to process YCbCr422/420 10-bit pixel buffers but it is lacking performance at high frame rates so I decided to switch to Metal. But with Metal I am getting even worse performance. I am loading both the Luma (Y) and Chroma (CbCr) textures in 16-bit format as follows:
let pixelFormatY = MTLPixelFormat.r16Unorm
let pixelFormatUV = MTLPixelFormat.rg16Unorm
renderPassDescriptorY!.colorAttachments[0].texture = texture;
renderPassDescriptorY!.colorAttachments[0].loadAction = .clear;
renderPassDescriptorY!.colorAttachments[0].clearColor = MTLClearColor(red: 0.0, green: 0.0, blue: 0.0, alpha: 1.0)
renderPassDescriptorY!.colorAttachments[0].storeAction = .store;
renderPassDescriptorCbCr!.colorAttachments[0].texture = texture;
renderPassDescriptorCbCr!.colorAttachments[0].loadAction = .clear;
renderPassDescriptorCbCr!.colorAttachments[0].clearColor = MTLClearColor(red: 0.0, green: 0.0, blue: 0.0, alpha: 1.0)
renderPassDescriptorCbCr!.colorAttachments[0].storeAction = .store;
// Vertices and texture coordinates for Metal shader
let vertices:[AAPLVertex] = [AAPLVertex(position: vector_float2(-1.0, -1.0), texCoord: vector_float2( 0.0 , 1.0)),
AAPLVertex(position: vector_float2(1.0, -1.0), texCoord: vector_float2( 1.0, 1.0)),
AAPLVertex(position: vector_float2(-1.0, 1.0), texCoord: vector_float2( 0.0, 0.0)),
AAPLVertex(position: vector_float2(1.0, 1.0), texCoord: vector_float2( 1.0, 0.0))
]
let commandBuffer = commandQueue!.makeCommandBuffer()
if let commandBuffer = commandBuffer {
let renderEncoderY = commandBuffer.makeRenderCommandEncoder(descriptor: renderPassDescriptorY!)
renderEncoderY?.setRenderPipelineState(pipelineStateY!)
renderEncoderY?.setVertexBytes(vertices, length: vertices.count * MemoryLayout<AAPLVertex>.stride, index: 0) renderEncoderY?.setFragmentTexture(CVMetalTextureGetTexture(lumaTexture!), index: 0)
renderEncoderY?.setViewport(MTLViewport(originX: 0, originY: 0, width: Double(dstWidthY), height: Double(dstHeightY), znear: 0, zfar: 1))
renderEncoderY?.drawPrimitives(type: .triangleStrip, vertexStart: 0, vertexCount: 4, instanceCount: 1)
renderEncoderY?.endEncoding()
let renderEncoderCbCr = commandBuffer.makeRenderCommandEncoder(descriptor: renderPassDescriptorCbCr!)
renderEncoderCbCr?.setRenderPipelineState(pipelineStateCbCr!)
renderEncoderCbCr?.setVertexBytes(vertices, length: vertices.count * MemoryLayout<AAPLVertex>.stride, index: 0)
renderEncoderCbCr?.setFragmentTexture(CVMetalTextureGetTexture(chromaTexture!), index: 0)
renderEncoderCbCr?.setViewport(MTLViewport(originX: 0, originY: 0, width: Double(dstWidthUV), height: Double(dstHeightUV), znear: 0, zfar: 1))
renderEncoderCbCr?.drawPrimitives(type: .triangleStrip, vertexStart: 0, vertexCount: 4, instanceCount: 1)
renderEncoderCbCr?.endEncoding()
commandBuffer.commit()
}
And here is shader code:
vertex MappedVertex vertexShaderYCbCrPassthru (
constant Vertex *vertices [[ buffer(0) ]],
unsigned int vertexId [[vertex_id]]
)
{
MappedVertex out;
Vertex v = vertices[vertexId];
out.renderedCoordinate = float4(v.position, 0.0, 1.0);
out.textureCoordinate = v.texCoord;
return out;
}
fragment half fragmentShaderYPassthru ( MappedVertex in [[ stage_in ]],
texture2d<float, access::sample> textureY [[ texture(0) ]]
)
{
constexpr sampler s(s_address::clamp_to_edge, t_address::clamp_to_edge, min_filter::linear, mag_filter::linear);
float Y = float(textureY.sample(s, in.textureCoordinate).r);
return half(Y);
}
fragment half2 fragmentShaderCbCrPassthru ( MappedVertex in [[ stage_in ]],
texture2d<float, access::sample> textureCbCr [[ texture(0) ]]
)
{
constexpr sampler s(s_address::clamp_to_edge, t_address::clamp_to_edge, min_filter::linear, mag_filter::linear);
float2 CbCr = float2(textureCbCr.sample(s, in.textureCoordinate).rg);
return half2(CbCr);
}
Is there anything fundamentally wrong in the code that makes it slow?