Developing command line Metal compute apps?

I'd like to port a command line GPU compute application to Metal.


Are there any examples that show how to initialize Metal for a pure command line environment?

Answered by ymx in 13350022

JFYI: I've created a sample app to convert grayscaled image at https://github.com/safx/Metal-CommandLine-Sample-Swift

Accepted Answer

JFYI: I've created a sample app to convert grayscaled image at https://github.com/safx/Metal-CommandLine-Sample-Swift

Nothing special needed for command line environment. This is a simple test app that I have written to try it out. I am no expert at this, but it seems to work.


main.swift:

import MetalKit
let N = 100
let bufferLength = N * sizeof(Float)

let devices = MTLCopyAllDevices()
//print("Possible devices: \(devices)")
let device = devices[0]
print("Running compute application on device \(device.name!)")
print("Adding vectorA and vectorB into vectorC.  Each vector is \(N) floats")

let commandQueue = device.newCommandQueue()
let defaultLibrary = device.newDefaultLibrary()
let commandBuffer = commandQueue.commandBuffer()
let kernel = defaultLibrary!.newFunctionWithName("add_kernel")
let computePipeLineDescriptor = MTLComputePipelineDescriptor()
computePipeLineDescriptor.computeFunction = kernel
let computePipelineState =
    try! device.newComputePipelineStateWithDescriptor(computePipeLineDescriptor)

// Set up thread groups to be used in commandEncoder
let thrdWidth = computePipelineState.threadExecutionWidth
let thrdsPerGroup = MTLSize(width:thrdWidth,height:1,depth:1)
let numThrdgroups = MTLSize(width:(N+thrdWidth)/thrdWidth, height:1, depth:1)

// Create input and output vectors, and corresponding metal buffers
var vectorA = [Float](count: N, repeatedValue: 0.0)
for (index, _) in vectorA.enumerate() {
    vectorA[index] = Float(index)
}
var vectorB = [Float](count: N, repeatedValue: 0.0)
for (index, _) in vectorB.enumerate() {
    vectorB[index] = Float(index * 2)
}
var vectorC = [Float](count: N, repeatedValue: 0.0)
let bufferA = device.newBufferWithBytes(vectorA, length: bufferLength,
                options: MTLResourceOptions.CPUCacheModeDefaultCache)
let bufferB = device.newBufferWithBytes(vectorB, length: bufferLength,
                options: MTLResourceOptions.CPUCacheModeDefaultCache)
let bufferC = device.newBufferWithBytes(vectorC, length: bufferLength,
                options: MTLResourceOptions.CPUCacheModeDefaultCache)
// Create Compute Command Encoder and add buffers and thread groups
let computeCommandEncoder = commandBuffer.computeCommandEncoder()
computeCommandEncoder.setBuffer(bufferA, offset: 0, atIndex: 0)
computeCommandEncoder.setBuffer(bufferB, offset: 0, atIndex: 1)
computeCommandEncoder.setBuffer(bufferC, offset: 0, atIndex: 2)
computeCommandEncoder.setComputePipelineState(computePipelineState)
computeCommandEncoder.dispatchThreadgroups(numThrdgroups,
                        threadsPerThreadgroup: thrdsPerGroup)
// Finalize configuration and start job
computeCommandEncoder.endEncoding()
commandBuffer.commit()
// Wait for job to finish
commandBuffer.waitUntilCompleted()
// Get output data back into Swift
let data = NSData(bytesNoCopy: bufferC.contents(), length: bufferLength,
                    freeWhenDone: false)
data.getBytes(&vectorC, length:bufferLength)
print("vectorA = \(vectorA)")
print("vectorB = \(vectorB)")
print("vectorC = \(vectorC)")
exit(0)


Shaders.metal:

#include <metal_stdlib>
using namespace metal;
kernel void add_kernel(const device float *a [[ buffer(0) ]],
                         const device float *b [[ buffer(1) ]],
                         device float *c [[ buffer(2) ]],
                    uint id [[ thread_position_in_grid ]]) {
   
    c[id] = a[id] + b[id];
}

Thanks @ymx and @salver.

FYI. Updated for Xcode 13.3.1 ...

    let bufferLength = N * MemoryLayout<Float>.size
    
    let devices = MTLCopyAllDevices()
    print("\(#file):\(#line) Possible devices: \(devices)")
    let device = devices[0]
    print("\(#file):\(#line) Running compute application on device \(device.name)")
    print("\(#file):\(#line) Adding vectorA and vectorB into vectorC.  Each vector is \(N) floats")
    
    let commandQueue              = device.makeCommandQueue()
    let defaultLibrary            = device.makeDefaultLibrary()
    let commandBuffer             = commandQueue!.makeCommandBuffer()
    let kernel                    = defaultLibrary!.makeFunction(name: "add_kernel")
    let computePipeLineDescriptor = MTLComputePipelineDescriptor()
    computePipeLineDescriptor.computeFunction = kernel
    let computePipelineState      = try! await device.makeComputePipelineState(descriptor: computePipeLineDescriptor, options: [] )
    
    // Set up thread groups to be used in commandEncoder
    let thrdWidth     = 3 //    FOUND NO REPLACEMENT FOR: computePipelineState.threadExecutionWidth, SO USED INTEGER
    let thrdsPerGroup = MTLSize(width:thrdWidth,height:1,depth:1)
    let numThrdgroups = MTLSize(width:(N+thrdWidth)/thrdWidth, height:1, depth:1)
    
    // Create input and output vectors, and corresponding metal buffers
    var vectorA = Array(repeating: Float(0.0), count: N)
    for (index, _) in vectorA.enumerated() {
        vectorA[index] = Float(index)
    }
    var vectorB = Array(repeating: Float(0.0), count: N)
    for (index, _) in vectorB.enumerated() {
        vectorB[index] = Float(index * 2)
    }
    var vectorC = Array(repeating: Float(0.0), count: N)
    let bufferA = device.makeBuffer(bytes: vectorA, length: bufferLength, options: [])
    let bufferB = device.makeBuffer(bytes: vectorB, length: bufferLength, options: [])
    let bufferC = device.makeBuffer(bytes: vectorC, length: bufferLength, options: [])
    // Create Compute Command Encoder and add buffers and thread groups
    let computeCommandEncoder = commandBuffer!.makeComputeCommandEncoder()
    computeCommandEncoder!.setBuffer(bufferA, offset: 0, index: 0)
    computeCommandEncoder!.setBuffer(bufferB, offset: 0, index: 1)
    computeCommandEncoder!.setBuffer(bufferC, offset: 0, index: 2)
    computeCommandEncoder!.setComputePipelineState(computePipelineState.0)
    computeCommandEncoder!.dispatchThreadgroups(numThrdgroups, threadsPerThreadgroup: thrdsPerGroup)
    // Finalize configuration and start job
    computeCommandEncoder!.endEncoding()
    commandBuffer!.commit()
    // Wait for job to finish
    commandBuffer!.waitUntilCompleted()
    // Get output data back into Swift
    let data = NSData(bytesNoCopy: bufferC!.contents(), length: bufferLength, freeWhenDone: false)
    data.getBytes(&vectorC, length:bufferLength)
    
    print("\(#file):\(#line) vectorA = \(vectorA)")
    print("\(#file):\(#line) vectorB = \(vectorB)")
    print("\(#file):\(#line) vectorC = \(vectorC)")
    exit(0)

Note that I found no replacement for thrdWidth declaration and so just inserted an integer.

Here is an extremely simple sample app that does what you're looking for.

Developing command line Metal compute apps?
 
 
Q