Nothing special needed for command line environment. This is a simple test app that I have written to try it out. I am no expert at this, but it seems to work.
import MetalKit
let N = 100
let bufferLength = N * sizeof(Float)
let devices = MTLCopyAllDevices()
//print("Possible devices: \(devices)")
let device = devices[0]
print("Running compute application on device \(!)")
print("Adding vectorA and vectorB into vectorC. Each vector is \(N) floats")
let commandQueue = device.newCommandQueue()
let defaultLibrary = device.newDefaultLibrary()
let commandBuffer = commandQueue.commandBuffer()
let kernel = defaultLibrary!.newFunctionWithName("add_kernel")
let computePipeLineDescriptor = MTLComputePipelineDescriptor()
computePipeLineDescriptor.computeFunction = kernel
let computePipelineState =
try! device.newComputePipelineStateWithDescriptor(computePipeLineDescriptor)
// Set up thread groups to be used in commandEncoder
let thrdWidth = computePipelineState.threadExecutionWidth
let thrdsPerGroup = MTLSize(width:thrdWidth,height:1,depth:1)
let numThrdgroups = MTLSize(width:(N+thrdWidth)/thrdWidth, height:1, depth:1)
// Create input and output vectors, and corresponding metal buffers
var vectorA = [Float](count: N, repeatedValue: 0.0)
for (index, _) in vectorA.enumerate() {
vectorA[index] = Float(index)
var vectorB = [Float](count: N, repeatedValue: 0.0)
for (index, _) in vectorB.enumerate() {
vectorB[index] = Float(index * 2)
var vectorC = [Float](count: N, repeatedValue: 0.0)
let bufferA = device.newBufferWithBytes(vectorA, length: bufferLength,
options: MTLResourceOptions.CPUCacheModeDefaultCache)
let bufferB = device.newBufferWithBytes(vectorB, length: bufferLength,
options: MTLResourceOptions.CPUCacheModeDefaultCache)
let bufferC = device.newBufferWithBytes(vectorC, length: bufferLength,
options: MTLResourceOptions.CPUCacheModeDefaultCache)
// Create Compute Command Encoder and add buffers and thread groups
let computeCommandEncoder = commandBuffer.computeCommandEncoder()
computeCommandEncoder.setBuffer(bufferA, offset: 0, atIndex: 0)
computeCommandEncoder.setBuffer(bufferB, offset: 0, atIndex: 1)
computeCommandEncoder.setBuffer(bufferC, offset: 0, atIndex: 2)
threadsPerThreadgroup: thrdsPerGroup)
// Finalize configuration and start job
// Wait for job to finish
// Get output data back into Swift
let data = NSData(bytesNoCopy: bufferC.contents(), length: bufferLength,
freeWhenDone: false)
data.getBytes(&vectorC, length:bufferLength)
print("vectorA = \(vectorA)")
print("vectorB = \(vectorB)")
print("vectorC = \(vectorC)")
#include <metal_stdlib>
using namespace metal;
kernel void add_kernel(const device float *a [[ buffer(0) ]],
const device float *b [[ buffer(1) ]],
device float *c [[ buffer(2) ]],
uint id [[ thread_position_in_grid ]]) {
c[id] = a[id] + b[id];