i believe i am encountering a bug in the MPS backend of CoreML. i believe there is an invalid conversion of a slice_by_index + gather operation resulting in indexing the wrong values on GPU execution.
the following is a python program using the coremltools library illustrating the issue:
from coremltools.converters.mil import Builder as mb
from coremltools.converters.mil.mil import types
dB = 20480
shapeI = (2, dB)
shapeB = (dB, 22)
@mb.program(input_specs=[mb.TensorSpec(shape=shapeI, dtype=types.int32),
mb.TensorSpec(shape=shapeB)])
def prog(i, b):
lslice = mb.slice_by_index(x=i, begin=[0, 0], end=[1, dB], end_mask=[False, True],
squeeze_mask=[True, False], name='slice_left')
rslice = mb.slice_by_index(x=i, begin=[1, 0], end=[2, dB], end_mask=[False, True],
squeeze_mask=[True, False], name='slice_right')
ldata = mb.gather(x=b, indices=lslice)
rdata = mb.gather(x=b, indices=rslice) # actual bug in optimization of gather+slice
x = mb.add(x=ldata, y=rdata)
# dummy ops to make a bigger graph to run on GPU
x = mb.mul(x=x, y=2.)
x = mb.mul(x=x, y=.5)
x = mb.mul(x=x, y=2.)
x = mb.mul(x=x, y=.5)
x = mb.mul(x=x, y=2.)
x = mb.mul(x=x, y=.5)
x = mb.mul(x=x, y=2.)
x = mb.mul(x=x, y=.5)
x = mb.mul(x=x, y=2.)
x = mb.mul(x=x, y=.5)
x = mb.mul(x=x, y=2.)
x = mb.mul(x=x, y=.5)
x = mb.mul(x=x, y=2.)
x = mb.mul(x=x, y=.5)
x = mb.mul(x=x, y=1., name='result')
return x
input_types = [
ct.TensorType(name="i", shape=shapeI, dtype=np.int32),
ct.TensorType(name="b", shape=shapeB, dtype=np.float32),
]
with tempfile.TemporaryDirectory() as tmpdirname:
model_cpu = ct.convert(prog,
inputs=input_types,
compute_precision=ct.precision.FLOAT32,
compute_units=ct.ComputeUnit.CPU_ONLY,
package_dir=tmpdirname + 'model_cpu.mlpackage')
model_gpu = ct.convert(prog,
inputs=input_types,
compute_precision=ct.precision.FLOAT32,
compute_units=ct.ComputeUnit.CPU_AND_GPU,
package_dir=tmpdirname + 'model_gpu.mlpackage')
inputs = {
"i": torch.randint(0, shapeB[0], shapeI, dtype=torch.int32),
"b": torch.rand(shapeB, dtype=torch.float32),
}
cpu_output = model_cpu.predict(inputs)
gpu_output = model_gpu.predict(inputs)
# equivalent to prog
expected = inputs["b"][inputs["i"][0]] + inputs["b"][inputs["i"][1]]
# what actually happens on GPU
actual = inputs["b"][inputs["i"][0]] + inputs["b"][inputs["i"][0]]
print(f"diff expected vs cpu: {np.sum(np.absolute(expected - cpu_output['result']))}")
print(f"diff expected vs gpu: {np.sum(np.absolute(expected - gpu_output['result']))}")
print(f"diff actual vs gpu: {np.sum(np.absolute(actual - gpu_output['result']))}")
the issue seems to occur in the slice_right + gather operations when executed on GPU. the wrong items in input "i" are selected.
the program outpus
diff expected vs cpu: 0.0
diff expected vs gpu: 150104.015625
diff actual vs gpu: 0.0
this behavior has been tested on MacBook Pro 14inches 2023, (M2 pro) on mac os 14.7, using coremltools 8.0b2 with python 3.9.19