I've created a custom BoxBlur kernel that produces identical results to Apple's built-in box blur (CIBoxBlur) kernel but my custom kernel is orders of magnitude slower. So naturally I am wondering what I'm doing wrong to get such poor performance. Below is my custom kernel in the Metal shading language. Can you spot why it's so slow? The built-in filter performs well so I can only assume it's something I'm doing wrong.
#include <CoreImage/CoreImage.h>
#import <simd/simd.h>
extern "C" {
namespace coreimage {
float4 customBoxBlurFilterKernel(sampler src) {
float2 crd = src.coord();
int edge = 100;
int minx = crd.x - edge;
int maxx = crd.x + edge;
int miny = crd.y - edge;
int maxy = crd.y + edge;
float4 sums = float4(0,0,0,0);
float cnt = 0;
// compute average of surrounding rgb values
for(int row=miny; row < maxy; row++) {
for(int col=minx; col < maxx; col++) {
float4 samp = src.sample(float2(col, row));
sums[0] += samp[0];
sums[1] += samp[1];
sums[2] += samp[2];
cnt += 1.;
}
}
return float4(sums[0]/cnt, sums[1]/cnt, sums[2]/cnt, 1);
}
}
}