I have created the following kernel function to find the average and variance for each feature map in each image (ex. 160 x 92 x 128 (width, height, depth(feature map), but it is very slow. Let me know if there's a way to improve the speed. Below is a sample code
kernel void calculate_avgA(texture2d_array<float, access::read> texture_in [[texture(0)]],
texture2d_array<float, access::write> texture_out [[texture(1)]],
uint3 tid [[thread_position_in_grid]])
{
int width = texture_in.get_width();
int height = texture_in.get_height();
int depth = texture_in.get_array_size();
float4 outColor;
uint3 kernelIndex(0,0,0);
uint3 textureIndex(0,0,0);
for(int k = 0; k < depth; k++) {
outColor = (0.0, 0.0, 0.0, 0.0);
for (int i=0; i < width; i++)
{
for (int j=0; j < height; j++)
{
kernelIndex = uint3(i, j, k);
textureIndex = uint3(tid.x + i, tid.y + j, tid.z + k);
float4 color = texture_in.read(textureIndex.xy, textureIndex.z).rgba;
outColor += color;
}
}
outColor = outColor / (width * height);
texture_out.write(float4(outColor.rgba), tid.xy, textureIndex.z);
}
}
-(void) calculateArrAvgFromSrc : (MPSImage *)srcMPSImage dstImage:(MPSImage *)dstMPSImage buffer:(id <MTLCommandBuffer>) commandBuffer {
/
id <MTLTexture> underlyingSrcTexture = [srcMPSImage texture];
id <MTLTexture> underlyingDstTexture = [dstMPSImage texture];
NSUInteger srcWidth = [underlyingSrcTexture width];
NSUInteger srcHeight = [underlyingSrcTexture height];
NSUInteger srcArrayLength = [underlyingSrcTexture arrayLength];
/
static dispatch_once_t onceToken;
static id <MTLFunction> upsampleFunction = nil;
static id <MTLComputePipelineState> computePipelineState = nil;
/
dispatch_once (&onceToken, ^{
upsampleFunction = [_defaultLibrary newFunctionWithName:@"calculate_avgA"];
computePipelineState = [_device newComputePipelineStateWithFunction:upsampleFunction error:nil];
});
assert(upsampleFunction);
assert(computePipelineState);
/
id <MTLComputeCommandEncoder> computeCommandEncoder = [commandBuffer computeCommandEncoder];
[computeCommandEncoder setComputePipelineState:computePipelineState];
[computeCommandEncoder setTexture:underlyingSrcTexture atIndex:0];
[computeCommandEncoder setTexture:underlyingDstTexture atIndex:1];
/
MTLSize numThreadgroups = {(1) , (1 ) , srcArrayLength };
MTLSize numThreadsPerThreadgroup = {1, 1, srcArrayLength};
printf("Executing with numThreadgroups = {%lu %lu %lu}, numThreadsPerThreadgroup = {%lu %lu %lu}\n",
(unsigned long)numThreadgroups.width, (unsigned long)numThreadgroups.height, (unsigned long)numThreadgroups.depth,
(unsigned long)numThreadsPerThreadgroup.width, (unsigned long)numThreadsPerThreadgroup.height, (unsigned long)numThreadsPerThreadgroup.depth);
[computeCommandEncoder dispatchThreadgroups:numThreadgroups threadsPerThreadgroup:numThreadsPerThreadgroup];
[computeCommandEncoder endEncoding];
}