i have a question about convolution. i use MPSCNN to run the CNN network for a long time , for example ,VGG-NET ,ResNet ,SqueezeNet and so on . the performance is very good , SqueezeNet only need 20ms , i can use it to recognize image realtime with my iPhone. i am curious , i do not know why MPSCNN is so fast adn high performance. i just know it use Metal and GPU. so i want write the kernel code myself and compare to MPSCNN .
i construct the convolution example for that:
the input is 3x224x224
the convolution kernel is 64x3x3
the pading is 1
the stride is 1
so the output is 64x224x224
and datatype is float
the MPSCNN code is that
NSDate *start2 = [NSDate date];
MPSImageDescriptor *desc = [MPSImageDescriptor imageDescriptorWithChannelFormat:MPSImageFeatureChannelFormatFloat32 width:224 height:224 featureChannels:3];
MPSImage *srcImage = [[MPSImage alloc] initWithDevice:self.device imageDescriptor:desc];
MPSImageDescriptor *desc2 = [MPSImageDescriptor imageDescriptorWithChannelFormat:MPSImageFeatureChannelFormatFloat32 width:224 height:224 featureChannels:64];
MPSImage *outImage = [[MPSImage alloc] initWithDevice:self.device imageDescriptor:desc2];
id<MTLCommandBuffer> commandBuffer = [self.commandQueue commandBuffer];
int co = 4*224*224;
int kernel_size = 3;
int pad = 1;
int stride = 1;
int count = 64*224*224;
float *buf = new float[co];
for(int i =0;i<co;i++){
buf[i] = 1.0;
}
int weight_count = 3*64*kernel_size*kernel_size;
float *weight = new float[weight_count];
for(int i =0;i<weight_count;i++){
weight[i] = 0.123;
}
float *bias = new float[64];
for(int i =0;i<64;i++){
bias[i] = 1.23456789;
}
MTLRegion region = MTLRegionMake3D(0, 0, 0,224,224,1);
[srcImage.texture replaceRegion:region mipmapLevel:0 slice:0 withBytes:buf bytesPerRow:srcImage.width*4*sizeof(float) bytesPerImage:0];
MPSCNNConvolutionDescriptor *convdesc = [MPSCNNConvolutionDescriptor cnnConvolutionDescriptorWithKernelWidth:kernel_size kernelHeight:kernel_size inputFeatureChannels:3 outputFeatureChannels:64 neuronFilter:nil];
convdesc.strideInPixelsX = stride;
convdesc.strideInPixelsY = stride;
convdesc.groups = 1;
MPSCNNConvolution *conv = [[MPSCNNConvolution alloc] initWithDevice:self.device convolutionDescriptor:convdesc kernelWeights:weight biasTerms:bias flags:MPSCNNConvolutionFlagsNone];
MPSOffset offset;
offset.x = 0;
offset.y = 0;
offset.z = 0;
conv.offset = offset;
[conv encodeToCommandBuffer:commandBuffer sourceImage:srcImage destinationImage:outImage];
NSTimeInterval localtime2 = [[NSDate date] timeIntervalSinceDate:start2] * 1000;
cout << "data init used " << localtime2 << "ms" << endl;
NSDate *start = [NSDate date];
[commandBuffer commit];
[commandBuffer waitUntilCompleted];
delete [] buf;
delete [] weight;
delete [] bias;
NSTimeInterval localtime = [[NSDate date] timeIntervalSinceDate:start] * 1000;
cout << "gpu calc used " << localtime << "ms" << endl;
my metal code is that (because 4 channel is easy to process , so i convert input to 4x224x224)
id <MTLComputePipelineState> pipline = self.pipelineShaderTex;
int co = 4*224*224;
int kernel_size = 3;
int pad = 1;
int stride = 1;
int count = 64*224*224;
float *buf = new float[co];
for(int i =0;i<co;i++){
buf[i] = 1.0;
}
int weight_count = 4*64*kernel_size*kernel_size;
float *weight = new float[weight_count];
for(int i =0;i<weight_count;i++){
weight[i] = i%4 == 3 ? 0 : 0.123;
}
float *bias = new float[64];
for(int i =0;i<64;i++){
bias[i] = 1.23456789;
}
MetalConvolutionParameter param;
param.count = count;
param.padSize = pad;
param.kernelSize = kernel_size;
param.stride = stride;
param.inputChannel = 3;
param.outputChannel = 64;
param.inputWidth = 224;
param.inputHeight = 224;
param.outputWidth = 224;
param.outputHeight = 224;
MTLTextureDescriptor *indesc = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:MTLPixelFormatRGBA32Float width:224 height:224 mipmapped:NO];
indesc.textureType = MTLTextureType2D;
MTLTextureDescriptor *outdesc = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:MTLPixelFormatRGBA32Float width:224 height:224 mipmapped:NO];
outdesc.textureType = MTLTextureType2DArray;
outdesc.arrayLength = 64/4;
MTLTextureDescriptor *weightdesc = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:MTLPixelFormatRGBA32Float width:3 height:3 mipmapped:NO];
weightdesc.textureType = MTLTextureType2DArray;
weightdesc.arrayLength = 64;
MTLTextureDescriptor *biasdesc = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:MTLPixelFormatRGBA32Float width:1 height:1 mipmapped:NO];
biasdesc.textureType = MTLTextureType2DArray;
biasdesc.arrayLength = 64/4;
if(!self.inTexture){
self.inTexture = [self.device newTextureWithDescriptor:indesc];
self.outTexture = [self.device newTextureWithDescriptor:outdesc];
self.weightTexture = [self.device newTextureWithDescriptor:weightdesc];
self.biasTexture = [self.device newTextureWithDescriptor:biasdesc];
[self.inTexture replaceRegion:MTLRegionMake3D(0, 0, 0, 224, 224, 1) mipmapLevel:0 slice:0 withBytes:buf bytesPerRow:224*4*sizeof(float) bytesPerImage:0];
for(int i =0;i<weightdesc.arrayLength;i++){
[self.weightTexture replaceRegion:MTLRegionMake3D(0, 0, 0, 3, 3, 1) mipmapLevel:0 slice:i withBytes:weight+3*3*4*i bytesPerRow:3*4*sizeof(float) bytesPerImage:0];
}
for(int i =0;i<biasdesc.arrayLength;i++){
[self.biasTexture replaceRegion:MTLRegionMake3D(0, 0, 0, 1, 1, 1) mipmapLevel:0 slice:i withBytes:bias+4*i bytesPerRow:1*4*sizeof(float) bytesPerImage:0];
}
}
id<MTLBuffer> parambuffer = [self.device newBufferWithBytes:¶m length:sizeof(param) options:MTLResourceCPUCacheModeDefaultCache];
id<MTLCommandBuffer> commandBuffer = [self.commandQueue commandBuffer];
id<MTLComputeCommandEncoder> encoder = [commandBuffer computeCommandEncoder];
[encoder setComputePipelineState:pipline];
[encoder setTexture:self.inTexture atIndex:0];
[encoder setTexture:self.outTexture atIndex:1];
[encoder setTexture:self.weightTexture atIndex:2];
[encoder setTexture:self.biasTexture atIndex:3];
[encoder setBuffer:parambuffer offset:0 atIndex:0];
MTLSize threadsPerGroups = MTLSizeMake(32, 16, 1);
MTLSize threadGroups = MTLSizeMake((224 + threadsPerGroups.width -1 ) / threadsPerGroups.width,
(224 + threadsPerGroups.height -1 ) / threadsPerGroups.height, 16);
[encoder dispatchThreadgroups:threadGroups threadsPerThreadgroup:threadsPerGroups];
[encoder endEncoding];
NSDate *start = [NSDate date];
[commandBuffer commit];
[commandBuffer waitUntilCompleted];
delete [] buf;
delete [] weight;
delete [] bias;
NSTimeInterval localtime = [[NSDate date] timeIntervalSinceDate:start] * 1000;
cout << "Time used " << localtime << "ms" << endl;
and metal kernel function is ( i do not process the pad and stride , and input is reading (0,0), ignore it . i just test calculator performance)
kernel void convolutionForwardTexture(texture2d<float, access::read> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
texture2d_array<float, access::read> weights [[ texture(2) ]],
texture2d_array<float, access::read> bias [[ texture(3) ]],
const device MetalConvolutionParameter *convolvParams [[ buffer(0) ]],
ushort3 gid [[ thread_position_in_grid ]]){
if(gid.x>=224||gid.y>=224){
return;
}
float total = 0;
float total2 = 0;
float total3 = 0;
float total4 = 0;
float4 k,input;
int slice = gid.z;
for(int kh =0;kh<3;kh++){
for(int kw =0;kw<3;kw++) {
k = weights.read(uint2(kw,kh),slice*4);
input = inTexture.read(uint2(0,0));
total+=dot(k,input);
k = weights.read(uint2(kw,kh),slice*4+1);
input = inTexture.read(uint2(0,0));
total2+=dot(k,input);
k = weights.read(uint2(kw,kh),slice*4+2);
input = inTexture.read(uint2(0,0));
total3+=dot(k,input);
k = weights.read(uint2(kw,kh),slice*4+3);
input = inTexture.read(uint2(0,0));
total4+=dot(k,input);
}
}
float4 output = float4(total,total2,total3,total4) + bias.read(uint2(0,0),slice);
outTexture.write(output,uint2(gid.x,gid.y),gid.z);
}
the result is MPSCNN need only 10ms ,and my code is 40ms , why my code is so slow ? i do not know how MPSCNN do it ? can you give some help for me ?