Post

Replies

Boosts

Views

Activity

Reply to Dynamic coreml model inference is significantly slower than static model
Here is a simple example import torch import torch.nn as nn import coremltools as ct class Model(nn.Module):     def __init__(self):         super().__init__()         self.conv_pre1 = nn.ConvTranspose2d(128, 256, kernel_size=3, stride=2, padding=1, output_padding=1)         self.conv_pre2 = nn.ConvTranspose2d(256, 256, kernel_size=3, stride=2, padding=1, output_padding=1)         self.conv1 = nn.ConvTranspose2d(256, 256, kernel_size=3, stride=2, padding=1, output_padding=1)         self.conv2 = nn.ConvTranspose2d(256, 256, kernel_size=3, stride=2, padding=1, output_padding=1)         self.conv3 = nn.ConvTranspose2d(256, 256, kernel_size=3, stride=2, padding=1, output_padding=1)         self.conv4 = nn.ConvTranspose2d(256, 3, kernel_size=3, stride=2, padding=1, output_padding=1)     def forward(self, input1, input2):         y = self.conv_pre1(input2)         y = self.conv_pre2(y)                 x = input1 + y         x = self.conv1(x)         x = self.conv2(x)         x = self.conv3(x)         x = self.conv4(x)         nn_output = torch.clip(x, 0.0, 1.0)         recon_img_out = torch.ceil(nn_output*255.0-0.5)         return recon_img_out model = Model() model.cuda() dummy_input_f = torch.randn(1,256, 68, 120, device='cuda') dummy_input_z = torch.randn(1,128, 17, 30, device='cuda') torch_model = model.eval() trace_model = torch.jit.trace(torch_model, (dummy_input_f, dummy_input_z)) # Set the input_shape to use RangeDim for each dimension. input_x1_shape = ct.EnumeratedShapes(shapes=[[1, 256, 128//16, 128//16],                                           [1, 256, 8,8],                                           [1, 256, 24,24]],                                           default=[1, 3,16,16]) input_x2_shape = ct.EnumeratedShapes(shapes=[[1, 128, 2, 2],                                           [1, 128, 2, 2],                                           [1, 128, 6, 6]],                                           default=[1, 128, 4, 4]) input_1=ct.TensorType(name="input_x1", shape=input_x1_shape)   input_2=ct.TensorType(name="input_x2", shape=input_x2_shape)   outputs=ct.TensorType(name="output_img")   # outputs=ct.ImageType(name="output_img", color_layout=ct.colorlayout.RGB) mlmodel = ct.convert(     trace_model,     inputs=[input_1, input_2],     outputs=[outputs], ) mlmodel.save("check.mlmodel") Except default shape , the other two are still too slow. input1: 8x8  input2: 2x2 50ms input1: 24x24  input2: 6x6 50ms input1: 16x16  input2: 4x4(default) 1.8ms Then i change model of one input by remove input2, non-default shape inference times speed up a bit, but still unusual. Enumerate Model Inference Speed: input1: 8x8  1.9ms input1: 24x24  12.14ms input1: 16x16  (default) 1.8ms 8x8 and 24x24  inference times with a fixed size model is ~0.5ms and ~4ms. Are these results normal? Does the single-input enumerate model also slow down by 3 to 4 times??
Feb ’23