CoreML dynamic shapes excessive memory use

Hi everyone, I'm developing an iOS app which uses a PyTorch GPT2 model converted to Core ML via the Python coremltools. When I use dynamic shapes (either via RangeDim or EnumeratedShapes using coremtools) I get HUGE memory spikes when creating the model instance, while I don't get them when I specify a specific shape.

Specifically, I have a single vector input with a specific length. below is the peak and sustained memory usage with different settings. The difference between just a fixed input and a dynamic input is really big and I can't explain why or what to do against it.

  • static (length 64): 15.5 MB
  • RangeDim (64 max length):227 MB peak then 22.8 MB
  • RangeDim (512 max length): 281 MB peak then 25.8 MB
  • enum (from 0 to 64 with step size 10): 328 MB peak then 24.7 MB

Code to produce reproduce:

import coremltools as ct
import torch
import os
from transformers import GPT2LMHeadModel, GPT2Config
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(Path(model_folder))

# Create a module that just extracts the logits
class JustLogits(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, input_ids):
        return self.model(input_ids, return_dict=True).logits

def create_models_for(n_embd, n_layer, n_head, n_inner):
    model_str = f"n_embd_{n_embd}_n_layer_{n_layer}_n_head_{n_head}_n_inner_{n_inner}"
    model_out = f"models/gpt2_{model_str}.mlpackage"
    
    configuration = GPT2Config(
        vocab_size=tokenizer.vocab_size,
        n_embd=n_embd,
        n_layer=n_layer,
        n_head=n_head,
        n_inner=n_inner,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

    model = GPT2LMHeadModel(configuration)
    model.init_weights()
    model.eval()

    logit_model = JustLogits(model)
    logit_model.eval();

    max_sequence_length = 64

    all_input_ids = [torch.ones((seq_len,), dtype=torch.int32) for seq_len in [1, 64, max_sequence_length]]

    out = model(all_input_ids[1], return_dict=True);

    logit_model.eval()
    traced_model = torch.jit.trace(logit_model, all_input_ids[-1])

    from coremltools.converters.mil.mil import types

    mlmodel = ct.convert(
        traced_model,
        convert_to="mlprogram",
        minimum_deployment_target=ct.target.iOS15,
        inputs=[ct.TensorType(shape=[64], name="input_ids", dtype=types.int32)], 
        outputs=[ct.TensorType(name="output_logits")]
    )
    mlmodel.save(model_out)
    out = mlmodel.predict({"input_ids": all_input_ids[-1]})

    for inference_max_len in [64, 512]:
        mlmodel = ct.convert(
            traced_model,
            convert_to="mlprogram",
            minimum_deployment_target=ct.target.iOS15,
            inputs=[ct.TensorType(shape=[ct.RangeDim(1, inference_max_len, default=inference_max_len)], name="input_ids", dtype=types.int32)],
            outputs=[ct.TensorType(name="output_logits")]
        )
        
mlmodel.save(f"models/gpt2_{model_str}_dynamic_{inference_max_len}.mlpackage") 

create_models_for(192, 1, 6, 4*192)
CoreML dynamic shapes excessive memory use
 
 
Q