Hi everyone,
I'm developing an iOS app which uses a PyTorch GPT2 model converted to Core ML via the Python coremltools. When I use dynamic shapes (either via RangeDim or EnumeratedShapes using coremtools) I get HUGE memory spikes when creating the model instance, while I don't get them when I specify a specific shape.
Specifically, I have a single vector input with a specific length. below is the peak and sustained memory usage with different settings. The difference between just a fixed input and a dynamic input is really big and I can't explain why or what to do against it.
static (length 64): 15.5 MB
RangeDim (64 max length):227 MB peak then 22.8 MB
RangeDim (512 max length): 281 MB peak then 25.8 MB
enum (from 0 to 64 with step size 10): 328 MB peak then 24.7 MB
Code to produce reproduce:
import coremltools as ct
import torch
import os
from transformers import GPT2LMHeadModel, GPT2Config
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(Path(model_folder))
# Create a module that just extracts the logits
class JustLogits(torch.nn.Module):
def __init__(self, model):
super().__init__()
self.model = model
def forward(self, input_ids):
return self.model(input_ids, return_dict=True).logits
def create_models_for(n_embd, n_layer, n_head, n_inner):
model_str = f"n_embd_{n_embd}_n_layer_{n_layer}_n_head_{n_head}_n_inner_{n_inner}"
model_out = f"models/gpt2_{model_str}.mlpackage"
configuration = GPT2Config(
vocab_size=tokenizer.vocab_size,
n_embd=n_embd,
n_layer=n_layer,
n_head=n_head,
n_inner=n_inner,
bos_token_id=tokenizer.bos_token_id,
eos_token_id=tokenizer.eos_token_id,
)
model = GPT2LMHeadModel(configuration)
model.init_weights()
model.eval()
logit_model = JustLogits(model)
logit_model.eval();
max_sequence_length = 64
all_input_ids = [torch.ones((seq_len,), dtype=torch.int32) for seq_len in [1, 64, max_sequence_length]]
out = model(all_input_ids[1], return_dict=True);
logit_model.eval()
traced_model = torch.jit.trace(logit_model, all_input_ids[-1])
from coremltools.converters.mil.mil import types
mlmodel = ct.convert(
traced_model,
convert_to="mlprogram",
minimum_deployment_target=ct.target.iOS15,
inputs=[ct.TensorType(shape=[64], name="input_ids", dtype=types.int32)],
outputs=[ct.TensorType(name="output_logits")]
)
mlmodel.save(model_out)
out = mlmodel.predict({"input_ids": all_input_ids[-1]})
for inference_max_len in [64, 512]:
mlmodel = ct.convert(
traced_model,
convert_to="mlprogram",
minimum_deployment_target=ct.target.iOS15,
inputs=[ct.TensorType(shape=[ct.RangeDim(1, inference_max_len, default=inference_max_len)], name="input_ids", dtype=types.int32)],
outputs=[ct.TensorType(name="output_logits")]
)
mlmodel.save(f"models/gpt2_{model_str}_dynamic_{inference_max_len}.mlpackage")
create_models_for(192, 1, 6, 4*192)