I was also having this problem on my 2019 MacBook Pro and I managed to solve the problem like this. You can circumvent this problem by creating your own implementation of Adam in keras and use that.
I have made a very rough and basic implementation while referencing the research paper on Adam (https://arxiv.org/abs/1412.6980) and Creating a Custom Optimiser in Tensorflow (https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/Optimizer#creating_a_custom_optimizer_2).
Please note that I have not implemented _resource_apply_sparse or any of Adam’s fancier bells and whistles (such as amsgrad). This is a simple and basic implementation of the optimiser as described in the paper I referenced above.
IMPORTANT NOTE:
The code requires that it runs in eager mode (due to self.iterations.numpy()).
To implement this we add the line “tf.config.run_functions_eagerly(True)” at the top of the code.
Optimiser code:
import tensorflow as tf
tf.config.run_functions_eagerly(True)
class CustomAdam(tf.keras.optimizers.Optimizer):
def __init__(self, learning_rate=0.001,beta1 = 0.9, beta2 = 0.999, epsilon = 1e-8, name="CustomAdam", **kwargs):
super().__init__(name, **kwargs)
self._set_hyper("learning_rate", kwargs.get("lr", learning_rate)) # handle lr=learning_rate
self._set_hyper("decay", self._initial_decay)
self._set_hyper("beta_v", beta1)
self._set_hyper("beta_s", beta2)
self._set_hyper("epsilon", epsilon)
self._set_hyper("corrected_v", beta1)
self._set_hyper("corrected_s", beta2)
def _create_slots(self, var_list):
"""
One slot per model variable.
"""
for var in var_list:
self.add_slot(var, "beta_v")
self.add_slot(var, "beta_s")
self.add_slot(var, "epsilon")
self.add_slot(var, "corrected_v")
self.add_slot(var, "corrected_s")
@tf.function
def _resource_apply_dense(self, grad, var):
"""Update the slots and perform an optimization step for the model variable.
"""
var_dtype = var.dtype.base_dtype
lr_t = self._decayed_lr(var_dtype) # handle learning rate decay
momentum_var1 = self.get_slot(var, "beta_v")
momentum_hyper1 = self._get_hyper("beta_v", var_dtype)
momentum_var2 = self.get_slot(var, "beta_s")
momentum_hyper2 = self._get_hyper("beta_s", var_dtype)
momentum_var1.assign(momentum_var1 * momentum_hyper1 + (1. - momentum_hyper1)* grad)
momentum_var2.assign(momentum_var2 * momentum_hyper2 + (1. - momentum_hyper2)* (grad ** 2))
# Adam bias-corrected estimate
corrected_v = self.get_slot(var, "corrected_v")
corrected_v.assign(momentum_var1 / (1 - (momentum_hyper1 ** (self.iterations.numpy() + 1) )))
corrected_s = self.get_slot(var, "corrected_s")
corrected_s.assign(momentum_var2 / (1 - (momentum_hyper2 ** (self.iterations.numpy() + 1) )))
epsilon_hyper = self._get_hyper("epsilon", var_dtype)
var.assign_add(-lr_t * (corrected_v / (tf.sqrt(corrected_s) + epsilon_hyper)))
def _resource_apply_sparse(self, grad, var):
raise NotImplementedError
def get_config(self):
base_config = super().get_config()
return {
**base_config,
"learning_rate": self._serialize_hyperparameter("learning_rate"),
"decay": self._serialize_hyperparameter("decay"),
"beta_v": self._serialize_hyperparameter("beta_v"),
"beta_s": self._serialize_hyperparameter("beta_s"),
"epsilon": self._serialize_hyperparameter("epsilon"),
}
Example usage:
model.compile(optimizer = CustomAdam(),
loss= ‘mse’)
model.fit(X, Y, epochs=10)