Bidirectional layer (for BLSTM) is way too slow on GPU - 10e5 slower

My computer almost stalls whenever I try to use a Bidirectional layer. I'm using Macos M1 with tensorflow-macos 2.5 tensorflow-metal 0.1.2, tensorflow-deps 2.5.0.

Bellow I show 2 short snippets of demo code: one working (without Bidirectional), one not-working (with Bidirectional).

import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.layers import SimpleRNN, Bidirectional, Masking

import tensorflow_addons as tfa
additional_metrics = ['accuracy']
batch_size = 128
embedding_output_dims = 15
loss_function = BinaryCrossentropy()
max_sequence_length = 300
num_distinct_words = 5000
number_of_epochs = 5
optimizer = Adam()
optimizer = tfa.optimizers.RectifiedAdam(learning_rate=0.01, clipnorm=0.5)
validation_split = 0.20
verbosity_mode = 1


def working_demo_LSTM():
    # Load dataset
    (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=num_distinct_words)
    print(x_train.shape)
    print(x_test.shape)

    # Pad all sequences
    padded_inputs = pad_sequences(x_train, maxlen=max_sequence_length,
                                  value=0.0)  # 0.0 because it corresponds with <PAD>
    padded_inputs_test = pad_sequences(x_test, maxlen=max_sequence_length,
                                       value=0.0)  # 0.0 because it corresponds with <PAD>

    # Define the Keras model
    model = Sequential()
    model.add(Embedding(num_distinct_words, embedding_output_dims, input_length=max_sequence_length))
    model.add(LSTM(10))
    model.add(Dense(1, activation='sigmoid'))
    # Compile the model
    model.compile(optimizer=optimizer, loss=loss_function, metrics=additional_metrics)
    # Give a summary
    model.summary()
    history = model.fit(padded_inputs, y_train, batch_size=batch_size, epochs=number_of_epochs,
                        verbose=verbosity_mode, validation_split=validation_split)

    # Test the model after training
    test_results = model.evaluate(padded_inputs_test, y_test, verbose=False)
    print(f'Test results - Loss: {test_results[0]} - Accuracy: {100*test_results[1]}%')

    return True


def nonworking_demo():
    # Load dataset
    (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=num_distinct_words)
    print(x_train.shape)
    print(x_test.shape)

    # Pad all sequences
    padded_inputs = pad_sequences(x_train, maxlen=max_sequence_length,
                                  value=0.0)  # 0.0 because it corresponds with <PAD>
    padded_inputs_test = pad_sequences(x_test, maxlen=max_sequence_length,
                                       value=0.0)  # 0.0 because it corresponds with <PAD>

    # Define the Keras model
    model = Sequential()
    model.add(Embedding(num_distinct_words, embedding_output_dims, input_length=max_sequence_length))
    model.add(Bidirectional(SimpleRNN(units=10, return_sequences=True)))
    model.add(Dense(1, activation='sigmoid'))
    # Compile the model
    model.compile(optimizer=optimizer, loss=loss_function, metrics=additional_metrics)
    # Give a summary
    # model.summary()
    history = model.fit(padded_inputs, y_train, batch_size=batch_size, epochs=number_of_epochs,
                        verbose=verbosity_mode, validation_split=validation_split)

    # Test the model after training
    test_results = model.evaluate(padded_inputs_test, y_test, verbose=False)
    print(f'Test results - Loss: {test_results[0]} - Accuracy: {100*test_results[1]}%')

    return True


def main():
    # working_demo_LSTM()
    nonworking_demo_BLSTM()


if __name__ == "__main__":

    main()

I'm getting the following warnings and the computer stalls whenever I run nonworking_demo_BLSTM() with with tf.device('/cpu:0'): I get 7secs per epoch. If I don't explicitly select CPU, I get a ETA of 05:44:30 just for the 1st epoch! Are these values normal?

Hi @mrt77 , which OS and version you are using ? We recommend upgrading to macOS 12.0.

I'm already using macOS 12.0 Beta 8

The perf difference between the scripts is a result of SimpleRNN op, not the use of Bidirectional op. Tensorflow allows the plugin to implement fused versions of LSTM/GRU ops. This fusion is required for GPU acceleration. SimpleRNN op does not have fused op support from Tensorflow which restricts Metal plugin to provide good acceleration on GPU. Our recommendation is to update the model to use LSTM/GRU instead. Please see the code snippet below:

def nonworking_demo():
    # Load dataset
    (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=num_distinct_words)
    print(x_train.shape)
    print(x_test.shape)

    # Pad all sequences
    padded_inputs = pad_sequences(x_train, maxlen=max_sequence_length,
                                  value=0.0)  # 0.0 because it corresponds with <PAD>
    padded_inputs_test = pad_sequences(x_test, maxlen=max_sequence_length,
                                       value=0.0)  # 0.0 because it corresponds with <PAD>

    # Define the Keras model
    model = Sequential()
    model.add(Embedding(num_distinct_words, embedding_output_dims, input_length=max_sequence_length))
    model.add(Bidirectional(LSTM(units=10, return_sequences=True)))
    model.add(Dense(1, activation='sigmoid'))
    # Compile the model
    model.compile(optimizer=optimizer, loss=loss_function, metrics=additional_metrics)
    # Give a summary
    # model.summary()
    history = model.fit(padded_inputs, y_train, batch_size=batch_size, epochs=number_of_epochs,
                        verbose=verbosity_mode, validation_split=validation_split)

    # Test the model after training
    test_results = model.evaluate(padded_inputs_test, y_test, verbose=False)
    print(f'Test results - Loss: {test_results[0]} - Accuracy: {100*test_results[1]}%')

    return True

Will this issue be resolved with macOs 12?

Bidirectional layer (for BLSTM) is way too slow on GPU - 10e5 slower
 
 
Q