DeepExplainer for TensorFlow >= 2.15 #3644

malamdatacloud · 2024-05-07T15:45:11Z

malamdatacloud
May 7, 2024

I have this code:

-- coding: utf-8 --

"""Ocean/

Automatically generated by Colab.

Original file is located at
https://colab.research.google.com/drive/1tYi3EA30eKxdlJeU0njY3_sTtKvQN73J
"""

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
import shap
shap.initjs()

Make sure all data is in float32 format for TensorFlow compatibility

int64_columns_df = df.select_dtypes(include=['int'])
df[int64_columns_df.columns] = df[int64_columns_df.columns].astype('float32', copy=False)

X = df.drop(columns=['Shipping Company'], axis=1)
y = df['Shipping Company']

Import requirements

import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dense

Get some insights about the shape of ocean DataFrame:

print(f"The shape of the Ocean DataFrame is: {X.shape}")
print(f"The number of unique shipping companies is: {len(y.unique())}")
print(f"The number of unique loading countries is: {len(X['Loading Country'].unique())}")
print(f"The number of unique loading portss is: {len(X['Loading Port'].unique())}")
print(f"The number of unique destination countries is: {len(X['Destination Country'].unique())}")
print(f"The number of unique destination countries is: {len(X['Destination Port'].unique())}")

Initialize and adapt StringLookUp layers for categorical columns

lookup_loading_country = tf.keras.layers.StringLookup()
lookup_loading_country.adapt(X['Loading Country'])

lookup_loading_port = tf.keras.layers.StringLookup()
lookup_loading_port.adapt(X['Loading Port'])

lookup_destination_country = tf.keras.layers.StringLookup()
lookup_destination_country.adapt(X['Destination Country'])

lookup_destination_port = tf.keras.layers.StringLookup()
lookup_destination_port.adapt(X['Destination Port'])

lookup_shipping_company = tf.keras.layers.StringLookup()
lookup_shipping_company.adapt(y)

Create the TensorFlow dataset

def map_features(row):
# Apply StringLookup layers to the respective features within the dataset mapping function
return {
'input_loading_country': lookup_loading_country(row['Loading Country']),
'input_loading_port': lookup_loading_port(row['Loading Port']),
'input_destination_country': lookup_destination_country(row['Destination Country']),
'input_destination_port': lookup_destination_port(row['Destination Port']),
'input_numerical': [#row['Time elapsed ATA-ATD'], #1
row['Average_Delay_Per_Route'],#2
row['Time elapsed ATD-ETD'],#3
#row['Time elapsed ETD-ETA'], #4
row['LateEarly'], #5
row['On_Time_Percentage_per_Ship_Comp'],#6
#row['ATA_day'], #7
#row['ATA_weekday'],#8
row['ATD_weekday'],#9
#row['ATA_month'], #10
row['ATD_month'], #11
row['ATD_year'], #12
#row['ATA_year'], #13
row['ATD_day'], #14
row['Gross Weight'], #15
row['Billable Weight'],#16
row['Delay'],#17
row['EarlyDelivery'], #18
row['ETAAccuracy'], #19
row['Hazard'], # 20
row['Legs'], # 21
row['Volume'],# 22
row['Pack Qty'],# 23
row['DelayedDeparture'],# 24
row['OnTimeArrival'], #25
row['DeliveryDelay'], #26
row['Average_Delay'], #27
row['Amount Containers 20'], #28
row['Amount Containers 40'], #29
]}

def process_dataframe(features_df, target_df):

target_indices = lookup_shipping_company(y)
dataset = tf.data.Dataset.from_tensor_slices((features_df.to_dict('list'), target_indices))
dataset = dataset.map(lambda x, y: (map_features(x), y))
return dataset

Apply conversion to dataset

full_dataset = process_dataframe(X, y)

Shuffle and batch the dataset

full_dataset = full_dataset.shuffle(buffer_size=len(X)).batch(32)

Calculate the number of batches to split into training and validation

train_size = int(0.8 * len(X))
val_size = len(X) - train_size

train_dataset = full_dataset.take(train_size // 32) # Use train_size divided by batch size
val_dataset = full_dataset.skip(train_size // 32)

Define the model with functional API to handle multiple inputs

input_loading_country = tf.keras.Input(shape=(1,), name='input_loading_country', dtype=tf.float32)
input_loading_port = tf.keras.Input(shape=(1,), name='input_loading_port', dtype=tf.float32)
input_destination_country = tf.keras.Input(shape=(1,), name='input_destination_country', dtype=tf.float32)
input_destination_port = tf.keras.Input(shape=(1,), name='input_destination_port', dtype=tf.float32)
input_numerical = tf.keras.Input(shape=(23,), name='input_numerical')

Check for best output_dim based on input_dim:

import math
vocabulary_size_loading_country = lookup_loading_country.vocabulary_size()
vocabulary_size_loading_port = lookup_loading_port.vocabulary_size()
vocabulary_size_destination_country = lookup_destination_country.vocabulary_size()
vocabulary_size_destination_port = lookup_destination_port.vocabulary_size()
vocabulary_size_shipping = lookup_shipping_company.vocabulary_size()

loading_country_dim = int(math.sqrt(vocabulary_size_loading_country))
loading_port_dim = int(math.sqrt(vocabulary_size_loading_port))

destination_country_dim = int(math.sqrt(vocabulary_size_destination_country))
destination_port_dim = int(math.sqrt(vocabulary_size_destination_port))

shipping_dim = int(math.sqrt(vocabulary_size_shipping))

Embeddings for categorical inputs

loading_embedding_country = tf.keras.layers.Embedding(
input_dim = lookup_loading_country.vocabulary_size(),
output_dim = loading_country_dim)(input_loading_country)

loading_embedding_port = tf.keras.layers.Embedding(
input_dim = lookup_loading_port.vocabulary_size(),
output_dim = loading_port_dim)(input_loading_port)

destination_embedding_country = tf.keras.layers.Embedding(
input_dim = lookup_destination_country.vocabulary_size(),
output_dim = destination_country_dim)(input_destination_country)

destination_embedding_port = tf.keras.layers.Embedding(
input_dim = lookup_destination_port.vocabulary_size(),
output_dim = destination_port_dim)(input_destination_port)

Flatten embeddings and concatenate with numerical inputs

loading_country_flat = tf.keras.layers.Flatten()(loading_embedding_country)
loading_port_flat = tf.keras.layers.Flatten()(loading_embedding_port)

destination_country_flat = tf.keras.layers.Flatten()(destination_embedding_country)
destination_port_flat = tf.keras.layers.Flatten()(destination_embedding_port)

concatenated = tf.keras.layers.Concatenate()([loading_country_flat,
loading_port_flat,
destination_country_flat,
destination_port_flat,
input_numerical
])

x = tf.keras.layers.Dense(256, activation='relu')(concatenated)

output = tf.keras.layers.Dense(lookup_shipping_company.vocabulary_size(), activation='softmax')(x)

Assemble the model

model = tf.keras.Model(inputs=[
input_loading_country,
input_loading_port,
input_destination_country,
input_destination_port,
input_numerical
], outputs=output)

#from tensorflow.keras.callbacks import ModelCheckpoint

model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
'model_ocean_{epoch:02d}.h5', # Saves the model with the epoch number
save_best_only=False, # Saves all models
verbose=1 # Print out messages when saving the model
)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

Fit the model with the callback

history = model.fit(
train_dataset,
epochs=50,
validation_data=val_dataset,
callbacks=[model_checkpoint]
)

tf.keras.utils.plot_model(model, show_shapes=True, rankdir="LR")

def predict_top_companies(loading_port,
loading_country,
destination_port,
destination_country,
legs):

loading_port_encoded = lookup_loading_port(tf.constant([loading_port]))
loading_country_encoded = lookup_loading_country(tf.constant([loading_country]))

destination_port_encoded = lookup_destination_port(tf.constant([destination_port]))
destination_country_encoded = lookup_destination_country(tf.constant([destination_country]))

numerical_features = np.zeros((1, 17))  # Assuming you have 25 other numerical features
numerical_features[0, 16] = legs  # Set the last element to 'legs'

predictions = model_selected.predict([
    loading_port_encoded,
    loading_country_encoded,
    destination_port_encoded,
    destination_country_encoded,
    numerical_features
    ])

# Find the indices of the top 3 predictions
top_indices = np.argsort(predictions[0])[-1:][::-1]
top_confidences = [predictions[0][i] for i in top_indices]

# Adjust confidences based on model's overall accuracy
model_accuracy =  0.9112
adjusted_confidences = [conf * model_accuracy for conf in top_confidences]

# Get the shipping companies names
top_companies = [lookup_shipping_company.get_vocabulary()[i] for i in top_indices]

return top_companies[0]

# Create a results table
#results_table = pd.DataFrame({
#    'Shipping Company': top_companies,
#    'Confidence (%)': [f"{conf * 100:.2f}%" for conf in adjusted_confidences]
#})
#return results_table

Example usage of the function

#results = predict_top_companies(loading_port='Tel-Aviv',

loading_country='Israel',

destination_port='Dallas',

destination_country="USA",

legs=5)

#print("Top 1 Shipping Companies Ocean:")

#print(results)

explainer = shap.DeepExplainer(model, (model.layers[0].input, model.layers[-1].output))
shap_values - explainer.shap_values(val_dataset)?

This is a model and function to predict top shipping companies based on user's input.
Everything works, up to the predict_top_companies functions.

The only thing that I haven't got it to work, not even disabling tf eager mode, is the SHAP DeepExplainer on my model.
Any thoughts on how to accomplish this?
deepexplainer_tf.txt
I have uploaded the code as a txt file if someone who is eager to help wants to play with it.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

DeepExplainer for TensorFlow >= 2.15 #3644

{{title}}

Replies: 0 comments

Select a reply

DeepExplainer for TensorFlow >= 2.15 #3644

malamdatacloud May 7, 2024

-- coding: utf-8 --

Make sure all data is in float32 format for TensorFlow compatibility

Import requirements

Get some insights about the shape of ocean DataFrame:

Initialize and adapt StringLookUp layers for categorical columns

Create the TensorFlow dataset

Apply conversion to dataset

Shuffle and batch the dataset

Calculate the number of batches to split into training and validation

Define the model with functional API to handle multiple inputs

Check for best output_dim based on input_dim:

Embeddings for categorical inputs

Flatten embeddings and concatenate with numerical inputs

Assemble the model

Fit the model with the callback

Example usage of the function

loading_country='Israel',

destination_port='Dallas',

destination_country="USA",

legs=5)

Replies: 0 comments

malamdatacloud
May 7, 2024