📈 Rail Break Prediction AI

Introduction

Rail Break Prediction AI project focuses on building a data pipeline to extract, enrich, and analyse real-world data using machine learning models, with the goal of predicting rail breaks within the coming 30 days, utilising the Insight Factory platform.

Software Architecture

Images

Code - XAI of Decision Tree

0 - data and train

df = spark.sql("""select * from demo.training_table""")
df = df.fillna(0)
# select data
feature_columns = ['BrakeCylinder', 'IntrainForce', 'SND']
target_column = 'target'
# 提取 X 和 y 在 PySpark 中
X = df.select(feature_columns).toPandas()
y = df.select(target_column).toPandas()

print(X.columns)
print(y.columns)

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. create a tree classifier
clf = DecisionTreeClassifier()

# 3. train
clf.fit(X_train, y_train)

# 4. predict
y_pred = clf.predict(X_test)

# 5. get accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# F1 score
f1 = f1_score(y_test, y_pred, average='binary')  
	# You can also use 'micro' or 'macro' or 'weighted' depending on your need
print(f"F1 Score: {f1}")

true_positive = 0
true_negative = 0
false_positive = 0
false_negative = 0
for i in range(len(y_pred)):
    pred = y_pred[i] != 0
    real = y_test.iloc[i][0] != 0
    # if y_pred[i] != 0 or y_test.iloc[i][0] != 0:
        # print(y_pred[i], y_test.iloc[i][0])
    if pred == 0 and real == 0:
        true_negative += 1
    elif pred == real == 1:
        true_positive += 1
    elif pred == 0 and real == 1:
        false_negative += 1
    elif pred == 1 and real == 0:
        false_positive += 1
print("true positive = ", true_positive)
print("true negative = ", true_negative)
print("false positive = ", false_positive)
print("false negative = ", false_positive)
precision = true_positive / (true_positive + false_positive)
recall = true_positive / (true_positive + false_negative)
print("precision = ", precision)
print("recall = ", recall)
print("f1 = ", precision * recall * 2 / (precision + recall))
print("my percision = ", true_positive / (true_positive + false_positive + false_negative))

# test
print(type(y_test))
print(y_test.shape)
print(y_pred.shape)
print(y_test.iloc[0])
print(y_test.iloc[0][0])
'_______________________________'
print(clf)
print(clf.classes_)
print(clf.n_features_in_)

0 - Control Center

# control center
class_number = clf.n_classes_
feature_number = clf.n_features_in_
print(f"class number   = {class_number} -> {clf.classes_}")
print(f"feature number = {feature_number}")

# decision tree model = clf
class_names = [f"class{i}" for i in range(class_number)]  # 2 class
# feature_names = [str(i) for i in range(feature_number)]  # 3 feature
feature_names = ['BrakeCylinder', 'IntrainForce', 'SND']

1 - visualise tree

from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import matplotlib.pyplot as plt

# visualise decision tree
temp = plt.figure(figsize=(100, 30))
temp = tree.plot_tree(clf, feature_names=feature_names, class_names=class_names, filled=True, max_depth=5, fontsize=12)
plt.show()

2 - Feature Importance

import numpy as np

# Get Feature Importance
importances = clf.feature_importances_

# Match feature name and Importance
# feature_names = [str(i) for i in range(3)]
feature_importances = sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True)

# Print the top 10 most important features
print(f"Feature        | Importance")
for feature, importance in feature_importances[:10]:
    print(f"{feature:15}| {importance:.6f}")

import matplotlib.pyplot as plt

# Select the top 20 most important features for visualization
top_n = 20
top_features = [f[0] for f in feature_importances[:top_n]]
top_importances = [f[1] for f in feature_importances[:top_n]]

# Plot figure
plt.figure(figsize=(10, 4))
plt.barh(top_features, top_importances, color='skyblue')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title(f'Top {len(top_features)} Feature Importances')
plt.gca().invert_yaxis()  # Features are listed from highest to lowest
plt.show()

3 - Decision Path

import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import matplotlib.pyplot as plt

# Choose a sample by index
sample_index = 0
X_sample = X_test.iloc[sample_index].to_numpy().reshape(1, -1)

# get decision path
decision_path = clf.decision_path(X_sample)

# Prints the index of the node passed on the path
node_indicator = decision_path[0]
leaf_id = clf.apply(X_sample)

# print decision path
print(f"Sample {sample_index} 's decision path：")
for node_id in node_indicator.indices:
    # Print the feature number and threshold
    if node_id in clf.tree_.children_left:
        threshold_sign = "<="
    else:
        threshold_sign = ">"

    print(f"Node {node_id}: use feature {clf.tree_.feature[node_id]} threshold {clf.tree_.threshold[node_id]} {threshold_sign} to make decision")

# The last node is the leaf node
print(f"Leaf Node: {leaf_id[0]}")

4 - LIME

import lime
import lime.lime_tabular
import numpy as np

# Ensure X_train and X_test are numpy arrays
X_train = np.array(X_train)
X_test = np.array(X_test)

# model is clf, traning set is X_test
explainer = lime.lime_tabular.LimeTabularExplainer(X_train, feature_names=feature_names, class_names=class_names, discretize_continuous=True)

# choose a sample
i = 0
exp = explainer.explain_instance(X_test[i], clf.predict_proba, num_features=10)

# show result
exp.show_in_notebook(show_all=False)