import pandas as pd
import numpy as np


# Import API
from virtualitics import api
vip=api.VIP()

Setting up WebSocket connection to: ws://localhost:12345/api
Connection Successful! Initializing session.


# Import data from local 
# path_to_data = '../../data/default of credit card clients.csv'
# df = pd.read_csv(path_to_data)

# Uncomment previous two lines to import data from local file instead of UCI machine learning repository
link = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls'
df = pd.read_excel(link, header=1)

df.drop('ID', axis=1, inplace=True)

# Rename categorical class values with descriptions
sex = {1:'Male', 2:'Female'}
education = {1:'graduate', 2:'university', 3:'high school', 4:'other', 5:'other', 6:'other', 0:'other'}
marriage = {1:'married', 2:'single', 3:'other', 0:'other'}

df['SEX'] = [sex[x] for x in df['SEX']]
df['EDUCATION'] = [education[x] for x in df['EDUCATION']]
df['MARRIAGE'] = [marriage[x] for x in df['MARRIAGE']]

# Define target and categoricals
target = 'default payment next month'
features = list(df.columns)[:-1]
categoricals = ['SEX', 'EDUCATION', 'MARRIAGE']

df.head()


vip.load_data(df, 'credit_card_defaults')

Data set loaded with name: 'credit_card_defaults (2)'

'credit_card_defaults (2)'


vip.smart_mapping(target, features)
vip.normalize()


vip.insights()


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, roc_auc_score, make_scorer
from sklearn.metrics import f1_score

from keras.models import Sequential
from keras.layers import Dense
import keras.backend as K
import tensorflow
# from tensorflow.random import set_seed
# from tensorflow import set_random_seed

from numpy.random import seed

seed(1)
tensorflow.random.set_seed(1)

# One hot encode categoricals
data = pd.get_dummies(df, columns=categoricals)


data['UTILIZATION'] = data['BILL_AMT1'] / data['LIMIT_BAL']
data['ON_TIME'] = (data['PAY_0'] < 1).astype(int)


train, test = train_test_split(data, test_size=.66666, random_state=100)
test_indices = test.index

# Normalize
scl=StandardScaler()
X = scl.fit_transform(train.drop(target, axis=1).astype(np.float64))
y = train[target]


NeuralNet = Sequential()
NeuralNet.add(Dense(25, input_dim=train.shape[1]-1, activation='relu'))
NeuralNet.add(Dense(8, activation='relu'))
NeuralNet.add(Dense(1, activation='sigmoid'))
NeuralNet.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


NeuralNet.fit(X, y, epochs=20, batch_size=32)

Epoch 1/20
313/313 [==============================] - 1s 471us/step - loss: 0.5039 - accuracy: 0.7919
Epoch 2/20
313/313 [==============================] - 0s 465us/step - loss: 0.4521 - accuracy: 0.8134
Epoch 3/20
313/313 [==============================] - 0s 463us/step - loss: 0.4429 - accuracy: 0.8143
Epoch 4/20
313/313 [==============================] - 0s 475us/step - loss: 0.4381 - accuracy: 0.8159
Epoch 5/20
313/313 [==============================] - 0s 461us/step - loss: 0.4349 - accuracy: 0.8158
Epoch 6/20
313/313 [==============================] - 0s 455us/step - loss: 0.4320 - accuracy: 0.8167
Epoch 7/20
313/313 [==============================] - 0s 468us/step - loss: 0.4299 - accuracy: 0.8180
Epoch 8/20
313/313 [==============================] - 0s 461us/step - loss: 0.4275 - accuracy: 0.8179
Epoch 9/20
313/313 [==============================] - 0s 460us/step - loss: 0.4260 - accuracy: 0.8197
Epoch 10/20
313/313 [==============================] - 0s 542us/step - loss: 0.4244 - accuracy: 0.8201
Epoch 11/20
313/313 [==============================] - 0s 502us/step - loss: 0.4224 - accuracy: 0.8209
Epoch 12/20
313/313 [==============================] - 0s 477us/step - loss: 0.4214 - accuracy: 0.8211
Epoch 13/20
313/313 [==============================] - 0s 488us/step - loss: 0.4210 - accuracy: 0.8222
Epoch 14/20
313/313 [==============================] - 0s 481us/step - loss: 0.4198 - accuracy: 0.8218
Epoch 15/20
313/313 [==============================] - 0s 495us/step - loss: 0.4179 - accuracy: 0.8239
Epoch 16/20
313/313 [==============================] - 0s 470us/step - loss: 0.4175 - accuracy: 0.8242
Epoch 17/20
313/313 [==============================] - 0s 461us/step - loss: 0.4167 - accuracy: 0.8254
Epoch 18/20
313/313 [==============================] - 0s 498us/step - loss: 0.4153 - accuracy: 0.8266
Epoch 19/20
313/313 [==============================] - 0s 481us/step - loss: 0.4147 - accuracy: 0.8247
Epoch 20/20
313/313 [==============================] - 0s 468us/step - loss: 0.4138 - accuracy: 0.8251

<keras.callbacks.History at 0x20496c12550>


NeuralNet.evaluate(scl.transform(test.drop(target, axis=1).astype(np.float64)), test[target])

625/625 [==============================] - 0s 340us/step - loss: 0.4434 - accuracy: 0.8149

[0.4433560073375702, 0.8148999810218811]


vip.load_data(df.loc[test_indices])

# Load prediction probabilities
preds = pd.Series(NeuralNet.predict(scl.transform(test.drop(target, axis=1).astype(np.float64))).ravel(), 
                  name='pred_probs', index = test_indices)
vip.add_column(preds)

Data set loaded with name: 'user_dataset_2 (2)'
Note: 'New column, pred_probs, successfully added to the current dataset.'


# import features to VIP
for feature in ['UTILIZATION', 'ON_TIME']:
    vip.add_column(test[feature])

Note: 'New column, UTILIZATION, successfully added to the current dataset.'
Note: 'New column, ON_TIME, successfully added to the current dataset.'


vip.smart_mapping( 'pred_probs', features + ['UTILIZATION', 'ON_TIME'])
vip.normalize('softmax')
smart_mapping_plot = vip.local_history[-1]
smart_mapping_plot.transparency = 'pred_probs'
smart_mapping_plot.color_type = "gradient"
vip.show(smart_mapping_plot)



Note: 'Trend Lines are not currently supported.'


from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

# Hyperparameter range
tuned_parameters = [{'n_estimators': range(50, 150, 25),  'max_depth': range(2, 5, 1),
                     'min_samples_split':range(5, 25, 5)}]

X_train = train.drop(target, axis=1)
y_train = train[target]

X_test = test.drop(target, axis=1)
y_test = test[target]

from time import time


score='f1'
s = time()
print("# Tuning hyper-parameters for f1 score")

clf = GridSearchCV(GradientBoostingClassifier(subsample=.2, random_state=100), tuned_parameters, cv=2,
                   scoring= 'f1', return_train_score=True)
clf.fit(X_train, y_train)

print("Best parameters set found on development set:")
print()
print(clf.best_params_)

# Tuning hyper-parameters for f1 score
Best parameters set found on development set:

{'max_depth': 2, 'min_samples_split': 5, 'n_estimators': 100}


means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

0.454 (+/-0.021) for {'max_depth': 2, 'min_samples_split': 5, 'n_estimators': 50}
0.454 (+/-0.028) for {'max_depth': 2, 'min_samples_split': 5, 'n_estimators': 75}
0.458 (+/-0.025) for {'max_depth': 2, 'min_samples_split': 5, 'n_estimators': 100}
0.452 (+/-0.015) for {'max_depth': 2, 'min_samples_split': 5, 'n_estimators': 125}
0.453 (+/-0.022) for {'max_depth': 2, 'min_samples_split': 10, 'n_estimators': 50}
0.454 (+/-0.029) for {'max_depth': 2, 'min_samples_split': 10, 'n_estimators': 75}
0.456 (+/-0.020) for {'max_depth': 2, 'min_samples_split': 10, 'n_estimators': 100}
0.455 (+/-0.014) for {'max_depth': 2, 'min_samples_split': 10, 'n_estimators': 125}
0.453 (+/-0.019) for {'max_depth': 2, 'min_samples_split': 15, 'n_estimators': 50}
0.453 (+/-0.030) for {'max_depth': 2, 'min_samples_split': 15, 'n_estimators': 75}
0.454 (+/-0.024) for {'max_depth': 2, 'min_samples_split': 15, 'n_estimators': 100}
0.449 (+/-0.022) for {'max_depth': 2, 'min_samples_split': 15, 'n_estimators': 125}
0.453 (+/-0.018) for {'max_depth': 2, 'min_samples_split': 20, 'n_estimators': 50}
0.454 (+/-0.030) for {'max_depth': 2, 'min_samples_split': 20, 'n_estimators': 75}
0.453 (+/-0.028) for {'max_depth': 2, 'min_samples_split': 20, 'n_estimators': 100}
0.448 (+/-0.021) for {'max_depth': 2, 'min_samples_split': 20, 'n_estimators': 125}
0.444 (+/-0.025) for {'max_depth': 3, 'min_samples_split': 5, 'n_estimators': 50}
0.457 (+/-0.046) for {'max_depth': 3, 'min_samples_split': 5, 'n_estimators': 75}
0.454 (+/-0.036) for {'max_depth': 3, 'min_samples_split': 5, 'n_estimators': 100}
0.451 (+/-0.026) for {'max_depth': 3, 'min_samples_split': 5, 'n_estimators': 125}
0.449 (+/-0.044) for {'max_depth': 3, 'min_samples_split': 10, 'n_estimators': 50}
0.453 (+/-0.054) for {'max_depth': 3, 'min_samples_split': 10, 'n_estimators': 75}
0.450 (+/-0.051) for {'max_depth': 3, 'min_samples_split': 10, 'n_estimators': 100}
0.450 (+/-0.036) for {'max_depth': 3, 'min_samples_split': 10, 'n_estimators': 125}
0.451 (+/-0.031) for {'max_depth': 3, 'min_samples_split': 15, 'n_estimators': 50}
0.447 (+/-0.060) for {'max_depth': 3, 'min_samples_split': 15, 'n_estimators': 75}
0.445 (+/-0.051) for {'max_depth': 3, 'min_samples_split': 15, 'n_estimators': 100}
0.451 (+/-0.064) for {'max_depth': 3, 'min_samples_split': 15, 'n_estimators': 125}
0.448 (+/-0.031) for {'max_depth': 3, 'min_samples_split': 20, 'n_estimators': 50}
0.443 (+/-0.045) for {'max_depth': 3, 'min_samples_split': 20, 'n_estimators': 75}
0.443 (+/-0.017) for {'max_depth': 3, 'min_samples_split': 20, 'n_estimators': 100}
0.447 (+/-0.023) for {'max_depth': 3, 'min_samples_split': 20, 'n_estimators': 125}
0.449 (+/-0.026) for {'max_depth': 4, 'min_samples_split': 5, 'n_estimators': 50}
0.442 (+/-0.022) for {'max_depth': 4, 'min_samples_split': 5, 'n_estimators': 75}
0.446 (+/-0.020) for {'max_depth': 4, 'min_samples_split': 5, 'n_estimators': 100}
0.443 (+/-0.013) for {'max_depth': 4, 'min_samples_split': 5, 'n_estimators': 125}
0.444 (+/-0.024) for {'max_depth': 4, 'min_samples_split': 10, 'n_estimators': 50}
0.443 (+/-0.033) for {'max_depth': 4, 'min_samples_split': 10, 'n_estimators': 75}
0.432 (+/-0.013) for {'max_depth': 4, 'min_samples_split': 10, 'n_estimators': 100}
0.439 (+/-0.008) for {'max_depth': 4, 'min_samples_split': 10, 'n_estimators': 125}
0.443 (+/-0.031) for {'max_depth': 4, 'min_samples_split': 15, 'n_estimators': 50}
0.440 (+/-0.015) for {'max_depth': 4, 'min_samples_split': 15, 'n_estimators': 75}
0.439 (+/-0.015) for {'max_depth': 4, 'min_samples_split': 15, 'n_estimators': 100}
0.439 (+/-0.002) for {'max_depth': 4, 'min_samples_split': 15, 'n_estimators': 125}
0.440 (+/-0.051) for {'max_depth': 4, 'min_samples_split': 20, 'n_estimators': 50}
0.439 (+/-0.040) for {'max_depth': 4, 'min_samples_split': 20, 'n_estimators': 75}
0.444 (+/-0.038) for {'max_depth': 4, 'min_samples_split': 20, 'n_estimators': 100}
0.438 (+/-0.027) for {'max_depth': 4, 'min_samples_split': 20, 'n_estimators': 125}


cv_grid = pd.DataFrame(clf.cv_results_['params'])
cv_grid['train_f1'] = clf.cv_results_['mean_train_score']
cv_grid['test_f1'] = clf.cv_results_['mean_test_score']

vip.load_data(cv_grid, 'gridsearch')

vip.plot(x='max_depth', y='min_samples_split', z='n_estimators',
                      size='train_f1', color='test_f1', size_scale=5.0, color_type='gradient', color_normalization='softmax')

Data set loaded with name: 'gridsearch (2)'

	LIMIT_BAL	SEX	EDUCATION	MARRIAGE	AGE	PAY_0	PAY_2	PAY_3	PAY_4	PAY_5	...	BILL_AMT4	BILL_AMT5	BILL_AMT6	PAY_AMT1	PAY_AMT2	PAY_AMT3	PAY_AMT4	PAY_AMT5	PAY_AMT6	default payment next month
0	20000	Female	university	married	24	2	2	-1	-1	-2	...	0	0	0	0	689	0	0	0	0	1
1	120000	Female	university	single	26	-1	2	0	0	0	...	3272	3455	3261	0	1000	1000	1000	0	2000	1
2	90000	Female	university	single	34	0	0	0	0	0	...	14331	14948	15549	1518	1500	1000	1000	1000	5000	0
3	50000	Female	university	married	37	0	0	0	0	0	...	28314	28959	29547	2000	2019	1200	1100	1069	1000	0
4	50000	Male	university	married	57	-1	0	-1	0	0	...	20940	19146	19131	2000	36681	10000	9000	689	679	0

	SmartMapping Rank	Feature	Correlated Group
0	1	PAY_3	None
1	2	PAY_2	None
2	3	PAY_0	None
3	4	PAY_4	None
4	5	PAY_5	None

	SmartMapping Rank	Feature	Correlated Group
0	1	ON_TIME	None
1	2	PAY_0	None
2	3	PAY_2	None
3	4	PAY_3	None
4	5	PAY_4	None

Model Visualization and Explainability¶

Import Data and Preprocess¶

Load and Visualize Data in VIP¶

Notes on Smart Mapping Results¶

Payment status is very important:¶

Points on the diagonal:¶

Generate Automatic Insights¶

Red Insight:¶

Blue Insight:¶

Build and Train Neural Net¶

Preprocessing¶

Engineer New Features¶

Takeaway: we can make qualitative statements of what the Neural Net is learning¶

Possible followup: Drag Education to playback and step through different education levels¶

Visualize Grid Search of Hyperparameters - GBM¶

Run gridsearch over 3 different hyperparameters¶

Print results of gridsearch¶

Plot gridsearch results in VIP¶

Takeaways¶

Insight
The 1 category makes up a relatively high distribution of points when LIMIT_BAL is between 50000 and 70000, PAY_AMT2 is between 10 and 60994, and PAY_AMT1 is between 28 and 4448. Out of 3,303 points in this region, 23.46% are 1 vs. only 21.95% overall.
The 1 category makes up a relatively high distribution of points when LIMIT_BAL is between 10000 and 40000, PAY_AMT2 is between 12 and 28883, and PAY_AMT1 is between 100 and 36147. Out of 2,682 points in this region, 28.82% are 1 vs. only 21.46% overall.
The 1 category makes up a relatively high distribution of points when LIMIT_BAL is between 80000 and 710000, PAY_AMT2 is between 0 and 316, and PAY_AMT1 is between 0 and 18. Out of 2,171 points in this region, 29.53% are 1 vs. only 21.54% overall.
The 1 category makes up a relatively high distribution of points when LIMIT_BAL is between 10000 and 70000, PAY_AMT2 is between 0 and 50000, and PAY_AMT1 is between 0 and 16. Out of 1,714 points in this region, 46.97% are 1 vs. only 20.61% overall.
The 1 category makes up a relatively high distribution of points when LIMIT_BAL is between 10000 and 140000, PAY_AMT2 is between 0 and 9, and PAY_AMT1 is between 39 and 125000. Out of 1,685 points in this region, 40.12% are 1 vs. only 21.05% overall.
The 1 category makes up a relatively high distribution of points when LIMIT_BAL is between 80000 and 680000, PAY_AMT2 is between 322 and 149654, and PAY_AMT1 is between 0 and 21. Out of 1,495 points in this region, 33.18% are 1 vs. only 21.54% overall.
The 0 category makes up a relatively high distribution of points when LIMIT_BAL is between 150000 and 300000, PAY_AMT2 is between 876 and 4003, and PAY_AMT1 is between 29 and 164163. Out of 2,764 points in this region, 88.49% are 0 vs. only 76.8% overall.
The 0 category makes up a relatively high distribution of points when LIMIT_BAL is between 150000 and 300000, PAY_AMT2 is between 4004 and 8858, and PAY_AMT1 is between 22 and 304815. Out of 2,375 points in this region, 85.6% are 0 vs. only 77.22% overall.
The 0 category makes up a relatively high distribution of points when LIMIT_BAL is between 310000 and 1000000, PAY_AMT2 is between 7524 and 1684259, and PAY_AMT1 is between 33 and 873552. Out of 1,680 points in this region, 90.6% are 0 vs. only 77.13% overall.
The 0 category makes up a relatively high distribution of points when LIMIT_BAL is between 50000 and 140000, PAY_AMT2 is between 10 and 140000, and PAY_AMT1 is between 4560 and 143713. Out of 1,663 points in this region, 84.97% are 0 vs. only 77.46% overall.
The 0 category makes up a relatively high distribution of points when LIMIT_BAL is between 310000 and 800000, PAY_AMT2 is between 884 and 7507, and PAY_AMT1 is between 31 and 368199. Out of 1,627 points in this region, 92.5% are 0 vs. only 77.04% overall.
The 0 category makes up a relatively high distribution of points when LIMIT_BAL is between 150000 and 300000, PAY_AMT2 is between 8866 and 237648, and PAY_AMT1 is between 22 and 273844. Out of 1,520 points in this region, 89.21% are 0 vs. only 77.28% overall.