Group 19: Fernando Trias, Roger Iliffe, HyounJun Park, Siyuan Yin
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
Before loading this data, you must create the "bz2" files by running the Data_cleansing notebook.
public_data_cat = pd.read_pickle("analysis-predictors.bz2")
public_data_y = pd.read_pickle("analysis-target.bz2")
public_data_cat = public_data_cat.drop('issue_date',axis=1)
public_data_y.mean()
We created two design matrices. One for training and testing on the entire testing set. Another for training and testing on a random sample so that our experiments could run faster.
X_sub, y_sub = resample(public_data_cat, public_data_y, n_samples=10000)
X_train_sub, X_test_sub, y_train_sub, y_test_sub = train_test_split(
X_sub, y_sub, test_size=0.25, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(
public_data_cat, public_data_y, test_size=0.25, random_state=42)
We experimented with the scaling method, StandardScaler.
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
scale.fit(X_train)
X_train_scale = pd.DataFrame(data = scale.transform(X_train), columns = X_train.columns.tolist())
X_test_scale = pd.DataFrame(data = scale.transform(X_test), columns = X_test.columns.tolist())
# create a subset of test data
X_train_sub_scale = pd.DataFrame(data = scale.transform(X_train_sub), columns = X_train.columns.tolist())
X_test_sub_scale = pd.DataFrame(data = scale.transform(X_test_sub), columns = X_test.columns.tolist())
y_train.describe()
X_test_scale.describe()
y_test.describe()
from sklearn.linear_model import LinearRegression
model = LinearRegression().fit(X_train_scale, y_train)
model.score(X_train_scale, y_train)
model.score(X_test_scale, y_test)
model.coef_
#
# Draw a bar chart showing each coefficient's magnitude and direction
#
def graph_coef(model, columns):
cf = dict(zip(columns, model.coef_.flatten()))
data = sorted(cf.items(),key=lambda x:abs(x[1]))
# print(data)
data_order, data_values = zip(*data)
data_order = np.array(data_order)
data_values = np.array(data_values)
index = np.arange(len(data_order))
fig,ax = plt.subplots(1,1, figsize=(10,30))
colors = np.array([(1,0,0)]*len(data_values))
colors[data_values >= 0] = (0,0,1)
ax.barh(index,abs(data_values),color = colors, alpha=0.8)
ax.set_yticklabels(data_order)
ax.set_yticks(index)
plt.tick_params(axis='both', which='major', labelsize=20)
ax.set_title("Magnitude of coefficient", fontsize=20)
ax.set_ylabel("predictors")
ax.set_xlabel("magnitude");
Drawing a graph of each coefficient's magnitude and direction will help us to understand what factors the model considers to be significant. Red means a negative coefficient. Blue is a positive coefficient.
graph_coef(model, X_test_scale.columns)
from sklearn.linear_model import LassoCV
LassoCVobject = LassoCV()
LassoCVobject.fit(X_train_scale, y_train)
LassoCVobject.score(X_train_scale, y_train)
LassoCVobject.score(X_test_scale, y_test)
np.mean(LassoCVobject.predict(X_test_scale))
graph_coef(LassoCVobject, X_test_scale.columns)
For simplicity, we want to predict all the loans than satisfy a minimum return.
y_train_cat = y_train > 0.05 #we can choose
y_test_cat = y_test > 0.05
X_train_scale
from sklearn.linear_model import LogisticRegression
model_logreg = LogisticRegression().fit(X_train_scale, y_train_cat)
model_logreg.score(X_train_scale, y_train_cat)
model_logreg.score(X_test_scale, y_test_cat)
graph_coef(model_logreg, X_test_scale.columns)
The accuracy score on the model is not sufficient to determine it's quality. This is because a loan's loss can be up to 100%, whereas the gains are usually limited by the interest rate to 10-20%. Thus, a few bad loans can completely wipe out any profits from good loans. For example, if we choose 10 loans and 9 of them make 10%, but the last one is a total loss, then the overall perfomance will be negative. However, the accuracy will be 90%.
#
# for a model, evaluate it with the given "df" data and calculate a total profit/loss APY using
# the profit data.
#
# In the case of regression models, use "cutoff" to determine the investment criteria
#
def evaluate_strategy(model, df, profit, cutoff=None, showprint=True):
# see where our model says to invest
inv = model.predict(df)
if cutoff is not None:
inv = inv > cutoff
inv = inv.reshape(-1)
# return the mean return
if showprint:
print("Positive result on %d out of %d (%0.2f%%)" % (inv.sum(), inv.shape[0], inv.sum() / inv.shape[0] * 100.0))
print("Mean return of strategy:", profit[inv].mean())
print("Overall return:", profit.mean())
return profit[inv].mean(), profit.mean()
#
# for a model, evaluate it with the given "df" data and calculate a total profit/loss APY using
# the profit data.
#
# In the case of regression models, use "cutoff" to determine the investment criteria
#
def evaluate_strategy(model, df, profit, cutoff=None, showprint=True):
# see where our model says to invest
inv = model.predict(df)
if cutoff is not None:
inv = inv > cutoff
inv = inv.reshape(-1)
# return the mean return
if showprint:
print("Positive result on %d out of %d (%0.2f%%)" % (inv.sum(), inv.shape[0], inv.sum() / inv.shape[0] * 100.0))
print("Mean return of strategy:", profit[inv].mean())
print("Overall return:", profit.mean())
print("remaining", profit[~inv].mean())
return profit[inv].mean(), profit.mean()
def evaluate_score(estimator, X, y):
positive = model.predict_proba(df)[:,1] >= 0.9
return profit[positive].mean()
#
# evaluate the strategy based on the probability estimates rather than the classification
# results
#
def evaluate_proba(model, df, profit, cutoff):
positive = model.predict_proba(df)[:,1] >= cutoff
# print(positive.shape)
print("Percent positive investments %0.1f%%" % ((positive.sum() / profit.count()) * 100))
print("Average return", profit[positive].mean())
evaluate_strategy(model_logreg, X_test_scale, y_test)
x= model_logreg.predict(X_test_scale)
a =y_test>0
evaluate_strategy(model_logreg,X_test_scale,y_test)
y_test[x]
ss = pd.DataFrame(data=x)
ss['e'] = y_test.values
ss.columns = ['a', 'b']
ss[ss['a']==True].b.mean()
ss
newy = y_train_cat[:1000]
newy1 = y_train[:1000]
ma = LogisticRegression().fit(X_train[:1000],newy)
newy1[ma.predict(X_train[:1000])]
np.random.choice(y_test[~x],100).mean()
from sklearn.metrics import accuracy_score
print(accuracy_score(y_train_cat,model_logreg.predict(X_train_scale)))
y_train_cat.where(model_logreg.predict(X_train_scale))
index = np.where(model_logreg.predict(X_train_scale)==True)[0]
# model_logreg.predict(X_train_scale).sum()
y_train.iloc[index].mean()
print(y_train.iloc[index].mean())
y_train.iloc[index].nsmallest(100)
np.abs(y_train.iloc[index])
print(np.sum(np.abs(y_train)<0.05)/len(y_train))
print(np.sum(y_train.iloc[index]<-0.30))
print(len(y_train))
y_train.dtype
We will use this evaluate_strategy() function to determine if how well the strategy works. In the above example, the strategy returned -0.09 or about 9% loss, so it was not good. The overall return is the return from all the other loans and represents the "random" strategy. It returned -0.02, or a 2% loss, which is better than the model, but still not good.
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier(max_depth=10, max_leaf_nodes=10)
model_rf.fit(X_train_scale, y_train>0.01)
model_rf.score(X_train_scale, y_train>0.01)
model_rf.score(X_test_scale, y_test_cat)
evaluate_strategy(model_rf, X_test_scale, y_test)
y_test.iloc[model_rf.predict(X_test_scale)==False]
(model_rf.predict(X_test_scale)==False).sum()
For this case, we evaluated the strategy using probability estimates. The code below creates a Random Forest and then chooses loans where the confidence in the prediction is very high. This results in a positive return over the test data set. This is a promising result.
model_rf_p = RandomForestClassifier(max_depth=10, max_leaf_nodes=10)
model_rf_p.fit(X_train_scale, y_train > 0.0)
evaluate_proba(model_rf_p, X_test_scale, y_test, 0.9)
One thing to consider is how the choice of our "target" return affects the model's accuracy. In other words, is the model better at predicting higher quality loans? The results below show that this is generally not the case. The graphs show that the best area of prediction is around a "target" return greater than 0. AdaBoost shows a peak at the end, so that requires more investigation, but it is probably spurious. Other than that, the models are unable to predict which loans will yield the highest returns. However, that's the very question we want to answer.
from sklearn.ensemble import AdaBoostClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
x = np.linspace(-0.50, 0.20, 40)
modelnames = ["RF", "AdaBoost", "QDA"]
models = [RandomForestClassifier(), AdaBoostClassifier(), QuadraticDiscriminantAnalysis()]
for name, model in zip(modelnames, models):
y = []
s = []
for cutoff in x:
positive = y_train_sub > cutoff
model.fit(X_train_sub_scale, positive)
score = accuracy_score(positive, model.predict(X_train_sub_scale))
s.append(score)
apy, mapy = evaluate_strategy(model, X_test_sub_scale, y_test_sub, cutoff, showprint=False)
y.append(apy)
plt.plot(x, y)
# plt.plot(x, s)
plt.xlabel("Return cutoff")
plt.ylabel("Return diff (negative is worse)")
plt.title("Accuracy per Cutoff for %s" % name)
plt.show();
Another area to consider is regression models for predicting the actual return. So far, we've worked with classification models to distinguish good loans from bad loans. We will use sub sample of data.
from sklearn.ensemble import RandomForestRegressor
model_rf_reg = RandomForestRegressor()
model_rf_reg.fit(X_train_sub_scale, y_train_sub)
evaluate_strategy(model_rf_reg, X_test_scale, y_test, 0.01)
evaluate_strategy(model_rf_reg, X_test_scale, y_test, 0.10)
evaluate_strategy(model_rf_reg, X_test_scale, y_test, 0.20)
As shown below, the AdaBoost regressor is unable to make any accurate predictions.
from sklearn.ensemble import AdaBoostRegressor
model_adaboost = AdaBoostRegressor()
model_adaboost.fit(X_train_sub_scale, y_train_sub)
evaluate_strategy(model_adaboost, X_test_scale, y_test, 0.01)
The graph above showed that AdaBoost may have had good results for high returns. However, running over the entire train/test set shows abysmal perfomance.
from sklearn.ensemble import AdaBoostClassifier
model_adaboost2 = AdaBoostClassifier()
model_adaboost2.fit(X_train_sub_scale, y_train_sub > 0.20)
evaluate_strategy(model_adaboost2, X_test_scale, y_test)
AdaBoost with Random Forest as base classifier takes a long time to run, so we try it with the small data set. The results are not promising.
from sklearn.ensemble import AdaBoostClassifier
model_adaboost3 = AdaBoostClassifier(
learning_rate = 1,
n_estimators=20,
base_estimator=RandomForestClassifier(max_depth=15, max_leaf_nodes=15))
model_adaboost3.fit(X_train_sub_scale, y_train_sub > 0.10)
evaluate_strategy(model_adaboost3, X_test_sub_scale, y_test_sub)
As in the Random Forest, we will also consider predicting loan investments based on the probability. Only the highest probability results will be considered.
evaluate_proba(model_adaboost3, X_test_scale, y_test, 0.6)
So far, all the models have used the scaled data. Now we consider using polynomial features. For these tests, in order to keep runtimes reasonable, we will use a subset of the most promising features.
small_columns = ['loan_amnt','term_months','int_rate','credit_score',
'emp_length_years','annual_inc','dti','revol_util_perc']
X_train_small = X_train[small_columns]
X_test_small = X_test[small_columns]
from sklearn.preprocessing import PolynomialFeatures
scale.fit(X_train_small)
poly = PolynomialFeatures(2)
X_train_poly = poly.fit_transform(scale.transform(X_train_small))
X_test_poly = poly.fit_transform(scale.transform(X_test_small))
y_train_target = y_train > 0.05
y_test_target = y_test > 0.05
from sklearn.linear_model import LogisticRegression
poly_logreg = LogisticRegression().fit(X_train_poly, y_train_target)
poly_logreg.score(X_train_poly, y_train_target)
poly_logreg.score(X_test_poly, y_test_target)
evaluate_strategy(poly_logreg, X_train_poly, y_train)
evaluate_strategy(poly_logreg, X_test_poly, y_test)
from sklearn.tree import DecisionTreeClassifier
decisionmodel = DecisionTreeClassifier(max_depth=3).fit(X_train_scale, y_train_cat)
evaluate_strategy(decisionmodel, X_test_scale, y_test)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda_model = LinearDiscriminantAnalysis().fit(X_train_scale, y_train > 0.10)
evaluate_strategy(lda_model, X_test_scale, y_test)
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
qda_model = QuadraticDiscriminantAnalysis()
qda_model.fit(X_train_scale, y_train > 0.10)
evaluate_strategy(qda_model, X_test_scale, y_test)
evaluate_proba(qda_model, X_test_scale, y_test, 0.9)
The Neural Network takes a long time to train, so we created a simple model to see if this is worth tweaking to improve perfomance.
y_nn = y_train > 0.05
from keras.models import Sequential
from keras.layers import Dense
model_nn = Sequential()
model_nn.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model_nn.add(Dense(64, activation='relu'))
model_nn.add(Dense(32, activation='relu'))
model_nn.add(Dense(8, activation='relu'))
model_nn.add(Dense(1, activation='sigmoid'))
model_nn.compile(loss='binary_crossentropy', optimizer='adam')
model_nn.summary()
model_nn.fit(X_train_scale, y_nn, epochs=10, batch_size=128, validation_split=0.2)
The val_loss does not seem to be moving much so the model is not working well.
evaluate_strategy(model_nn, X_test_scale, y_test, 0.5)
p = model_nn.predict_proba(X_test_scale)
p.min(), p.max()
Even though the model does not seem to work well, if we only consider the loans where the probability score is higher than 0.9, the return is 0.05, which is much higher than most of the alternative models.
positive = model_nn.predict_proba(X_test_scale) >= 0.9
print("Percent positive investments %0.1f%%" % ((positive.sum() / y_test.count()) * 100))
print("Average return", y_test[positive.reshape(-1)].mean())
Next, we considered stacking models. First, we create a simple model with 3 base models: Logistic, Random Forest and Neural Network.
modellist = ['Logistic', 'RF', "NN"]
modelobj = [model_logreg, model_rf, model_nn]
# create empty results that we will fill in
z = np.zeros((X_train.shape[0], len(modellist)))
stack_train = pd.DataFrame(data=z, columns=modellist)
z = np.zeros((X_test.shape[0], len(modellist)))
stack_test = pd.DataFrame(data=z, columns=modellist)
#make dataset with prediction.
for name, model in zip(modellist, modelobj):
stack_train[name] = model.predict_proba(X_train_scale)
stack_test[name] = model.predict_proba(X_test_scale)
stack_train
and stack_test
have one row for each sample. The columns are the probability estimates of the models.
stack_train.head()
stack_train.describe()
stack_test.describe()
We will aggregate these with the "min" function. Thus we are interested in the loans that all models agree are good. As shown below, that's 9% of all test loans and will yield a return of 0.02.
# positive = stack_test.mean(axis=1) > 0.5
# positive = (stack_test > 0.75).sum(axis=1) >= 3
positive = stack_test.min(axis=1) > 0.8
print("Percent positive investments %0.1f%%" % ((positive.sum() / y_test.count()) * 100))
print("Average return", y_test[positive.values].mean())
For the sake of completeness, we will also run a KNN classifier. But the results are not promising.
from sklearn.neighbors import KNeighborsClassifier
KNN =KNeighborsClassifier(n_neighbors = 2).fit(X_train_sub_scale, y_train_sub > 0.10)
evaluate_strategy(KNN, X_test_sub_scale, y_test_sub)