Group 19: Fernando Trias, Roger Iliffe, HyounJun Park, Siyuan Yin

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

Loan Data¶

Before loading this data, you must create the "bz2" files by running the Data_cleansing notebook.

public_data_cat = pd.read_pickle("analysis-predictors.bz2")
public_data_y = pd.read_pickle("analysis-target.bz2")

public_data_cat = public_data_cat.drop('issue_date',axis=1)

public_data_y.mean()

-0.0239263066826214

Create Design Matrix¶

We created two design matrices. One for training and testing on the entire testing set. Another for training and testing on a random sample so that our experiments could run faster.

Model Split on subset of data set¶

X_sub, y_sub = resample(public_data_cat, public_data_y, n_samples=10000)
X_train_sub, X_test_sub, y_train_sub, y_test_sub = train_test_split(
    X_sub, y_sub, test_size=0.25, random_state=42)

Model Split on entire data set¶

X_train, X_test, y_train, y_test = train_test_split(
    public_data_cat, public_data_y, test_size=0.25, random_state=42)

Scaling¶

We experimented with the scaling method, StandardScaler.

StandardScaler¶

from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
scale.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

X_train_scale = pd.DataFrame(data = scale.transform(X_train), columns = X_train.columns.tolist())
X_test_scale = pd.DataFrame(data = scale.transform(X_test), columns = X_test.columns.tolist())

# create a subset of test data
X_train_sub_scale = pd.DataFrame(data = scale.transform(X_train_sub), columns = X_train.columns.tolist())
X_test_sub_scale = pd.DataFrame(data = scale.transform(X_test_sub), columns = X_test.columns.tolist())

y_train.describe()

count    739330.000000
mean         -0.023918
std           0.283187
min          -1.000000
25%           0.040477
50%           0.073132
75%           0.108923
max           0.982541
Name: apy, dtype: float64

X_test_scale.describe()

y_test.describe()

count    246444.000000
mean         -0.023950
std           0.282922
min          -1.000000
25%           0.040477
50%           0.073039
75%           0.108996
max           0.641484
Name: apy, dtype: float64

Exploratory Models¶

Linear Regression¶

from sklearn.linear_model import LinearRegression
model = LinearRegression().fit(X_train_scale, y_train)

model.score(X_train_scale, y_train)

0.03254129937521988

model.score(X_test_scale, y_test)

model.coef_

array([-6.38064695e-03, -1.62106034e-02,  2.52067500e-02, -4.91283083e-02,
        2.74099218e-03,  3.25197428e-03, -1.74467720e-02, -3.89975006e-03,
        3.96122768e-03, -4.21146722e-03,  8.26137674e-04, -1.76109206e-03,
       -1.08739979e-02, -2.12420579e-03,  2.67522599e-03,  2.33718308e-03,
        1.14442416e-02,  9.76098938e+10,  1.03296389e+11,  9.96037206e+10,
       -8.27240725e+08, -3.61126128e+10, -4.75160271e+08, -8.44122645e+08,
       -2.19235969e+10, -3.53738562e+10])

#
# Draw a bar chart showing each coefficient's magnitude and direction
#
def graph_coef(model, columns):
    cf = dict(zip(columns, model.coef_.flatten()))
    data = sorted(cf.items(),key=lambda x:abs(x[1]))
#     print(data)
    data_order, data_values = zip(*data)
    data_order = np.array(data_order)
    data_values = np.array(data_values)
    index = np.arange(len(data_order))
    
    fig,ax = plt.subplots(1,1, figsize=(10,30))

    colors = np.array([(1,0,0)]*len(data_values))
    colors[data_values >= 0] = (0,0,1)

    ax.barh(index,abs(data_values),color = colors, alpha=0.8)
    ax.set_yticklabels(data_order)
    ax.set_yticks(index)
    plt.tick_params(axis='both', which='major', labelsize=20)
    ax.set_title("Magnitude of coefficient", fontsize=20)
    ax.set_ylabel("predictors")
    ax.set_xlabel("magnitude");

Drawing a graph of each coefficient's magnitude and direction will help us to understand what factors the model considers to be significant. Red means a negative coefficient. Blue is a positive coefficient.

graph_coef(model, X_test_scale.columns)

Lasso¶

from sklearn.linear_model import LassoCV
LassoCVobject = LassoCV()
LassoCVobject.fit(X_train_scale, y_train)

LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
    max_iter=1000, n_alphas=100, n_jobs=1, normalize=False, positive=False,
    precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
    verbose=False)

LassoCVobject.score(X_train_scale, y_train)

0.03253995035026058

LassoCVobject.score(X_test_scale, y_test)

0.033842239263917495

np.mean(LassoCVobject.predict(X_test_scale))

-0.024146723680368885

graph_coef(LassoCVobject, X_test_scale.columns)

Define y_train_cat for classification¶

For simplicity, we want to predict all the loans than satisfy a minimum return.

y_train_cat = y_train > 0.05 #we can choose

y_test_cat = y_test > 0.05

X_train_scale

Logistic Regression¶

from sklearn.linear_model import LogisticRegression
model_logreg = LogisticRegression().fit(X_train_scale, y_train_cat)

model_logreg.score(X_train_scale, y_train_cat)

0.6897542369442603

model_logreg.score(X_test_scale, y_test_cat)

0.6896455178458392

graph_coef(model_logreg, X_test_scale.columns)

Strategy evaluate function for classification¶

The accuracy score on the model is not sufficient to determine it's quality. This is because a loan's loss can be up to 100%, whereas the gains are usually limited by the interest rate to 10-20%. Thus, a few bad loans can completely wipe out any profits from good loans. For example, if we choose 10 loans and 9 of them make 10%, but the last one is a total loss, then the overall perfomance will be negative. However, the accuracy will be 90%.

#
# for a model, evaluate it with the given "df" data and calculate a total profit/loss APY using
# the profit data.
#
# In the case of regression models, use "cutoff" to determine the investment criteria
#
def evaluate_strategy(model, df, profit, cutoff=None, showprint=True):
    # see where our model says to invest
    inv = model.predict(df)
    if cutoff is not None: 
        inv = inv > cutoff
        inv = inv.reshape(-1)
    # return the mean return
    if showprint:
        print("Positive result on %d out of %d (%0.2f%%)" % (inv.sum(), inv.shape[0], inv.sum() / inv.shape[0] * 100.0))
        print("Mean return of strategy:", profit[inv].mean())
        print("Overall return:", profit.mean())
    return profit[inv].mean(), profit.mean()

#
# for a model, evaluate it with the given "df" data and calculate a total profit/loss APY using
# the profit data.
#
# In the case of regression models, use "cutoff" to determine the investment criteria
#
def evaluate_strategy(model, df, profit, cutoff=None, showprint=True):
    # see where our model says to invest
    inv = model.predict(df)
    if cutoff is not None: 
        inv = inv > cutoff
        inv = inv.reshape(-1)
    # return the mean return
    if showprint:
        print("Positive result on %d out of %d (%0.2f%%)" % (inv.sum(), inv.shape[0], inv.sum() / inv.shape[0] * 100.0))
        print("Mean return of strategy:", profit[inv].mean())
        print("Overall return:", profit.mean())
        print("remaining", profit[~inv].mean())
    return profit[inv].mean(), profit.mean()

def evaluate_score(estimator, X, y):
    positive = model.predict_proba(df)[:,1] >= 0.9
    return profit[positive].mean()

#
# evaluate the strategy based on the probability estimates rather than the classification
# results
#
def evaluate_proba(model, df, profit, cutoff):
    positive = model.predict_proba(df)[:,1] >= cutoff
#     print(positive.shape)
    print("Percent positive investments %0.1f%%" % ((positive.sum() / profit.count()) * 100))
    print("Average return", profit[positive].mean())

evaluate_strategy(model_logreg, X_test_scale, y_test)

Positive result on 245375 out of 246444 (99.57%)
Mean return of strategy: -0.023810635287260434
Overall return: -0.023949942502153383
remaining -0.05592609578032883

(-0.023810635287260434, -0.023949942502153383)

x= model_logreg.predict(X_test_scale)

a =y_test>0
evaluate_strategy(model_logreg,X_test_scale,y_test)

y_test[x]

ss = pd.DataFrame(data=x)
ss['e'] = y_test.values

ss.columns = ['a', 'b']

ss[ss['a']==True].b.mean()

ss

newy = y_train_cat[:1000]
newy1 = y_train[:1000]

ma = LogisticRegression().fit(X_train[:1000],newy)
newy1[ma.predict(X_train[:1000])]
np.random.choice(y_test[~x],100).mean()

from sklearn.metrics import accuracy_score

print(accuracy_score(y_train_cat,model_logreg.predict(X_train_scale)))
y_train_cat.where(model_logreg.predict(X_train_scale))
index = np.where(model_logreg.predict(X_train_scale)==True)[0]
# model_logreg.predict(X_train_scale).sum()

y_train.iloc[index].mean()

print(y_train.iloc[index].mean())
y_train.iloc[index].nsmallest(100)
np.abs(y_train.iloc[index])

print(np.sum(np.abs(y_train)<0.05)/len(y_train))
print(np.sum(y_train.iloc[index]<-0.30))
print(len(y_train))

Positive result on 245375 out of 246444 (99.57%)
Mean return of strategy: -0.023810635287260434
Overall return: -0.023949942502153383
remaining -0.05592609578032883
0.6897542369442603
-0.023756868317566113
0.12373770846577306
96545
739330

y_train.dtype

dtype('float64')

We will use this evaluate_strategy() function to determine if how well the strategy works. In the above example, the strategy returned -0.09 or about 9% loss, so it was not good. The overall return is the return from all the other loans and represents the "random" strategy. It returned -0.02, or a 2% loss, which is better than the model, but still not good.

Random Forest¶

from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(max_depth=10, max_leaf_nodes=10)

model_rf.fit(X_train_scale, y_train>0.01)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=10,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

model_rf.score(X_train_scale, y_train>0.01)

0.8019815238121002

model_rf.score(X_test_scale, y_test_cat)

0.6910291993312883

evaluate_strategy(model_rf, X_test_scale, y_test)

Positive result on 246444 out of 246444 (100.00%)
Mean return of strategy: -0.023949942502153383
Overall return: -0.023949942502153383
remaining nan

(-0.023949942502153383, -0.023949942502153383)

y_test.iloc[model_rf.predict(X_test_scale)==False]
(model_rf.predict(X_test_scale)==False).sum()

0

Random Forest using probability estimates¶

For this case, we evaluated the strategy using probability estimates. The code below creates a Random Forest and then chooses loans where the confidence in the prediction is very high. This results in a positive return over the test data set. This is a promising result.

model_rf_p = RandomForestClassifier(max_depth=10, max_leaf_nodes=10)
model_rf_p.fit(X_train_scale, y_train > 0.0)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=10,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

evaluate_proba(model_rf_p, X_test_scale, y_test, 0.9)

Percent positive investments 0.0%
Average return nan

Various Classifiers, loop through different target returns¶

One thing to consider is how the choice of our "target" return affects the model's accuracy. In other words, is the model better at predicting higher quality loans? The results below show that this is generally not the case. The graphs show that the best area of prediction is around a "target" return greater than 0. AdaBoost shows a peak at the end, so that requires more investigation, but it is probably spurious. Other than that, the models are unable to predict which loans will yield the highest returns. However, that's the very question we want to answer.

from sklearn.ensemble import AdaBoostClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

x = np.linspace(-0.50, 0.20, 40)

modelnames = ["RF", "AdaBoost", "QDA"]
models = [RandomForestClassifier(), AdaBoostClassifier(), QuadraticDiscriminantAnalysis()]

for name, model in zip(modelnames, models):
    y = []
    s = []
    for cutoff in x:
        positive = y_train_sub > cutoff
        model.fit(X_train_sub_scale, positive)
        score = accuracy_score(positive, model.predict(X_train_sub_scale))
        s.append(score)
        apy, mapy = evaluate_strategy(model, X_test_sub_scale, y_test_sub, cutoff, showprint=False)
        y.append(apy)
    plt.plot(x, y)
#     plt.plot(x, s)
    plt.xlabel("Return cutoff")
    plt.ylabel("Return diff (negative is worse)")
    plt.title("Accuracy per Cutoff for %s" % name)
    plt.show();

/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")
/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")

Regression Models¶

Another area to consider is regression models for predicting the actual return. So far, we've worked with classification models to distinguish good loans from bad loans. We will use sub sample of data.

Random Forest Regression¶

from sklearn.ensemble import RandomForestRegressor
model_rf_reg = RandomForestRegressor()
model_rf_reg.fit(X_train_sub_scale, y_train_sub)
evaluate_strategy(model_rf_reg, X_test_scale, y_test, 0.01)

Positive result on 106785 out of 246444 (43.33%)
Mean return of strategy: 0.0005673895921541428
Overall return: -0.023949942502153383
remaining -0.04269619807960085

(0.0005673895921541428, -0.023949942502153383)

evaluate_strategy(model_rf_reg, X_test_scale, y_test, 0.10)

Positive result on 11768 out of 246444 (4.78%)
Mean return of strategy: -0.01048387897865809
Overall return: -0.023949942502153383
remaining -0.024625208126011475

(-0.01048387897865809, -0.023949942502153383)

evaluate_strategy(model_rf_reg, X_test_scale, y_test, 0.20)

Positive result on 44 out of 246444 (0.02%)
Mean return of strategy: 0.059943427109245606
Overall return: -0.023949942502153383
remaining -0.023964923461012554

(0.059943427109245606, -0.023949942502153383)

Ada Boost Regressor¶

As shown below, the AdaBoost regressor is unable to make any accurate predictions.

from sklearn.ensemble import AdaBoostRegressor
model_adaboost = AdaBoostRegressor()
model_adaboost.fit(X_train_sub_scale, y_train_sub)
evaluate_strategy(model_adaboost, X_test_scale, y_test, 0.01)

Positive result on 1017 out of 246444 (0.41%)
Mean return of strategy: -0.011285307098657703
Overall return: -0.023949942502153383
remaining -0.024002422197563288

(-0.011285307098657703, -0.023949942502153383)

Ada Boost Classifier for high returns¶

The graph above showed that AdaBoost may have had good results for high returns. However, running over the entire train/test set shows abysmal perfomance.

from sklearn.ensemble import AdaBoostClassifier
model_adaboost2 = AdaBoostClassifier()
model_adaboost2.fit(X_train_sub_scale, y_train_sub > 0.20)
evaluate_strategy(model_adaboost2, X_test_scale, y_test)

Positive result on 2607 out of 246444 (1.06%)
Mean return of strategy: -0.10521975738512479
Overall return: -0.023949942502153383
remaining -0.023081040705461957

(-0.10521975738512479, -0.023949942502153383)

Ada Boost Classifier with Random Forest base classifier¶

AdaBoost with Random Forest as base classifier takes a long time to run, so we try it with the small data set. The results are not promising.

from sklearn.ensemble import AdaBoostClassifier
model_adaboost3 = AdaBoostClassifier(
    learning_rate = 1,
    n_estimators=20,
    base_estimator=RandomForestClassifier(max_depth=15, max_leaf_nodes=15))
model_adaboost3.fit(X_train_sub_scale, y_train_sub > 0.10)
evaluate_strategy(model_adaboost3, X_test_sub_scale, y_test_sub)

Positive result on 654 out of 2500 (26.16%)
Mean return of strategy: -0.029647385090392175
Overall return: -0.02146721277172889
remaining -0.01856914522221321

(-0.029647385090392175, -0.02146721277172889)

As in the Random Forest, we will also consider predicting loan investments based on the probability. Only the highest probability results will be considered.

evaluate_proba(model_adaboost3, X_test_scale, y_test, 0.6)

Percent positive investments 0.0%
Average return nan

Polynomial Features with subset of columns¶

So far, all the models have used the scaled data. Now we consider using polynomial features. For these tests, in order to keep runtimes reasonable, we will use a subset of the most promising features.

small_columns = ['loan_amnt','term_months','int_rate','credit_score',
                 'emp_length_years','annual_inc','dti','revol_util_perc']
X_train_small = X_train[small_columns]
X_test_small = X_test[small_columns]

from sklearn.preprocessing import PolynomialFeatures

scale.fit(X_train_small)

poly = PolynomialFeatures(2)
X_train_poly = poly.fit_transform(scale.transform(X_train_small))
X_test_poly = poly.fit_transform(scale.transform(X_test_small))

y_train_target = y_train > 0.05
y_test_target = y_test > 0.05

from sklearn.linear_model import LogisticRegression
poly_logreg = LogisticRegression().fit(X_train_poly, y_train_target)

poly_logreg.score(X_train_poly, y_train_target)

0.7271367319059148

poly_logreg.score(X_test_poly, y_test_target)

0.7273660547629481

evaluate_strategy(poly_logreg, X_train_poly, y_train)

Positive result on 656189 out of 739330 (88.75%)
Mean return of strategy: -0.022489458771179967
Overall return: -0.023918428054797717
remaining -0.035196545051804605

(-0.022489458771179967, -0.023918428054797717)

evaluate_strategy(poly_logreg, X_test_poly, y_test)

Positive result on 218611 out of 246444 (88.71%)
Mean return of strategy: -0.022387062287078586
Overall return: -0.023949942502153383
remaining -0.036225399933899144

(-0.022387062287078586, -0.023949942502153383)

Single DecisionTree¶

from sklearn.tree import DecisionTreeClassifier

decisionmodel = DecisionTreeClassifier(max_depth=3).fit(X_train_scale, y_train_cat)
evaluate_strategy(decisionmodel, X_test_scale, y_test)

Positive result on 217500 out of 246444 (88.26%)
Mean return of strategy: -0.02996264526775741
Overall return: -0.023949942502153383
remaining 0.021232577243523003

(-0.02996264526775741, -0.023949942502153383)

LDA¶

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda_model = LinearDiscriminantAnalysis().fit(X_train_scale, y_train > 0.10)
evaluate_strategy(lda_model, X_test_scale, y_test)

Positive result on 43665 out of 246444 (17.72%)
Mean return of strategy: -0.08914574747050502
Overall return: -0.023949942502153383
remaining -0.009911137576875466

/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:388: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")

(-0.08914574747050502, -0.023949942502153383)

QDA¶

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

qda_model = QuadraticDiscriminantAnalysis()
qda_model.fit(X_train_scale, y_train > 0.10)
evaluate_strategy(qda_model, X_test_scale, y_test)

/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")

Positive result on 78716 out of 246444 (31.94%)
Mean return of strategy: -0.06199690086671178
Overall return: -0.023949942502153383
remaining -0.00609422148583862

/Users/rogeriliffe/anaconda3/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:682: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")

(-0.06199690086671178, -0.023949942502153383)

evaluate_proba(qda_model, X_test_scale, y_test, 0.9)

Percent positive investments 7.8%
Average return -0.08139954065277541

Neural Network¶

The Neural Network takes a long time to train, so we created a simple model to see if this is worth tweaking to improve perfomance.

y_nn = y_train > 0.05

from keras.models import Sequential
from keras.layers import Dense

model_nn = Sequential()
model_nn.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model_nn.add(Dense(64, activation='relu'))
model_nn.add(Dense(32, activation='relu'))
model_nn.add(Dense(8, activation='relu'))
model_nn.add(Dense(1, activation='sigmoid'))
model_nn.compile(loss='binary_crossentropy', optimizer='adam')
model_nn.summary()

Using TensorFlow backend.

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_1 (Dense)              (None, 128)               3456      
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_3 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_4 (Dense)              (None, 8)                 264       
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 9         
=================================================================
Total params: 14,065
Trainable params: 14,065
Non-trainable params: 0
_________________________________________________________________

model_nn.fit(X_train_scale, y_nn, epochs=10, batch_size=128, validation_split=0.2)

Train on 591464 samples, validate on 147866 samples
Epoch 1/10
591464/591464 [==============================] - 8s 14us/step - loss: 0.5515 - val_loss: 0.5470
Epoch 2/10
591464/591464 [==============================] - 8s 13us/step - loss: 0.5448 - val_loss: 0.5459
Epoch 3/10
591464/591464 [==============================] - 8s 14us/step - loss: 0.5430 - val_loss: 0.5450
Epoch 4/10
591464/591464 [==============================] - 8s 14us/step - loss: 0.5420 - val_loss: 0.5427
Epoch 5/10
591464/591464 [==============================] - 8s 14us/step - loss: 0.5411 - val_loss: 0.5422
Epoch 6/10
591464/591464 [==============================] - 8s 13us/step - loss: 0.5403 - val_loss: 0.5421
Epoch 7/10
591464/591464 [==============================] - 8s 14us/step - loss: 0.5395 - val_loss: 0.5422
Epoch 8/10
591464/591464 [==============================] - 8s 14us/step - loss: 0.5390 - val_loss: 0.5433
Epoch 9/10
591464/591464 [==============================] - 8s 14us/step - loss: 0.5385 - val_loss: 0.5415
Epoch 10/10
591464/591464 [==============================] - 8s 14us/step - loss: 0.5381 - val_loss: 0.5408

<keras.callbacks.History at 0x1a45386320>

The val_loss does not seem to be moving much so the model is not working well.

evaluate_strategy(model_nn, X_test_scale, y_test, 0.5)

Positive result on 211585 out of 246444 (85.86%)
Mean return of strategy: -0.02123804195563544
Overall return: -0.023949942502153383
remaining -0.0404104685394764

(-0.02123804195563544, -0.023949942502153383)

p = model_nn.predict_proba(X_test_scale)
p.min(), p.max()

(0.0, 0.9988053)

Even though the model does not seem to work well, if we only consider the loans where the probability score is higher than 0.9, the return is 0.05, which is much higher than most of the alternative models.

positive = model_nn.predict_proba(X_test_scale) >= 0.9
print("Percent positive investments %0.1f%%" % ((positive.sum() / y_test.count()) * 100))
print("Average return", y_test[positive.reshape(-1)].mean())

Percent positive investments 6.3%
Average return 0.046840585805957735

Stacking Model¶

Next, we considered stacking models. First, we create a simple model with 3 base models: Logistic, Random Forest and Neural Network.

modellist = ['Logistic', 'RF', "NN"]
modelobj = [model_logreg, model_rf, model_nn]

# create empty results that we will fill in
z = np.zeros((X_train.shape[0], len(modellist)))
stack_train = pd.DataFrame(data=z, columns=modellist)
z = np.zeros((X_test.shape[0], len(modellist)))
stack_test = pd.DataFrame(data=z, columns=modellist)

#make dataset with prediction.
for name, model in zip(modellist, modelobj):
    stack_train[name] = model.predict_proba(X_train_scale)
    stack_test[name] = model.predict_proba(X_test_scale)

stack_train and stack_test have one row for each sample. The columns are the probability estimates of the models.

stack_train.head()

stack_train.describe()

stack_test.describe()

We will aggregate these with the "min" function. Thus we are interested in the loans that all models agree are good. As shown below, that's 9% of all test loans and will yield a return of 0.02.

# positive = stack_test.mean(axis=1) > 0.5
# positive = (stack_test > 0.75).sum(axis=1) >= 3
positive = stack_test.min(axis=1) > 0.8
print("Percent positive investments %0.1f%%" % ((positive.sum() / y_test.count()) * 100))
print("Average return", y_test[positive.values].mean())

Percent positive investments 0.0%
Average return nan

KNN¶

For the sake of completeness, we will also run a KNN classifier. But the results are not promising.

from sklearn.neighbors import KNeighborsClassifier

KNN =KNeighborsClassifier(n_neighbors = 2).fit(X_train_sub_scale, y_train_sub > 0.10)
evaluate_strategy(KNN, X_test_sub_scale, y_test_sub)

Positive result on 312 out of 2500 (12.48%)
Mean return of strategy: -0.027277999524171637
Overall return: -0.02146721277172889
remaining -0.020638617951453702

(-0.027277999524171637, -0.02146721277172889)

	loan_amnt	term_months	int_rate	credit_score	emp_length_years	annual_inc	dti	delinq_2yrs	credit_history_years	inq_last_6mths	...	total_acc	verification_status_Not Verified	verification_status_Source Verified	verification_status_Verified	home_ownership_ANY	home_ownership_MORTGAGE	home_ownership_NONE	home_ownership_OTHER	home_ownership_OWN	home_ownership_RENT
count	246444.000000	246444.000000	246444.000000	246444.000000	246444.000000	246444.000000	246444.000000	246444.000000	246444.000000	246444.000000	...	246444.000000	246444.000000	246444.000000	246444.000000	246444.000000	246444.000000	246444.000000	246444.000000	246444.000000	246444.000000
mean	0.002492	0.000873	0.003489	0.003658	-0.000443	0.003085	0.002265	0.001378	-0.001509	0.003561	...	-0.002733	-0.000365	-0.000125	0.000487	0.002716	-0.002486	0.002673	0.003240	0.001082	0.001691
std	1.000531	1.000533	1.001566	1.002071	0.999455	1.226673	0.953216	0.998851	0.999830	1.004971	...	1.000046	0.999842	0.999971	1.000187	1.112239	0.999987	1.185844	1.130124	1.001417	1.000348
min	-1.621322	-0.561624	-1.722283	-1.669946	-1.462000	-1.214605	-1.879250	-0.360744	-2.132729	-0.713918	...	-1.944951	-0.654167	-0.779051	-0.690128	-0.011455	-0.995315	-0.006579	-0.011689	-0.338276	-0.815402
25%	-0.744073	-0.561624	-0.717991	-0.746343	-0.882563	-0.490418	-0.653453	-0.360744	-0.683532	-0.713918	...	-0.691202	-0.654167	-0.779051	-0.690128	-0.011455	-0.995315	-0.006579	-0.011689	-0.338276	-0.815402
50%	-0.276208	-0.561624	-0.074215	-0.130607	0.002234	-0.168556	-0.052510	-0.360744	-0.181789	-0.713918	...	-0.106119	-0.654167	-0.779051	-0.690128	-0.011455	-0.995315	-0.006579	-0.011689	-0.338276	-0.815402
75%	0.659524	-0.561624	0.614626	0.639062	1.145464	0.233770	0.602496	-0.360744	0.508933	0.322344	...	0.562548	1.528662	1.283612	1.449006	-0.011455	1.004707	-0.006579	-0.011689	-0.338276	1.226389
max	2.998853	1.780551	3.788445	3.563806	1.145464	152.039127	101.986036	29.485919	7.331183	7.576176	...	12.598543	1.528662	1.283612	1.449006	87.298103	1.004707	151.996916	85.551733	2.956166	1.226389

	loan_amnt	term_months	int_rate	credit_score	emp_length_years	annual_inc	dti	delinq_2yrs	credit_history_years	inq_last_6mths	...	total_acc	verification_status_Not Verified	verification_status_Source Verified	verification_status_Verified	home_ownership_ANY	home_ownership_MORTGAGE	home_ownership_NONE	home_ownership_OTHER	home_ownership_OWN	home_ownership_RENT
0	-0.978006	-0.561624	-0.400395	-0.284541	-1.462000	-0.570883	-0.426800	-0.360744	-0.226469	-0.713918	...	-1.359868	-0.654167	-0.779051	1.449006	-0.011455	-0.995315	-0.006579	-0.011689	-0.338276	1.226389
1	-1.094973	-0.561624	-1.720137	-1.669946	0.002234	-0.731813	0.740775	-0.360744	0.764932	-0.713918	...	-0.106119	1.528662	-0.779051	-0.690128	-0.011455	-0.995315	-0.006579	-0.011689	-0.338276	1.226389
2	-0.978006	-0.561624	-1.226575	-1.362078	-1.462000	-0.260287	-1.271030	-0.360744	-0.872877	-0.713918	...	-1.109118	-0.654167	-0.779051	1.449006	-0.011455	1.004707	-0.006579	-0.011689	-0.338276	-0.815402
3	-0.744073	-0.561624	-1.314558	-1.362078	1.145464	0.314235	-0.317632	0.787205	0.363904	-0.713918	...	1.314797	-0.654167	1.283612	-0.690128	-0.011455	1.004707	-0.006579	-0.011689	-0.338276	-0.815402
4	-0.817177	-0.561624	0.908618	1.100864	-1.462000	-0.842855	-1.339650	-0.360744	-1.319319	1.358606	...	-1.025535	-0.654167	-0.779051	1.449006	-0.011455	-0.995315	-0.006579	-0.011689	-0.338276	1.226389
5	-0.767467	-0.561624	-0.260910	-0.592409	-1.172282	0.314235	-0.749104	1.935153	0.888354	-0.713918	...	0.311798	1.528662	-0.779051	-0.690128	-0.011455	1.004707	-0.006579	-0.011689	-0.338276	-0.815402
6	-1.247029	-0.561624	-0.033442	0.177261	-0.303127	-0.474324	0.516202	3.083102	0.743324	0.322344	...	1.147631	-0.654167	-0.779051	1.449006	-0.011455	1.004707	-0.006579	-0.011689	-0.338276	-0.815402
7	0.659524	-0.561624	-0.202970	-0.130607	1.145464	0.153305	-1.012146	0.787205	-0.728213	0.322344	...	-0.106119	-0.654167	1.283612	-0.690128	-0.011455	1.004707	-0.006579	-0.011689	-0.338276	-0.815402
8	0.191658	-0.561624	0.069562	0.177261	-1.462000	-0.522604	-0.434077	0.787205	-0.449141	-0.713918	...	-0.691202	-0.654167	-0.779051	1.449006	-0.011455	1.004707	-0.006579	-0.011689	-0.338276	-0.815402
9	1.127390	1.780551	1.054540	0.946930	-0.592845	1.038423	0.421590	3.083102	0.653963	0.322344	...	2.150631	-0.654167	1.283612	-0.690128	-0.011455	-0.995315	-0.006579	-0.011689	-0.338276	1.226389
10	1.969548	-0.561624	-0.786661	-0.900276	-1.172282	0.117900	0.296826	-0.360744	0.274908	-0.713918	...	0.395381	-0.654167	1.283612	-0.690128	-0.011455	1.004707	-0.006579	-0.011689	-0.338276	-0.815402
11	-1.094973	-0.561624	-0.503399	-0.438475	1.145464	-0.442138	1.115065	-0.360744	-0.215482	0.322344	...	-0.440452	-0.654167	1.283612	-0.690128	-0.011455	1.004707	-0.006579	-0.011689	-0.338276	-0.815402
12	1.244356	1.780551	0.161837	-0.284541	-0.013409	-0.249022	0.697108	-0.360744	-0.315465	-0.713918	...	-0.524035	-0.654167	-0.779051	1.449006	-0.011455	1.004707	-0.006579	-0.011689	-0.338276	-0.815402
13	-0.270359	-0.561624	1.327072	1.562666	-0.592845	-0.545134	1.970731	-0.360744	-0.282504	-0.713918	...	0.896881	1.528662	-0.779051	-0.690128	-0.011455	1.004707	-0.006579	-0.011689	-0.338276	-0.815402
14	0.659524	-0.561624	-1.469064	-1.516012	-0.882563	-0.570883	-1.030861	-0.360744	-1.162569	-0.713918	...	-0.941952	1.528662	-0.779051	-0.690128	-0.011455	-0.995315	-0.006579	-0.011689	2.956166	-0.815402
15	0.425591	1.780551	2.018059	1.408732	-1.462000	0.330328	1.735761	-0.360744	0.185913	-0.713918	...	0.228215	1.528662	-0.779051	-0.690128	-0.011455	1.004707	-0.006579	-0.011689	-0.338276	-0.815402
16	1.244356	-0.561624	-1.211553	-1.208144	-1.462000	-0.007626	0.335295	-0.360744	-0.638852	-0.713918	...	0.311798	-0.654167	1.283612	-0.690128	-0.011455	-0.995315	-0.006579	-0.011689	-0.338276	1.226389
17	-1.539445	-0.561624	1.161836	0.946930	1.145464	-0.453033	-1.570461	-0.360744	-0.516529	-0.713918	...	-1.276285	1.528662	-0.779051	-0.690128	-0.011455	-0.995315	-0.006579	-0.011689	-0.338276	1.226389
18	-0.823026	-0.561624	-0.896103	-0.746343	-0.882563	-0.249022	1.661943	-0.360744	-0.672545	0.322344	...	-0.022535	-0.654167	-0.779051	1.449006	-0.011455	1.004707	-0.006579	-0.011689	-0.338276	-0.815402
19	-1.036489	-0.561624	0.354970	0.331195	0.002234	-0.961606	-0.847875	-0.360744	-1.854390	-0.713918	...	-1.777785	-0.654167	-0.779051	1.449006	-0.011455	-0.995315	-0.006579	-0.011689	-0.338276	1.226389
20	-0.276208	-0.561624	-1.168635	-1.054210	-1.172282	-0.249022	-0.831240	-0.360744	-0.482835	-0.713918	...	-1.527035	-0.654167	1.283612	-0.690128	-0.011455	-0.995315	-0.006579	-0.011689	-0.338276	1.226389
21	-0.764542	-0.561624	1.316343	1.254798	-0.303127	-0.442138	-0.106574	-0.360744	-0.416180	0.322344	...	-0.607618	-0.654167	1.283612	-0.690128	-0.011455	1.004707	-0.006579	-0.011689	-0.338276	-0.815402
22	0.425591	1.780551	0.033081	0.023327	1.145464	1.038423	0.374803	-0.360744	0.074577	-0.713918	...	0.061048	1.528662	-0.779051	-0.690128	-0.011455	1.004707	-0.006579	-0.011689	-0.338276	-0.815402
23	-0.393174	-0.561624	-0.181511	-0.284541	-0.303127	-0.137175	-0.383132	-0.360744	-1.117889	-0.713918	...	-1.276285	-0.654167	-0.779051	1.449006	-0.011455	-0.995315	-0.006579	-0.011689	-0.338276	1.226389
24	-0.276208	-0.561624	-1.168635	-1.054210	1.145464	-0.178212	-0.174154	0.787205	0.898975	0.322344	...	0.061048	-0.654167	1.283612	-0.690128	-0.011455	-0.995315	-0.006579	-0.011689	-0.338276	1.226389
25	0.729704	-0.561624	-1.061339	-1.054210	-1.462000	1.762610	0.192857	0.787205	0.508933	-0.713918	...	0.813298	1.528662	-0.779051	-0.690128	-0.011455	-0.995315	-0.006579	-0.011689	-0.338276	1.226389
26	-1.094973	-0.561624	-0.554901	-0.438475	0.566028	-0.683534	-1.459214	-0.360744	-1.006553	1.358606	...	-1.276285	1.528662	-0.779051	-0.690128	-0.011455	-0.995315	-0.006579	-0.011689	2.956166	-0.815402
27	-0.744073	-0.561624	0.367845	0.331195	1.145464	-0.071998	2.010240	-0.360744	-1.330306	-0.713918	...	-0.106119	-0.654167	-0.779051	1.449006	-0.011455	1.004707	-0.006579	-0.011689	-0.338276	-0.815402
28	-1.118366	-0.561624	-1.233013	-1.362078	-1.462000	-0.750507	-0.079542	0.787205	-0.326818	0.322344	...	-0.524035	1.528662	-0.779051	-0.690128	-0.011455	1.004707	-0.006579	-0.011689	-0.338276	-0.815402
29	-0.343463	-0.561624	-0.503399	-0.438475	0.276310	0.137212	-0.759501	1.935153	0.408951	-0.713918	...	-0.273285	1.528662	-0.779051	-0.690128	-0.011455	1.004707	-0.006579	-0.011689	-0.338276	-0.815402
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
739300	-1.258726	-0.561624	-0.717991	-0.592409	-1.462000	0.072839	-0.084741	-0.360744	2.281517	0.322344	...	1.064048	-0.654167	1.283612	-0.690128	-0.011455	1.004707	-0.006579	-0.011689	-0.338276	-0.815402
739301	-0.837647	-0.561624	0.633940	0.639062	0.002234	-0.924930	2.272242	-0.360744	-0.405193	-0.713918	...	-1.192702	-0.654167	-0.779051	1.449006	-0.011455	-0.995315	-0.006579	-0.011689	-0.338276	1.226389
739302	1.595256	1.780551	-0.357476	-0.438475	-1.172282	-0.007626	-0.467348	-0.360744	1.479093	-0.713918	...	0.228215	-0.654167	1.283612	-0.690128	-0.011455	1.004707	-0.006579	-0.011689	-0.338276	-0.815402
739303	0.659524	1.780551	-0.160052	0.023327	1.145464	0.129165	-0.597309	-0.360744	-0.371499	-0.713918	...	1.899881	1.528662	-0.779051	-0.690128	-0.011455	-0.995315	-0.006579	-0.011689	2.956166	-0.815402
739304	-1.094973	-0.561624	-0.074215	-0.130607	-0.882563	-0.812279	0.042102	-0.360744	3.697753	0.322344	...	0.729714	-0.654167	1.283612	-0.690128	-0.011455	1.004707	-0.006579	-0.011689	-0.338276	-0.815402
739305	-0.276208	1.780551	0.488017	0.639062	1.145464	-0.715720	1.469600	-0.360744	-0.828928	-0.713918	...	-0.440452	-0.654167	1.283612	-0.690128	-0.011455	-0.995315	-0.006579	-0.011689	-0.338276	1.226389
739306	1.185873	-0.561624	-1.166489	-1.208144	1.145464	1.038423	-1.079726	-0.360744	-0.293857	2.394867	...	-0.691202	-0.654167	1.283612	-0.690128	-0.011455	-0.995315	-0.006579	-0.011689	-0.338276	1.226389
739307	1.127390	-0.561624	-1.720137	-1.669946	1.145464	0.845306	-1.072449	1.935153	-0.115134	-0.713918	...	0.061048	-0.654167	1.283612	-0.690128	-0.011455	1.004707	-0.006579	-0.011689	-0.338276	-0.815402
739308	-0.861040	-0.561624	-1.211553	-1.208144	1.145464	0.185491	-0.760541	0.787205	1.133732	-0.713918	...	0.729714	1.528662	-0.779051	-0.690128	-0.011455	1.004707	-0.006579	-0.011689	-0.338276	-0.815402
739309	-0.416567	-0.561624	-1.275931	-1.208144	-0.592845	-0.120277	0.047300	0.787205	-0.137474	-0.713918	...	0.061048	1.528662	-0.779051	-0.690128	-0.011455	1.004707	-0.006579	-0.011689	-0.338276	-0.815402
739310	-0.276208	-0.561624	-0.503399	-0.438475	0.002234	-0.686918	0.555710	-0.360744	0.621001	-0.713918	...	1.064048	-0.654167	-0.779051	1.449006	-0.011455	-0.995315	-0.006579	-0.011689	2.956166	-0.815402
739311	1.010423	-0.561624	-1.720137	-1.669946	-1.172282	1.199353	-1.169140	-0.360744	-0.237823	-0.713918	...	-0.022535	1.528662	-0.779051	-0.690128	-0.011455	1.004707	-0.006579	-0.011689	-0.338276	-0.815402
739312	2.414021	1.780551	-0.224429	-0.130607	-1.172282	1.102795	-1.464413	-0.360744	0.621001	-0.713918	...	1.147631	-0.654167	1.283612	-0.690128	-0.011455	1.004707	-0.006579	-0.011689	-0.338276	-0.815402
739313	1.127390	1.780551	-0.097820	0.023327	0.566028	-0.297301	-1.022543	-0.360744	1.768786	-0.713918	...	1.816297	1.528662	-0.779051	-0.690128	-0.011455	-0.995315	-0.006579	-0.011689	-0.338276	1.226389
739314	0.674145	-0.561624	-0.396103	-0.284541	-0.882563	0.636096	0.356089	-0.360744	-0.070087	2.394867	...	0.896881	-0.654167	1.283612	-0.690128	-0.011455	-0.995315	-0.006579	-0.011689	-0.338276	1.226389
739315	-0.217724	-0.561624	-0.503399	-0.746343	0.566028	-0.409952	0.931039	0.787205	-0.371499	-0.713918	...	-0.356869	-0.654167	1.283612	-0.690128	-0.011455	-0.995315	-0.006579	-0.011689	-0.338276	1.226389
739316	-0.861040	-0.561624	1.054540	0.946930	-1.172282	0.233770	-0.530769	0.787205	0.353283	-0.713918	...	-0.774785	-0.654167	1.283612	-0.690128	-0.011455	-0.995315	-0.006579	-0.011689	-0.338276	1.226389
739317	-0.510140	-0.561624	-0.896103	-0.746343	-1.462000	0.072839	-1.218006	0.787205	-0.271516	1.358606	...	-1.276285	1.528662	-0.779051	-0.690128	-0.011455	1.004707	-0.006579	-0.011689	-0.338276	-0.815402
739318	-0.510140	1.780551	0.975141	2.024467	1.145464	-0.249022	0.154389	-0.360744	1.066345	1.358606	...	1.899881	-0.654167	-0.779051	1.449006	-0.011455	1.004707	-0.006579	-0.011689	-0.338276	-0.815402
739319	0.776490	-0.561624	-0.520566	-0.438475	1.145464	0.370561	0.569226	-0.360744	0.274908	0.322344	...	-0.524035	-0.654167	-0.779051	1.449006	-0.011455	-0.995315	-0.006579	-0.011689	2.956166	-0.815402
739320	-0.627107	-0.561624	-0.548463	-0.438475	1.145464	-0.538697	0.819792	-0.360744	0.653963	1.358606	...	-0.607618	1.528662	-0.779051	-0.690128	-0.011455	1.004707	-0.006579	-0.011689	-0.338276	-0.815402
739321	0.659524	1.780551	0.071708	0.177261	-0.882563	0.636096	0.100325	-0.360744	0.664950	-0.713918	...	1.983464	1.528662	-0.779051	-0.690128	-0.011455	-0.995315	-0.006579	-0.011689	-0.338276	1.226389
739322	1.595256	-0.561624	0.483725	0.639062	1.145464	1.601680	-1.487286	3.083102	0.353283	0.322344	...	-0.022535	-0.654167	1.283612	-0.690128	-0.011455	1.004707	-0.006579	-0.011689	-0.338276	-0.815402
739323	0.460681	1.780551	1.955828	2.024467	-0.303127	1.022330	-0.657611	-0.360744	-0.984212	0.322344	...	-0.106119	-0.654167	1.283612	-0.690128	-0.011455	1.004707	-0.006579	-0.011689	-0.338276	-0.815402
739324	-0.744073	-0.561624	-1.567776	-1.669946	-0.592845	0.394701	-1.039178	-0.360744	1.088685	-0.713918	...	-0.022535	-0.654167	-0.779051	1.449006	-0.011455	1.004707	-0.006579	-0.011689	-0.338276	-0.815402
739325	-1.211939	-0.561624	-1.522712	-1.516012	-0.013409	-0.409952	-0.387291	-0.360744	-0.906570	-0.713918	...	-1.276285	1.528662	-0.779051	-0.690128	-0.011455	-0.995315	-0.006579	-0.011689	-0.338276	1.226389
739326	1.127390	-0.561624	-1.376789	-1.208144	1.145464	1.601680	-0.838518	-0.360744	-0.171168	-0.713918	...	-0.524035	-0.654167	1.283612	-0.690128	-0.011455	-0.995315	-0.006579	-0.011689	-0.338276	1.226389
739327	0.776490	-0.561624	-0.288807	-0.130607	1.145464	0.032607	0.290588	-0.360744	-0.171168	0.322344	...	2.150631	1.528662	-0.779051	-0.690128	-0.011455	-0.995315	-0.006579	-0.011689	-0.338276	1.226389
739328	-0.159241	-0.561624	-1.226575	-1.362078	0.566028	0.072839	-0.113852	-0.360744	-0.282504	-0.713918	...	-0.858368	1.528662	-0.779051	-0.690128	-0.011455	-0.995315	-0.006579	-0.011689	-0.338276	1.226389
739329	1.244356	1.780551	1.852823	1.870534	-1.462000	0.491259	-0.374815	-0.360744	-1.129242	-0.713918	...	-0.524035	-0.654167	1.283612	-0.690128	-0.011455	-0.995315	-0.006579	-0.011689	2.956166	-0.815402

	Logistic	RF	NN
0	0.325929	0.164760	0.828567
1	0.407844	0.135459	0.042536
2	0.316926	0.124358	0.544525
3	0.313234	0.100118	0.598191
4	0.283544	0.218676	0.772149

	Logistic	RF	NN
count	739330.000000	739330.000000	739330.000000
mean	0.309042	0.198137	0.691212
std	0.070130	0.074785	0.190744
min	0.001393	0.098424	0.000000
25%	0.260996	0.135437	0.605358
50%	0.307649	0.183158	0.741385
75%	0.355664	0.244574	0.828946
max	0.999950	0.413025	0.999198

	Logistic	RF	NN
count	246444.000000	246444.000000	246444.000000
mean	0.309134	0.198503	0.690799
std	0.070187	0.074902	0.190969
min	0.001095	0.098424	0.000000
25%	0.261269	0.135669	0.605514
50%	0.307743	0.184029	0.741023
75%	0.355600	0.244692	0.828611
max	0.999846	0.413025	0.998805