Group 19: Fernando Trias, Roger Iliffe, HyounJun Park, Siyuan Yin
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
complete = pd.read_pickle("complete.bz2")
public_data_cat = pd.read_pickle("analysis-predictors.bz2")
public_data_y = pd.read_pickle("analysis-target.bz2")
public_data_cat.columns
sum1 = complete[["issue_date_year","total_pymnt", "funded_amnt"]].groupby("issue_date_year").sum()
fig, ax = plt.subplots(1,1, figsize=(12,8))
sum1.plot(kind="bar", ax=ax)
ax.set_title("Completed Loans: Repaid vs Total Funded", fontsize=14)
ax.legend(["Repaid", "Total Funded"])
fig.savefig("Net_CashFlow_By_Year.png")
fig, ax = plt.subplots(1,1, figsize=(10,10))
public_data_y.hist(range=(-1,1),bins=100)
ax.set_title("APY histogram",fontsize=15)
ax.set_xlabel("APY",fontsize=12)
fig.savefig("APY_histogram.png")
cleaned_complete = public_data_cat.copy()
cleaned_complete["apy"] = public_data_y
cleaned_complete["issue_date_year"] = cleaned_complete.issue_date.dt.year
fig, axes = plt.subplots(1,1, figsize=(12,10))
sns.boxplot("issue_date_year", "apy", data = cleaned_complete)
axes.set_title("Distribution of APY by year", fontsize=24);
ax.set_xlabel("Issue year",fontsize=15)
ax.set_ylabel("APY", fontsize=15);
fig.savefig("Distribution_of_APY_by_year.png")
Credit = ["A1","A2","A3","A4","A5","B1","B2","B3","B3","B4","B5","C1","C2","C3","C4","C5",
"D1","D2","D3","D4","D5","E1","E2","E3","E4","E5","F1","F2","F3","F4","F5"
,"G1","G2","G3","G4","G5"]
fig, axes = plt.subplots(1,1, figsize=(12,10))
ax = sns.barplot("credit_score", "apy", data = cleaned_complete)
ax.set_title("Profit Loss by Credit Score", fontsize=24)
ax.set_xticklabels(Credit)
ax.set_xlabel("Credit Score",fontsize=15)
ax.set_ylabel("APY", fontsize=15);
fig.savefig("Profit_loss_by_creditscore.png")
df = complete[["addr_state", "funded_amnt"]].groupby("addr_state").sum()
df.index
population = pd.read_csv("pop.csv", sep="\t", index_col=0)
population.head()
df.head()
df_pop = df.join(population)
z_data = df_pop['funded_amnt'].astype(float) / df_pop.Population.astype(float)
z_data.head()
complete["bad_loan"] = (complete.good_loan == 0).astype(float)
df = complete[["addr_state", "good_loan", "bad_loan"]].groupby("addr_state").sum()
df["prop_bad"] = df.bad_loan / (df.good_loan + df.bad_loan)
z_data = df.prop_bad
df.head()
import plotly
import plotly.plotly as py
import pandas as pd
plotly.tools.set_credentials_file(username='ftrias', api_key='DcIZCHxlGrkQMXBjF2RM')
def plot_by_state(state, z_data, title_text="States"):
scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],\
[0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]
data = [ dict(
type='choropleth',
# colorscale = scl,
# autocolorscale = False,
locations = df.index,
z = z_data,
locationmode = 'USA-states',
# text = df['text'],
marker = dict(
line = dict (
color = 'rgb(255,255,255)',
width = 2
) ),
colorbar = dict(
title = "USD")
) ]
layout = dict(
title = title_text,
geo = dict(
scope='usa',
projection=dict( type='albers usa' ),
showlakes = True,
lakecolor = 'rgb(255, 255, 255)'),
)
fig = dict( data=data, layout=layout )
py.image.save_as(fig, filename=title_text)
return py.iplot( fig, filename='d3-cloropleth-map' )
fig = plot_by_state(df.index, df.prop_bad, "Proportion of Bad Loans Per State.png")
z_data = df_pop['funded_amnt'].astype(float) / df_pop.Population.astype(float)
plot_by_state(df_pop.index, z_data, "Funded Loans per Capita by State.png")
def plot_state_mean(column, dfx, title):
df = dfx[["addr_state", column]].groupby("addr_state").mean()
return plot_by_state(df.index, df[column], title)
def plot_state_sum(column, dfx, title):
df = dfx[["addr_state", column]].groupby("addr_state").sum()
return plot_by_state(df.index, df[column], title)
plot_state_mean("apy", complete, "Mean APY by State.png")
data = pd.read_excel('Percent White.xls')
group = data.groupby("AB").mean()
def plot_by_state(state, z_data, title_text="States"):
scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],\
[0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]
data = [ dict(
type='choropleth',
# colorscale = scl,
# autocolorscale = False,
locations = group.index,
z = z_data,
locationmode = 'USA-states',
# text = df['text'],
marker = dict(
line = dict (
color = 'rgb(255,255,255)',
width = 2
) ),
colorbar = dict(
title = "Proportion squared")
) ]
layout = dict(
title = title_text,
geo = dict(
scope='usa',
projection=dict( type='albers usa' ),
showlakes = True,
lakecolor = 'rgb(255, 255, 255)'),
)
fig = dict( data=data, layout=layout )
py.image.save_as(fig, filename=title_text)
return py.iplot(fig, filename='d3-cloropleth-map')
plot_by_state(group.index, (100-group.Value), "Proportion of Non White By State.png")
s = complete.sample(1000)[['apy','term_months','credit_score', 'emp_length_years'
, 'annual_inc', 'dti', 'revol_util_perc']]
a = pd.plotting.scatter_matrix(s, alpha=0.2, figsize=(15,15))
[s.set_xticks(()) for s in a.reshape(-1)]
[s.set_yticks(()) for s in a.reshape(-1)]
plt.savefig("Predictors_Correlations.png")