import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score
from pandas.plotting import scatter_matrix
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import Lasso
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import r2_score
from statsmodels.api import OLS
import statsmodels.api as sm
%matplotlib inline
import seaborn as sns
sns.set()
stockdf = pd.read_pickle("035720.bz2")
y_data = stockdf.등락률[:-1]
x_data = stockdf.iloc[:,stockdf.columns != '등락률'][1:]
x_data = x_data.drop('날짜',axis=1)
X_train, X_test, y_train, y_test = train_test_split(
x_data, y_data, test_size=0.25, random_state=42)
stockdf.corr()
plt.plot(y_data)
y_data.mean()
scale = StandardScaler()
scale.fit(X_train)
X_train_scale = pd.DataFrame(data = scale.transform(X_train), columns = X_train.columns.tolist())
X_test_scale = pd.DataFrame(data = scale.transform(X_test), columns = X_test.columns.tolist())
X_train_scale_add = sm.add_constant(X_train_scale)
X_test_scale_add = sm.add_constant(X_test_scale)
y_train_re = y_train.values.reshape(-1,1)
y_test_re = y_test.values.reshape(-1,1)
OLSmodel = OLS(y_train_re,X_train_scale_add).fit()
OLSmodel.summary()
yhat = OLSmodel.predict(X_test_scale_add)
print("rscore {0}".format(r2_score(y_test,yhat)))
print("mean of profit : {0}".format(OLSmodel.predict(X_test_scale_add).mean()))
linearmodel = LinearRegression().fit(X_train_scale,y_train)
print("rscore {0}".format(linearmodel.score(X_test_scale,y_test)))
print("mean of profit : {0}".format(linearmodel.predict(X_test_scale).mean()))
{k: v for k,v in zip(X_train_scale.columns, linearmodel.coef_.ravel())}
Ridge = RidgeCV().fit(X_train_scale,y_train)
print("rscore {0}".format(Ridge.score(X_test_scale,y_test)))
print("mean of profit : {0}".format(Ridge.predict(X_test_scale).mean()))
{k: v for k,v in zip(X_train_scale.columns, Ridge.coef_.ravel())}
Lassomodel = LassoCV().fit(X_train_scale,y_train)
print("rscore {0}".format(Lassomodel.score(X_test_scale,y_test)))
print("mean of profit : {0}".format(Lassomodel.predict(X_test_scale).mean()))
RandomRegression = RandomForestRegressor().fit(X_train_scale,y_train)
print("rscore {0}".format(RandomRegression.score(X_test_scale,y_test)))
print("mean of profit : {0}".format(RandomRegression.predict(X_test_scale).mean()))
Adaregressor = AdaBoostRegressor().fit(X_train_scale,y_train)
print("rscore {0}".format(linearmodel.score(X_test_scale,y_test)))
print("mean of profit : {0}".format(linearmodel.predict(X_test_scale).mean()))
def evaluate_strategy(model, df, profit, cutoff=None, showprint=True):
# see where our model says to invest
inv = model.predict(df)
if cutoff is not None:
inv = inv > cutoff
inv = inv.reshape(-1)
# return the mean return
if showprint:
print("Positive result on %d out of %d (%0.2f%%)" % (inv.sum(), inv.shape[0], inv.sum() / inv.shape[0] * 100.0))
print("Mean return of strategy:", profit[inv].mean())
print("Overall return:", profit.mean())
return profit[inv].mean(), profit.mean()
y_train_cat = y_train > 0
y_test_cat = y_test >0
Logist =LogisticRegressionCV().fit(X_train_scale, y_train_cat)
print(accuracy_score(y_test_cat,Logist.predict(X_test_scale)))
evaluate_strategy(Logist,X_test_scale,y_test)
Decision = DecisionTreeClassifier().fit(X_train_scale,y_train_cat)
print(accuracy_score(y_test_cat,Decision.predict(X_test_scale)))
evaluate_strategy(Decision,X_test_scale,y_test)
RandomForest = RandomForestClassifier().fit(X_train_scale,y_train_cat)
print(accuracy_score(y_test_cat,RandomForest.predict(X_test_scale)))
evaluate_strategy(RandomForest,X_test_scale,y_test)
AdaCL = AdaBoostClassifier(n_estimators=300,learning_rate=0.05).fit(X_train_scale,y_train_cat)
print(accuracy_score(y_test_cat,AdaCL.predict(X_test_scale)))
evaluate_strategy(AdaCL,X_test_scale,y_test)
poly = PolynomialFeatures(4)
X_train_poly = poly.fit_transform(X_train_scale)
X_test_poly = poly.fit_transform(X_test_scale)
newscaler = StandardScaler().fit(X_train_poly)
X_train_poly = newscaler.transform(X_train_poly)
X_test_poly = newscaler.transform(X_test_poly)
polylog = LogisticRegressionCV().fit(X_train_poly,y_train_cat)
print(accuracy_score(y_test_cat,polylog.predict(X_test_poly)))
evaluate_strategy(polylog,X_test_poly,y_test)
polyridge = RidgeCV().fit(X_train_poly,y_train)
print("rscore {0}".format(polyridge.score(X_test_poly,y_test)))
print("mean of profit : {0}".format(polyridge.predict(X_test_poly).mean()))
SVM = SVC().fit(X_train_scale,y_train_cat)
print(accuracy_score(y_test_cat,SVM.predict(X_test_scale)))
evaluate_strategy(SVM,X_test_scale,y_test)
LDA =LinearDiscriminantAnalysis().fit(X_train_scale,y_train_cat)
print(accuracy_score(y_test_cat,LDA.predict(X_test_scale)))
evaluate_strategy(LDA,X_test_scale,y_test)
QDA = QuadraticDiscriminantAnalysis().fit(X_train_scale,y_train_cat)
print(accuracy_score(y_test_cat,QDA.predict(X_test_scale)))
evaluate_strategy(QDA,X_test_scale,y_test)
KNN = KNeighborsClassifier(n_neighbors=5).fit(X_train_scale,y_train_cat)
print(accuracy_score(y_test_cat,KNN.predict(X_test_scale)))
evaluate_strategy(KNN,X_test_scale,y_test)
model_nn = Sequential()
model_nn.add(Dense(715, input_dim=X_train_poly.shape[1], activation='relu'))
model_nn.add(Dense(500, activation='relu'))
model_nn.add(Dense(300, activation='relu'))
model_nn.add(Dense(150, activation='relu'))
model_nn.add(Dense(100, activation='relu'))
model_nn.add(Dense(64, activation='relu'))
model_nn.add(Dense(32, activation='relu'))
model_nn.add(Dense(8, activation='relu'))
model_nn.add(Dense(1, activation='linear'))
model_nn.compile(loss='mse', optimizer='adam')
model_nn.summary()
model_nn.fit(X_train_poly, y_train_cat, epochs=50, batch_size=715, validation_split=0.2)
r2_score(y_test,model_nn.predict(X_test_poly))
list_model = ["Singletree",'RF', 'Ada', 'Logit', 'QDA','LDA','KNN']
model_dict = [Decision,RandomForest,AdaCL,Logist,QDA,LDA,KNN]
tune_index_name =['train row {0}'.format(i+1) for i in range(len(X_train_scale))]
ensemble_test_index_name =['test row {0}'.format(i+1) for i in range(len(X_test_scale))]
#make empty dataframe with index i made
ensemble_tune = pd.DataFrame(index=tune_index_name)
ensemble_test = pd.DataFrame(index=ensemble_test_index_name)
#make dataset with prediction.
for n,i in zip(list_model,model_dict):
ensemble_tune["{0}'s model prediction".format(n)] = [i[1] for i in i.predict_proba(X_train_scale)]
ensemble_test["{0}'s model prediction".format(n)] = [i[1] for i in i.predict_proba(X_test_scale)]
metamodel = LogisticRegressionCV().fit(ensemble_tune,y_train_cat)
print(accuracy_score(y_test_cat,metamodel.predict(ensemble_test)))
evaluate_strategy(metamodel,ensemble_test,y_test)
X_tommorow = pd.DataFrame(data = scale.transform(x_data[0:10]), columns = X_train.columns.tolist())
poly = PolynomialFeatures(4)
X_tommorow_poly = poly.fit_transform(X_tommorow)
X_tommorow_poly = newscaler.transform(X_tommorow_poly)
X_tommorow_poly
today_meta = pd.DataFrame(index=stockdf.날짜[0:10])
for n,i in zip(list_model,model_dict):
today_meta["{0}'s model prediction".format(n)] = [i[1] for i in i.predict_proba(X_tommorow)]
metamodel.predict(today_meta)
all_model = ['linearmodel','Lassomodel','Ridge','polyridge','RandomRegression','Adaregressor',
'model_nn','Logist','polylog','Decision','RandomForest','AdaCL','SVM','LDA','QDA','KNN','metamodel']
all_dict = [linearmodel,Lassomodel,Ridge,polyridge,RandomRegression,Adaregressor,
model_nn,Logist,polylog,Decision,RandomForest,AdaCL,SVM,LDA,QDA,KNN,metamodel]
Regression = ['linearmodel','Lassomodel','Ridge','polyridge','RandomRegression','Adaregressor',
'model_nn']
Regression_model = [linearmodel,Lassomodel,Ridge,polyridge,RandomRegression,Adaregressor,
model_nn]
Logistic = ['Logist','polylog','Decision','RandomForest','AdaCL','SVM','LDA','QDA','KNN','metamodel']
Logistic_model = [Logist,Decision,RandomForest,AdaCL,LDA,QDA,KNN]
Polynomial = [polyridge,model_nn,polylog]
index = []
for i in range(10):
index.append("{0}'s next day".format(stockdf.날짜[i]))
result = pd.DataFrame(index=index,columns=all_model)
for n,i in zip(all_model,all_dict):
if i in Polynomial:
result[n] =i.predict(X_tommorow_poly)
elif i == metamodel:
result[n] =metamodel.predict(today_meta)
else:
result[n] =i.predict(X_tommorow)
result["mean"] = result[Regression].mean(axis=1)
result["percentage of true"] = result[Logistic].mean(axis=1)
result["true"] = [0]+[i for i in y_data[0:9]]
result
# the value's standard is % based on today's stock price.
# True = tomorrow gonna be +, positive return
# False = negative.
# 'true' column is actual result.
#recent 'true' is defalut zero because we do not have tomorrow's actual data.