This notebook shows several machine learning models used on a house sale price data set. It covers the following topics:
Once the data was prepared the following two questions were addressed via machine learning:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import fbeta_score, make_scorer, mean_squared_error, mean_absolute_error, accuracy_score
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn import linear_model
import time
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
warnings.filterwarnings('ignore')
%matplotlib inline
df = pd.read_csv("../../input/housing2.csv", parse_dates=True, sep=',')
print("Number of columns in dataframe {}".format(len(df.columns)))
# variables
np.set_printoptions(precision=3)
k_features = 7 # used for feature selection
k = 10 # used for k-fold validation
cv_iter = 5 # number of iterations for cv
[train_percent, test_percent] = [.7, .3]
classification_day_threshold = 120
#missing data
total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent Missing'])
missing_data.head(10)
# remove because there are too many missing data in the feature
del df['Rented']
del df['Showing Service']
del df['Easements']
del df['Common Land Acres']
del df['Water Frontage Length']
del df['Water Body Type']
del df['Development / Subdivision']
del df['Assessment Year']
del df['Current Use']
del df['Assessment Amount']
del df['Short Sale']
del df['Garage Type']
del df['Basement Access Type']
del df['Foreclosed/Bank-Owned/REO']
# remove because data is irrelevant or duplicate
del df['MLS #']
del df['Status']
del df['Price - Closed'] # same as feature 'Price'
del df['Price - List']
del df[' Price - Original ']
del df[' Tax - Gross Amount '] # sale price determines Taxes, not the othe way around
del df['Seasonal'] # whether or not a property is seasonal (summer home) should not change the outlook of a buyer
# remove due to feature set size being too big
del df['Listing Office - Office Name']
del df['List Agent - Agent Name']
# remove due to redundancy
del df['Address']
del df['State']
del df['County']
# Force types for certain fields
df['Price'] = df['Price'].str.replace(',', '')
df['Price'] = df['Price'].str.replace('$', '')
df['Price'] = df['Price'].astype(float)
df['SqFtTotFn'] = df['SqFtTotFn'].str.replace(',', '')
df['SqFtTotFn'] = df['SqFtTotFn'].astype(float)
# Enrich garage and garage capacity features
# 1. If garage == no, then add 0 to garage capacity, i.e. make sure garage capacity = 0
df['Garage Capacity'].replace('', np.nan, inplace=True)
df['Garage'].replace('', np.nan, inplace=True)
df.loc[df['Garage'] == 'No', 'Garage Capacity'] = 0
del df['Garage']
# Change Basement from Yes/No to 1/0
df.Basement = pd.Series(np.where(df.Basement.values == 'Yes', 1, 0), df.index)
# Total Stories contains 4+ values that we will set to 4
stories = lambda x: '4' if x == '4+' else x
df['Total Stories'] = df['Total Stories'].apply(stories)
df['Total Stories'] = df['Total Stories'].astype(float)
# Create the Days in market metric which is the date of closure - date of MLS listing
from datetime import date
closed = df.columns.get_loc("Date - Closed")
listed = df.columns.get_loc("Date - MLS List")
df['DaysMkt'] = (pd.to_datetime(df[df.columns[closed]]) -
pd.to_datetime(df[df.columns[listed]])).astype('timedelta64[D]')
df['xdayplus'] = df['DaysMkt'] > classification_day_threshold
df = df.drop('Date - Closed', 1)
df = df.drop('Date - MLS List', 1)
# BEFORE: remove samples missing certain features
#print(df.isnull().sum().sum())
#print(len(df))
#df.isnull().sum()
# AFTER: remove samples any missing features
df.dropna(how='any', inplace = True)
print(df.isnull().sum().sum())
print(len(df))
df.isnull().sum()
print("Number of columns in dataframe {}".format(len(df.columns)))
# One hot encode some features.
df = pd.concat([df,
pd.get_dummies(df[['Property Type']]),
pd.get_dummies(df[['Flood Zone']]),
pd.get_dummies(df[['Surveyed']]),
pd.get_dummies(df[['Covenants']]),
pd.get_dummies(df[['City']])], axis=1)
df.drop(['City','Property Type','Flood Zone','Surveyed','Covenants'], axis=1, inplace=True)
# set up dataframes for regression and classification inputs and labels. The dfs are not the same...
df.dropna(axis=1, how='any')
# For classification
df_class = df.copy(deep=True)
labels_class = df_class['xdayplus']
del df_class['DaysMkt'] # since this is 100% correlated with 90dayplus feature
del df_class['xdayplus']
del df_class['DOM'] # highly correlated with xdayplus
print(df_class.columns)
# For regression
labels = df.Price
del df['Price']
del df['xdayplus']
print(df.columns)
print("Number of columns in dataframe {}".format(len(df.columns)))
print("Number of columns in dataframe {}".format(len(df_class.columns)))
# Perform the pre-train, train, validation and test split
# For Classification
xc, xc_pre, labelsc, yc_pre = train_test_split(df_class, labels_class, test_size=500, shuffle=True)
xc_train, xc_test, yc_train, yc_test = train_test_split(xc, labelsc,
train_size=train_percent, test_size=test_percent, shuffle=True)
print("classf'n: sample size of pre={0}, train={1} and test={2}.".format(len(xc_pre), len(xc_train), len(xc_test)))
print("classf'n: label size of pre={0}, train={1} and test={2}.".format(len(yc_pre), len(yc_train), len(yc_test)))
# For regression
x, x_pre, labels, y_pre = train_test_split(df, labels, test_size=500, shuffle=True)
x_train, x_test, y_train, y_test = train_test_split(x, labels,
train_size=train_percent, test_size=test_percent, shuffle=True)
print("reg: sample size of pre={0}, train={1} and test={2}.".format(len(x_pre), len(x_train), len(x_test)))
print("reg: label size of pre={0}, train={1} and test={2}.".format(len(y_pre), len(y_train), len(y_test)))
# Need to scale features
# Must retain indices
from sklearn.preprocessing import StandardScaler, MinMaxScaler
reg_columns_to_scale = ['Bedrooms - Total', 'Baths - Total', 'SqFtTotFn', 'DOM', 'PicCount',
'Lot - Acres', 'Year Built', 'Total Stories', 'Rooms - Total',
'Garage Capacity', 'Basement', 'DaysMkt']
class_columns_to_scale = ['Price', 'Bedrooms - Total', 'Baths - Total', 'SqFtTotFn', 'PicCount',
'Lot - Acres', 'Year Built', 'Total Stories', 'Rooms - Total',
'Garage Capacity', 'Basement']
# For Regression
reg_std = StandardScaler().fit(x_train[reg_columns_to_scale])
reg_train_df = pd.DataFrame(
reg_std.transform(x_train[reg_columns_to_scale]), columns=reg_columns_to_scale, index=x_train.index)
x_train_std = x_train.copy(deep=True)
x_train_std[reg_columns_to_scale] = reg_train_df[reg_columns_to_scale]
print(x_train[reg_columns_to_scale].head(5)) # see the original data b4 standardization
print(x_train_std[reg_columns_to_scale].head(5)) # make sure that df is correctly updated
#print(x_test[columns_to_scale].head(5)) # see the original data b4 standardization
reg_test_std = pd.DataFrame(
reg_std.transform(x_test[reg_columns_to_scale]), columns=reg_columns_to_scale, index=x_test.index)
x_test_std = x_test.copy(deep=True)
x_test_std[reg_columns_to_scale] = reg_test_std[reg_columns_to_scale]
# For Classification
reg_c_std = StandardScaler().fit(xc_train[class_columns_to_scale])
reg_c_train_df = pd.DataFrame(
reg_c_std.transform(xc_train[class_columns_to_scale]), columns=class_columns_to_scale, index=xc_train.index)
xc_train_std = xc_train.copy(deep=True)
xc_train_std[class_columns_to_scale] = reg_c_train_df[class_columns_to_scale]
print(xc_train[class_columns_to_scale].head(5)) # see the original data b4 standardization
print(xc_train_std[class_columns_to_scale].head(5)) # make sure that df is correctly updated#print(x_test[columns_to_scale].head(5)) # see the original data b4 standardization
reg_c_test_df = pd.DataFrame(
reg_c_std.transform(xc_test[class_columns_to_scale]), columns=class_columns_to_scale, index=xc_test.index)
xc_test_std = xc_test.copy(deep=True)
xc_test_std[class_columns_to_scale] = reg_c_test_df[class_columns_to_scale]
# Now we have two sets, x_train and x_train_std, and can create more like x_train_max
# Will set up x_train_scaled and copy to it whatever scaling we want
x_train_scaled = x_train_std
x_test_scaled = x_test_std
xc_train_scaled = xc_train_std
xc_test_scaled = xc_test_std
#Feature Selection
# return last k elements in the df column list since feature scoring functions order from least to most important
def top_k_in_list(df, ordering, k):
length = len(df.columns.tolist())
return np.array([df.columns.tolist()[x] for x in ordering])[length-k:length]
#K-Best (more steady compared to ExtraTreeRegressor)
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
kbest = SelectKBest(score_func=f_regression, k=k_features)
clf = kbest.fit(x_train_scaled, y_train)
indices = np.argsort(clf.scores_) # list containing indices, from least to most important, feature
print(top_k_in_list(x_train_scaled, indices, k_features))
#features = fit.transform(x_train_scaled)
plt.figure(figsize=(20,10))
plt.title("K-Best important features")
plt.barh(range(x_train_scaled.shape[1]), clf.scores_[indices],
color="r", align="center")
# If you want to define your own labels,
# change indices to a list of labels on the following line.
plt.yticks(range(x_train.shape[1]), [x_train_scaled.columns.tolist()[x] for x in indices])
plt.ylim([-1, x_train.shape[1]])
plt.show()
# Select features for regression
print("Regression: These are the top {0} feature indices {1}".format(k_features,
indices[len(indices)-k_features:len(indices)]))
x_train_new = x_train_scaled[top_k_in_list(x_train_scaled, indices, k_features)]
x_test_new = x_test_scaled[top_k_in_list(x_test_scaled, indices, k_features)]
# Select features for classification
clf = kbest.fit(xc_train_scaled, yc_train)
indices = np.argsort(clf.scores_) # list containing indices, from least to most important, feature
print(top_k_in_list(xc_train_scaled, indices, k_features))
print("Classsification: These are the top {0} feature indices {1}".format(k_features,
indices[len(indices)-k_features:len(indices)]))
xc_train_new = xc_train_scaled[top_k_in_list(xc_train_scaled, indices, k_features)]
xc_test_new = xc_test_scaled[top_k_in_list(xc_test_scaled, indices, k_features)]
x = np.arange(len(y_test)) # set the x-axis for plotting
def mpe_scorer(ground_truth, predictions):
mpe = ((predictions - ground_truth)/ground_truth)
return mpe.mean()
my_mpe_scorer = make_scorer(mpe_scorer, greater_is_better=False)
def error_scorer(ground_truth, predictions):
denom = len(ground_truth)
return (((ground_truth != predictions).sum())/denom)
my_error_scorer = make_scorer(mpe_scorer, greater_is_better=False)
def rmse_error (labels, predictions):
return np.sqrt(mean_squared_error(labels, predictions))
def rmse2_error (labels, predictions):
return np.sqrt(mean_squared_error(labels, predictions))
# if greater_is_better=False for some reason the sign is flipped
#https://github.com/scikit-learn/scikit-learn/issues/2439
rmse = make_scorer(rmse_error, greater_is_better=False)
rmse2 = make_scorer(rmse2_error, greater_is_better=True)
mse = make_scorer(mean_squared_error)
mae = make_scorer(mean_absolute_error)
accuracy = make_scorer(accuracy_score, greater_is_better=True)
#from sklearn.model_selection import learning_curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5), scoring='r2', y_label=''):
"""
Generate a simple plot of the test and training learning curve.
Parameters
----------
estimator : object type that implements the "fit" and "predict" methods
An object of that type which is cloned for each validation.
title : string
Title for the chart.
X : array-like, shape (n_samples, n_features)
Training vector, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape (n_samples) or (n_samples, n_features), optional
Target relative to X for classification or regression;
None for unsupervised learning.
ylim : tuple, shape (ymin, ymax), optional
Defines minimum and maximum yvalues plotted.
cv : int, cross-validation generator or an iterable, optional
Determines the cross-validation splitting strategy.
Possible inputs for cv are:
- None, to use the default 3-fold cross-validation,
- integer, to specify the number of folds.
- An object to be used as a cross-validation generator.
- An iterable yielding train/test splits.
For integer/None inputs, if ``y`` is binary or multiclass,
:class:`StratifiedKFold` used. If the estimator is not a classifier
or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.
Refer :ref:`User Guide <cross_validation>` for the various
cross-validators that can be used here.
n_jobs : integer, optional
Number of jobs to run in parallel (default 1).
"""
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score ({})".format(y_label))
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, scoring=scoring)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation")
plt.legend(loc="best")
return plt
def plot_validation_curves (train_scores, test_scores, param='alpha', param_range=np.logspace(-3, 3, 13), scoring=rmse2):
# train_scores, test_scores = validation_curve(linear_model.Lasso(), train_scores, test_scores,
# param_name=param, param_range=param_range,
# cv=10, scoring=scoring, n_jobs=1)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.title("Validation Curve")
plt.xlabel("$\{}$".format(param))
plt.ylabel("Score")
plt.ylim(1.5e5, 3.5e5)
lw = 2
plt.semilogx(param_range, train_scores_mean, label="Training score",
color="darkorange", lw=lw)
plt.fill_between(param_range, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.2,
color="darkorange", lw=lw)
plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
color="navy", lw=lw)
plt.fill_between(param_range, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.2,
color="navy", lw=lw)
plt.legend(loc="best")
plt.show()
# Learning Curves for Linear Regression
# Make sure that the learning curve code has the best parameters for the models in question
t0_r_lr_learning_curve = t0 = time.time()
#from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
title = "Learning Curves Linear Regression"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=k, test_size=0.2, random_state=0)
estimator = linear_model.LinearRegression()
plot_learning_curve(estimator, title, x_pre, y_pre, ylim=(0, 5e5),
cv=cv, n_jobs=4, y_label='RMSE', scoring=rmse2)
plt.figure(figsize=(10,10))
plt.show()
t1_r_lr_learning_curve = t1 = time.time()
print("Execution time: {0}".format(t1-t0))
# this histogram shows why the choice of <30 days yields such high marks. Practically 80% of the
# time the sale takes > 30 days
#x_pre.DOM.describe()
plt.title('Number of days it took to sell house')
plt.xlabel('Days')
plt.ylabel('Occurrences (mean = {})'.format(x_pre.DOM.mean()))
binwidth=30
plt.axvline(x_pre.DOM.mean(), color='b', linestyle='dashed', linewidth=2)
plt.hist(x_pre.DOM, bins=np.arange(min(x_pre.DOM), max(x_pre.DOM) + binwidth, binwidth))
from sklearn.linear_model import LogisticRegression
t0_c_log_learning_curve = t0 = time.time()
title = "Learning Curves Logistic Regression"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=k, test_size=0.3, random_state=0)
estimator = LogisticRegression(penalty='l1', C=10)
plot_learning_curve(estimator, title, xc_pre, yc_pre, ylim=(.5, .9), cv=cv, n_jobs=1,
scoring='accuracy', y_label='Accuracy')
t1_c_log_learning_curve = t1 = time.time()
scores = cross_val_score(estimator, xc_pre, yc_pre, cv=k)
#plt.axhline(y=scores.mean(), linewidth=1, color='black')
#print("Long term cross validation score = {}".format(scores.mean()))
print("Execution time: {0}".format(t1-t0))
cv_iter = 10 # number of iterations for cv
k = 10 # used for k-fold validation
# t0_r_lr = t0 = time.time()
# t1_r_lr = t1 = time.time()
# print("Execution time: {0}".format(t1-t0))
t0_r_lr = t0 = time.time()
# prepare sets to make code below more readable and robust (easy to mix up x_train with xc_train)
[train_samples, train_labels, test_samples, test_labels] = [x_train_scaled, y_train, x_test_scaled, y_test]
# No need for parameter search
# Cross Validation to report the training error
model = linear_model.LinearRegression(fit_intercept=False)
print('Model: {}'.format(model))
k_fold = KFold(n_splits=k, shuffle=True)
training_results = []
training_results_mpe = []
training_results_rmse = []
test_results = []
test_results_mpe = []
test_results_rmse = []
a = []
for i in range(0,cv_iter-1):
for train_indices, test_indices in k_fold.split(train_samples):
try:
model.fit(train_samples.iloc[train_indices,:], train_labels.iloc[train_indices])
training_results.append(model.score(train_samples.iloc[train_indices,:], train_labels.iloc[train_indices]))
training_results_mpe.append(mpe_scorer(train_labels.iloc[train_indices],
model.predict(train_samples.iloc[train_indices,:])))
training_results_rmse.append(rmse_error(train_labels.iloc[train_indices],
model.predict(train_samples.iloc[train_indices,:])))
test_results.append(model.score(train_samples.iloc[test_indices,:], train_labels.iloc[test_indices]))
test_results_mpe.append(mpe_scorer(train_labels.iloc[test_indices],
model.predict(train_samples.iloc[test_indices,:])))
test_results_rmse.append(rmse_error(train_labels.iloc[test_indices],
model.predict(train_samples.iloc[test_indices,:])))
except e:
print("Problem in cv. {0}".format(e))
a = train_indices
print("{0}-fold cross validation with {1} iterations results:".format(k, cv_iter))
#print("CV Train set accuracy (R^2) = {0:.3f}".format(np.mean(training_results)))
print("CV Train set accuracy (Mean % Error) = {0:.2f}%".format(np.mean(training_results_mpe)*100))
print("CV Train set accuracy (RMSE) = ${0:.2f}".format(np.mean(training_results_rmse)))
print("CV Train set accuracy (RMSE) std dev = ${0:.2f}".format(np.std(training_results_rmse)))
#print("CV Validation set accuracy (R^2) = {0:.3f}".format(np.mean(test_results)))
print("CV Validation set accuracy (Mean % Error) = {0:.2f}%".format(np.mean(test_results_mpe)*100))
print("CV Validation set accuracy (RMSE) = ${0:.2f}".format(np.mean(test_results_rmse)))
print("CV Validation set accuracy (RMSE) std dev = ${0:.2f}".format(np.std(test_results_rmse)))
print(model.coef_)
lr_coef_full = model.coef_
# Report the test error
model.fit(test_samples, test_labels) # re-fit using all of the test data
print('Number of coefficients full feature set: \n', len(model.coef_))
#print("Test set accuracy (R^2) = {0:.3f}".format(model.score(test_samples, test_labels)))
scores = mpe_scorer(test_labels, model.predict(test_samples))
print("Test set accuracy (Mean % Error) = {0:.2f}%".format(scores*100))
linear_predicted_full_df = pd.DataFrame(data=model.predict(test_samples), columns=['Price'], index=test_samples.index)
lr_test_rmse_full = np.sqrt(mean_squared_error(test_labels, model.predict(test_samples)))
print('Test RMSE ${0:.2f}'.format(lr_test_rmse_full))
print("Test sample std dev = ${0:.2f}".format(np.std(test_labels)))
print("Std. dev. for predictions on test = ${0:.2f}".format(np.std(model.predict(test_samples))))
# Using the reduced features set predict output for graphing
print("")
print("Results using the reduced feature set:")
[train_samples, test_samples] = [x_train_new, x_test_new] # feature selected sets = training and test samples
# May want to run GridSearchCV again with reduced set
model.fit(train_samples, train_labels)
linear_predicted_lean_df = pd.DataFrame(data=model.predict(test_samples), columns=['Price'], index=test_samples.index)
print('Number of coefficients reduced feature set: \n', len(model.coef_))
#print("Test set accuracy (R^2) = {0:.3f}".format(model.score(test_samples, test_labels)))
scores = mpe_scorer(test_labels, model.predict(test_samples))
print("Test set accuracy (Mean % Error) = {0:.2f}%".format(scores*100))
lr_test_rmse_lean = np.sqrt(mean_squared_error(test_labels, model.predict(test_samples)))
print('Test RMSE ${0:.2f}'.format(lr_test_rmse_lean))
print("Test sample std dev = ${0:.2f}".format(np.std(test_labels)))
print("Std. dev. for predictions on test = ${0:.2f}".format(np.std(model.predict(test_samples))))
temp = test_labels.sort_values()
plt.figure(figsize=(20,10))
plt.plot(x,temp)
plt.plot(x,linear_predicted_full_df.sort_values(by=['Price']), 'r')
plt.plot(x,linear_predicted_lean_df.sort_values(by=['Price']), 'g')
plt.xlabel('sample')
plt.ylabel('Sale price')
plt.legend(loc='best')
print(model.coef_)
lr_coef_lean = model.coef_
t1_r_lr = t1 = time.time()
print("Execution time: {0}".format(t1-t0))
t0_r_l = t0 = time.time()
# prepare sets to make code below more readable and robust (easy to mix up x_train with xc_train)
[train_samples, train_labels, test_samples, test_labels] = [x_train_scaled, y_train, x_test_scaled, y_test]
# Parameter search
tuned_parameters = [{'alpha': np.logspace(-3, 3, 20)}]
n_folds = k
clf = GridSearchCV(estimator=linear_model.Lasso(),
param_grid=tuned_parameters, cv=n_folds, refit=False, scoring=rmse)
clf.fit(x_train, y_train)
print("Best score {0:.4f}".format(clf.best_score_))
print("Best parameters")
lasso_best_params=clf.best_params_
print(lasso_best_params)
# Cross Validation to report the training error
model = linear_model.Lasso(alpha = clf.best_params_['alpha'])
print('Model: {}'.format(model))
k_fold = KFold(n_splits=k, shuffle=True)
training_results = []
training_results_mpe = []
training_results_rmse = []
test_results = []
test_results_mpe = []
test_results_rmse = []
a = []
for i in range(0,cv_iter-1):
for train_indices, test_indices in k_fold.split(train_samples):
try:
model.fit(train_samples.iloc[train_indices,:], train_labels.iloc[train_indices])
training_results.append(model.score(train_samples.iloc[train_indices,:], train_labels.iloc[train_indices]))
training_results_mpe.append(mpe_scorer(train_labels.iloc[train_indices],
model.predict(train_samples.iloc[train_indices,:])))
training_results_rmse.append(rmse_error(train_labels.iloc[train_indices],
model.predict(train_samples.iloc[train_indices,:])))
test_results.append(model.score(train_samples.iloc[test_indices,:], train_labels.iloc[test_indices]))
test_results_mpe.append(mpe_scorer(train_labels.iloc[test_indices],
model.predict(train_samples.iloc[test_indices,:])))
test_results_rmse.append(rmse_error(train_labels.iloc[test_indices],
model.predict(train_samples.iloc[test_indices,:])))
except e:
print("Problem in cv. {0}".format(e))
a = train_indices
print("{0}-fold cross validation with {1} iterations results:".format(k, cv_iter))
#print("CV Train set accuracy (R^2) = {0:.3f}".format(np.mean(training_results)))
print("CV Train set accuracy (Mean % Error) = {0:.2f}%".format(np.mean(training_results_mpe)*100))
print("CV Train set accuracy (RMSE) = ${0:.2f}".format(np.mean(training_results_rmse)))
print("CV Train set accuracy (RMSE) std dev = ${0:.2f}".format(np.std(training_results_rmse)))
#print("CV Validation set accuracy (R^2) = {0:.3f}".format(np.mean(test_results)))
print("CV Validation set accuracy (Mean % Error) = {0:.2f}%".format(np.mean(test_results_mpe)*100))
print("CV Validation set accuracy (RMSE) = ${0:.2f}".format(np.mean(test_results_rmse)))
print("CV Validation set accuracy (RMSE) std dev = ${0:.2f}".format(np.std(test_results_rmse)))
print(model.coef_)
l_coef_full = model.coef_
# Report the test error
model.fit(test_samples, test_labels) # re-fit using all of the test data
print('Number of coefficients full feature set: \n', len(model.coef_))
#print("Test set accuracy (R^2) = {0:.3f}".format(model.score(test_samples, test_labels)))
scores = mpe_scorer(test_labels, model.predict(test_samples))
print("Test set accuracy (Mean % Error) = {0:.2f}%".format(scores*100))
lasso_predicted_full_df = pd.DataFrame(data=model.predict(test_samples), columns=['Price'], index=test_samples.index)
l_test_rmse_full = np.sqrt(mean_squared_error(test_labels, model.predict(test_samples)))
print('Test RMSE ${0:.2f}'.format(l_test_rmse_full))
print("Test sample std dev = ${0:.2f}".format(np.std(test_labels)))
print("Std. dev. for predictions on test = ${0:.2f}".format(np.std(model.predict(test_samples))))
# Using the reduced features set predict output for graphing
print("")
print("Results using the reduced feature set:")
[train_samples, test_samples] = [x_train_new, x_test_new] # feature selected sets = training and test samples
# May want to run GridSearchCV again with reduced set
model.fit(train_samples, train_labels)
lasso_predicted_lean_df = pd.DataFrame(data=model.predict(test_samples), columns=['Price'], index=test_samples.index)
print('Number of coefficients reduced feature set: \n', len(model.coef_))
#print("Test set accuracy (R^2) = {0:.3f}".format(model.score(test_samples, test_labels)))
scores = mpe_scorer(test_labels, model.predict(test_samples))
print("Test set accuracy (Mean % Error) = {0:.2f}%".format(scores*100))
l_test_rmse_lean = np.sqrt(mean_squared_error(test_labels, model.predict(test_samples)))
print('Test RMSE ${0:.2f}'.format(l_test_rmse_lean))
print("Test sample std dev = ${0:.2f}".format(np.std(test_labels)))
print("Std. dev. for predictions on test = ${0:.2f}".format(np.std(model.predict(test_samples))))
temp = test_labels.sort_values()
plt.figure(figsize=(20,10))
plt.plot(x,temp)
plt.plot(x,linear_predicted_full_df.sort_values(by=['Price']), 'r')
plt.plot(x,linear_predicted_lean_df.sort_values(by=['Price']), 'g')
plt.xlabel('sample')
plt.ylabel('Sale price')
plt.legend(loc='best')
print(model.coef_)
l_coef_lean = model.coef_
t1_r_l = t1 = time.time()
print("Execution time: {0}".format(t1-t0))
from sklearn.model_selection import validation_curve
[train_scores, valid_scores] = validation_curve(
linear_model.Lasso(),
x_train,
y_train,
param_name="alpha",
param_range=np.logspace(-3, 5, 20),
scoring=rmse2)
plot_validation_curves(
train_scores,
valid_scores,
param='alpha',
param_range=np.logspace(-3, 5, 20),
scoring=rmse2)
t0_r_rf = t0 = time.time()
from sklearn.ensemble import RandomForestRegressor
# prepare sets to make code below more readable and robust (easy to mix up x_train with xc_train)
[train_samples, train_labels, test_samples, test_labels] = [x_train_scaled, y_train, x_test_scaled, y_test]
#k = cv_iter = 2
# Parameter search
# careful about np.linspace since in some cases you want whole numbers
tuned_parameters = [{'criterion': ['mae', 'mse'],
'max_depth': np.arange(1, 25, 2),
'n_estimators': np.arange(1, 11, 2)}]
n_folds = k
clf = GridSearchCV(estimator=RandomForestRegressor(),
param_grid=tuned_parameters, cv=n_folds, refit=False, scoring=mse)
clf.fit(x_train, y_train)
print("Best score {0:.4f}".format(clf.best_score_))
print("Best parameters")
rfr_best_params=clf.best_params_
print(rfr_best_params)
# Cross Validation to report the training error
model = RandomForestRegressor(criterion=clf.best_params_['criterion'],
max_depth=clf.best_params_['max_depth'],
n_estimators=clf.best_params_['n_estimators'])
print('Model: {}'.format(model))
k_fold = KFold(n_splits=k, shuffle=True)
training_results = []
training_results_mpe = []
training_results_rmse = []
test_results = []
test_results_mpe = []
test_results_rmse = []
a = []
for i in range(0,cv_iter-1):
for train_indices, test_indices in k_fold.split(train_samples):
try:
model.fit(train_samples.iloc[train_indices,:], train_labels.iloc[train_indices])
training_results.append(model.score(train_samples.iloc[train_indices,:], train_labels.iloc[train_indices]))
training_results_mpe.append(mpe_scorer(train_labels.iloc[train_indices],
model.predict(train_samples.iloc[train_indices,:])))
training_results_rmse.append(rmse_error(train_labels.iloc[train_indices],
model.predict(train_samples.iloc[train_indices,:])))
test_results.append(model.score(train_samples.iloc[test_indices,:], train_labels.iloc[test_indices]))
test_results_mpe.append(mpe_scorer(train_labels.iloc[test_indices],
model.predict(train_samples.iloc[test_indices,:])))
test_results_rmse.append(rmse_error(train_labels.iloc[test_indices],
model.predict(train_samples.iloc[test_indices,:])))
except e:
print("Problem in cv. {0}".format(e))
a = train_indices
print("{0}-fold cross validation with {1} iterations results:".format(k, cv_iter))
#print("CV Train set accuracy (R^2) = {0:.3f}".format(np.mean(training_results)))
print("CV Train set accuracy (Mean % Error) = {0:.2f}%".format(np.mean(training_results_mpe)*100))
print("CV Train set accuracy (RMSE) = ${0:.2f}".format(np.mean(training_results_rmse)))
print("CV Train set accuracy (RMSE) std dev = ${0:.2f}".format(np.std(training_results_rmse)))
#print("CV Validation set accuracy (R^2) = {0:.3f}".format(np.mean(test_results)))
print("CV Validation set accuracy (Mean % Error) = {0:.2f}%".format(np.mean(test_results_mpe)*100))
print("CV Validation set accuracy (RMSE) = ${0:.2f}".format(np.mean(test_results_rmse)))
print("CV Validation set accuracy (RMSE) std dev = ${0:.2f}".format(np.std(test_results_rmse)))
# Report the test error
model.fit(test_samples, test_labels) # re-fit using all of the test data
#print("Test set accuracy (R^2) = {0:.3f}".format(model.score(test_samples, test_labels)))
scores = mpe_scorer(test_labels, model.predict(test_samples))
print("Test set accuracy (Mean % Error) = {0:.2f}%".format(scores*100))
rf_predicted_full_df = pd.DataFrame(data=model.predict(test_samples), columns=['Price'], index=test_samples.index)
rf_r_test_rmse_full = np.sqrt(mean_squared_error(test_labels, model.predict(test_samples)))
print('Test RMSE ${0:.2f}'.format(rf_r_test_rmse_full))
print("Test sample std dev = ${0:.2f}".format(np.std(test_labels)))
print("Std. dev. for predictions on test = ${0:.2f}".format(np.std(model.predict(test_samples))))
# Using the reduced features set predict output for graphing
print("")
print("Results using the reduced feature set:")
[train_samples, test_samples] = [x_train_new, x_test_new] # feature selected sets = training and test samples
model.fit(train_samples, train_labels)
rf_predicted_lean_df = pd.DataFrame(data=model.predict(test_samples), columns=['Price'], index=test_samples.index)
#print("Test set accuracy (R^2) = {0:.3f}".format(model.score(test_samples, test_labels)))
scores = mpe_scorer(test_labels, model.predict(test_samples))
print("Test set accuracy (Mean % Error) = {0:.2f}%".format(scores*100))
rf_r_test_rmse_lean = np.sqrt(mean_squared_error(test_labels, model.predict(test_samples)))
print('Test RMSE ${0:.2f}'.format(l_test_rmse_lean))
print("Test sample std dev = ${0:.2f}".format(np.std(test_labels)))
print("Std. dev. for predictions on test = ${0:.2f}".format(np.std(model.predict(test_samples))))
temp = test_labels.sort_values()
plt.figure(figsize=(20,10))
plt.plot(x,temp)
plt.plot(x,rf_predicted_full_df.sort_values(by=['Price']), 'r')
plt.plot(x,rf_predicted_lean_df.sort_values(by=['Price']), 'g')
t1_r_rf = t1 = time.time()
print("Execution time: {0}".format(t1-t0))
# If you want, change to a lower degree for the polynomial regression
#K-Best
from sklearn.feature_selection import f_regression
#k = k_features
k=5
kbest = SelectKBest(score_func=f_regression, k=k)
clf = kbest.fit(x_train_scaled, y_train)
indices = np.argsort(clf.scores_) # list containing indices, from least to most important, feature
print(top_k_in_list(x_train_scaled, indices, k))
# Select features for regression
print("Regression: These are the top {0} feature indices {1}".format(k, indices[len(indices)-k:len(indices)]))
xp_train_new = x_train_scaled[top_k_in_list(x_train_scaled, indices, k)]
xp_test_new = x_test_scaled[top_k_in_list(x_test_scaled, indices, k)]
print(xp_train_new.shape)
# Polynomial of 2nd degree
# Need to use the scaled feature set otherwise the polynomial entries becomes very large
# setup
from sklearn.preprocessing import PolynomialFeatures
[xp_train_new, xp_test_new] = [x_train_new, x_test_new]
# Try polynomial regression
print("Polynomial with all interactions (x_i^2 included)")
linear = linear_model.LinearRegression()
poly = PolynomialFeatures(degree=2, interaction_only=False)
x_poly_train = poly.fit_transform(xp_train_new)
linear.fit(x_poly_train, y_train)
print(poly.get_feature_names())
print('Number of coefficients: \n', len(linear.coef_))
print('Number of coefficients: \n', linear.coef_)
print('Intercept: \n', linear.intercept_)
print("Train set accuracy (R^2) = {0:.3f}".format(linear.score(x_poly_train, y_train)))
scores = mpe_scorer(y_train, linear.predict(x_poly_train))
print("Train set accuracy (Mean % Error) = {0:.2f}%".format(scores*100))
print('Train RMSE ${0:.2f}'.format(np.sqrt(mean_squared_error(y_train, linear.predict(x_poly_train)))))
#Predict Output
predict_ = poly.fit_transform(xp_test_new)
poly_predicted_lean_df = pd.DataFrame(data=linear.predict(predict_), columns=['Price'], index=xp_test_new.index)
print("Test set accuracy (R^2) = {0:.3f}".format(linear.score(predict_, y_test)))
scores = mpe_scorer(y_test, linear.predict(predict_))
print("Test set accuracy (Mean % Error) = {0:.2f}%".format(scores*100))
print('Test RMSE ${0:.2f}'.format(np.sqrt(mean_squared_error(y_test, linear.predict(predict_)))))
np.set_printoptions(precision=3)
print("Full feature results:")
reg_full_df = pd.concat([
y_test,
pd.DataFrame(linear_predicted_full_df),
pd.DataFrame(lasso_predicted_full_df),
pd.DataFrame(rf_predicted_full_df)],
axis=1)
reg_full_df.columns = ['test','linear','lasso','random_forest']
reg_full_df['lin_mpe'] = ((reg_full_df.linear - reg_full_df.test)/reg_full_df.test)*100
reg_full_df['lasso_mpe'] = ((reg_full_df.lasso - reg_full_df.test)/reg_full_df.test)*100
reg_full_df['random_forest_mpe'] = ((reg_full_df.random_forest - reg_full_df.test)/reg_full_df.test)*100
print("{}".format(reg_full_df.mean()))
print("")
print("Selected features result:")
reg_lean_df = pd.concat([
y_test,
pd.DataFrame(poly_predicted_lean_df),
pd.DataFrame(linear_predicted_lean_df),
pd.DataFrame(lasso_predicted_lean_df),
pd.DataFrame(rf_predicted_lean_df)],
axis=1)
reg_lean_df.columns = ['test','poly','linear','lasso','rf']
reg_lean_df['lin_mpe'] = ((reg_lean_df.linear - reg_lean_df.test)/reg_lean_df.test)*100
reg_lean_df['poly_mpe'] = ((reg_lean_df.poly - reg_lean_df.test)/reg_lean_df.test)*100
reg_lean_df['lasso_mpe'] = ((reg_lean_df.lasso - reg_lean_df.test)/reg_lean_df.test)*100
reg_lean_df['rf_mpe'] = ((reg_lean_df.rf - reg_lean_df.test)/reg_lean_df.test)*100
print("{}".format(reg_lean_df.mean()))
# Make sure that the learning curve code has the best parameters for the models in question
t0_r_lr_learning_curve = t0 = time.time()
#from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
title = "Learning Curves Linear Regression"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=k, test_size=0.2, random_state=0)
estimator = linear_model.LinearRegression()
plot_learning_curve(estimator, title, x_train[0:1200], y_train[0:1200], ylim=(0, 5e5),
cv=cv, n_jobs=4, y_label='RMSE', scoring=rmse2)
plt.figure(figsize=(10,10))
plt.show()
t1_r_lr_learning_curve = t1 = time.time()
print("Execution time: {0}".format(t1-t0))
print(lasso_best_params)
# Make sure that the learning curve code has the best parameters for the models in question
t0_r_l_learning_curve = t1 = time.time()
title = "Learning Curves Lasso Regression"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=k, test_size=0.2, random_state=0)
estimator = linear_model.Lasso(alpha=1000)
plot_learning_curve(estimator, title, x_train[0:1500], y_train[0:1500], ylim=(0, 5e5),
cv=cv, n_jobs=4, y_label='RMSE', scoring=rmse2)
plt.figure(figsize=(10,10))
t1_r_l_learning_curve = t1 = time.time()
print("Execution time: {0}".format(t1-t0))
print(rfr_best_params)
# Make sure that the learning curve code has the best parameters for the models in question
from sklearn.ensemble import RandomForestRegressor
t0_r_rf_learning_curve = t0 = time.time()
title = "Learning Curves Random Forest"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=k, test_size=0.2, random_state=0)
estimator = RandomForestRegressor(criterion='mse', max_depth=30, n_estimators=50)
plot_learning_curve(estimator, title, x_train[0:3000], y_train[0:3000], ylim=(0, 5e5),
cv=cv, n_jobs=4, y_label='RMSE', scoring=rmse2)
scores = cross_val_score(estimator, x_train, y_train, cv=10, scoring=rmse2)
plt.axhline(y=-1*scores.mean(), linewidth=1, color='black')
print("Long term cross validation score = {}".format(scores.mean()))
plt.figure(figsize=(10,10))
t1_r_rf_learning_curve = t1 = time.time()
print("Execution time: {0}".format(t1-t0))
from sklearn.kernel_ridge import KernelRidge
clf = KernelRidge(alpha=1.0)
clf.fit(x_train, y_train)
print(clf.score(x_train, y_train))
# Logistic regression
t0_c_log = t0 = time.time()
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, cross_val_score
# prepare sets to make code below more readable and robust (easy to mix up x_train with xc_train)
[train_samples, train_labels, test_samples, test_labels] = [xc_train_scaled, yc_train, xc_test_scaled, yc_test]
# Parameter search
tuned_parameters = [{'C': np.logspace(-5, 3, 29), 'penalty': ['l1', 'l2']}]
n_folds = k
clf = GridSearchCV(estimator=linear_model.LogisticRegression(),
param_grid=tuned_parameters, cv=n_folds, refit=False)
clf.fit(xc_train, yc_train)
print("Best score {0:.4f}".format(clf.best_score_))
print("Best parameters")
log_c_best_params=clf.best_params_
print(clf.best_params_)
# Cross Validation to report the training error
model = linear_model.LogisticRegression(C=clf.best_params_['C'],
random_state=111,
penalty=clf.best_params_['penalty'])
print('Model: {}'.format(model))
k_fold = KFold(n_splits=k, shuffle=True)
training_results = []
test_results = []
a = []
for i in range(0,cv_iter-1):
for train_indices, test_indices in k_fold.split(train_samples):
try:
model.fit(train_samples.iloc[train_indices,:], train_labels.iloc[train_indices])
training_results.append(accuracy_score(train_labels.iloc[train_indices],
model.predict(train_samples.iloc[train_indices,:])))
test_results.append(accuracy_score(train_labels.iloc[test_indices],
model.predict(train_samples.iloc[test_indices,:])))
except e:
print("Problem in cv. {0}".format(e))
a = train_indices
print("{0}-fold cross validation with {1} iterations results:".format(k, cv_iter))
print("CV Train set accuracy (% Correct) = {0:.4f}".format(np.mean(training_results)))
print("CV Validation set accuracy (% Correct) = {0:.4f}".format(np.mean(test_results)))
# Report the test error
model.fit(test_samples, test_labels) # re-fit using all of the test data
scores = accuracy_score(test_labels, model.predict(test_samples))
print("Test set accuracy (% Correct) = {0:.4f}".format(scores))
log_predicted_full_df = pd.DataFrame(data=model.predict(test_samples), columns=['xdays'], index=test_samples.index)
# Using the reduced features set predict output for graphing
print("")
print("Results using the reduced feature set:")
[train_samples, test_samples] = [xc_train_new, xc_test_new] # feature selected sets = training and test samples
model.fit(train_samples, train_labels)
log_predicted_lean_df = pd.DataFrame(data=model.predict(test_samples), columns=['xdays'], index=test_samples.index)
scores = accuracy_score(test_labels, model.predict(test_samples))
print("Test set accuracy (% Correct) = {0:.4f}".format(scores))
t1_c_log = t1 = time.time()
print("Execution time: {0}".format(t1-t0))
t0_c_log = t0 = time.time()
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
# prepare sets to make code below more readable and robust (easy to mix up x_train with xc_train)
[train_samples, train_labels, test_samples, test_labels] = [xc_train, yc_train, xc_test, yc_test]
# Parameter search
tuned_parameters = [{'n_estimators': np.arange(5, 12, 1), 'learning_rate' : np.arange(.5, 5, .5)}]
n_folds = k
clf = GridSearchCV(estimator=AdaBoostClassifier(),
param_grid=tuned_parameters, cv=n_folds, refit=False)
clf.fit(xc_train, yc_train)
print("Best score {0:.4f}".format(clf.best_score_))
print("Best parameters")
ada_c_best_params=clf.best_params_
print(clf.best_params_)
# Cross Validation to report the training error
model = AdaBoostClassifier(n_estimators=clf.best_params_['n_estimators'], learning_rate = clf.best_params_['learning_rate'])
print('Model: {}'.format(model))
k_fold = KFold(n_splits=k, shuffle=True)
training_results = []
test_results = []
a = []
for i in range(0,cv_iter-1):
for train_indices, test_indices in k_fold.split(train_samples):
try:
model.fit(train_samples.iloc[train_indices,:], train_labels.iloc[train_indices])
training_results.append(accuracy_score(train_labels.iloc[train_indices],
model.predict(train_samples.iloc[train_indices,:])))
test_results.append(accuracy_score(train_labels.iloc[test_indices],
model.predict(train_samples.iloc[test_indices,:])))
except e:
print("Problem in cv. {0}".format(e))
a = train_indices
print("{0}-fold cross validation with {1} iterations results:".format(k, cv_iter))
print("CV Train set accuracy (% Correct) = {0:.4f}".format(np.mean(training_results)))
print("CV Validation set accuracy (% Correct) = {0:.4f}".format(np.mean(test_results)))
# Report the test error
model.fit(test_samples, test_labels) # re-fit using all of the test data
print("Test set accuracy (% Correct) = {0:.4f}".format(model.score(test_samples, test_labels)))
scores = accuracy_score(test_labels, model.predict(test_samples))
#print("Test set accuracy (Mean % Error) = {0:.4f}".format(scores))
ada_predicted_full_df = pd.DataFrame(data=model.predict(test_samples), columns=['xdays'], index=test_samples.index)
# Using the reduced features set predict output for graphing
print("")
print("Results using the reduced feature set:")
[train_samples, test_samples] = [xc_train_new, xc_test_new] # feature selected sets = training and test samples
model.fit(train_samples, train_labels)
ada_predicted_lean_df = pd.DataFrame(data=model.predict(test_samples), columns=['xdays'], index=test_samples.index)
print("Test set accuracy (% Correct) = {0:.4f}".format(model.score(test_samples, test_labels)))
scores = accuracy_score(test_labels, model.predict(test_samples))
#print("Test set accuracy (Mean % Error) = {0:.4f}".format(scores))
t1_c_ada = t1 = time.time()
print("Execution time: {0}".format(t1-t0))
# Import the random forest model.
t0_c_rf = t0 = time.time()
from sklearn.ensemble import RandomForestClassifier
#from sklearn.model_selection import KFold, cross_val_score
# prepare sets to make code below more readable and robust (easy to mix up x_train with xc_train)
[train_samples, train_labels, test_samples, test_labels] = [xc_train, yc_train, xc_test, yc_test]
# Parameter search
tuned_parameters = [{'max_depth': np.arange(1, 50, 2), 'n_estimators': np.arange(1, 30, 2),
'criterion': ['entropy', 'gini'] }]
n_folds = k
clf = GridSearchCV(estimator=RandomForestClassifier(),
param_grid=tuned_parameters, cv=n_folds, refit=False)
clf.fit(xc_train, yc_train)
print("Best score {0:.4f}".format(clf.best_score_))
print("Best parameters")
rf_c_best_params=clf.best_params_
print(clf.best_params_)
# Cross Validation to report the training error
model = RandomForestClassifier(max_depth=clf.best_params_['max_depth'],
n_estimators=clf.best_params_['n_estimators'],
criterion=clf.best_params_['criterion'], )
print('Model: {}'.format(model))
k_fold = KFold(n_splits=k, shuffle=True)
training_results = []
test_results = []
a = []
for i in range(0,cv_iter-1):
for train_indices, test_indices in k_fold.split(train_samples):
try:
model.fit(train_samples.iloc[train_indices,:], train_labels.iloc[train_indices])
training_results.append(accuracy_score(train_labels.iloc[train_indices],
model.predict(train_samples.iloc[train_indices,:])))
test_results.append(accuracy_score(train_labels.iloc[test_indices],
model.predict(train_samples.iloc[test_indices,:])))
except e:
print("Problem in cv. {0}".format(e))
a = train_indices
print("{0}-fold cross validation with {1} iterations results:".format(k, cv_iter))
print("CV Train set accuracy (% Correct) = {0:.4f}".format(np.mean(training_results)))
print("CV Test set accuracy (% Correct) = {0:.4f}".format(np.mean(test_results)))
# Report the test error
model.fit(test_samples, test_labels) # re-fit using all of the test data
print("Test set accuracy (% Correct) = {0:.4f}".format(model.score(test_samples, test_labels)))
scores = accuracy_score(test_labels, model.predict(test_samples))
print("Test set accuracy (Mean % Error) = {0:.4f}".format(scores))
rf_predicted_full_df = pd.DataFrame(data=model.predict(test_samples), columns=['xdays'], index=test_samples.index)
# Using the reduced features set predict output for graphing
print("")
print("Results using the reduced feature set:")
[train_samples, test_samples] = [xc_train_new, xc_test_new] # feature selected sets = training and test samples
model.fit(train_samples, train_labels)
rf_predicted_lean_df = pd.DataFrame(data=model.predict(test_samples), columns=['xdays'], index=test_samples.index)
print("Test set accuracy (% Correct) = {0:.4f}".format(model.score(test_samples, test_labels)))
scores = accuracy_score(test_labels, model.predict(test_samples))
print("Test set accuracy (Mean % Error) = {0:.4f}".format(scores))
t1_c_rf = t1 = time.time()
print("Execution time: {0}".format(t1-t0))
class_full_df = pd.concat([
yc_test,
pd.DataFrame(log_predicted_full_df),
pd.DataFrame(ada_predicted_full_df),
pd.DataFrame(rf_predicted_full_df)],
axis=1)
class_full_df.columns = ['test','log','ada','rf',]
denom = len(class_full_df)
print("Logistic Regression accuracy {:.4f}".format(((class_full_df.test == class_full_df.log).sum())/denom))
print("AdaBoosT classification accuracy {:.4f}".format(((class_full_df.ada == class_full_df.test).sum())/denom))
print("Random Forest classification accuracy {:.4f}".format(((class_full_df.rf == class_full_df.test).sum())/denom))
# The same as below
# from sklearn.metrics import accuracy_score
# print(accuracy_score(yc_test, log_predicted_full_df))
# print(accuracy_score(yc_test, tree_predicted_full_df))
# print(accuracy_score(yc_test, ada_predicted_full_df))
# print(accuracy_score(yc_test, svm_predicted_full_df))
# print(accuracy_score(yc_test, rf_predicted_full_df))
print(log_c_best_params)
print(ada_c_best_params)
print(rf_c_best_params)
from sklearn.linear_model import LogisticRegression
print(log_c_best_params)
t0_c_log_learning_curve = t0 = time.time()
iterations = 50
title = "Learning Curves Logistic Regression"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=iterations, test_size=0.3, random_state=0)
#estimator = LogisticRegression(penalty='l1', C=.5)
estimator = LogisticRegression(C=0.3728, class_weight=None, dual=False,
fit_intercept=True, intercept_scaling=1, max_iter=100,
multi_class='ovr', n_jobs=1, penalty='l1', random_state=111,
solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
plot_learning_curve(estimator, title, xc_train, yc_train, ylim=(.55, .7), cv=cv, n_jobs=1, scoring='accuracy', y_label='Accuracy')
t1_c_log_learning_curve = t1 = time.time()
scores = cross_val_score(estimator, xc_train, yc_train, cv=iterations)
plt.axhline(y=scores.mean(), linewidth=1, color='black')
print("Long term cross validation score = {}".format(scores.mean()))
print("Execution time: {0}".format(t1-t0))
print(ada_c_best_params)
t0_c_ada_learning_curve = t0 = time.time()
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
title = "Learning Curves AdaBoost"
cv = ShuffleSplit(n_splits=10, test_size=0.8, random_state=0)
estimator = AdaBoostClassifier(learning_rate=.5, n_estimators=10, random_state=None)
#estimator = KNeighborsClassifier(leaf_size=10)
print(estimator)
plot_learning_curve(estimator, title, xc_train, yc_train, ylim=(0, 1), cv=cv, n_jobs=4, scoring='accuracy', y_label='Accuracy')
t1_c_ada_learning_curve = t1 = time.time()
from sklearn.model_selection import cross_val_score
scores = cross_val_score(estimator, xc_train, yc_train, cv=10)
plt.axhline(y=scores.mean(), linewidth=1, color='black')
print("Long term cross validation score = {}".format(scores.mean()))
print("Execution time: {0}".format(t1-t0))
print(rf_c_best_params)
t0_c_rf_learning_curve = t0 = time.time()
from sklearn.ensemble import RandomForestClassifier
title = "Learning Curves Random Forest"
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
#estimator = RandomForestClassifier(max_depth = 5, n_estimators=3, criterion='entropy')
estimator = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=9, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=17, n_jobs=1,
oob_score=False, random_state=None, verbose=0,
warm_start=False)
plot_learning_curve(estimator, title, xc_train, yc_train, (0.5, 1), cv=cv, n_jobs=4, scoring='accuracy', y_label='Accuracy')
t1_c_rf_learning_curve = t1 = time.time()
#from sklearn.model_selection import cross_val_score
scores = cross_val_score(estimator, xc_train, yc_train, cv=10)
plt.axhline(y=scores.mean(), linewidth=1, color='black')
print("Long term cross validation score = {}".format(scores.mean()))
print("Execution time: {0}".format(t1-t0))