This notebook shows several machine learning models used on a house sale price data set. It covers the following topics:

  • Data pre-processing
    • whenever possible data types were explicitly set
    • features that were missing more than 3% of the time were removed
    • features that were irrelevant, such as MLS id, were removed
    • features that were duplicate of others were removed
    • features that were redundant, such as ‘State’, ‘County’ were removed. I kept ‘City’
    • categorical features that had high cardinality, such as ‘list agent’, where the cardinality was 456, were removed
  • Feature Extraction
  • Removed any samples missing data
  • One hot encoded the categorical features
  • Standardized the features by removing the mean and scaling by unit variance
  • Selected a subset of the features by using a Kbest algorithm.

Once the data was prepared the following two questions were addressed via machine learning:

  • What is the likely sale price given the unlabeled data?
    • used regression
    • used lasso regression
    • used a random forest
  • What is the likelihood (0|1) and the probability that a certain house would sell in less than 90 days?
    • used logistic regression
    • used AdaBoost
    • used a random forest
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import fbeta_score, make_scorer, mean_squared_error, mean_absolute_error, accuracy_score
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn import linear_model
import time
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit

%matplotlib inline

df = pd.read_csv("../../input/housing2.csv", parse_dates=True, sep=',')
print("Number of columns in dataframe {}".format(len(df.columns)))

# variables
k_features = 7 # used for feature selection
k = 10 # used for k-fold validation
cv_iter = 5 # number of iterations for cv 
[train_percent, test_percent] = [.7, .3]
classification_day_threshold = 120

#missing data
total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent Missing'])

# remove because there are too many missing data in the feature
del df['Rented']
del df['Showing Service']
del df['Easements']
del df['Common Land Acres']
del df['Water Frontage Length']
del df['Water Body Type']
del df['Development / Subdivision']
del df['Assessment Year']
del df['Current Use']
del df['Assessment Amount']
del df['Short Sale']
del df['Garage Type']
del df['Basement Access Type']
del df['Foreclosed/Bank-Owned/REO']

# remove because data is irrelevant or duplicate
del df['MLS #']
del df['Status']
del df['Price - Closed'] # same as feature 'Price'
del df['Price - List']
del df[' Price - Original ']
del df[' Tax - Gross Amount '] # sale price determines Taxes, not the othe way around
del df['Seasonal'] # whether or not a property is seasonal (summer home) should not change the outlook of a buyer 

# remove due to feature set size being too big
del df['Listing Office - Office Name']
del df['List Agent - Agent Name']

# remove due to redundancy
del df['Address']
del df['State']
del df['County']

# Force types for certain fields
df['Price'] = df['Price'].str.replace(',', '')
df['Price'] = df['Price'].str.replace('$', '')
df['Price'] = df['Price'].astype(float)
df['SqFtTotFn'] = df['SqFtTotFn'].str.replace(',', '')
df['SqFtTotFn'] = df['SqFtTotFn'].astype(float)

# Enrich garage and garage capacity features
# 1. If garage == no, then add 0 to garage capacity, i.e. make sure garage capacity = 0
df['Garage Capacity'].replace('', np.nan, inplace=True)
df['Garage'].replace('', np.nan, inplace=True)
df.loc[df['Garage'] == 'No', 'Garage Capacity'] = 0
del df['Garage']

# Change Basement from Yes/No to 1/0
df.Basement = pd.Series(np.where(df.Basement.values == 'Yes', 1, 0), df.index)

# Total Stories contains 4+ values that we will set to 4 
stories = lambda x: '4' if x == '4+' else x
df['Total Stories'] = df['Total Stories'].apply(stories)
df['Total Stories'] = df['Total Stories'].astype(float)

# Create the Days in market metric which is the date of closure - date of MLS listing
from datetime import date
closed = df.columns.get_loc("Date - Closed")
listed = df.columns.get_loc("Date - MLS List")
df['DaysMkt'] = (pd.to_datetime(df[df.columns[closed]]) - 
df['xdayplus'] = df['DaysMkt'] > classification_day_threshold
df = df.drop('Date - Closed', 1)
df = df.drop('Date - MLS List', 1)

# BEFORE: remove samples missing certain features

# AFTER: remove samples any missing features
df.dropna(how='any', inplace = True)

print("Number of columns in dataframe {}".format(len(df.columns)))

# One hot encode some features.
df = pd.concat([df, 
           pd.get_dummies(df[['Property Type']]),
           pd.get_dummies(df[['Flood Zone']]),
           pd.get_dummies(df[['City']])], axis=1)
df.drop(['City','Property Type','Flood Zone','Surveyed','Covenants'], axis=1, inplace=True)

# set up dataframes for regression and classification inputs and labels. The dfs are not the same...

df.dropna(axis=1, how='any')

# For classification
df_class = df.copy(deep=True)
labels_class = df_class['xdayplus']
del df_class['DaysMkt'] # since this is 100% correlated with 90dayplus feature
del df_class['xdayplus']
del df_class['DOM'] # highly correlated with xdayplus

# For regression
labels = df.Price
del df['Price']
del df['xdayplus']

print("Number of columns in dataframe {}".format(len(df.columns)))
print("Number of columns in dataframe {}".format(len(df_class.columns)))

# Perform the pre-train, train, validation and test split

# For Classification
xc, xc_pre, labelsc, yc_pre = train_test_split(df_class, labels_class, test_size=500, shuffle=True)
xc_train, xc_test, yc_train, yc_test = train_test_split(xc, labelsc, 
                                                        train_size=train_percent, test_size=test_percent, shuffle=True)
print("classf'n: sample size of pre={0}, train={1} and test={2}.".format(len(xc_pre), len(xc_train), len(xc_test)))
print("classf'n: label size of pre={0}, train={1} and test={2}.".format(len(yc_pre), len(yc_train), len(yc_test)))

# For regression
x, x_pre, labels, y_pre = train_test_split(df, labels, test_size=500, shuffle=True)
x_train, x_test, y_train, y_test = train_test_split(x, labels, 
                                                    train_size=train_percent, test_size=test_percent, shuffle=True)
print("reg: sample size of pre={0}, train={1} and test={2}.".format(len(x_pre), len(x_train), len(x_test)))
print("reg: label size of pre={0}, train={1} and test={2}.".format(len(y_pre), len(y_train), len(y_test)))

# Need to scale features
# Must retain indices
from sklearn.preprocessing import StandardScaler, MinMaxScaler
reg_columns_to_scale = ['Bedrooms - Total', 'Baths - Total', 'SqFtTotFn', 'DOM', 'PicCount',
       'Lot - Acres', 'Year Built', 'Total Stories', 'Rooms - Total',
       'Garage Capacity', 'Basement', 'DaysMkt']
class_columns_to_scale = ['Price', 'Bedrooms - Total', 'Baths - Total', 'SqFtTotFn', 'PicCount',
       'Lot - Acres', 'Year Built', 'Total Stories', 'Rooms - Total',
       'Garage Capacity', 'Basement']

# For Regression
reg_std = StandardScaler().fit(x_train[reg_columns_to_scale])
reg_train_df = pd.DataFrame(
    reg_std.transform(x_train[reg_columns_to_scale]), columns=reg_columns_to_scale, index=x_train.index)
x_train_std = x_train.copy(deep=True)
x_train_std[reg_columns_to_scale] = reg_train_df[reg_columns_to_scale]
print(x_train[reg_columns_to_scale].head(5)) # see the original data b4 standardization
print(x_train_std[reg_columns_to_scale].head(5)) # make sure that df is correctly updated
#print(x_test[columns_to_scale].head(5)) # see the original data b4 standardization

reg_test_std = pd.DataFrame(
    reg_std.transform(x_test[reg_columns_to_scale]), columns=reg_columns_to_scale, index=x_test.index)
x_test_std = x_test.copy(deep=True)
x_test_std[reg_columns_to_scale] = reg_test_std[reg_columns_to_scale]

# For Classification
reg_c_std = StandardScaler().fit(xc_train[class_columns_to_scale])
reg_c_train_df = pd.DataFrame(
    reg_c_std.transform(xc_train[class_columns_to_scale]), columns=class_columns_to_scale, index=xc_train.index)
xc_train_std = xc_train.copy(deep=True)
xc_train_std[class_columns_to_scale] = reg_c_train_df[class_columns_to_scale]
print(xc_train[class_columns_to_scale].head(5)) # see the original data b4 standardization
print(xc_train_std[class_columns_to_scale].head(5)) # make sure that df is correctly updated#print(x_test[columns_to_scale].head(5)) # see the original data b4 standardization

reg_c_test_df = pd.DataFrame(
    reg_c_std.transform(xc_test[class_columns_to_scale]), columns=class_columns_to_scale, index=xc_test.index)
xc_test_std = xc_test.copy(deep=True)
xc_test_std[class_columns_to_scale] = reg_c_test_df[class_columns_to_scale]

# Now we have two sets, x_train and x_train_std, and can create more like x_train_max
# Will set up x_train_scaled and copy to it whatever scaling we want

x_train_scaled = x_train_std
x_test_scaled = x_test_std
xc_train_scaled = xc_train_std
xc_test_scaled = xc_test_std

#Feature Selection
# return last k elements in the df column list since feature scoring functions order from least to most important
def top_k_in_list(df, ordering, k):
    length = len(df.columns.tolist())
    return np.array([df.columns.tolist()[x] for x in ordering])[length-k:length]

#K-Best (more steady compared to ExtraTreeRegressor)
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

kbest = SelectKBest(score_func=f_regression, k=k_features)
clf =, y_train)
indices = np.argsort(clf.scores_) # list containing indices, from least to most important, feature
print(top_k_in_list(x_train_scaled, indices, k_features))
#features = fit.transform(x_train_scaled)

plt.title("K-Best important features")
plt.barh(range(x_train_scaled.shape[1]), clf.scores_[indices],
       color="r", align="center")
# If you want to define your own labels,
# change indices to a list of labels on the following line.
plt.yticks(range(x_train.shape[1]), [x_train_scaled.columns.tolist()[x] for x in indices])
plt.ylim([-1, x_train.shape[1]])

# Select features for regression
print("Regression: These are the top {0} feature indices {1}".format(k_features, 
x_train_new = x_train_scaled[top_k_in_list(x_train_scaled, indices, k_features)]
x_test_new = x_test_scaled[top_k_in_list(x_test_scaled, indices, k_features)]

# Select features for classification
clf =, yc_train)
indices = np.argsort(clf.scores_) # list containing indices, from least to most important, feature
print(top_k_in_list(xc_train_scaled, indices, k_features))
print("Classsification: These are the top {0} feature indices {1}".format(k_features,
xc_train_new = xc_train_scaled[top_k_in_list(xc_train_scaled, indices, k_features)]
xc_test_new = xc_test_scaled[top_k_in_list(xc_test_scaled, indices, k_features)]

x = np.arange(len(y_test)) # set the x-axis for plotting

def mpe_scorer(ground_truth, predictions):
    mpe = ((predictions - ground_truth)/ground_truth)
    return mpe.mean()
my_mpe_scorer = make_scorer(mpe_scorer, greater_is_better=False)

def error_scorer(ground_truth, predictions):
    denom = len(ground_truth)
    return (((ground_truth != predictions).sum())/denom)
my_error_scorer = make_scorer(mpe_scorer, greater_is_better=False)

def rmse_error (labels, predictions):
    return np.sqrt(mean_squared_error(labels, predictions))
def rmse2_error (labels, predictions):
    return np.sqrt(mean_squared_error(labels, predictions))
# if greater_is_better=False for some reason the sign is flipped
rmse = make_scorer(rmse_error, greater_is_better=False)
rmse2 = make_scorer(rmse2_error, greater_is_better=True)
mse = make_scorer(mean_squared_error)
mae = make_scorer(mean_absolute_error)
accuracy = make_scorer(accuracy_score, greater_is_better=True)
Number of columns in dataframe 46
Number of columns in dataframe 19
Index(['Price', 'Bedrooms - Total', 'Baths - Total', 'SqFtTotFn', 'PicCount',
       'Lot - Acres', 'Year Built', 'Total Stories', 'Rooms - Total',
       'Garage Capacity', 'Basement', 'Property Type_Condo',
       'Property Type_Single Family', 'Flood Zone_No', 'Flood Zone_Unknown',
       'Flood Zone_Yes', 'Surveyed_No', 'Surveyed_Unknown', 'Surveyed_Yes',
       'Covenants_No', 'Covenants_Unknown', 'Covenants_Yes', 'City_Bethel',
       'City_Canaan', 'City_Cavendish', 'City_Cornish', 'City_Croydon',
       'City_Enfield', 'City_Grantham', 'City_Hanover', 'City_Hartford',
       'City_Hartland', 'City_Lebanon', 'City_Ludlow', 'City_Lyme',
       'City_Norwich', 'City_Orford', 'City_Plainfield', 'City_Plymouth',
       'City_Rochester', 'City_Royalton', 'City_Sharon', 'City_Stockbridge',
       'City_Strafford', 'City_Sunapee', 'City_Weathersfield',
       'City_West Windsor', 'City_Windsor'],
Index(['Bedrooms - Total', 'Baths - Total', 'SqFtTotFn', 'DOM', 'PicCount',
       'Lot - Acres', 'Year Built', 'Total Stories', 'Rooms - Total',
       'Garage Capacity', 'Basement', 'DaysMkt', 'Property Type_Condo',
       'Property Type_Single Family', 'Flood Zone_No', 'Flood Zone_Unknown',
       'Flood Zone_Yes', 'Surveyed_No', 'Surveyed_Unknown', 'Surveyed_Yes',
       'Covenants_No', 'Covenants_Unknown', 'Covenants_Yes', 'City_Bethel',
       'City_Canaan', 'City_Cavendish', 'City_Cornish', 'City_Croydon',
       'City_Enfield', 'City_Grantham', 'City_Hanover', 'City_Hartford',
       'City_Hartland', 'City_Lebanon', 'City_Ludlow', 'City_Lyme',
       'City_Norwich', 'City_Orford', 'City_Plainfield', 'City_Plymouth',
       'City_Rochester', 'City_Royalton', 'City_Sharon', 'City_Stockbridge',
       'City_Strafford', 'City_Sunapee', 'City_Weathersfield',
       'City_West Windsor', 'City_Windsor'],
Number of columns in dataframe 49
Number of columns in dataframe 48
classf'n: sample size of pre=500, train=3814 and test=1635.
classf'n: label size of pre=500, train=3814 and test=1635.
reg: sample size of pre=500, train=3814 and test=1635.
reg: label size of pre=500, train=3814 and test=1635.
      Bedrooms - Total  Baths - Total  SqFtTotFn  DOM  PicCount  Lot - Acres  \
2438                 3              2     1229.0  104        19         0.47   
5040                 4              4     2888.0  190        23        10.00   
2365                 3              1     1028.0   12         7         5.20   
5279                 3              3     3092.0  451        30        11.85   
5733                 4              2     2767.0   72        36         6.05   

      Year Built  Total Stories  Rooms - Total  Garage Capacity  Basement  \
2438      1975.0            2.0              6              2.0         1   
5040      1985.0            2.0             10              2.0         1   
2365      1977.0            1.0              9              2.0         1   
5279      1985.0            2.0              9              4.0         1   
5733      1991.0            2.0             10              4.0         1   

2438    132.0  
5040    293.0  
2365     70.0  
5279    510.0  
5733    165.0  
      Bedrooms - Total  Baths - Total  SqFtTotFn       DOM  PicCount  \
2438         -0.050547      -0.380249  -0.775278 -0.282219  0.058763   
5040          0.969489       1.514291   0.888279  0.092346  0.527395   
2365         -0.050547      -1.327519  -0.976830 -0.682917 -1.347131   
5279         -0.050547       0.567021   1.092839  1.229109  1.347500   
5733          0.969489      -0.380249   0.766947 -0.421592  2.050447   

      Lot - Acres  Year Built  Total Stories  Rooms - Total  Garage Capacity  \
2438    -0.214018    0.218152       0.471005      -0.552922         0.775136   
5040     0.146864    0.426494       0.471005       1.119437         0.775136   
2365    -0.034902    0.259821      -1.331265       0.701347         0.775136   
5279     0.216920    0.426494       0.471005       0.701347         2.648626   
5733    -0.002714    0.551499       0.471005       1.119437         2.648626   

      Basement   DaysMkt  
2438  0.566248 -0.391767  
5040  0.566248  0.302670  
2365  0.566248 -0.659189  
5279  0.566248  1.238649  
5733  0.566248 -0.249429  
         Price  Bedrooms - Total  Baths - Total  SqFtTotFn  PicCount  \
488   380000.0                 3              3     2098.0        16   
5847  137000.0                 3              2     1315.0        19   
2946  695000.0                 2              1     1024.0        12   
5898  330000.0                 3              2     2000.0        14   
4430  189500.0                 3              2     1341.0        16   

      Lot - Acres  Year Built  Total Stories  Rooms - Total  Garage Capacity  \
488          0.60      1997.0            2.0              7              2.0   
5847         0.40      1970.0            2.0              6              0.0   
2946         0.73      1950.0            1.0              4              2.0   
5898         1.38      2004.0            1.0              4              3.0   
4430         3.00      1850.0            1.5              7              1.0   

488          1  
5847         1  
2946         1  
5898         1  
4430         1  
         Price  Bedrooms - Total  Baths - Total  SqFtTotFn  PicCount  \
488   0.252477         -0.030591       0.566579   0.103560 -0.271823   
5847 -0.531563         -0.030591      -0.375825  -0.686841  0.073141   
2946  1.268825         -1.063094      -1.318230  -0.980591 -0.731775   
5898  0.091152         -0.030591      -0.375825   0.004634 -0.501799   
4430 -0.362171         -0.030591      -0.375825  -0.660595 -0.271823   

      Lot - Acres  Year Built  Total Stories  Rooms - Total  Garage Capacity  \
488     -0.228340    0.674769       0.471539      -0.114609         0.805549   
5847    -0.236686    0.108735       0.471539      -0.533305        -1.104531   
2946    -0.222915   -0.310550      -1.344161      -1.370696         0.805549   
5898    -0.195790    0.821519      -1.344161      -1.370696         1.760589   
4430    -0.128186   -2.406975      -0.436311      -0.114609        -0.149491   

488    0.57957  
5847   0.57957  
2946   0.57957  
5898   0.57957  
4430   0.57957  
['PicCount' 'City_Hanover' 'Garage Capacity' 'Bedrooms - Total'
 'Rooms - Total' 'Baths - Total' 'SqFtTotFn']
Regression: These are the top 7 feature indices [ 4 30  9  0  8  1  2]
['SqFtTotFn' 'Lot - Acres' 'City_Ludlow' 'Surveyed_Yes' 'Surveyed_Unknown'
 'City_Lebanon' 'City_Hanover']
Classsification: These are the top 7 feature indices [ 3  5 33 18 17 32 29]
#from sklearn.model_selection import learning_curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5), scoring='r2', y_label=''):
    Generate a simple plot of the test and training learning curve.

    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - An object to be used as a cross-validation generator.
          - An iterable yielding train/test splits.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    if ylim is not None:
    plt.xlabel("Training examples")
    plt.ylabel("Score ({})".format(y_label))
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, scoring=scoring)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",

    return plt

def plot_validation_curves (train_scores, test_scores, param='alpha', param_range=np.logspace(-3, 3, 13), scoring=rmse2):

#     train_scores, test_scores = validation_curve(linear_model.Lasso(), train_scores, test_scores,
#                                                 param_name=param, param_range=param_range,
#                                                 cv=10, scoring=scoring, n_jobs=1)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.title("Validation Curve")
    plt.ylim(1.5e5, 3.5e5)
    lw = 2
    plt.semilogx(param_range, train_scores_mean, label="Training score",
                 color="darkorange", lw=lw)
    plt.fill_between(param_range, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.2,
                     color="darkorange", lw=lw)
    plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
                 color="navy", lw=lw)
    plt.fill_between(param_range, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.2,
                     color="navy", lw=lw)


# Learning Curves for Linear Regression
# Make sure that the learning curve code has the best parameters for the models in question 
t0_r_lr_learning_curve = t0 = time.time()
#from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit

title = "Learning Curves Linear Regression"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=k, test_size=0.2, random_state=0)
estimator = linear_model.LinearRegression()
plot_learning_curve(estimator, title, x_pre, y_pre, ylim=(0, 5e5), 
                    cv=cv, n_jobs=4, y_label='RMSE', scoring=rmse2)


t1_r_lr_learning_curve = t1 = time.time()
print("Execution time: {0}".format(t1-t0))
<matplotlib.figure.Figure at 0x11773bdd8>
Execution time: 0.6666350364685059
# this histogram shows why the choice of <30 days yields such high marks. Practically 80% of the 
# time the sale takes > 30 days
plt.title('Number of days it took to sell house')
plt.ylabel('Occurrences (mean = {})'.format(x_pre.DOM.mean()))
plt.axvline(x_pre.DOM.mean(), color='b', linestyle='dashed', linewidth=2)
plt.hist(x_pre.DOM, bins=np.arange(min(x_pre.DOM), max(x_pre.DOM) + binwidth, binwidth))
(array([ 145.,   73.,   60.,   36.,   26.,   32.,   18.,   15.,    7.,
          16.,   10.,   10.,    7.,    6.,    3.,    4.,    4.,    2.,
           1.,    2.,    2.,    4.,    2.,    3.,    1.,    0.,    0.,
           3.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    2.,
           0.,    1.,    0.,    0.,    1.,    0.,    0.,    0.,    1.,
           0.,    1.,    0.,    0.,    0.,    0.,    0.,    1.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    0.,    1.]),
 array([   0,   30,   60,   90,  120,  150,  180,  210,  240,  270,  300,
         330,  360,  390,  420,  450,  480,  510,  540,  570,  600,  630,
         660,  690,  720,  750,  780,  810,  840,  870,  900,  930,  960,
         990, 1020, 1050, 1080, 1110, 1140, 1170, 1200, 1230, 1260, 1290,
        1320, 1350, 1380, 1410, 1440, 1470, 1500, 1530, 1560, 1590, 1620,
        1650, 1680, 1710, 1740, 1770, 1800, 1830, 1860, 1890, 1920, 1950,
        1980, 2010, 2040, 2070, 2100, 2130]),
 <a list of 71 Patch objects>)
from sklearn.linear_model import LogisticRegression

t0_c_log_learning_curve = t0 = time.time()

title = "Learning Curves Logistic Regression"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=k, test_size=0.3, random_state=0)
estimator = LogisticRegression(penalty='l1', C=10)
plot_learning_curve(estimator, title, xc_pre, yc_pre, ylim=(.5, .9), cv=cv, n_jobs=1, 
                    scoring='accuracy', y_label='Accuracy')

t1_c_log_learning_curve = t1 = time.time()
scores = cross_val_score(estimator, xc_pre, yc_pre, cv=k)
#plt.axhline(y=scores.mean(), linewidth=1, color='black')
#print("Long term cross validation score = {}".format(scores.mean()))
print("Execution time: {0}".format(t1-t0))
Execution time: 0.5980069637298584


cv_iter = 10 # number of iterations for cv 
k = 10 # used for k-fold validation

# t0_r_lr = t0 = time.time()
# t1_r_lr = t1 = time.time()
# print("Execution time: {0}".format(t1-t0))
t0_r_lr = t0 = time.time()

# prepare sets to make code below more readable and robust (easy to mix up x_train with xc_train)
[train_samples, train_labels, test_samples, test_labels]  = [x_train_scaled, y_train, x_test_scaled, y_test]

# No need for parameter search

# Cross Validation to report the training error
model = linear_model.LinearRegression(fit_intercept=False)
print('Model: {}'.format(model))
k_fold = KFold(n_splits=k, shuffle=True)
training_results = []
training_results_mpe = []
training_results_rmse = []
test_results = []
test_results_mpe = []
test_results_rmse = []
a = []

for i in range(0,cv_iter-1):
    for train_indices, test_indices in k_fold.split(train_samples):
  [train_indices,:], train_labels.iloc[train_indices])
            training_results.append(model.score(train_samples.iloc[train_indices,:], train_labels.iloc[train_indices]))
            test_results.append(model.score(train_samples.iloc[test_indices,:], train_labels.iloc[test_indices]))
        except e:
            print("Problem in cv. {0}".format(e))
            a = train_indices
print("{0}-fold cross validation with {1} iterations results:".format(k, cv_iter))
#print("CV Train set accuracy (R^2) = {0:.3f}".format(np.mean(training_results)))
print("CV Train set accuracy (Mean % Error) = {0:.2f}%".format(np.mean(training_results_mpe)*100))
print("CV Train set accuracy (RMSE) = ${0:.2f}".format(np.mean(training_results_rmse)))
print("CV Train set accuracy (RMSE) std dev = ${0:.2f}".format(np.std(training_results_rmse)))
#print("CV Validation set accuracy (R^2) = {0:.3f}".format(np.mean(test_results)))
print("CV Validation set accuracy (Mean % Error) = {0:.2f}%".format(np.mean(test_results_mpe)*100))
print("CV Validation set accuracy (RMSE) = ${0:.2f}".format(np.mean(test_results_rmse)))
print("CV Validation set accuracy (RMSE) std dev = ${0:.2f}".format(np.std(test_results_rmse)))
lr_coef_full = model.coef_

# Report the test error, test_labels) # re-fit using all of the test data
print('Number of coefficients full feature set: \n', len(model.coef_))
#print("Test set accuracy (R^2) = {0:.3f}".format(model.score(test_samples, test_labels)))
scores = mpe_scorer(test_labels, model.predict(test_samples))
print("Test set accuracy (Mean % Error) = {0:.2f}%".format(scores*100))
linear_predicted_full_df = pd.DataFrame(data=model.predict(test_samples), columns=['Price'], index=test_samples.index)
lr_test_rmse_full = np.sqrt(mean_squared_error(test_labels, model.predict(test_samples)))
print('Test RMSE ${0:.2f}'.format(lr_test_rmse_full))
print("Test sample std dev = ${0:.2f}".format(np.std(test_labels)))
print("Std. dev. for predictions on test = ${0:.2f}".format(np.std(model.predict(test_samples))))

# Using the reduced features set predict output for graphing
print("Results using the reduced feature set:")
[train_samples, test_samples] = [x_train_new, x_test_new] # feature selected sets = training and test samples
# May want to run GridSearchCV again with reduced set, train_labels)
linear_predicted_lean_df = pd.DataFrame(data=model.predict(test_samples), columns=['Price'], index=test_samples.index)
print('Number of coefficients reduced feature set: \n', len(model.coef_))
#print("Test set accuracy (R^2) = {0:.3f}".format(model.score(test_samples, test_labels)))
scores = mpe_scorer(test_labels, model.predict(test_samples))
print("Test set accuracy (Mean % Error) = {0:.2f}%".format(scores*100))
lr_test_rmse_lean = np.sqrt(mean_squared_error(test_labels, model.predict(test_samples)))
print('Test RMSE ${0:.2f}'.format(lr_test_rmse_lean))
print("Test sample std dev = ${0:.2f}".format(np.std(test_labels)))
print("Std. dev. for predictions on test = ${0:.2f}".format(np.std(model.predict(test_samples))))

temp = test_labels.sort_values()
plt.plot(x,linear_predicted_full_df.sort_values(by=['Price']), 'r')
plt.plot(x,linear_predicted_lean_df.sort_values(by=['Price']), 'g')
plt.ylabel('Sale price')
lr_coef_lean = model.coef_

t1_r_lr = t1 = time.time()
print("Execution time: {0}".format(t1-t0))
Model: LinearRegression(copy_X=True, fit_intercept=False, n_jobs=1, normalize=False)
2-fold cross validation with 2 iterations results:
CV Train set accuracy (Mean % Error) = 11.64%
CV Train set accuracy (RMSE) = $185295.00
CV Train set accuracy (RMSE) std dev = $5489.10
CV Validation set accuracy (Mean % Error) = 11.78%
CV Validation set accuracy (RMSE) = $191070.93
CV Validation set accuracy (RMSE) std dev = $5068.31
[ -5.535e+03   4.359e+04   1.295e+05  -3.711e+04   1.356e+04   2.550e+04
   3.910e+03  -1.075e+04   1.384e+04  -5.528e+03  -1.441e+04   3.269e+04
  -1.198e+18  -1.198e+18   1.010e+18   1.010e+18   1.010e+18   7.616e+17
   7.616e+17   7.616e+17  -6.187e+17  -6.187e+17  -6.187e+17   4.464e+16
   4.464e+16   4.464e+16   4.464e+16   4.464e+16   4.464e+16   4.464e+16
   4.464e+16   4.464e+16   4.464e+16   4.464e+16   4.464e+16   4.464e+16
   4.464e+16   4.464e+16   4.464e+16   4.464e+16   4.464e+16   4.464e+16
   4.464e+16   4.464e+16   4.464e+16   4.464e+16   4.464e+16   4.464e+16
Number of coefficients full feature set: 
Test set accuracy (Mean % Error) = 14.16%
Test RMSE $202614.01
Test sample std dev = $293038.36
Std. dev. for predictions on test = $211662.10

Results using the reduced feature set:
Number of coefficients reduced feature set: 
Test set accuracy (Mean % Error) = -130.19%
Test RMSE $350560.55
Test sample std dev = $293038.36
Std. dev. for predictions on test = $233752.55
[  17367.246  479470.578    2598.842   -1230.058   -2498.3     34251.156
Execution time: 0.1254429817199707
t0_r_l = t0 = time.time()

# prepare sets to make code below more readable and robust (easy to mix up x_train with xc_train)
[train_samples, train_labels, test_samples, test_labels]  = [x_train_scaled, y_train, x_test_scaled, y_test]

# Parameter search
tuned_parameters = [{'alpha': np.logspace(-3, 3, 20)}]
n_folds = k
clf = GridSearchCV(estimator=linear_model.Lasso(), 
                   param_grid=tuned_parameters, cv=n_folds, refit=False, scoring=rmse), y_train)        
print("Best score {0:.4f}".format(clf.best_score_))
print("Best parameters")

# Cross Validation to report the training error
model = linear_model.Lasso(alpha = clf.best_params_['alpha'])
print('Model: {}'.format(model))
k_fold = KFold(n_splits=k, shuffle=True)
training_results = []
training_results_mpe = []
training_results_rmse = []
test_results = []
test_results_mpe = []
test_results_rmse = []
a = []

for i in range(0,cv_iter-1):
    for train_indices, test_indices in k_fold.split(train_samples):
  [train_indices,:], train_labels.iloc[train_indices])
            training_results.append(model.score(train_samples.iloc[train_indices,:], train_labels.iloc[train_indices]))
            test_results.append(model.score(train_samples.iloc[test_indices,:], train_labels.iloc[test_indices]))
        except e:
            print("Problem in cv. {0}".format(e))
            a = train_indices
print("{0}-fold cross validation with {1} iterations results:".format(k, cv_iter))
#print("CV Train set accuracy (R^2) = {0:.3f}".format(np.mean(training_results)))
print("CV Train set accuracy (Mean % Error) = {0:.2f}%".format(np.mean(training_results_mpe)*100))
print("CV Train set accuracy (RMSE) = ${0:.2f}".format(np.mean(training_results_rmse)))
print("CV Train set accuracy (RMSE) std dev = ${0:.2f}".format(np.std(training_results_rmse)))
#print("CV Validation set accuracy (R^2) = {0:.3f}".format(np.mean(test_results)))
print("CV Validation set accuracy (Mean % Error) = {0:.2f}%".format(np.mean(test_results_mpe)*100))
print("CV Validation set accuracy (RMSE) = ${0:.2f}".format(np.mean(test_results_rmse)))
print("CV Validation set accuracy (RMSE) std dev = ${0:.2f}".format(np.std(test_results_rmse)))
l_coef_full = model.coef_

# Report the test error, test_labels) # re-fit using all of the test data
print('Number of coefficients full feature set: \n', len(model.coef_))
#print("Test set accuracy (R^2) = {0:.3f}".format(model.score(test_samples, test_labels)))
scores = mpe_scorer(test_labels, model.predict(test_samples))
print("Test set accuracy (Mean % Error) = {0:.2f}%".format(scores*100))
lasso_predicted_full_df = pd.DataFrame(data=model.predict(test_samples), columns=['Price'], index=test_samples.index)
l_test_rmse_full = np.sqrt(mean_squared_error(test_labels, model.predict(test_samples)))
print('Test RMSE ${0:.2f}'.format(l_test_rmse_full))
print("Test sample std dev = ${0:.2f}".format(np.std(test_labels)))
print("Std. dev. for predictions on test = ${0:.2f}".format(np.std(model.predict(test_samples))))

# Using the reduced features set predict output for graphing
print("Results using the reduced feature set:")
[train_samples, test_samples] = [x_train_new, x_test_new] # feature selected sets = training and test samples
# May want to run GridSearchCV again with reduced set, train_labels)
lasso_predicted_lean_df = pd.DataFrame(data=model.predict(test_samples), columns=['Price'], index=test_samples.index)
print('Number of coefficients reduced feature set: \n', len(model.coef_))
#print("Test set accuracy (R^2) = {0:.3f}".format(model.score(test_samples, test_labels)))
scores = mpe_scorer(test_labels, model.predict(test_samples))
print("Test set accuracy (Mean % Error) = {0:.2f}%".format(scores*100))
l_test_rmse_lean = np.sqrt(mean_squared_error(test_labels, model.predict(test_samples)))
print('Test RMSE ${0:.2f}'.format(l_test_rmse_lean))
print("Test sample std dev = ${0:.2f}".format(np.std(test_labels)))
print("Std. dev. for predictions on test = ${0:.2f}".format(np.std(model.predict(test_samples))))

temp = test_labels.sort_values()
plt.plot(x,linear_predicted_full_df.sort_values(by=['Price']), 'r')
plt.plot(x,linear_predicted_lean_df.sort_values(by=['Price']), 'g')
plt.ylabel('Sale price')
l_coef_lean = model.coef_

t1_r_l = t1 = time.time()
print("Execution time: {0}".format(t1-t0))
Best score -188931.0874
Best parameters
{'alpha': 233.57214690901213}
Model: Lasso(alpha=233.57214690901213, copy_X=True, fit_intercept=True,
   max_iter=1000, normalize=False, positive=False, precompute=False,
   random_state=None, selection='cyclic', tol=0.0001, warm_start=False)
2-fold cross validation with 2 iterations results:
CV Train set accuracy (Mean % Error) = 11.62%
CV Train set accuracy (RMSE) = $185895.72
CV Train set accuracy (RMSE) std dev = $109.50
CV Validation set accuracy (Mean % Error) = 11.56%
CV Validation set accuracy (RMSE) = $189895.82
CV Validation set accuracy (RMSE) std dev = $235.41
[   3216.607   45288.331  147026.12   -11994.428   16656.556   35230.683
    6032.423      -0.     -14779.027    1207.599  -22242.791    8414.24
  -31271.213       0.       7348.798      -0.         -0.      18897.74
  -17703.278      -0.       1815.078      -0.     -16291.228  -40576.91
      -0.         -0.      -6533.484      -0.          0.     -56016.634
  223571.974    1881.168   40732.785   36440.397  114581.303  105962.557
  113062.675      -0.          0.         -0.     -49421.946  -31043.236
       0.          0.          0.     241547.413  -41181.391  -16734.308
Number of coefficients full feature set: 
Test set accuracy (Mean % Error) = 14.82%
Test RMSE $202818.14
Test sample std dev = $293038.36
Std. dev. for predictions on test = $209577.78

Results using the reduced feature set:
Number of coefficients reduced feature set: 
Test set accuracy (Mean % Error) = 21.03%
Test RMSE $222793.99
Test sample std dev = $293038.36
Std. dev. for predictions on test = $197306.92
[  13170.548  184661.335    5773.091   -2526.054   -2012.371   38209.962
Execution time: 4.37628698348999
from sklearn.model_selection import validation_curve
[train_scores, valid_scores] = validation_curve(
    param_range=np.logspace(-3, 5, 20), 

    param_range=np.logspace(-3, 5, 20), 
t0_r_rf = t0 = time.time()
from sklearn.ensemble import RandomForestRegressor

# prepare sets to make code below more readable and robust (easy to mix up x_train with xc_train)
[train_samples, train_labels, test_samples, test_labels]  = [x_train_scaled, y_train, x_test_scaled, y_test]
#k = cv_iter = 2

# Parameter search
# careful about np.linspace since in some cases you want whole numbers
tuned_parameters = [{'criterion': ['mae', 'mse'],
                     'max_depth': np.arange(1, 25, 2),
                     'n_estimators': np.arange(1, 11, 2)}]

n_folds = k
clf = GridSearchCV(estimator=RandomForestRegressor(), 
                   param_grid=tuned_parameters, cv=n_folds, refit=False, scoring=mse), y_train)        
print("Best score {0:.4f}".format(clf.best_score_))
print("Best parameters")

# Cross Validation to report the training error
model = RandomForestRegressor(criterion=clf.best_params_['criterion'],

print('Model: {}'.format(model))
k_fold = KFold(n_splits=k, shuffle=True)
training_results = []
training_results_mpe = []
training_results_rmse = []
test_results = []
test_results_mpe = []
test_results_rmse = []
a = []

for i in range(0,cv_iter-1):
    for train_indices, test_indices in k_fold.split(train_samples):
  [train_indices,:], train_labels.iloc[train_indices])
            training_results.append(model.score(train_samples.iloc[train_indices,:], train_labels.iloc[train_indices]))
            test_results.append(model.score(train_samples.iloc[test_indices,:], train_labels.iloc[test_indices]))
        except e:
            print("Problem in cv. {0}".format(e))
            a = train_indices
print("{0}-fold cross validation with {1} iterations results:".format(k, cv_iter))
#print("CV Train set accuracy (R^2) = {0:.3f}".format(np.mean(training_results)))
print("CV Train set accuracy (Mean % Error) = {0:.2f}%".format(np.mean(training_results_mpe)*100))
print("CV Train set accuracy (RMSE) = ${0:.2f}".format(np.mean(training_results_rmse)))
print("CV Train set accuracy (RMSE) std dev = ${0:.2f}".format(np.std(training_results_rmse)))
#print("CV Validation set accuracy (R^2) = {0:.3f}".format(np.mean(test_results)))
print("CV Validation set accuracy (Mean % Error) = {0:.2f}%".format(np.mean(test_results_mpe)*100))
print("CV Validation set accuracy (RMSE) = ${0:.2f}".format(np.mean(test_results_rmse)))
print("CV Validation set accuracy (RMSE) std dev = ${0:.2f}".format(np.std(test_results_rmse)))

# Report the test error, test_labels) # re-fit using all of the test data
#print("Test set accuracy (R^2) = {0:.3f}".format(model.score(test_samples, test_labels)))
scores = mpe_scorer(test_labels, model.predict(test_samples))
print("Test set accuracy (Mean % Error) = {0:.2f}%".format(scores*100))
rf_predicted_full_df = pd.DataFrame(data=model.predict(test_samples), columns=['Price'], index=test_samples.index)
rf_r_test_rmse_full = np.sqrt(mean_squared_error(test_labels, model.predict(test_samples)))
print('Test RMSE ${0:.2f}'.format(rf_r_test_rmse_full))
print("Test sample std dev = ${0:.2f}".format(np.std(test_labels)))
print("Std. dev. for predictions on test = ${0:.2f}".format(np.std(model.predict(test_samples))))

# Using the reduced features set predict output for graphing
print("Results using the reduced feature set:")
[train_samples, test_samples] = [x_train_new, x_test_new] # feature selected sets = training and test samples, train_labels)
rf_predicted_lean_df = pd.DataFrame(data=model.predict(test_samples), columns=['Price'], index=test_samples.index)
#print("Test set accuracy (R^2) = {0:.3f}".format(model.score(test_samples, test_labels)))
scores = mpe_scorer(test_labels, model.predict(test_samples))
print("Test set accuracy (Mean % Error) = {0:.2f}%".format(scores*100))
rf_r_test_rmse_lean = np.sqrt(mean_squared_error(test_labels, model.predict(test_samples)))
print('Test RMSE ${0:.2f}'.format(l_test_rmse_lean))
print("Test sample std dev = ${0:.2f}".format(np.std(test_labels)))
print("Std. dev. for predictions on test = ${0:.2f}".format(np.std(model.predict(test_samples))))

temp = test_labels.sort_values()
plt.plot(x,rf_predicted_full_df.sort_values(by=['Price']), 'r')
plt.plot(x,rf_predicted_lean_df.sort_values(by=['Price']), 'g')

t1_r_rf = t1 = time.time()
print("Execution time: {0}".format(t1-t0))
Best score 85111875262.0560
Best parameters
{'criterion': 'mae', 'n_estimators': 1, 'max_depth': 13}
Model: RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=13,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
2-fold cross validation with 2 iterations results:
CV Train set accuracy (Mean % Error) = 8.15%
CV Train set accuracy (RMSE) = $161041.44
CV Train set accuracy (RMSE) std dev = $17399.22
CV Validation set accuracy (Mean % Error) = 19.43%
CV Validation set accuracy (RMSE) = $245520.08
CV Validation set accuracy (RMSE) std dev = $14467.37
Test set accuracy (Mean % Error) = 10.84%
Test RMSE $184675.13
Test sample std dev = $293038.36
Std. dev. for predictions on test = $307762.00

Results using the reduced feature set:
Test set accuracy (Mean % Error) = 22.17%
Test RMSE $222793.99
Test sample std dev = $293038.36
Std. dev. for predictions on test = $244398.04
Execution time: 222.4744849205017
# If you want, change to a lower degree for the polynomial regression
from sklearn.feature_selection import f_regression

#k = k_features
kbest = SelectKBest(score_func=f_regression, k=k)
clf =, y_train)
indices = np.argsort(clf.scores_) # list containing indices, from least to most important, feature
print(top_k_in_list(x_train_scaled, indices, k))

# Select features for regression
print("Regression: These are the top {0} feature indices {1}".format(k, indices[len(indices)-k:len(indices)]))
xp_train_new = x_train_scaled[top_k_in_list(x_train_scaled, indices, k)]
xp_test_new = x_test_scaled[top_k_in_list(x_test_scaled, indices, k)]
# Polynomial of 2nd degree
# Need to use the scaled feature set otherwise the polynomial entries becomes very large

# setup
from sklearn.preprocessing import PolynomialFeatures
[xp_train_new, xp_test_new] = [x_train_new, x_test_new]

# Try polynomial regression
print("Polynomial with all interactions (x_i^2 included)")
linear = linear_model.LinearRegression()
poly = PolynomialFeatures(degree=2, interaction_only=False)
x_poly_train = poly.fit_transform(xp_train_new), y_train)
print('Number of coefficients: \n', len(linear.coef_))
print('Number of coefficients: \n', linear.coef_)
print('Intercept: \n', linear.intercept_)
print("Train set accuracy (R^2) = {0:.3f}".format(linear.score(x_poly_train, y_train)))
scores = mpe_scorer(y_train, linear.predict(x_poly_train))
print("Train set accuracy (Mean % Error) = {0:.2f}%".format(scores*100))
print('Train RMSE ${0:.2f}'.format(np.sqrt(mean_squared_error(y_train, linear.predict(x_poly_train)))))

#Predict Output
predict_ = poly.fit_transform(xp_test_new)
poly_predicted_lean_df = pd.DataFrame(data=linear.predict(predict_), columns=['Price'], index=xp_test_new.index)
print("Test set accuracy (R^2) = {0:.3f}".format(linear.score(predict_, y_test)))
scores = mpe_scorer(y_test, linear.predict(predict_))
print("Test set accuracy (Mean % Error) = {0:.2f}%".format(scores*100))
print('Test RMSE ${0:.2f}'.format(np.sqrt(mean_squared_error(y_test, linear.predict(predict_)))))
Polynomial with all interactions (x_i^2 included)
['1', 'x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x0^2', 'x0 x1', 'x0 x2', 'x0 x3', 'x0 x4', 'x0 x5', 'x0 x6', 'x1^2', 'x1 x2', 'x1 x3', 'x1 x4', 'x1 x5', 'x1 x6', 'x2^2', 'x2 x3', 'x2 x4', 'x2 x5', 'x2 x6', 'x3^2', 'x3 x4', 'x3 x5', 'x3 x6', 'x4^2', 'x4 x5', 'x4 x6', 'x5^2', 'x5 x6', 'x6^2']
Number of coefficients: 
Number of coefficients: 
 [     0.     17573.961  79816.157  11922.068  10605.12   15848.106
  41616.253  62598.977   6733.045  13309.184  -5449.99   21827.867
   1489.865   7086.885 -15305.561  79816.157  17397.213  49746.762
 -31348.41   33495.647  25976.003  -1462.713  14444.688 -12934.808
  -1050.53     153.808  -6100.692 -11026.865   1307.194  -1522.079
   -971.558  -6252.065  17167.797  -9128.551  34482.604  16698.166]
Train set accuracy (R^2) = 0.587
Train set accuracy (Mean % Error) = 27.10%
Train RMSE $185967.78
Test set accuracy (R^2) = 0.438
Test set accuracy (Mean % Error) = 26.10%
Test RMSE $219648.72

Evaluating the results

print("Full feature results:")
reg_full_df = pd.concat([
reg_full_df.columns = ['test','linear','lasso','random_forest']
reg_full_df['lin_mpe'] = ((reg_full_df.linear - reg_full_df.test)/reg_full_df.test)*100
reg_full_df['lasso_mpe'] = ((reg_full_df.lasso - reg_full_df.test)/reg_full_df.test)*100
reg_full_df['random_forest_mpe'] = ((reg_full_df.random_forest - reg_full_df.test)/reg_full_df.test)*100

print("Selected features result:")
reg_lean_df = pd.concat([
reg_lean_df.columns = ['test','poly','linear','lasso','rf']
reg_lean_df['lin_mpe'] = ((reg_lean_df.linear - reg_lean_df.test)/reg_lean_df.test)*100
reg_lean_df['poly_mpe'] = ((reg_lean_df.poly - reg_lean_df.test)/reg_lean_df.test)*100
reg_lean_df['lasso_mpe'] = ((reg_lean_df.lasso - reg_lean_df.test)/reg_lean_df.test)*100
reg_lean_df['rf_mpe'] = ((reg_lean_df.rf - reg_lean_df.test)/reg_lean_df.test)*100
Full feature results:
test                 299639.224465
linear               299703.423242
lasso                299639.224465
random_forest        302518.897248
lin_mpe                  14.157313
lasso_mpe                14.824947
random_forest_mpe        10.844106
dtype: float64

Selected features result:
test         299639.224465
poly         295359.133831
linear        41072.677330
lasso        298325.899110
rf           291120.781651
lin_mpe        -130.189549
poly_mpe         26.103971
lasso_mpe        21.026387
rf_mpe           22.169141
dtype: float64
# Make sure that the learning curve code has the best parameters for the models in question 
t0_r_lr_learning_curve = t0 = time.time()
#from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit

title = "Learning Curves Linear Regression"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=k, test_size=0.2, random_state=0)
estimator = linear_model.LinearRegression()
plot_learning_curve(estimator, title, x_train[0:1200], y_train[0:1200], ylim=(0, 5e5), 
                    cv=cv, n_jobs=4, y_label='RMSE', scoring=rmse2)


t1_r_lr_learning_curve = t1 = time.time()
print("Execution time: {0}".format(t1-t0))
<matplotlib.figure.Figure at 0x11890a780>
Execution time: 0.4840521812438965
# Make sure that the learning curve code has the best parameters for the models in question 
t0_r_l_learning_curve = t1 = time.time()

title = "Learning Curves Lasso Regression"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=k, test_size=0.2, random_state=0)
estimator = linear_model.Lasso(alpha=1000)
plot_learning_curve(estimator, title, x_train[0:1500], y_train[0:1500], ylim=(0, 5e5), 
                    cv=cv, n_jobs=4, y_label='RMSE', scoring=rmse2)


t1_r_l_learning_curve = t1 = time.time()
print("Execution time: {0}".format(t1-t0))
{'alpha': 233.57214690901213}
Execution time: 15.570173978805542
<matplotlib.figure.Figure at 0x1176450f0>
# Make sure that the learning curve code has the best parameters for the models in question 
from sklearn.ensemble import RandomForestRegressor
t0_r_rf_learning_curve = t0 = time.time()

title = "Learning Curves Random Forest"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=k, test_size=0.2, random_state=0)
estimator = RandomForestRegressor(criterion='mse', max_depth=30, n_estimators=50)
plot_learning_curve(estimator, title, x_train[0:3000], y_train[0:3000], ylim=(0, 5e5), 
                    cv=cv, n_jobs=4, y_label='RMSE', scoring=rmse2)

scores = cross_val_score(estimator, x_train, y_train, cv=10, scoring=rmse2)
plt.axhline(y=-1*scores.mean(), linewidth=1, color='black')
print("Long term cross validation score = {}".format(scores.mean()))

t1_r_rf_learning_curve = t1 = time.time()
print("Execution time: {0}".format(t1-t0))
{'criterion': 'mae', 'n_estimators': 1, 'max_depth': 13}
Long term cross validation score = 167269.83366850213
Execution time: 21.33846688270569
<matplotlib.figure.Figure at 0x117646a58>
from sklearn.kernel_ridge import KernelRidge
clf = KernelRidge(alpha=1.0), y_train) 
print(clf.score(x_train, y_train))


# Logistic regression
t0_c_log = t0 = time.time()

from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, cross_val_score

# prepare sets to make code below more readable and robust (easy to mix up x_train with xc_train)
[train_samples, train_labels, test_samples, test_labels]  = [xc_train_scaled, yc_train, xc_test_scaled, yc_test]

# Parameter search
tuned_parameters = [{'C': np.logspace(-5, 3, 29), 'penalty': ['l1', 'l2']}]
n_folds = k
clf = GridSearchCV(estimator=linear_model.LogisticRegression(), 
                   param_grid=tuned_parameters, cv=n_folds, refit=False), yc_train)        
print("Best score {0:.4f}".format(clf.best_score_))
print("Best parameters")

# Cross Validation to report the training error
model = linear_model.LogisticRegression(C=clf.best_params_['C'], 
print('Model: {}'.format(model))
k_fold = KFold(n_splits=k, shuffle=True)
training_results = []
test_results = []
a = []

for i in range(0,cv_iter-1):
    for train_indices, test_indices in k_fold.split(train_samples):
  [train_indices,:], train_labels.iloc[train_indices])
        except e:
            print("Problem in cv. {0}".format(e))
            a = train_indices
print("{0}-fold cross validation with {1} iterations results:".format(k, cv_iter))
print("CV Train set accuracy (% Correct) = {0:.4f}".format(np.mean(training_results)))
print("CV Validation set accuracy (% Correct) = {0:.4f}".format(np.mean(test_results)))

# Report the test error, test_labels) # re-fit using all of the test data
scores = accuracy_score(test_labels, model.predict(test_samples))
print("Test set accuracy (% Correct) = {0:.4f}".format(scores))
log_predicted_full_df = pd.DataFrame(data=model.predict(test_samples), columns=['xdays'], index=test_samples.index)

# Using the reduced features set predict output for graphing
print("Results using the reduced feature set:")
[train_samples, test_samples] = [xc_train_new, xc_test_new] # feature selected sets = training and test samples, train_labels)
log_predicted_lean_df = pd.DataFrame(data=model.predict(test_samples), columns=['xdays'], index=test_samples.index)
scores = accuracy_score(test_labels, model.predict(test_samples))
print("Test set accuracy (% Correct) = {0:.4f}".format(scores))

t1_c_log = t1 = time.time()
print("Execution time: {0}".format(t1-t0))
Best score 0.6185
Best parameters
{'penalty': 'l1', 'C': 0.37275937203149379}
Model: LogisticRegression(C=0.37275937203149379, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=111,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
2-fold cross validation with 2 iterations results:
CV Train set accuracy (% Correct) = 0.6172
CV Validation set accuracy (% Correct) = 0.6085
Test set accuracy (% Correct) = 0.6300

Results using the reduced feature set:
Test set accuracy (% Correct) = 0.6239
Execution time: 3.041253089904785
t0_c_log = t0 = time.time()

from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
# prepare sets to make code below more readable and robust (easy to mix up x_train with xc_train)
[train_samples, train_labels, test_samples, test_labels]  = [xc_train, yc_train, xc_test, yc_test]

# Parameter search
tuned_parameters = [{'n_estimators': np.arange(5, 12, 1), 'learning_rate' : np.arange(.5, 5, .5)}]
n_folds = k
clf = GridSearchCV(estimator=AdaBoostClassifier(), 
                   param_grid=tuned_parameters, cv=n_folds, refit=False), yc_train)        
print("Best score {0:.4f}".format(clf.best_score_))
print("Best parameters")

# Cross Validation to report the training error
model = AdaBoostClassifier(n_estimators=clf.best_params_['n_estimators'], learning_rate = clf.best_params_['learning_rate'])
print('Model: {}'.format(model))
k_fold = KFold(n_splits=k, shuffle=True)
training_results = []
test_results = []
a = []

for i in range(0,cv_iter-1):
    for train_indices, test_indices in k_fold.split(train_samples):
  [train_indices,:], train_labels.iloc[train_indices])
        except e:
            print("Problem in cv. {0}".format(e))
            a = train_indices
print("{0}-fold cross validation with {1} iterations results:".format(k, cv_iter))
print("CV Train set accuracy (% Correct) = {0:.4f}".format(np.mean(training_results)))
print("CV Validation set accuracy (% Correct) = {0:.4f}".format(np.mean(test_results)))

# Report the test error, test_labels) # re-fit using all of the test data
print("Test set accuracy (% Correct) = {0:.4f}".format(model.score(test_samples, test_labels)))
scores = accuracy_score(test_labels, model.predict(test_samples))
#print("Test set accuracy (Mean % Error) = {0:.4f}".format(scores))
ada_predicted_full_df = pd.DataFrame(data=model.predict(test_samples), columns=['xdays'], index=test_samples.index)

# Using the reduced features set predict output for graphing
print("Results using the reduced feature set:")
[train_samples, test_samples] = [xc_train_new, xc_test_new] # feature selected sets = training and test samples, train_labels)
ada_predicted_lean_df = pd.DataFrame(data=model.predict(test_samples), columns=['xdays'], index=test_samples.index)
print("Test set accuracy (% Correct) = {0:.4f}".format(model.score(test_samples, test_labels)))
scores = accuracy_score(test_labels, model.predict(test_samples))
#print("Test set accuracy (Mean % Error) = {0:.4f}".format(scores))

t1_c_ada = t1 = time.time()
print("Execution time: {0}".format(t1-t0))
Best score 0.6227
Best parameters
{'n_estimators': 10, 'learning_rate': 0.5}
Model: AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.5, n_estimators=10, random_state=None)
2-fold cross validation with 2 iterations results:
CV Train set accuracy (% Correct) = 0.6219
CV Validation set accuracy (% Correct) = 0.6064
Test set accuracy (% Correct) = 0.6355

Results using the reduced feature set:
Test set accuracy (% Correct) = 0.6183
Execution time: 5.488809108734131
# Import the random forest model.

t0_c_rf = t0 = time.time()

from sklearn.ensemble import RandomForestClassifier
#from sklearn.model_selection import KFold, cross_val_score
# prepare sets to make code below more readable and robust (easy to mix up x_train with xc_train)
[train_samples, train_labels, test_samples, test_labels]  = [xc_train, yc_train, xc_test, yc_test]

# Parameter search
tuned_parameters = [{'max_depth': np.arange(1, 50, 2), 'n_estimators': np.arange(1, 30, 2), 
                    'criterion': ['entropy', 'gini'] }]
n_folds = k
clf = GridSearchCV(estimator=RandomForestClassifier(), 
                   param_grid=tuned_parameters, cv=n_folds, refit=False), yc_train)        
print("Best score {0:.4f}".format(clf.best_score_))
print("Best parameters")

# Cross Validation to report the training error
model = RandomForestClassifier(max_depth=clf.best_params_['max_depth'], 
                               criterion=clf.best_params_['criterion'], )
print('Model: {}'.format(model))
k_fold = KFold(n_splits=k, shuffle=True)
training_results = []
test_results = []
a = []

for i in range(0,cv_iter-1):
    for train_indices, test_indices in k_fold.split(train_samples):
  [train_indices,:], train_labels.iloc[train_indices])
        except e:
            print("Problem in cv. {0}".format(e))
            a = train_indices
print("{0}-fold cross validation with {1} iterations results:".format(k, cv_iter))
print("CV Train set accuracy (% Correct) = {0:.4f}".format(np.mean(training_results)))
print("CV Test set accuracy (% Correct) = {0:.4f}".format(np.mean(test_results)))

# Report the test error, test_labels) # re-fit using all of the test data
print("Test set accuracy (% Correct) = {0:.4f}".format(model.score(test_samples, test_labels)))
scores = accuracy_score(test_labels, model.predict(test_samples))
print("Test set accuracy (Mean % Error) = {0:.4f}".format(scores))
rf_predicted_full_df = pd.DataFrame(data=model.predict(test_samples), columns=['xdays'], index=test_samples.index)

# Using the reduced features set predict output for graphing
print("Results using the reduced feature set:")
[train_samples, test_samples] = [xc_train_new, xc_test_new] # feature selected sets = training and test samples, train_labels)
rf_predicted_lean_df = pd.DataFrame(data=model.predict(test_samples), columns=['xdays'], index=test_samples.index)
print("Test set accuracy (% Correct) = {0:.4f}".format(model.score(test_samples, test_labels)))
scores = accuracy_score(test_labels, model.predict(test_samples))
print("Test set accuracy (Mean % Error) = {0:.4f}".format(scores))

t1_c_rf = t1 = time.time()
print("Execution time: {0}".format(t1-t0))
Best score 0.6188
Best parameters
{'criterion': 'entropy', 'n_estimators': 29, 'max_depth': 9}
Model: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=9, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=29, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
2-fold cross validation with 2 iterations results:
CV Train set accuracy (% Correct) = 0.7782
CV Test set accuracy (% Correct) = 0.6214
Test set accuracy (% Correct) = 0.7859
Test set accuracy (Mean % Error) = 0.7859

Results using the reduced feature set:
Test set accuracy (% Correct) = 0.6104
Test set accuracy (Mean % Error) = 0.6104
Execution time: 109.23916888237
class_full_df = pd.concat([
class_full_df.columns = ['test','log','ada','rf',]
denom = len(class_full_df)
print("Logistic Regression accuracy {:.4f}".format(((class_full_df.test == class_full_df.log).sum())/denom))
print("AdaBoosT classification accuracy {:.4f}".format(((class_full_df.ada == class_full_df.test).sum())/denom))
print("Random Forest classification accuracy {:.4f}".format(((class_full_df.rf == class_full_df.test).sum())/denom))
# The same as below
# from sklearn.metrics import accuracy_score
# print(accuracy_score(yc_test, log_predicted_full_df))
# print(accuracy_score(yc_test, tree_predicted_full_df))
# print(accuracy_score(yc_test, ada_predicted_full_df))
# print(accuracy_score(yc_test, svm_predicted_full_df))
# print(accuracy_score(yc_test, rf_predicted_full_df))
Logistic Regression accuracy 0.6300
AdaBoosT classification accuracy 0.6355
Random Forest classification accuracy 0.8177
{'penalty': 'l1', 'C': 0.37275937203149379}
{'n_estimators': 10, 'learning_rate': 0.5}
{'criterion': 'gini', 'n_estimators': 17, 'max_depth': 9}
from sklearn.linear_model import LogisticRegression
t0_c_log_learning_curve = t0 = time.time()
iterations = 50
title = "Learning Curves Logistic Regression"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=iterations, test_size=0.3, random_state=0)
#estimator = LogisticRegression(penalty='l1', C=.5)
estimator = LogisticRegression(C=0.3728, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=111,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
plot_learning_curve(estimator, title, xc_train, yc_train, ylim=(.55, .7), cv=cv, n_jobs=1, scoring='accuracy', y_label='Accuracy')

t1_c_log_learning_curve = t1 = time.time()
scores = cross_val_score(estimator, xc_train, yc_train, cv=iterations)
plt.axhline(y=scores.mean(), linewidth=1, color='black')
print("Long term cross validation score = {}".format(scores.mean()))
print("Execution time: {0}".format(t1-t0))
{'penalty': 'l1', 'C': 0.37275937203149379}
Long term cross validation score = 0.6179872408293461
Execution time: 5.511536121368408
t0_c_ada_learning_curve = t0 = time.time()

from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

title = "Learning Curves AdaBoost"
cv = ShuffleSplit(n_splits=10, test_size=0.8, random_state=0)
estimator = AdaBoostClassifier(learning_rate=.5, n_estimators=10, random_state=None)
#estimator = KNeighborsClassifier(leaf_size=10)
plot_learning_curve(estimator, title, xc_train, yc_train, ylim=(0, 1), cv=cv, n_jobs=4, scoring='accuracy', y_label='Accuracy')

t1_c_ada_learning_curve = t1 = time.time()
from sklearn.model_selection import cross_val_score
scores = cross_val_score(estimator, xc_train, yc_train, cv=10)
plt.axhline(y=scores.mean(), linewidth=1, color='black')
print("Long term cross validation score = {}".format(scores.mean()))
print("Execution time: {0}".format(t1-t0))
{'n_estimators': 10, 'learning_rate': 0.5}
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.5, n_estimators=10, random_state=None)
Long term cross validation score = 0.6132794843784248
Execution time: 1.3455469608306885
t0_c_rf_learning_curve = t0 = time.time()

from sklearn.ensemble import RandomForestClassifier
title = "Learning Curves Random Forest"
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
#estimator = RandomForestClassifier(max_depth = 5, n_estimators=3, criterion='entropy')
estimator = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=9, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=17, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
plot_learning_curve(estimator, title, xc_train, yc_train, (0.5, 1), cv=cv, n_jobs=4, scoring='accuracy', y_label='Accuracy')

t1_c_rf_learning_curve = t1 = time.time()

#from sklearn.model_selection import cross_val_score
scores = cross_val_score(estimator, xc_train, yc_train, cv=10)
plt.axhline(y=scores.mean(), linewidth=1, color='black')
print("Long term cross validation score = {}".format(scores.mean()))
print("Execution time: {0}".format(t1-t0))
{'criterion': 'gini', 'n_estimators': 17, 'max_depth': 9}
Long term cross validation score = 0.616686733909331
Execution time: 2.3254339694976807