Skip to content

iv-stpn/from-scratch-linear-regression

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

5 Commits
 
 
 
 
 
 
 
 
 
 

Repository files navigation

Machine Learning Project: Linear Regression & Beyond

We first create a cell for all the libraries we will need to import for our project. We will use pandas to import and manipulate data easily through DataFrames, NumPy for linear algebra & complex operations over lists, Matplotlib for displaying 2D plots, Seaborn for displaying correlation heatmaps, scikit-learn for model engineering & some extra utilities.

import time
from math import sqrt

import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

We set some parameters to increase the height of Matplotlib figures and pandas print outputs so they can be displayed properly in the notebook.

from pylab import rcParams
rcParams['figure.figsize'] = 20, 5

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 30)
pd.set_option('display.width', 1000)

We create a function to display and analyse the results of our regressions feature-by-feature (with scatter plots & lines corresponding to the coefficients of each weight).

def plot_weights_data_biasless(plt, X, y, w, title, same_scale = True):
    intercept = w[0]
    w = w[1:]

    n = len(w)
    fig, axs = plt.subplots(2, n)
    fig.suptitle(title, fontsize=20)
    y_scale_line = (-1*((max(y)-min(y))/2), (max(y)-min(y))/2)
    y_scale = (min(y), max(y))

    x_minimums = X.T.min(axis = 1)
    x_maximums = X.T.max(axis = 1)
    x_size = x_maximums - x_minimums
    max_size = max(x_size)

    x = np.linspace(-100, 100, 100)
    for i in range(n):
        if same_scale:
            diff_size = max_size - x_size[i]
            x_scale = (x_minimums[i] - diff_size/2, x_maximums[i] + diff_size/2)            # We make it so all x scales are on the same scale

            if (n > 1):
                axs[0][i].set_xlim(x_scale)
                axs[0][i].set_ylim(y_scale)
            else:
                axs[0].set_xlim(x_scale)
                axs[0].set_ylim(y_scale)

            heatmap, xedges, yedges = np.histogram2d(np.concatenate((X[:, i], x_scale)),    # We add a point in the upper-left corner and upper-right corner of the heatmap
                                                     np.concatenate((y, y_scale)), bins=50) # so all heatmaps will have the same edges and be displayed correctly
        else:
            heatmap, xedges, yedges = np.histogram2d(X[:, i], y, bins=50)

        extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]

        if (n > 1):
            axs[0][i].imshow(heatmap.T, extent=extent, origin='lower')

            axs[1][i].set_xlim((-2, 2))
            axs[1][i].set_ylim(y_scale_line)
            axs[1][i].plot(x, w[i]*x, c="red", linewidth=2)
        else:
            axs[0].imshow(heatmap.T, extent=extent, origin='lower')

            axs[1].set_xlim((-2, 2))
            axs[1].set_ylim(y_scale_line)
            axs[1].plot(x, w[i]*x, c="red", linewidth=2)

    fig.tight_layout()

First Dataset: Analysis & Model Fitting of the Powerplant Dataset

The dataset can found at: https://archive.ics.uci.edu/ml/datasets/Combined+Cycle+Power+Plant. The dataset contains 4 variables and one output.

def load_data_excel(excel_path, y_label, drops=[]):
    df = pd.read_excel(excel_path, sheet_name=None)  # Excel datasets might be spread over multiple sheets. To concactenate all the sheets into one DataFrame,
    df = pd.concat(df, ignore_index=True)            # we use the "sheet_name=None" argument when reading and then pd.concat with "ignore_index=True"

    for drop in drops:                               # We add an optional "drops" argument, telling which columns to drop as soon as the data is loaded
        df = df.drop(drop, axis=1)

    y = df[y_label].to_numpy()                       # y_label tells which column corresponds to the ouput of our ML model, the value we will try to predict
    X = df.drop(y_label, axis=1).to_numpy()

    return X, y, df
X, y, df = load_data_excel("powerplant.xlsx", "PE")

print(df.columns)
print()
print(df.sample(10))
print()
print(df.describe())
Index(['AT', 'V', 'AP', 'RH', 'PE'], dtype='object')

          AT      V       AP     RH      PE
36110  14.84  41.48  1017.26  63.42  460.30
2224   29.64  69.51  1012.13  44.65  428.69
19473  29.77  69.98  1013.32  69.60  430.65
22812  22.95  63.94  1013.87  80.37  447.13
37436  25.68  59.44  1012.33  67.29  444.36
2351   23.43  61.47  1007.84  87.11  448.01
18592   7.61  39.37  1015.84  83.73  483.74
12341  19.69  39.72  1001.49  60.34  456.55
16187   6.59  39.37  1020.34  77.92  488.17
18744  16.54  42.86  1014.85  78.00  465.26

                 AT             V            AP            RH            PE
count  47840.000000  47840.000000  47840.000000  47840.000000  47840.000000
mean      19.651231     54.305804   1013.259078     73.308978    454.365009
std        7.452162     12.707362      5.938535     14.599658     17.066281
min        1.810000     25.360000    992.890000     25.560000    420.260000
25%       13.510000     41.740000   1009.100000     63.327500    439.750000
50%       20.345000     52.080000   1012.940000     74.975000    451.550000
75%       25.720000     66.540000   1017.260000     84.830000    468.430000
max       37.110000     81.560000   1033.300000    100.160000    495.760000

Part I: Linear Regression from Scratch

To normalize the features, we implement a function with two modes, "min-max" or "z-score" to use either of those methods for feature scaling. The function also add a 1s column for the bias (intercept) of the linear regression.

def normalizeFeatures(X, mode="min-max"):   # Put all columns (classes) on the same scale (scaling) and brings them on the origin [0; 1]
                                            # And add a 1s column for the factor associated to the bias

    X_t = X.T # Go from the individual points as lines to columns as lines (by default numpy operations are applied line by line)
              #
              # x11, x12, x13, x14    x11, x21, x31
              # x21, x22, x23, x24 -> x12, x22, x32
              # x31, x32, x33, x34    x13, x23, x33
              #                       x14, x24, x34

    # For normalization we suppose no variable has always the same value (otherwise there would a division by 0)
    # If a variable has always the same value, it should be removed from the dataset as it is pointless in the linear regression


    if (mode == "min-max"): # Feature scaling using minimums and maximums
        minimums = X_t.min(axis = 1)[:, None] # Create a new axis to convert scalars to 1-element arrays
        maximums = X_t.max(axis = 1)[:, None] # Create a new axis to convert scalars to 1-element arrays
        X_t = (X_t - minimums) / (maximums - minimums)

    elif (mode == "z-score"): # Feature scaling using z_scores
        X_t = (X_t - X_t.mean(axis = 1)[:, None]) / X_t.std(axis = 1)[:, None]

    X = X_t.T
    X = np.c_[np.ones((X.shape[0], 1)), X] # Add a 1s column to the new scaled matrix for the bias for matrix multiplications

    return X

We implement two cost functions, one using the classical least-squares logic, and just for fun, one for the absolute value of the error (that also describes a convex function).

For the reminder of this notebook, we will use the 'at' operator (@) for matrix multiplication (https://www.python.org/dev/peps/pep-0465/)

def costFunction(w, X, y, function="squares"):
    error = y - (X @ w)                                # The cost function returns a value that is a convex function of the error (usually the sum of its squares)
    if (function == "squares"):
        return (1/(2*len(X))) * (error.T @ error)      # Here we impletement the least squares cost function
    elif (function == "absolute"):
        return (1/(2*len(X))) * np.sum(np.abs(error))  # Here, just for fun, we implement an absolute value cost function (convex as well, corresponding
                                                       # to the sum of the absolute values of the error)

For the implementation of the gradient descent, we follow the general algorithm : while the maximal iteration has not been reached or the cost difference isn't small enough, we calculate new weights, equal to the previous weights minus the gradient of the cost function over the weights. To simplify the calculations "from scratch", we can use matrix multiplications to modelize the iterative substraction of each component of the gradient. Depending on the cost function used, we need to use the right gradient (derivative) formula of the function over 𝛿w. When the cost difference is negative (meaning the learning rate (α) is too large, as the function is convex), we decrease the learning rate.

def gradientDescent(alpha, w, X, y, iterations, delta, print_every=500, print_first=10, function="squares"):
    previous_cost = -1
    curr_cost = -1
    i = 0
    costs = []
    decrease_factor = 0.5

    m = len(X)

    previous_weights = None

    start = time.time() * 1000
    first_print = True
    while (i < iterations and (i < 1 or previous_cost-curr_cost > delta or previous_cost-curr_cost < 0)):
        if (previous_cost-curr_cost < 0 and i > 1):
            alpha = alpha * decrease_factor
            w = previous_weights


        # Since the cost function is the sum of the costs associated to each errors, the gradient of the cost function
        # can be expressed as the sum of the derivatives for the costs associated to the error (since the derivative of a sum is the sum of the derivatives)

        # The gradient can simply be calculated as ∇J(Xw - y) = X_t * J'(Xw - y) (because of the chain rule)

        error = X @ w - y                                                 # We use the predicted y (hypothesis) - true y notation instead of the usual true y - predicted y
                                                                          # to simply calculations
        if (function == "squares"):
            gradient = (1/m) * (X.T @ error)                              # The derivative of g(x) = (f(x))² is g'(x) = f'(x) * (2*f(x))
        elif (function == "absolute"):
            error = (X @ w) - y
            gradient = (1/(2*m)) * (X.T @ (error / ( np.abs(error) )))    # The derivative of g(x) = abs(f(x)) is g'(x) = f'(x) * (f(x)/abs(f(x)))

        previous_weights = w
        w = w - alpha*gradient

        previous_cost = curr_cost
        curr_cost = costFunction(w, X, y, function)

        costs.append(curr_cost)                                           # We keep costs in memory to plot them afterwards

        if ((print_every > 0 and i%print_every == 0) or (print_first > 0 and i < print_first)): # We only print the "print_first" first iterations and then every
                                                                                                # "print_every" iterations
            if (first_print):
                print("{: >10} {: >40} {: >40} {: >20}".format("curr iter", "cost difference", "curr cost", "alpha"))
                print()
                first_print = False

            print("{: >10} {: >40} {: >40} {: >20}".format(i, previous_cost-curr_cost if previous_cost != -1 else "N/A", curr_cost, alpha))

        i+=1  

    end = time.time() * 1000

    print()
    print("Weights found: ", w)

    return np.round(w, 3), end-start, costs, i, curr_cost

We implement our own functions to calculate the mean absolute error, the root mean squared error and the r squared (coefficient of determination) of our regressions.

We can reuse our cost function implementation to calculate the errors. However, for the rest of the notebook, we will use scikit-learn's error functions to calculate our errors quicker and with more precision.

def meanAbsoluteError(X, w, y):     # The sum of the absolute values of the error divided by the number of individuals
    return 2*costFunction(w, X, y, function="absolute")

def rootMeanSquaredError(X, w, y):  # The sum of the squares of the error divided by the number of individuals
    return sqrt(2*costFunction(w, X, y))

def r2(X, w, y):                    # One minus the error squared divided by the variance squared
    error = y - (X @ w)
    rss = error.T @ error

    var = y-y.mean()
    tss = var.T @ var

    return 1 - rss/tss

For our linear regression algorithm itself, we simply generate random weights between -10 and 10, with number of columns + 1 (the bias) components, normalize our features and launch our gradientDescent function.

def linear_regression_from_scratch(path, y_label):
    X, y, df = load_data_excel(path, y_label)

    w = np.random.randint(-10, 10, len(X[0]) + 1)
    X_normalized = normalizeFeatures(X)

    all_results = []
    for func in ["squares", "absolute"]:
        print(func.capitalize(), "Cost Function Gradient Descent:\n")
        results = gradientDescent(20, w.copy(), X_normalized, y, 100000, 0.000001, 1000, 10, func)
        all_results.append(results)

        print()
        print("Initial weights: ", w, "\n")
        print("Least-squares cost function:")

        print("Execution time: %.2f ms" % results[1])
        print("Final w: ", results[0])
        print("Iterations: ", results[3])
        print("Score: %.2f (%s)" % (results[4], func.capitalize()))

        print()
        print("Mean Absolute Error: %.2f (from scratch), %.2f (sklearn)" % ( meanAbsoluteError(X_normalized, results[0], y),
                                                                             mean_absolute_error(y, X_normalized @ results[0]) ))
        print("Root Mean Squared Error: %.2f (from scratch), %.2f (sklearn)" % ( rootMeanSquaredError(X_normalized, results[0], y),
                                                                                 mean_squared_error(y, X_normalized @ results[0], squared=False) ))
        print("R2 Score: %.2f (from scratch), %.2f (sklearn)" % ( r2(X_normalized, results[0], y),
                                                                  r2_score(y, X_normalized @ results[0]) ))

        print()

    fig, ((ax1, ax2)) = plt.subplots(1, 2)
    ax1.plot(np.linspace(0, all_results[0][3], all_results[0][3]), all_results[0][2])
    ax1.set_title("Least-squares cost function")
    ax2.plot(np.linspace(0, all_results[1][3], all_results[1][3]), all_results[1][2])
    ax2.set_title("Absolute value cost function")
    fig.tight_layout(pad=3.0)

    fig.suptitle("Evolution of the cost function with increasing iterations of the gradient descent", fontsize=20, y=1.08)

    plot_weights_data_biasless(plt, X, y, all_results[0][0], "Square Cost Function - w: " + str(all_results[0][0]))
    plot_weights_data_biasless(plt, X, y, all_results[1][0], "Absolute Value Cost Function - w: " + str(all_results[1][0]))

    return X, w, y, df

PE is the column containing our output variable. We launch the notebook in the same folder as the "powerplant.xlsx" file, containing our dataset.

X, w, y, df = linear_regression_from_scratch("powerplant.xlsx", "PE")
Squares Cost Function Gradient Descent:

 curr iter                          cost difference                                curr cost                alpha

         0                                      N/A                       195415802.08396953                   20
         1                      -360631639165.71313                        360827054967.7971                   20
         2                       274769978290.70715                        86057076677.08997                 10.0
         3                       -37811873884371.17                       37897930961048.266                 10.0
         4                        29304899224758.07                        8593031736290.197                  5.0
         5                       -849444418868419.2                        858037450604709.5                  5.0
         6                        684313333963590.0                       173724116641119.44                  2.5
         7                      -3338431968640673.0                       3512156085281792.5                  2.5
         8                       2981245317567429.0                        530910767714363.4                 1.25
         9                      -1091582693326152.9                       1622493461040516.2                 1.25
      1000                     0.005701991068027823                        11.61951136847797                0.625
      2000                   5.7175138627130195e-05                        10.39609414663941                0.625

Weights found:  [502.53335951 -69.57700606 -13.27503187   2.61258824 -11.69118116]

Initial weights:  [-5 -1 -7  6 -3]

Least-squares cost function:
Execution time: 1137.20 ms
Final w:  [502.533 -69.577 -13.275   2.613 -11.691]
Iterations:  2881
Score: 10.38 (Squares)

Mean Absolute Error: 3.63 (from scratch), 3.63 (sklearn)
Root Mean Squared Error: 4.56 (from scratch), 4.56 (sklearn)
R2 Score: 0.93 (from scratch), 0.93 (sklearn)

Absolute Cost Function Gradient Descent:

 curr iter                          cost difference                                curr cost                alpha

         0                                      N/A                       220.26338062056482                   20
         1                       10.922426431309276                       209.34095418925554                   20
         2                       10.922426431309276                       198.41852775794626                   20
         3                       10.922426431309248                       187.49610132663702                   20
         4                       10.922426431309276                       176.57367489532774                   20
         5                        10.92242643130922                       165.65124846401852                   20
         6                       10.922426431309276                       154.72882203270925                   20
         7                       10.922426431309276                       143.80639560139997                   20
         8                       10.922426431309304                       132.88396917009067                   20
         9                       10.922426431309233                       121.96154273878143                   20
      1000                     0.001406858427301927                       2.1112984576017193                 10.0
      2000                   1.0585840893417853e-05                       1.8108684947986275                  5.0

Weights found:  [503.2389214  -70.67589557 -13.68880356   1.94425688 -11.28738914]

Initial weights:  [-5 -1 -7  6 -3]

Least-squares cost function:
Execution time: 1535.59 ms
Final w:  [503.239 -70.676 -13.689   1.944 -11.287]
Iterations:  2798
Score: 1.81 (Absolute)

Mean Absolute Error: 3.61 (from scratch), 3.61 (sklearn)
Root Mean Squared Error: 4.57 (from scratch), 4.57 (sklearn)
R2 Score: 0.93 (from scratch), 0.93 (sklearn)

png

png

png

Part II: Linear Regression using Scikit Learn

To use scikit-learn's linear regression model, we simply scale our features using the fit_transform method of a MinMaxScaler instance (from the sklearn.preprocessing module) and create an instance of the LinearRegression class from the linear_model module.

Since the dataset contains a number of samples orders of magnitude greater than columns, it is unnecessary to train and test the model on differents subsets of the dataset.

To retrieve the bias and the weights, we get the intercept_ and coef_ attributes from our LinearRegression instance. We create a function to display the statistics of our linear regression (errors & weights).

We obtain the same result with scikit-learn as with our linear regression implementation from scratch; with an R squared of 0.93, we see that we predict the model very accurately.

def linear_regression_stats(w, X, y, y_pred, columns, title="Linear Regression", X_display=False):
    print("Weights:", len(w), "components")
    print()

    dict_w = dict(zip(["Bias"] + list(columns), w))
    print("Coefficients:")

    for k, v in dict_w.items():
        print(f'{k:<20} {v}')

    print()
    print("Mean absolute error: %.2f"
          % mean_absolute_error(y, y_pred))

    print("Root mean squared error: %.2f"
          % mean_squared_error(y, y_pred, squared=False))

    print("Coefficient of determination: %.2f"
          % r2_score(y, y_pred))

    if(X_display is not False):
        plot_weights_data_biasless(plt, X_display, y, w, title)


    return
def linear_regression_sklearn(path, y_label):
    X, y, df = load_data_excel(path, y_label)

    scaler = MinMaxScaler()
    X_normalized = scaler.fit_transform(X)

    model = linear_model.LinearRegression()
    model.fit(X_normalized, y)

    linear_regression_stats([model.intercept_] + list(model.coef_), X_normalized, y,
                            model.predict(X_normalized), df.drop(y_label, axis=1).columns, "Scikit Learn Linear Regression", X)

    return X, w, y, df
X, w, y, df = linear_regression_sklearn("powerplant.xlsx", "PE")
Weights: 5 components

Coefficients:
Bias                 502.69952629563875
AT                   -69.80621266422945
V                    -13.146102949136466
AP                   2.508771758184392
RH                   -11.790836077564483

Mean absolute error: 3.63
Root mean squared error: 4.56
Coefficient of determination: 0.93

png

Part III: Linear Regression using the Normal Equation (from scratch)

In this part, we code all the basic linear algebra functions to calculate our normal equation from scratch.

h = lambda x, w: [sum(column*weight) for column, weight in line for line in zip(x, w)]

def transposeMatrix(m):
    return list(map(list,zip(*m)))

def createOnesVector(n):
    vector = []
    for i in range(n):
        vector.append(1)

    return vector

def normalizeFeaturesMatrix(X):
    X_t = transposeMatrix(X)
    for column in X_t:
        maximum = max(column)
        minimum = min(column)

        for i in range(len(column)):
            column[i] = (column[i] - minimum)/(maximum-minimum)

    first_column = createOnesVector(len(X))
    X_t.insert(0, first_column)

    return transposeMatrix(X_t)

def productMatrix(X, Y):
    #print("(", len(X), ",", len(X[0]), "), (", len(Y), ",", len(Y[0]), ")")
    result = []
    for i in range(len(Y)):
       if isinstance(Y[i], (int, float)):
           Y[i] = [Y[i]]

    for i in range(len(X)):
       if isinstance(X[i], (int, float)):
           X[i] = [X[i]]

    for i in range(len(X)):
        result.append([])
        for j in range(len(Y[0])):
            result[i].append(0)

    # iterate through rows of X    

    for i in range(len(X)):
       # iterate through columns of Y
       for j in range(len(Y[0])):
           # iterate through rows of Y
           for k in range(len(Y)):
               result[i][j] += X[i][k] * Y[k][j]

    return result    

def getMatrixMinor(m,i,j):
    return [row[:j] + row[j+1:] for row in (m[:i]+m[i+1:])]

def getMatrixDeterminant(m):
    #base case for 2x2 matrix
    if len(m) == 2:
        return m[0][0]*m[1][1]-m[0][1]*m[1][0]

    determinant = 0
    for c in range(len(m)):
        determinant += ((-1)**c)*m[0][c]*getMatrixDeterminant(getMatrixMinor(m,0,c))
    return determinant

def getMatrixInverse(m):
    determinant = getMatrixDeterminant(m)
    #special case for 2x2 matrix:
    if len(m) == 2:
        return [[m[1][1]/determinant, -1*m[0][1]/determinant],
                [-1*m[1][0]/determinant, m[0][0]/determinant]]

    #find matrix of cofactors
    cofactors = []
    for r in range(len(m)):
        cofactorRow = []
        for c in range(len(m)):
            minor = getMatrixMinor(m,r,c)
            cofactorRow.append(((-1)**(r+c)) * getMatrixDeterminant(minor))
        cofactors.append(cofactorRow)
    cofactors = transposeMatrix(cofactors)
    for r in range(len(cofactors)):
        for c in range(len(cofactors)):
            cofactors[r][c] = cofactors[r][c]/determinant
    return cofactors

We apply the normal equation formula $(X_t\ X)^{-1}\ (X_t\ y)$ and then display the obtained weights.

def linear_regression_normal_equation(path, y_label):

    X, y, df = load_data_excel(path, y_label)

    X_normalized = normalizeFeaturesMatrix(X)
    X_t = transposeMatrix(X_normalized)

    w = productMatrix (
            getMatrixInverse (
                productMatrix(X_t, X_normalized)
            ),
                productMatrix(X_t, list(y))
        )

    linear_regression_stats(w, X_normalized, y, productMatrix(X_normalized, w),
                            df.drop(y_label, axis=1).columns, "Linear Regression Using the Normal Equation", X)

    return X, w, y, df

There are almost the same as before, and just like with scikit-learn, we obtain an R squared of 0.93, meaning our previous implementations were correct (for a small number of features, the normal equation predicts the exact optimal weights for the global minimum of the cost function).

X, w, y, df = linear_regression_normal_equation("powerplant.xlsx", "PE")
Weights: 5 components

Coefficients:
Bias                 [502.69952629770523]
AT                   [-69.80621266353046]
V                    [-13.1461029532893]
AP                   [2.508771759332376]
RH                   [-11.790836079060682]

Mean absolute error: 3.63
Root mean squared error: 4.56
Coefficient of determination: 0.93

png

Second Dataset: Analysis & Model Fitting of the Bike Sharing Dataset

The original dataset can found at: https://archive.ics.uci.edu/ml/datasets/Bike+Sharing+Dataset. The modified dataset can be found in the same project folder as the notebook. It has 12 feature columns and one output column ("demand", corresponding to "cnt" in the original dataset).

We hardcode our dataset loading with a custom function that replaces the dayOfWeek column (a string column) with an integer column, corresponding to the index of the day in a week.

def load_bike_data():
    df = pd.read_csv("Bike Rental UCI dataset.csv")

    days_dict = {"Mon": 0, "Tue": 1, "Wed": 2, "Thr": 3, "Fri": 4, "Sat": 5, "Sun": 6}
    df['dayOfWeek'] = df['dayOfWeek'].map(days_dict)

    y = df['demand'].to_numpy()
    X = df.drop('demand', axis=1).to_numpy()

    return X, y, df
X, y, df = load_bike_data()

print(df.columns)
print()
print(df.sample(10))

unique = []
unique_counts = []
unique_types = []
for column in df.columns:
    unique_column = df[column].unique()
    string = " - ".join([str(i) for i in unique_column])
    if len(string) > 40:
        string = string[:36] + " ..."
    unique.append(string)
    unique_counts.append(len(unique_column))

df_unique = pd.DataFrame(zip(df.columns, unique_counts, [str(dtype) for dtype in df.dtypes], unique),
                         columns = ["Column names", "# Unique", "Data Type", "Unique values"])

print()
print(df_unique)
Index(['season', 'yr', 'mnth', 'hr', 'holiday', 'workingday', 'weathersit', 'temp', 'hum', 'windspeed', 'dayOfWeek', 'days', 'demand'], dtype='object')

       season  yr  mnth  hr  holiday  workingday  weathersit  temp   hum  windspeed  dayOfWeek  days  demand
4151        3   0     6  21        0           0           2  0.70  0.54     0.1045          6   172     183
4895        3   0     7  21        0           1           1  0.74  0.48     0.1045          2   203     245
7848        4   0    11  16        0           1           1  0.56  0.60     0.1940          0   327     264
9870        1   1     2   7        0           1           1  0.22  0.64     0.0000          1   411     279
2240        2   0     4   5        0           1           2  0.32  0.87     0.1940          4    93       9
10611       2   1     3   6        0           1           2  0.50  0.88     0.0000          4   442     116
17052       4   1    12   7        0           1           1  0.36  0.93     0.1343          1   710     355
5359        3   0     8   5        0           1           1  0.62  0.73     0.2836          1   223      30
15674       4   1    10   7        0           0           1  0.40  0.82     0.0000          5   653      65
5592        3   0     8  22        0           1           1  0.62  0.83     0.0000          3   233     147

   Column names  # Unique Data Type                             Unique values
0        season         4     int64                             1 - 2 - 3 - 4
1            yr         2     int64                                     0 - 1
2          mnth        12     int64  1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 -  ...
3            hr        24     int64  0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 -  ...
4       holiday         2     int64                                     0 - 1
5    workingday         2     int64                                     0 - 1
6    weathersit         4     int64                             1 - 2 - 3 - 4
7          temp        50   float64  0.24 - 0.22 - 0.2 - 0.32 - 0.38 - 0. ...
8           hum        89   float64  0.81 - 0.8 - 0.75 - 0.86 - 0.76 - 0. ...
9     windspeed        30   float64  0.0 - 0.0896 - 0.2537 - 0.2836 - 0.2 ...
10    dayOfWeek         7     int64                 5 - 6 - 0 - 1 - 2 - 3 - 4
11         days       725     int64  0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 -  ...
12       demand       869     int64  16 - 40 - 32 - 13 - 1 - 2 - 3 - 8 -  ...

We display the intercorrelations of our columns to get some insights about each feature. As we can see, the hour of the day (hr), the temperature (temp), the year (yr) and "workingday" (indicated whether the day is a working day) have the highest correlations with our output column (demand).

Most other columns have a high intercorrelation with the previously described variables, meaning they do not carry more information than those variables (i.e. a PCA would probably only contain variables very similar to those 4 described, and not lose information).

def heatmap_correlations():
    X, y, df = load_bike_data()
    corr = df.corr()
    sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True), square=True)
heatmap_correlations()

png

Part I: Raw Linear Regression

We reuse the same functions/logic as before to implement the linear regressions for the bike sharing dataset.

def linear_regression_from_scratch_bike():

    X, y, df = load_bike_data()

    X_t = X.T

    w = np.random.randint(-10, 10, 13)
    X_normalized = normalizeFeatures(X)

    all_results = []
    for func in ["squares", "absolute"]:
        print(func.capitalize(), "Cost Function Gradient Descent:\n")
        results = gradientDescent(20, w.copy(), X_normalized, y, 100000, 0.000001, 5000, 10, func)
        all_results.append(results)

        print()
        print("Initial weights: ", w, "\n")
        print(func.capitalize(), "cost function:")

        print("Execution time: %.2f ms" % results[1])
        print("Final w: ", results[0])
        print("Iterations: ", results[3])
        print("Score: %.2f (%s)" % (results[4], func.capitalize()))

        print()
        print("Mean Absolute Error: %.2f (from scratch), %.2f (sklearn)" % ( meanAbsoluteError(X_normalized, results[0], y),
                                                                             mean_absolute_error(y, X_normalized @ results[0]) ))
        print("Root Mean Squared Error: %.2f (from scratch), %.2f (sklearn)" % ( rootMeanSquaredError(X_normalized, results[0], y),
                                                                                 mean_squared_error(y, X_normalized @ results[0], squared=False) ))
        print("R2 Score: %.2f (from scratch), %.2f (sklearn)" % ( r2(X_normalized, results[0], y),
                                                                  r2_score(y, X_normalized @ results[0]) ))

        print()

    fig, ((ax1, ax2)) = plt.subplots(1, 2)
    ax1.plot(np.linspace(0, all_results[0][3], all_results[0][3]), all_results[0][2])
    ax1.set_title("Least-squares cost function")
    ax2.plot(np.linspace(0, all_results[1][3], all_results[1][3]), all_results[1][2])
    ax2.set_title("Absolute value cost function")
    fig.tight_layout(pad=3.0)

    fig.suptitle("Evolution of the cost function with increasing iterations of the gradient descent", fontsize=20, y=1.08)

    return X, w, y, df
X, w, y, df = linear_regression_from_scratch_bike()
Squares Cost Function Gradient Descent:

 curr iter                          cost difference                                curr cost                alpha

         0                                      N/A                        116316723.4560647                   20
         1                       -652644168094.1022                        652760484817.5582                   20
         2                        493897747385.7277                        158862737431.8305                 10.0
         3                       -216847129729158.9                       217005992466590.75                 10.0
         4                       165650513642864.72                        51355478823726.03                  5.0
         5                  -1.6550308173240332e+16                   1.6601663652064058e+16                  5.0
         6                   1.2900086891849036e+16                       3701576760215023.0                  2.5
         7                  -2.6309898336357366e+17                   2.6680056012378867e+17                  2.5
         8                   2.1488794059869235e+17                    5.191261952509631e+16                 1.25
         9                   -6.761338970105887e+17                     7.28046516535685e+17                 1.25
      5000                    5.233006777416449e-05                       10077.580842341316               0.3125
     10000                   3.8329055314534344e-05                       10077.356015030264               0.3125
     15000                   2.8074040528736077e-05                         10077.1913407047               0.3125
     20000                    2.056277844530996e-05                        10077.07072531605               0.3125
     25000                   1.5061168596730568e-05                       10076.982380809832               0.3125
     30000                   1.1031521353288554e-05                       10076.917673048523               0.3125
     35000                    8.080014595179819e-06                       10076.870277974933               0.3125
     40000                     5.91819298279006e-06                         10076.8355635457               0.3125
     45000                    4.334768163971603e-06                       10076.810137029699               0.3125
     50000                   3.1749932531965896e-06                       10076.791513427637               0.3125
     55000                   2.3255179257830605e-06                        10076.77787260662               0.3125
     60000                    1.703321686363779e-06                       10076.767881413867               0.3125
     65000                   1.2475957191782072e-06                        10076.76056338422               0.3125

Weights found:  [  12.51829782   60.1667      122.54359936   38.08755315  177.15381016
  -24.37336462    4.51867719  -11.43225612  277.20917373 -196.86191416
   25.31465639    1.02458144  -82.59786157]

Initial weights:  [-4  2 -4  9  1 -4  1 -7 -8 -9 -4  6  3]

Squares cost function:
Execution time: 11037.94 ms
Final w:  [  12.518   60.167  122.544   38.088  177.154  -24.373    4.519  -11.432
  277.209 -196.862   25.315    1.025  -82.598]
Iterations:  68554
Score: 10076.76 (Squares)

Mean Absolute Error: 106.10 (from scratch), 106.10 (sklearn)
Root Mean Squared Error: 141.96 (from scratch), 141.96 (sklearn)
R2 Score: 0.39 (from scratch), 0.39 (sklearn)

Absolute Cost Function Gradient Descent:

 curr iter                          cost difference                                curr cost                alpha

         0                                      N/A                        83.01344989279826                   20
         1                        6.203046536422306                        76.81040335637596                   20
         2                        3.983805058385883                        72.82659829799007                   20
         3                        2.811305923512748                        70.01529237447733                   20
         4                       1.9462613790737606                        68.06903099540357                   20
         5                        1.340218542419592                        66.72881245298397                   20
         6                       0.9397803263304496                        65.78903212665352                   20
         7                       0.6634974083865899                        65.12553471826693                   20
         8                        0.479096686351312                        64.64643803191562                   20
         9                        0.361984622193134                        64.28445340972249                   20

Weights found:  [ 1.34204500e+01  3.74663675e+01  4.31477070e+01 -4.95932395e+00
  1.77097514e+02 -4.75378330e+00  3.97030899e-03  8.37545313e+00
  2.28844726e+02 -1.80766517e+02  1.79834489e+00  1.77187410e+01
  1.54569231e+01]

Initial weights:  [-4  2 -4  9  1 -4  1 -7 -8 -9 -4  6  3]

Absolute cost function:
Execution time: 836.27 ms
Final w:  [ 1.34200e+01  3.74660e+01  4.31480e+01 -4.95900e+00  1.77098e+02
 -4.75400e+00  4.00000e-03  8.37500e+00  2.28845e+02 -1.80767e+02
  1.79800e+00  1.77190e+01  1.54570e+01]
Iterations:  3175
Score: 50.58 (Absolute)

Mean Absolute Error: 101.15 (from scratch), 101.15 (sklearn)
Root Mean Squared Error: 147.68 (from scratch), 147.68 (sklearn)
R2 Score: 0.34 (from scratch), 0.34 (sklearn)

png

def linear_regression_scikit_bike():
    X, y, df = load_bike_data()

    scaler = MinMaxScaler()
    X_normalized = scaler.fit_transform(X)

    model = linear_model.LinearRegression()
    model.fit(X_normalized, y)


    linear_regression_stats([model.intercept_] + list(model.coef_), X_normalized, y,
                            model.predict(X_normalized), df.drop("demand", axis=1).columns,
                            "Scikit Learn Linear Regression (Bike Sharing Dataset)")

    return X, w, y, df
X, w, y, df = linear_regression_scikit_bike()
Weights: 13 components

Coefficients:
Bias                 12.70222800138481
season               60.136022510799975
yr                   130.00364157932958
mnth                 44.9430599627203
hr                   177.1774698565802
holiday              -24.386075025499807
workingday           4.517243675373237
weathersit           -11.450442809762603
temp                 277.2098848446928
hum                  -196.8266250443891
windspeed            25.33532207207788
dayOfWeek            1.0187825756877227
days                 -97.41800717998706

Mean absolute error: 106.09
Root mean squared error: 141.96
Coefficient of determination: 0.39
def linear_regression_normal_equation_bike():
    X, y, df = load_bike_data()

    X_normalized = normalizeFeatures(X)
    X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, random_state = 1, test_size=0.2)

    start = time.time() * 1000

    w = np.linalg.inv( X_train.T @ X_train ) @ ( X_train.T @ y_train )
    print(len(w))

    end = time.time() * 1000


    linear_regression_stats(w, X_normalized, y_test, X_test @ w,
                            df.drop("demand", axis=1).columns,
                            "Linear Regression Using the Normal Equation (Bike Sharing Dataset)")

    return X, w, y, df
X, w, y, df = linear_regression_normal_equation_bike()
13
Weights: 13 components

Coefficients:
Bias                 15.079934916228012
season               59.339308082236016
yr                   138.8077656327514
mnth                 55.21086379577173
hr                   175.21107201612472
holiday              -27.156028085155583
workingday           5.675966661051689
weathersit           -12.155998101331193
temp                 276.5409816263464
hum                  -198.53619282883324
windspeed            20.51289595746175
dayOfWeek            1.864165235446272
days                 -116.36340328166261

Mean absolute error: 106.45
Root mean squared error: 141.72
Coefficient of determination: 0.40

Part II: Data Exploration

The normal equation is slightly more effective than the linear regression, but overrall, the R squared coefficient found (0.4) is very mediocre. This is likely due to the fact that our hr, yr, weathersit and season columns are treated as integer columns (with values increasing in an ascending order), even though there is no logical order relationship between the different states of those variables; if it was sure that there was a predefined linear relationship between the hour of the day and the demand, we could reorganize the values so that each ascending value corresponds to a time of day where there is linearly more demand. However, we will surely need to reengineer our variables in another way. We can visualize the difference between the predicted values of our linear model and the actual values by aggregating over our different "cyclical"/bianry columns.

def data_exploration_bike():
    X, y, df = load_bike_data()

    w = np.linalg.inv( X.T @ X ) @ ( X.T @ y )
    y_pred = X @ w
    df['demand_pred'] = y_pred

    print('Coefficient of determination: %.2f'
      % r2_score(y, y_pred))

    print(len(df))
    print("Ten random samples from the dataset:\n")
    print(df.sample(10).head(10))


    cyclical_columns = ['season', 'weathersit', 'hr', 'dayOfWeek', 'workingday', 'holiday', 'yr']

    for feature in cyclical_columns:
        df_agg = df.groupby(feature).agg(["mean", "median", "var"])

        print("\n")
        print(df_agg[["demand", "demand_pred"]].head(10))

        bar_width = 0.3
        fig, axs = plt.subplots(1, len(df_agg["demand"].columns))

        for i, column in enumerate(df_agg["demand"].columns):

            axs[i].bar(df_agg.index - bar_width/2, df_agg["demand"][column], bar_width, label='y true')
            axs[i].bar(df_agg.index + bar_width/2, df_agg["demand_pred"][column], bar_width, label='y predicted')   
            axs[i].legend()
            axs[i].set_title(column, fontsize=20)

            axs[i].set_xticks(df_agg.index)

        fig.suptitle(feature + ": comparison between truth and prediction", fontsize=20, y=1.08)
data_exploration_bike()
Coefficient of determination: 0.39
17379
Ten random samples from the dataset:

       season  yr  mnth  hr  holiday  workingday  weathersit  temp   hum  windspeed  dayOfWeek  days  demand  demand_pred
5940        3   0     9   1        0           0           2  0.62  0.94     0.0000          5   247      83    42.268440
13530       3   1     7  23        0           0           1  0.68  0.79     0.1642          6   563     123   344.168922
10596       2   1     3  15        0           1           1  0.62  0.69     0.1940          3   441     304   270.436504
7899        4   0    11  19        0           1           2  0.32  0.57     0.2836          2   329     279   198.405554
12720       2   1     6   5        0           1           2  0.60  0.78     0.1940          1   530      37   167.817654
14341       3   1     8  18        0           0           2  0.66  0.69     0.4627          5   597     383   324.397225
6394        4   0     9   0        0           1           1  0.60  0.94     0.1940          3   266      40    58.495348
7996        4   0    12  20        0           0           1  0.36  0.76     0.1343          6   333     124   177.624117
2678        2   0     4  12        0           1           1  0.70  0.61     0.4179          1   111     186   212.237721
10301       1   1     3   7        0           0           1  0.22  0.44     0.2537          5   429      40   127.887607


            demand                      demand_pred                          
              mean median           var        mean      median           var
season                                                                       
1       111.114569   76.0  14214.364612  115.323382  113.846535   9681.434205
2       208.344069  165.0  35480.421290  193.216813  191.006952  11559.868944
3       236.016237  199.0  39089.888724  256.884292  256.776445  10180.924440
4       198.868856  155.5  33477.278708  189.066755  192.242005   9110.694564


                demand                      demand_pred                          
                  mean median           var        mean      median           var
weathersit                                                                       
1           204.869272    159  35905.616277  207.843434  209.899623  12795.372589
2           175.165493    133  27367.610731  162.502963  161.305560  10449.299623
3           111.579281     63  17897.368005  130.773596  127.849197  10234.572738
4            74.333333     36   6072.333333   26.230515    8.534965   5804.181395


        demand                      demand_pred                         
          mean median           var        mean      median          var
hr                                                                      
0    53.898072   40.0   1789.959251   79.768812   85.645790  6001.016718
1    33.375691   20.0   1124.846213   82.330754   86.826823  5836.261474
2    22.869930   11.0    706.424235   86.600493   90.410902  5629.431896
3    11.727403    6.0    175.276159   91.796335   95.027733  5295.320865
4     6.352941    6.0     17.171231   95.971317   98.929092  5150.567641
5    19.889819   19.0    174.260189   98.941965  102.702225  5278.485291
6    76.044138   76.0   3034.285342  104.598016  107.906228  5393.408450
7   212.064649  208.0  26063.498570  116.632991  119.821555  5866.947018
8   359.011004  385.0  55313.999879  134.790530  136.095544  6612.168659
9   219.309491  216.0   8780.337968  157.124994  158.203789  7065.198644


               demand                      demand_pred                          
                 mean median           var        mean      median           var
dayOfWeek                                                                       
0          183.744655    139  32225.336306  185.872767  185.649440  12337.375341
1          191.238891    147  35276.532630  193.463066  191.586918  12160.702330
2          191.130505    143  36440.696381  190.153178  186.061785  13269.690070
3          196.436665    154  35348.483335  197.400976  195.325764  12521.061083
4          196.135907    165  30302.765110  191.835377  189.328695  13197.053107
5          190.209793    129  32335.437053  184.625748  182.947089  13147.820909
6          177.468825    116  28280.378676  184.473269  182.196525  11919.571627


                demand                     demand_pred                          
                  mean median          var        mean      median           var
workingday                                                                      
0           181.405332    119  29878.44714  182.039717  180.097217  12510.035810
1           193.207754    151  34264.77789  193.207754  191.649945  12702.653684


            demand                      demand_pred                          
              mean median           var        mean      median           var
holiday                                                                      
0        190.42858    144  33117.242662  190.635819  188.930652  12666.995484
1        156.87000     97  24572.906914  156.870000  153.336933  11612.077618


        demand                      demand_pred                          
          mean median           var        mean      median           var
yr                                                                       
0   143.794448    109  17901.865772  144.199074  141.809477  10569.671592
1   234.666361    191  43643.781264  234.666361  231.637790  10674.391873

png

png

png

png

png

png

png

As we can see, the variance of each variable is very badly estimated by our model, and the mean for the demand depending on the hour of day column (hr) is especially incorrectly predicted; assuring that our model can acurately predict the demand depending on the hour is critcal to having a good R squared coefficient, as the hr column is highly correlated with the demand. To find how important the hr column is to predicting the demand, we will try different models.

Part III: Model Testing

First, we implement a model with only the temp variable (the non-categorical column the most correlated to our demand).

def model_temp_only():
    X, y, df = load_bike_data()

    df = df[["temp", "demand"]]

    y = df["demand"].to_numpy()
    X = df.drop("demand", axis=1).to_numpy()

    scaler = MinMaxScaler()
    X_normalized = scaler.fit_transform(X)

    model = linear_model.LinearRegression()
    model.fit(X_normalized, y)

    linear_regression_stats([model.intercept_] + list(model.coef_), X_normalized, y,
                            model.predict(X_normalized), df.drop("demand", axis=1).columns,
                            "Scikit Learn Linear Regression (Bike Sharing Dataset - temp only)")

    return X, w, y, df
X, w, y, df = model_temp_only()
Weights: 2 components

Coefficients:
Bias                 7.590302332541086
temp                 373.6690238139917

Mean absolute error: 125.52
Root mean squared error: 165.86
Coefficient of determination: 0.16

Temperature alone is enough to describe demand with a 0.16 R squared. Now, let us try to see how relevant hour is on its own by one-hot encoding the variable in the dataset. This will create 24 variables, each corresponding to one hour of the day; since hour is the variable most correlated to our output variable, this is sure to be effective.

def model_hour_only():
    X, y, df = load_bike_data()

    hour = pd.get_dummies(df.hr, prefix="hr")
    df = pd.concat([df["demand"], hour], axis=1)

    y = df["demand"].to_numpy()
    X = df.drop("demand", axis=1).to_numpy()

    scaler = MinMaxScaler()
    X_normalized = scaler.fit_transform(X)

    model = linear_model.LinearRegression()
    model.fit(X_normalized, y)

    linear_regression_stats([model.intercept_] + list(model.coef_), X_normalized, y,
                            model.predict(X_normalized), df.drop("demand", axis=1).columns,
                            "Scikit Learn Linear Regression (Bike Sharing Dataset - hour only)")

    return X, w, y, df
X, w, y, df = model_hour_only()
Weights: 25 components

Coefficients:
Bias                 374371697728217.06
hr_0                 -374371697728163.7
hr_1                 -374371697728183.25
hr_2                 -374371697728196.94
hr_3                 -374371697728204.44
hr_4                 -374371697728210.6
hr_5                 -374371697728195.8
hr_6                 -374371697728140.94
hr_7                 -374371697728005.2
hr_8                 -374371697727857.2
hr_9                 -374371697727997.1
hr_10                -374371697728043.6
hr_11                -374371697728008.0
hr_12                -374371697727964.0
hr_13                -374371697727963.0
hr_14                -374371697727976.2
hr_15                -374371697727965.4
hr_16                -374371697727905.25
hr_17                -374371697727756.44
hr_18                -374371697727791.8
hr_19                -374371697727905.2
hr_20                -374371697727991.44
hr_21                -374371697728044.8
hr_22                -374371697728086.06
hr_23                -374371697728129.4

Mean absolute error: 88.08
Root mean squared error: 128.07
Coefficient of determination: 0.50

The hr variable has a R squared score of 0.5 on its own ! This means that it carries most of the information of our dataset, and even more information that could be deduced from the correlation itself. Next we will test a model with all variables one-hot encoded (except days, as it would clearly overfit our data, since there would be one variable for every day, so the number of individuals to number of variables ratio would only be of 1/24, which is way too high.

Also as we can see, the weights have strange large values; this is due to the fact that with one-hot encoding, since it is sure that one and only one of the variables will be "on" at any given time, the bias can take any value, since the weights on the variables can correct it back to the "real" bias value. With multiple categorical variables being one-hot encoded, the same can happen again, as one variable of each one-hot encoded variable is sure to be 1 at any given time, the weights from one variable can derive from their original value as long as it is corrected by the other variable. Thus, from now on, the coefficients of the values will not be analysable.

def model_all_variables():
    X, y, df = load_bike_data()

    season = pd.get_dummies(df.season, prefix="season")
    month = pd.get_dummies(df.mnth, prefix="mnth")
    year = pd.get_dummies(df.yr, prefix="yr")
    workingday = pd.get_dummies(df.workingday, prefix="workingday")
    dayOfWeek = pd.get_dummies(df.dayOfWeek, prefix="dayOfWeek")
    holiday = pd.get_dummies(df.holiday, prefix="holiday")
    weather = pd.get_dummies(df.weathersit, prefix="weather_state")
    hour = pd.get_dummies(df.hr, prefix="hr")

    df = pd.concat([df[["temp", "hum", "windspeed", "demand"]], season, month, year, workingday,
                    weather, dayOfWeek, holiday, hour], axis=1)

    y = df["demand"].to_numpy()
    X = df.drop("demand", axis=1).to_numpy()

    scaler = MinMaxScaler()
    X_normalized = scaler.fit_transform(X)

    model = linear_model.LinearRegression()
    model.fit(X_normalized, y)

    linear_regression_stats([model.intercept_] + list(model.coef_), X_normalized, y,
                            model.predict(X_normalized), df.drop("demand", axis=1).columns,
                            "Scikit Learn Linear Regression (Bike Sharing Dataset - all variables)")

    return X, w, y, df
X, w, y, df = model_all_variables()
Weights: 61 components

Coefficients:
Bias                 -3287449034800647.5
temp                 229.47279613872115
hum                  -82.84376702764325
windspeed            -30.467239847406844
season_1             93204320900616.83
season_2             93204320900656.0
season_3             93204320900648.75
season_4             93204320900684.06
mnth_1               107485739536674.39
mnth_2               107485739536678.69
mnth_3               107485739536689.56
mnth_4               107485739536680.4
mnth_5               107485739536692.92
mnth_6               107485739536677.94
mnth_7               107485739536658.33
mnth_8               107485739536678.67
mnth_9               107485739536704.48
mnth_10              107485739536690.25
mnth_11              107485739536665.47
mnth_12              107485739536668.36
yr_0                 1577210655215961.0
yr_1                 1577210655216045.0
workingday_0         1531539644338810.8
workingday_1         -17779893992024.58
weather_state_1      22423555517236.812
weather_state_2      22423555517226.04
weather_state_3      22423555517171.22
weather_state_4      22423555517172.402
dayOfWeek_0          783327032305122.9
dayOfWeek_1          783327032305123.9
dayOfWeek_2          783327032305127.2
dayOfWeek_3          783327032305125.9
dayOfWeek_4          783327032305130.5
dayOfWeek_5          -765992506025705.1
dayOfWeek_6          -765992506025721.8
holiday_0            832832862605260.6
holiday_1            -716486675725601.2
hr_0                 -111255237288266.38
hr_1                 -111255237288282.67
hr_2                 -111255237288292.25
hr_3                 -111255237288303.0
hr_4                 -111255237288305.0
hr_5                 -111255237288288.69
hr_6                 -111255237288230.12
hr_7                 -111255237288094.53
hr_8                 -111255237287954.88
hr_9                 -111255237288102.25
hr_10                -111255237288157.12
hr_11                -111255237288130.94
hr_12                -111255237288093.22
hr_13                -111255237288097.62
hr_14                -111255237288113.88
hr_15                -111255237288104.5
hr_16                -111255237288042.69
hr_17                -111255237287888.69
hr_18                -111255237287920.23
hr_19                -111255237288028.42
hr_20                -111255237288108.75
hr_21                -111255237288157.62
hr_22                -111255237288194.47
hr_23                -111255237288233.38

Mean absolute error: 75.11
Root mean squared error: 101.65
Coefficient of determination: 0.69

With all the variables in our datasets, we obtain a R squared coefficient of 0.69, which is pretty mediocre, but seems to be the maximum that can be attained using a linear model. However, we observe that we have 61 components to our weights (39 more than with just the hour), for only 0.19 gained on our R2. We will try to trim down our model to have the optimal R2/number of variables ratio.

To remove variables, we can think logically: workingday contains most of the relevant information of week days (since what differentiates weeks days from the weekend is whether they are days of work, and it is improbable that consumer habits would change drastically in between week days), we can drop dayOfWeek and holiday (because there are very few days that are non-working and are not holidays). Also, most of the information of month is carried in season (which is also mostly contained in weather and tempature, as they are the most correlated on the correlation heatmap, but logically because what mostly change in between months is the climate) and year (as it describes long term changes in consumer tendencies), so we will drop the mnth column.

def model_all_variables_trimmed():
    X, y, df = load_bike_data()

    print(df.columns)
    season = pd.get_dummies(df.season, prefix="season")
    year = pd.get_dummies(df.yr, prefix="yr")
    workingday = pd.get_dummies(df.workingday, prefix="workingday")
    weather = pd.get_dummies(df.weathersit, prefix="weather_state")
    hour = pd.get_dummies(df.hr, prefix="hr")

    df = pd.concat([df[["temp", "hum", "windspeed", "demand"]], season, year, workingday,
                    weather, hour], axis=1)

    y = df["demand"].to_numpy()
    X = df.drop("demand", axis=1).to_numpy()

    scaler = MinMaxScaler()
    X_normalized = scaler.fit_transform(X)

    model = linear_model.LinearRegression()
    model.fit(X_normalized, y)

    linear_regression_stats([model.intercept_] + list(model.coef_), X_normalized, y,
                            model.predict(X_normalized), df.drop("demand", axis=1).columns,
                            "Scikit Learn Linear Regression (Bike Sharing Dataset - trimmed model)")

    return X, w, y, df
X, w, y, df = model_all_variables_trimmed()
Index(['season', 'yr', 'mnth', 'hr', 'holiday', 'workingday', 'weathersit', 'temp', 'hum', 'windspeed', 'dayOfWeek', 'days', 'demand'], dtype='object')
Weights: 40 components

Coefficients:
Bias                 31410054584152.043
temp                 239.13103129069313
hum                  -70.11770358086514
windspeed            -28.73115471762346
season_1             -54953805315583.93
season_2             -54953805315540.43
season_3             -54953805315555.62
season_4             -54953805315517.93
yr_0                 26837883695852.93
yr_1                 26837883695938.3
workingday_0         1152526478866.2336
workingday_1         1152526478873.348
weather_state_1      -4313108840646.7466
weather_state_2      -4313108840657.415
weather_state_3      -4313108840712.563
weather_state_4      -4313108840714.447
hr_0                 -133550602716.63249
hr_1                 -133550602734.13013
hr_2                 -133550602743.27393
hr_3                 -133550602753.81503
hr_4                 -133550602757.17523
hr_5                 -133550602740.45392
hr_6                 -133550602681.51718
hr_7                 -133550602546.36156
hr_8                 -133550602405.72298
hr_9                 -133550602553.23534
hr_10                -133550602607.84065
hr_11                -133550602582.18098
hr_12                -133550602542.70438
hr_13                -133550602547.62059
hr_14                -133550602563.44862
hr_15                -133550602554.05779
hr_16                -133550602492.03513
hr_17                -133550602338.4451
hr_18                -133550602370.33269
hr_19                -133550602479.08385
hr_20                -133550602558.72362
hr_21                -133550602608.30258
hr_22                -133550602645.45706
hr_23                -133550602684.42998

Mean absolute error: 75.61
Root mean squared error: 102.51
Coefficient of determination: 0.68

Despite having removed 21 components to the weight, we have lost almost no information, and the R2 coefficient has only dropped by 0.01. We can try to trim the weight some more. We will remove the humidity (hum) and the wind speed (windspeed) variables from the dataset, as they are almost not correlated with the demand. Also, we will remove the weather column, as the temperature and the season already describe most of the weather changes.

def model_all_variables_most_trimmed():
    X, y, df = load_bike_data()

    print(df.columns)
    season = pd.get_dummies(df.season, prefix="season")
    year = pd.get_dummies(df.yr, prefix="yr")
    workingday = pd.get_dummies(df.workingday, prefix="workingday")
    weather = pd.get_dummies(df.weathersit, prefix="weather_state")
    hour = pd.get_dummies(df.hr, prefix="hr")

    df = pd.concat([df[["temp", "demand"]], season, year, workingday, hour], axis=1)

    y = df["demand"].to_numpy()
    X = df.drop("demand", axis=1).to_numpy()

    scaler = MinMaxScaler()
    X_normalized = scaler.fit_transform(X)

    model = linear_model.LinearRegression()
    model.fit(X_normalized, y)

    linear_regression_stats([model.intercept_] + list(model.coef_), X_normalized, y,
                            model.predict(X_normalized), df.drop("demand", axis=1).columns,
                            "Scikit Learn Linear Regression (Bike Sharing Dataset - most trimmed model)")

    return X, w, y, df
X, w, y, df = model_all_variables_most_trimmed()
Index(['season', 'yr', 'mnth', 'hr', 'holiday', 'workingday', 'weathersit', 'temp', 'hum', 'windspeed', 'dayOfWeek', 'days', 'demand'], dtype='object')
Weights: 34 components

Coefficients:
Bias                 1108457999753337.8
temp                 253.38856765867126
season_1             -905497848461046.9
season_2             -905497848461009.9
season_3             -905497848461024.1
season_4             -905497848460986.8
yr_0                 -186147775733819.84
yr_1                 -186147775733731.4
workingday_0         -63785294351338.875
workingday_1         -63785294351334.02
hr_0                 46972918792728.74
hr_1                 46972918792709.484
hr_2                 46972918792700.08
hr_3                 46972918792688.24
hr_4                 46972918792684.21
hr_5                 46972918792701.87
hr_6                 46972918792759.45
hr_7                 46972918792894.46
hr_8                 46972918793037.2
hr_9                 46972918792891.93
hr_10                46972918792839.81
hr_11                46972918792868.6
hr_12                46972918792909.39
hr_13                46972918792905.984
hr_14                46972918792890.87
hr_15                46972918792900.09
hr_16                46972918792961.78
hr_17                46972918793113.805
hr_18                46972918793081.04
hr_19                46972918792971.56
hr_20                46972918792890.195
hr_21                46972918792839.76
hr_22                46972918792801.65
hr_23                46972918792760.59

Mean absolute error: 77.38
Root mean squared error: 105.57
Coefficient of determination: 0.66

6 variables down, and 0.02 lost on the R2 score. Every variable now is critical to our model. We will now create one last model without the year column, as, in the logic of our prediction, this column overfits our dataset; as we saw in the data exploration, the difference in demand between the two years of the dataset is very important, and it doesn't make sense to assume that there is a linear relationship between the year and the demand; since the year is too specific in describing the demand, we remove it from our dataset to avoid overfitting in future predictions.

def model_final():
    X, y, df = load_bike_data()

    print(df.columns)
    season = pd.get_dummies(df.season, prefix="season")
    workingday = pd.get_dummies(df.workingday, prefix="workingday")
    weather = pd.get_dummies(df.weathersit, prefix="weather_state")
    hour = pd.get_dummies(df.hr, prefix="hr")

    df = pd.concat([df[["temp", "demand"]], season, workingday, hour], axis=1)

    y = df["demand"].to_numpy()
    X = df.drop("demand", axis=1).to_numpy()

    scaler = MinMaxScaler()
    X_normalized = scaler.fit_transform(X)

    model = linear_model.LinearRegression()
    model.fit(X_normalized, y)

    linear_regression_stats([model.intercept_] + list(model.coef_), X_normalized, y,
                            model.predict(X_normalized), df.drop("demand", axis=1).columns,
                            "Scikit Learn Linear Regression (Bike Sharing Dataset - final model)")

    return X, w, y, df
X, w, y, df = model_final()
Index(['season', 'yr', 'mnth', 'hr', 'holiday', 'workingday', 'weathersit', 'temp', 'hum', 'windspeed', 'dayOfWeek', 'days', 'demand'], dtype='object')
Weights: 32 components

Coefficients:
Bias                 695432919332540.4
temp                 284.0390424705157
season_1             12670818751740.244
season_2             12670818751768.783
season_3             12670818751749.348
season_4             12670818751794.34
workingday_0         -454697299235976.8
workingday_1         -454697299235972.2
hr_0                 -253406438848405.47
hr_1                 -253406438848423.97
hr_2                 -253406438848433.1
hr_3                 -253406438848444.16
hr_4                 -253406438848447.88
hr_5                 -253406438848430.97
hr_6                 -253406438848373.38
hr_7                 -253406438848238.9
hr_8                 -253406438848096.7
hr_9                 -253406438848242.84
hr_10                -253406438848295.3
hr_11                -253406438848267.28
hr_12                -253406438848227.06
hr_13                -253406438848231.03
hr_14                -253406438848246.84
hr_15                -253406438848237.34
hr_16                -253406438848175.53
hr_17                -253406438848023.5
hr_18                -253406438848055.22
hr_19                -253406438848164.38
hr_20                -253406438848245.5
hr_21                -253406438848295.22
hr_22                -253406438848333.0
hr_23                -253406438848373.62

Mean absolute error: 82.43
Root mean squared error: 114.41
Coefficient of determination: 0.60

Our final model has a coefficient of determination of 0.6, which is pretty mediocre, and cannot be improved much using a linear regression. However, we can use a trick to increase the precision of our linear model by adding a slope variable (containing the tendency of change of demand over the last two hours) and a "memory" of the last demand. Hypothetically, if the bike sharing system has a constant knowledge of the current demand, such predictions of the demand for the next hour could be made in real time, and so this model would make sense in some situations.

def model_absolute():
    X, y, df = load_bike_data()

    def coef_dir(df):
        coef_hour=[]

        for i in range (0, np.size(df.to_numpy(),0)):
            if(i>=2):
                coef_hour.append((df['demand'].iloc[i-2]-df['demand'].iloc[i-1])/((i-2)-(i-1)))

        df = df.loc[2:, :]
        df.insert(1, 'coef_hour', coef_hour)

        return df

    df = coef_dir(df)
    previous_hour = df['demand'].shift(periods=-1)
    df['previous_hour'] = previous_hour
    df = df[:-1]

    season = pd.get_dummies(df.season, prefix="season")
    month = pd.get_dummies(df.mnth, prefix="mnth")
    year = pd.get_dummies(df.yr, prefix="yr")
    workingday = pd.get_dummies(df.workingday, prefix="workingday")
    dayOfWeek = pd.get_dummies(df.dayOfWeek, prefix="dayOfWeek")
    holiday = pd.get_dummies(df.holiday, prefix="holiday")
    weather = pd.get_dummies(df.weathersit, prefix="weather_state")
    hour = pd.get_dummies(df.hr, prefix="hr")

    df = pd.concat([df[["temp", "hum", "windspeed", "coef_hour", "previous_hour", "demand"]], season, month, year, workingday,
                    weather, dayOfWeek, holiday, hour], axis=1)


    X = df.drop(columns='demand', axis=1)

    y = df.demand
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1, test_size=0.2)

    scaler = MinMaxScaler()
    X_normalized = scaler.fit_transform(X)

    model = linear_model.LinearRegression()
    model.fit(X_normalized, y)

    linear_regression_stats([model.intercept_] + list(model.coef_), X_normalized, y,
                            model.predict(X_normalized), df.drop("demand", axis=1).columns,
                            "Scikit Learn Linear Regression (Bike Sharing Dataset - all variables + slope & memory)")

    return X, w, y, df
X, w, y, df = model_absolute()
Weights: 63 components

Coefficients:
Bias                 1253686445631265.2
temp                 84.62225251644675
hum                  -41.97547493316782
windspeed            -11.268458268016047
coef_hour            394.3025525017671
previous_hour        640.3950937247846
season_1             79732366875754.95
season_2             79732366875768.27
season_3             79732366875765.58
season_4             79732366875778.56
mnth_1               -245848678579735.16
mnth_2               -245848678579734.03
mnth_3               -245848678579730.56
mnth_4               -245848678579734.22
mnth_5               -245848678579729.0
mnth_6               -245848678579735.34
mnth_7               -245848678579741.84
mnth_8               -245848678579734.16
mnth_9               -245848678579724.84
mnth_10              -245848678579730.4
mnth_11              -245848678579738.25
mnth_12              -245848678579736.3
yr_0                 -625341793884164.1
yr_1                 -625341793884135.1
workingday_0         -347572509896171.2
workingday_1         -230581482095599.3
weather_state_1      -405502991477929.6
weather_state_2      -405502991477928.94
weather_state_3      -405502991477940.94
weather_state_4      -405502991477941.44
dayOfWeek_0          59888183383197.71
dayOfWeek_1          59888183383197.4
dayOfWeek_2          59888183383198.65
dayOfWeek_3          59888183383198.09
dayOfWeek_4          59888183383198.69
dayOfWeek_5          176879211183772.25
dayOfWeek_6          176879211183768.8
holiday_0            185148611151709.94
holiday_1            302139638952273.3
hr_0                 -71180661004683.75
hr_1                 -71180661004699.47
hr_2                 -71180661004707.39
hr_3                 -71180661004718.14
hr_4                 -71180661004731.81
hr_5                 -71180661004756.12
hr_6                 -71180661004795.53
hr_7                 -71180661004772.97
hr_8                 -71180661004566.53
hr_9                 -71180661004683.94
hr_10                -71180661004647.5
hr_11                -71180661004682.75
hr_12                -71180661004670.38
hr_13                -71180661004668.16
hr_14                -71180661004671.94
hr_15                -71180661004697.47
hr_16                -71180661004741.94
hr_17                -71180661004585.66
hr_18                -71180661004579.31
hr_19                -71180661004564.11
hr_20                -71180661004583.12
hr_21                -71180661004618.6
hr_22                -71180661004641.0
hr_23                -71180661004665.97

Mean absolute error: 43.93
Root mean squared error: 61.68
Coefficient of determination: 0.88

Using the slope & memory trick, we bump our R2 score to 0.88, which is descent given the fact that we cannot go further than 0.69 with a normal model.

Part IV: Beyond Linear Regression

Now we will experiment using other models, such as polynomial regression, GBRs & random forests. We will one-hot encode our cyclical variables and ignore the days variable to avoid overfitting.

def model_polynomial():
    X, y, df = load_bike_data()

    season = pd.get_dummies(df.season, prefix="season")
    month = pd.get_dummies(df.mnth, prefix="mnth")
    year = pd.get_dummies(df.yr, prefix="yr")
    workingday = pd.get_dummies(df.workingday, prefix="workingday")
    dayOfWeek = pd.get_dummies(df.dayOfWeek, prefix="dayOfWeek")
    holiday = pd.get_dummies(df.holiday, prefix="holiday")
    weather = pd.get_dummies(df.weathersit, prefix="weather_state")
    hour = pd.get_dummies(df.hr, prefix="hr")

    df = pd.concat([df[["temp", "hum", "windspeed", "demand"]], season, month, year, workingday,
                    weather, dayOfWeek, holiday, hour], axis=1)

    X = df.drop(columns='demand', axis=1).to_numpy()
    y = df.demand

    poly = PolynomialFeatures(2)
    X = poly.fit_transform(X)


    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 3, test_size=0.2)

    scaler = MinMaxScaler()
    X_normalized = scaler.fit_transform(X)

    model = linear_model.LinearRegression()
    model.fit(X_train, y_train)

    # Display only the first 50 weights
    linear_regression_stats([model.intercept_] + list(model.coef_), X_test, y_test,
                            model.predict(X_test), range(50),
                            "Scikit Learn Linear Regression (Bike Sharing Dataset - polynomial regression)")

    return X, w, y, df
X, w, y, df = model_polynomial()
Weights: 1892 components

Coefficients:
Bias                 -565856671135023.1
0                    453706089558032.0
1                    398319507516582.9
2                    -181372203020412.66
3                    258589399986588.56
4                    -11037823243591.514
5                    -3608501313896.939
6                    244243244573112.25
7                    397848525626195.94
8                    -75402447800346.53
9                    21291592296307.242
10                   -51923517237322.164
11                   43199302640342.4
12                   37203092342100.016
13                   210847182873706.75
14                   63141891809257.39
15                   -19951480979277.707
16                   -169325747509854.94
17                   -85105297555445.75
18                   -89713627792761.56
19                   105740157073682.77
20                   -116490748901414.95
21                   -8689424596687.358
22                   -146280247331512.6
23                   -154876916400955.66
24                   -122875910883698.86
25                   -148974263940034.22
26                   -194034388227727.94
27                   115293342889159.28
28                   217958024739677.1
29                   -27912827138339.688
30                   181876650426477.44
31                   73538116042012.69
32                   -26780085388992.645
33                   312278052859912.56
34                   -67439632001791.13
35                   242578555521488.6
36                   87468711828704.5
37                   -120667505151661.78
38                   137334285349085.4
39                   193224123988364.22
40                   -386623728423737.2
41                   -170845739711609.16
42                   -44082160040824.9
43                   101355460818762.31
44                   -119112499595242.2
45                   -64452405347130.25
46                   62281811034299.016
47                   52402963991323.3
48                   -74300992134451.52
49                   38700306147340.95

Mean absolute error: 35.27
Root mean squared error: 51.33
Coefficient of determination: 0.92
def model_polynomial_bis():
    X, y, df = load_bike_data()

    season = pd.get_dummies(df.season, prefix="season")
    month = pd.get_dummies(df.mnth, prefix="mnth")
    workingday = pd.get_dummies(df.workingday, prefix="workingday")
    holiday = pd.get_dummies(df.holiday, prefix="holiday")
    weather = pd.get_dummies(df.weathersit, prefix="weather_state")
    hour = pd.get_dummies(df.hr, prefix="hr")

    df = pd.concat([df[["temp", "hum", "windspeed", "demand"]], season, workingday,
                    weather, holiday, hour], axis=1)

    X = df.drop(columns='demand', axis=1).to_numpy()
    y = df.demand

    poly = PolynomialFeatures(2)
    X = poly.fit_transform(X)


    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 3, test_size=0.2)

    scaler = MinMaxScaler()
    X_normalized = scaler.fit_transform(X)

    model = linear_model.LinearRegression()
    model.fit(X_train, y_train)

    # Display only the first 50 weights
    linear_regression_stats([model.intercept_] + list(model.coef_), X_test, y_test,
                            model.predict(X_test), range(50),
                            "Scikit Learn Linear Regression (Bike Sharing Dataset - polynomial regression)")

    return X, w, y, df
X, w, y, df = model_polynomial_bis()
Weights: 821 components

Coefficients:
Bias                 250285363322603.88
0                    1757172700.1106026
1                    -161228095002373.56
2                    194363367985306.28
3                    101372801435767.44
4                    126894193843493.92
5                    -21309853473872.03
6                    299351064352364.06
7                    46182781640188.984
8                    -157135021165049.16
9                    -614589045179921.2
10                   26835713048362.71
11                   239268979099876.78
12                   -158108856339248.38
13                   -92272212934519.02
14                   -128942535670618.25
15                   -83678270714767.89
16                   -164704576193659.06
17                   -12201363423435.576
18                   31322434964663.465
19                   -18963989827837.28
20                   29417745491618.613
21                   -21941076719116.76
22                   -177985608949145.0
23                   63197415498299.09
24                   -32344816768069.125
25                   -48179469908633.33
26                   47320451119285.31
27                   -160425580769663.34
28                   68684057040437.21
29                   98530858516551.67
30                   185221940890451.5
31                   39632648051029.59
32                   77600682636556.45
33                   -11395029689059.496
34                   104264917175226.53
35                   33401833248016.867
36                   -36041385891181.92
37                   98998865741151.12
38                   -33423943004507.688
39                   -102636164458945.28
40                   -671.5625
41                   151.45703125
42                   -5.1796875
43                   -87843068859807.45
44                   -87843068859666.12
45                   -87843068859836.75
46                   -87843068859697.3
47                   -82854201264859.03
48                   -82854201264922.6
49                   172934285718905.5

Mean absolute error: 54.69
Root mean squared error: 76.94
Coefficient of determination: 0.82
def GBR_scikit_bike():
    X, y, df = load_bike_data()

    season = pd.get_dummies(df.season, prefix="season")
    month = pd.get_dummies(df.mnth, prefix="mnth")
    year = pd.get_dummies(df.yr, prefix="yr")
    workingday = pd.get_dummies(df.workingday, prefix="workingday")
    dayOfWeek = pd.get_dummies(df.dayOfWeek, prefix="dayOfWeek")
    holiday = pd.get_dummies(df.holiday, prefix="holiday")
    weather = pd.get_dummies(df.weathersit, prefix="weather_state")
    hour = pd.get_dummies(df.hr, prefix="hr")

    df = pd.concat([df[["temp", "hum", "windspeed", "demand"]], season, month, year, workingday,
                    weather, dayOfWeek, holiday, hour], axis=1)

    X = df.drop(columns='demand', axis=1).to_numpy()
    y = df.demand

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1, test_size=0.2)

    regr = GradientBoostingRegressor()
    regr.fit(X_train,y_train)
    y_pred = regr.predict(X_test)

    print('Coefficient of determination: %.2f'
      % r2_score(y_test, y_pred))

    print(len(X))
    return X, w, y, df
X, w, y, df = GBR_scikit_bike()
Coefficient of determination: 0.79
17379
def GBR_scikit_bike_bis():
    X, y, df = load_bike_data()

    season = pd.get_dummies(df.season, prefix="season")
    workingday = pd.get_dummies(df.workingday, prefix="workingday")
    holiday = pd.get_dummies(df.holiday, prefix="holiday")
    weather = pd.get_dummies(df.weathersit, prefix="weather_state")
    hour = pd.get_dummies(df.hr, prefix="hr")

    df = pd.concat([df[["temp", "hum", "windspeed", "demand"]], season, workingday,
                    weather, holiday, hour], axis=1)

    X = df.drop(columns='demand', axis=1).to_numpy()
    y = df.demand

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1, test_size=0.2)

    regr = GradientBoostingRegressor()
    regr.fit(X_train,y_train)
    y_pred = regr.predict(X_test)

    print('Coefficient of determination: %.2f'
      % r2_score(y_test, y_pred))

    print(len(X))
    return X, w, y, df
X, w, y, df = GBR_scikit_bike_bis()
Coefficient of determination: 0.73
17379
def random_forest_scikit_bike():
    X, y, df = load_bike_data()

    season = pd.get_dummies(df.season, prefix="season")
    month = pd.get_dummies(df.mnth, prefix="mnth")
    year = pd.get_dummies(df.yr, prefix="yr")
    workingday = pd.get_dummies(df.workingday, prefix="workingday")
    dayOfWeek = pd.get_dummies(df.dayOfWeek, prefix="dayOfWeek")
    holiday = pd.get_dummies(df.holiday, prefix="holiday")
    weather = pd.get_dummies(df.weathersit, prefix="weather_state")
    hour = pd.get_dummies(df.hr, prefix="hr")

    df = pd.concat([df[["temp", "hum", "windspeed", "demand"]], season, month, year, workingday,
                    weather, dayOfWeek, holiday, hour], axis=1)

    X = df.drop(columns='demand', axis=1).to_numpy()
    y = df.demand

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1, test_size=0.2)

    regr = RandomForestRegressor()
    regr.fit(X_train,y_train)
    y_pred = regr.predict(X_test)

    print('Coefficient of determination: %.2f'
      % r2_score(y_test, y_pred))

    print(len(X))
    return X, w, y, df
X, w, y, df = random_forest_scikit_bike()
Coefficient of determination: 0.92
17379
def random_forest_scikit_bike_bis():
    X, y, df = load_bike_data()

    season = pd.get_dummies(df.season, prefix="season")
    workingday = pd.get_dummies(df.workingday, prefix="workingday")
    holiday = pd.get_dummies(df.holiday, prefix="holiday")
    weather = pd.get_dummies(df.weathersit, prefix="weather_state")
    hour = pd.get_dummies(df.hr, prefix="hr")

    df = pd.concat([df[["temp", "hum", "windspeed", "demand"]], season, workingday,
                    weather, holiday, hour], axis=1)

    X = df.drop(columns='demand', axis=1).to_numpy()
    y = df.demand

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1, test_size=0.2)

    regr = RandomForestRegressor()
    regr.fit(X_train,y_train)
    y_pred = regr.predict(X_test)

    print('Coefficient of determination: %.2f'
      % r2_score(y_test, y_pred))

    print(len(X))
    return X, w, y, df
X, w, y, df = random_forest_scikit_bike_bis()
Coefficient of determination: 0.82
17379

As we can see, other models are much more effective at predicting the data than our linear regressions. However, it is possible that they are effective only because of the sheer number of variables they can apply on the dataset; specifically, the polynomial and random forest regressions seem to be somewhat dependent on the specific time events of our dataset (yearly, monthly and weekly variations) that cannot be generalised to real-world data without more knowledge on the intricate relationships between long time intervals and demand. However, with a R2 score of 0.82, the models are still quite robust despite all the removed information.

Releases

No releases published

Packages

No packages published