%%HTML
<link rel="stylesheet" type="text/css" href="https://raw.githubusercontent.com/malkaguillot/Foundations-in-Data-Science-and-Machine-Learning/refs/heads/main/docs/utils/custom.css">
<link rel="stylesheet" type="text/css" href="../utils/custom.css">

import numpy as np
import pandas as pd
# import patsy

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_curve, roc_auc_score, classification_report, accuracy_score, confusion_matrix 

import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv("../../data/BostonHousing.csv")

df.describe()

X_full=df.drop('medv', axis=1)
y_full= df['medv']

n_samples = X_full.shape[0]
n_features = X_full.shape[1]
print(n_samples, n_features)

506 13

X_full.head()

X_full.isna().sum()

crim       0
zn         0
indus      0
chas       0
nox        0
rm         0
age        0
dis        0
rad        0
tax        0
ptratio    0
b          0
lstat      0
dtype: int64

y_full.shape

(506,)

sns.set(rc={'figure.figsize':(6,4)})
plt.hist(y_full, bins=30)
plt.xlabel("House prices in $1000", size=15)
plt.ylabel('count', size=15)
plt.title('Distribution of median price in each neighborhood', size=20)
plt.show()

X_full.shape

(506, 13)

X_full.hist(bins=50, figsize=(15,10))
plt.show()

import pandas as pd
import matplotlib.pyplot as plt
correlation_matrix = X_full.corr().round(2)
sns.heatmap(correlation_matrix) #annot=True
plt.show()

from pandas.plotting import scatter_matrix
scatter_matrix(X_full, figsize=(12, 8))

array([[<Axes: xlabel='crim', ylabel='crim'>,
        <Axes: xlabel='zn', ylabel='crim'>,
        <Axes: xlabel='indus', ylabel='crim'>,
        <Axes: xlabel='chas', ylabel='crim'>,
        <Axes: xlabel='nox', ylabel='crim'>,
        <Axes: xlabel='rm', ylabel='crim'>,
        <Axes: xlabel='age', ylabel='crim'>,
        <Axes: xlabel='dis', ylabel='crim'>,
        <Axes: xlabel='rad', ylabel='crim'>,
        <Axes: xlabel='tax', ylabel='crim'>,
        <Axes: xlabel='ptratio', ylabel='crim'>,
        <Axes: xlabel='b', ylabel='crim'>,
        <Axes: xlabel='lstat', ylabel='crim'>],
       [<Axes: xlabel='crim', ylabel='zn'>,
        <Axes: xlabel='zn', ylabel='zn'>,
        <Axes: xlabel='indus', ylabel='zn'>,
        <Axes: xlabel='chas', ylabel='zn'>,
        <Axes: xlabel='nox', ylabel='zn'>,
        <Axes: xlabel='rm', ylabel='zn'>,
        <Axes: xlabel='age', ylabel='zn'>,
        <Axes: xlabel='dis', ylabel='zn'>,
        <Axes: xlabel='rad', ylabel='zn'>,
        <Axes: xlabel='tax', ylabel='zn'>,
        <Axes: xlabel='ptratio', ylabel='zn'>,
        <Axes: xlabel='b', ylabel='zn'>,
        <Axes: xlabel='lstat', ylabel='zn'>],
       [<Axes: xlabel='crim', ylabel='indus'>,
        <Axes: xlabel='zn', ylabel='indus'>,
        <Axes: xlabel='indus', ylabel='indus'>,
        <Axes: xlabel='chas', ylabel='indus'>,
        <Axes: xlabel='nox', ylabel='indus'>,
        <Axes: xlabel='rm', ylabel='indus'>,
        <Axes: xlabel='age', ylabel='indus'>,
        <Axes: xlabel='dis', ylabel='indus'>,
        <Axes: xlabel='rad', ylabel='indus'>,
        <Axes: xlabel='tax', ylabel='indus'>,
        <Axes: xlabel='ptratio', ylabel='indus'>,
        <Axes: xlabel='b', ylabel='indus'>,
        <Axes: xlabel='lstat', ylabel='indus'>],
       [<Axes: xlabel='crim', ylabel='chas'>,
        <Axes: xlabel='zn', ylabel='chas'>,
        <Axes: xlabel='indus', ylabel='chas'>,
        <Axes: xlabel='chas', ylabel='chas'>,
        <Axes: xlabel='nox', ylabel='chas'>,
        <Axes: xlabel='rm', ylabel='chas'>,
        <Axes: xlabel='age', ylabel='chas'>,
        <Axes: xlabel='dis', ylabel='chas'>,
        <Axes: xlabel='rad', ylabel='chas'>,
        <Axes: xlabel='tax', ylabel='chas'>,
        <Axes: xlabel='ptratio', ylabel='chas'>,
        <Axes: xlabel='b', ylabel='chas'>,
        <Axes: xlabel='lstat', ylabel='chas'>],
       [<Axes: xlabel='crim', ylabel='nox'>,
        <Axes: xlabel='zn', ylabel='nox'>,
        <Axes: xlabel='indus', ylabel='nox'>,
        <Axes: xlabel='chas', ylabel='nox'>,
        <Axes: xlabel='nox', ylabel='nox'>,
        <Axes: xlabel='rm', ylabel='nox'>,
        <Axes: xlabel='age', ylabel='nox'>,
        <Axes: xlabel='dis', ylabel='nox'>,
        <Axes: xlabel='rad', ylabel='nox'>,
        <Axes: xlabel='tax', ylabel='nox'>,
        <Axes: xlabel='ptratio', ylabel='nox'>,
        <Axes: xlabel='b', ylabel='nox'>,
        <Axes: xlabel='lstat', ylabel='nox'>],
       [<Axes: xlabel='crim', ylabel='rm'>,
        <Axes: xlabel='zn', ylabel='rm'>,
        <Axes: xlabel='indus', ylabel='rm'>,
        <Axes: xlabel='chas', ylabel='rm'>,
        <Axes: xlabel='nox', ylabel='rm'>,
        <Axes: xlabel='rm', ylabel='rm'>,
        <Axes: xlabel='age', ylabel='rm'>,
        <Axes: xlabel='dis', ylabel='rm'>,
        <Axes: xlabel='rad', ylabel='rm'>,
        <Axes: xlabel='tax', ylabel='rm'>,
        <Axes: xlabel='ptratio', ylabel='rm'>,
        <Axes: xlabel='b', ylabel='rm'>,
        <Axes: xlabel='lstat', ylabel='rm'>],
       [<Axes: xlabel='crim', ylabel='age'>,
        <Axes: xlabel='zn', ylabel='age'>,
        <Axes: xlabel='indus', ylabel='age'>,
        <Axes: xlabel='chas', ylabel='age'>,
        <Axes: xlabel='nox', ylabel='age'>,
        <Axes: xlabel='rm', ylabel='age'>,
        <Axes: xlabel='age', ylabel='age'>,
        <Axes: xlabel='dis', ylabel='age'>,
        <Axes: xlabel='rad', ylabel='age'>,
        <Axes: xlabel='tax', ylabel='age'>,
        <Axes: xlabel='ptratio', ylabel='age'>,
        <Axes: xlabel='b', ylabel='age'>,
        <Axes: xlabel='lstat', ylabel='age'>],
       [<Axes: xlabel='crim', ylabel='dis'>,
        <Axes: xlabel='zn', ylabel='dis'>,
        <Axes: xlabel='indus', ylabel='dis'>,
        <Axes: xlabel='chas', ylabel='dis'>,
        <Axes: xlabel='nox', ylabel='dis'>,
        <Axes: xlabel='rm', ylabel='dis'>,
        <Axes: xlabel='age', ylabel='dis'>,
        <Axes: xlabel='dis', ylabel='dis'>,
        <Axes: xlabel='rad', ylabel='dis'>,
        <Axes: xlabel='tax', ylabel='dis'>,
        <Axes: xlabel='ptratio', ylabel='dis'>,
        <Axes: xlabel='b', ylabel='dis'>,
        <Axes: xlabel='lstat', ylabel='dis'>],
       [<Axes: xlabel='crim', ylabel='rad'>,
        <Axes: xlabel='zn', ylabel='rad'>,
        <Axes: xlabel='indus', ylabel='rad'>,
        <Axes: xlabel='chas', ylabel='rad'>,
        <Axes: xlabel='nox', ylabel='rad'>,
        <Axes: xlabel='rm', ylabel='rad'>,
        <Axes: xlabel='age', ylabel='rad'>,
        <Axes: xlabel='dis', ylabel='rad'>,
        <Axes: xlabel='rad', ylabel='rad'>,
        <Axes: xlabel='tax', ylabel='rad'>,
        <Axes: xlabel='ptratio', ylabel='rad'>,
        <Axes: xlabel='b', ylabel='rad'>,
        <Axes: xlabel='lstat', ylabel='rad'>],
       [<Axes: xlabel='crim', ylabel='tax'>,
        <Axes: xlabel='zn', ylabel='tax'>,
        <Axes: xlabel='indus', ylabel='tax'>,
        <Axes: xlabel='chas', ylabel='tax'>,
        <Axes: xlabel='nox', ylabel='tax'>,
        <Axes: xlabel='rm', ylabel='tax'>,
        <Axes: xlabel='age', ylabel='tax'>,
        <Axes: xlabel='dis', ylabel='tax'>,
        <Axes: xlabel='rad', ylabel='tax'>,
        <Axes: xlabel='tax', ylabel='tax'>,
        <Axes: xlabel='ptratio', ylabel='tax'>,
        <Axes: xlabel='b', ylabel='tax'>,
        <Axes: xlabel='lstat', ylabel='tax'>],
       [<Axes: xlabel='crim', ylabel='ptratio'>,
        <Axes: xlabel='zn', ylabel='ptratio'>,
        <Axes: xlabel='indus', ylabel='ptratio'>,
        <Axes: xlabel='chas', ylabel='ptratio'>,
        <Axes: xlabel='nox', ylabel='ptratio'>,
        <Axes: xlabel='rm', ylabel='ptratio'>,
        <Axes: xlabel='age', ylabel='ptratio'>,
        <Axes: xlabel='dis', ylabel='ptratio'>,
        <Axes: xlabel='rad', ylabel='ptratio'>,
        <Axes: xlabel='tax', ylabel='ptratio'>,
        <Axes: xlabel='ptratio', ylabel='ptratio'>,
        <Axes: xlabel='b', ylabel='ptratio'>,
        <Axes: xlabel='lstat', ylabel='ptratio'>],
       [<Axes: xlabel='crim', ylabel='b'>,
        <Axes: xlabel='zn', ylabel='b'>,
        <Axes: xlabel='indus', ylabel='b'>,
        <Axes: xlabel='chas', ylabel='b'>,
        <Axes: xlabel='nox', ylabel='b'>,
        <Axes: xlabel='rm', ylabel='b'>,
        <Axes: xlabel='age', ylabel='b'>,
        <Axes: xlabel='dis', ylabel='b'>,
        <Axes: xlabel='rad', ylabel='b'>,
        <Axes: xlabel='tax', ylabel='b'>,
        <Axes: xlabel='ptratio', ylabel='b'>,
        <Axes: xlabel='b', ylabel='b'>,
        <Axes: xlabel='lstat', ylabel='b'>],
       [<Axes: xlabel='crim', ylabel='lstat'>,
        <Axes: xlabel='zn', ylabel='lstat'>,
        <Axes: xlabel='indus', ylabel='lstat'>,
        <Axes: xlabel='chas', ylabel='lstat'>,
        <Axes: xlabel='nox', ylabel='lstat'>,
        <Axes: xlabel='rm', ylabel='lstat'>,
        <Axes: xlabel='age', ylabel='lstat'>,
        <Axes: xlabel='dis', ylabel='lstat'>,
        <Axes: xlabel='rad', ylabel='lstat'>,
        <Axes: xlabel='tax', ylabel='lstat'>,
        <Axes: xlabel='ptratio', ylabel='lstat'>,
        <Axes: xlabel='b', ylabel='lstat'>,
        <Axes: xlabel='lstat', ylabel='lstat'>]], dtype=object)

(X_full.columns)

Index(['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax',
       'ptratio', 'b', 'lstat'],
      dtype='object')

fig, axs= plt.subplots(3, 5, figsize=(20, 12))
i=0
for ax in axs.flat:
    if i<len(X_full.columns):
        feature_name = X_full.columns[i]  
        ax.scatter(X_full[feature_name], y_full, alpha=0.1)
        ax.set(xlabel=feature_name, ylabel='Price')
        i=i+1

<table>
    <tr>
        <td>3 simulated models</td>
        <td><span style="color: grey;">train set MSE</span> and <span style="color: red;">test set MSE</span></td>
    </tr>
</table>

%%HTML
<div style="text-align: center;"> style="text-align: center;">
    <table>
            <td>3 simulated models</td>
            <td> Flexibility:<span style="color: grey;">train set MSE</span> and <span style="color: red;">test set MSE</span></td>
        </tr>
    </table>
</div>

mask=y_full<50

y_full=y_full[mask==True]
X_full=X_full[mask==True]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_full, y_full,test_size=0.2, random_state=1)

print("train data", X_train.shape, y_train.shape)
print("test data", X_test.shape,  y_test.shape)

train data (392, 13) (392,)
test data (98, 13) (98,)

from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# our first machine learning model
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg

LinearRegression()

LinearRegression()

lin_reg.fit(X_train_scaled, y_train)

LinearRegression()

LinearRegression()

print("R-squared for training dataset:{}".
      format(np.round(lin_reg.score(X_train_scaled, y_train), 2)))

R-squared for training dataset:0.79

print("R-squared for test dataset:{}".
      format(np.round(lin_reg.score(X_test_scaled, y_test), 2)))

R-squared for test dataset:0.73

lin_reg.coef_

array([-0.82250959,  0.96267645, -0.54388693,  0.187684  , -1.50325117,
        2.15830284, -0.54795352, -2.78522878,  2.16616351, -2.21708013,
       -1.8582422 ,  0.71805978, -2.70105292])

print('The coefficients of the features from the linear model:')
print(dict(zip(features, [round(x, 2) for x in lin_reg.coef_])))

The coefficients of the features from the linear model:
{'crim': -0.82, 'zn': 0.96, 'indus': -0.54, 'chas': 0.19, 'nox': -1.5, 'rm': 2.16, 'age': -0.55, 'dis': -2.79, 'rad': 2.17, 'tax': -2.22, 'ptratio': -1.86, 'b': 0.72, 'lstat': -2.7}

from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import KFold

# Perform 5-fold cross validation
scores = cross_val_score(lin_reg, X_train_scaled, y_train, 
                         scoring="neg_mean_squared_error", # evaluation metrics
                         cv=5)
scores

array([ -7.74021081, -15.46731828, -14.72344763, -17.14433289,
       -18.96092531])

# the other way of doing the same thing (more explicit)

# create a KFold object with 5 splits 
folds = KFold(n_splits = 5, shuffle = True, random_state = 100)
scores = cross_val_score(lin_reg, X_train_scaled, y_train, scoring='neg_mean_squared_error', cv=folds)
scores

array([-18.32023985, -11.84760498, -17.04867667, -16.44238641,
       -10.9874587 ])

# 1. create a cross-validation scheme
from sklearn.model_selection import KFold
kfold = KFold(n_splits=10,random_state=2, shuffle=True) 
kfold

KFold(n_splits=10, random_state=2, shuffle=True)

# 2. specify range of hyperparameters to tune
param_grid = [
  {'alpha': [0.0001, 0.001, 0.01, 0.1 ,1, 10],
      'l1_ratio':[.1,.5,.9,1]}   
]

from sklearn.model_selection import GridSearchCV

# 3. perform grid search
# 3.1 specify model
from sklearn.linear_model import ElasticNet
elastic_net = ElasticNet()

# 3.2 call GridSearchCV()

# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
model_cv = GridSearchCV(elastic_net, param_grid, cv=kfold,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
model_cv

GridSearchCV(cv=KFold(n_splits=10, random_state=2, shuffle=True),
             estimator=ElasticNet(),
             param_grid=[{'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10],
                          'l1_ratio': [0.1, 0.5, 0.9, 1]}],
             return_train_score=True, scoring='neg_mean_squared_error')

GridSearchCV(cv=KFold(n_splits=10, random_state=2, shuffle=True),
             estimator=ElasticNet(),
             param_grid=[{'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10],
                          'l1_ratio': [0.1, 0.5, 0.9, 1]}],
             return_train_score=True, scoring='neg_mean_squared_error')

ElasticNet()

ElasticNet()

# fit the model
model_cv.fit(X_train_scaled, y_train)

GridSearchCV(cv=KFold(n_splits=10, random_state=2, shuffle=True),
             estimator=ElasticNet(),
             param_grid=[{'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10],
                          'l1_ratio': [0.1, 0.5, 0.9, 1]}],
             return_train_score=True, scoring='neg_mean_squared_error')

GridSearchCV(cv=KFold(n_splits=10, random_state=2, shuffle=True),
             estimator=ElasticNet(),
             param_grid=[{'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10],
                          'l1_ratio': [0.1, 0.5, 0.9, 1]}],
             return_train_score=True, scoring='neg_mean_squared_error')

ElasticNet(alpha=0.01, l1_ratio=0.1)

ElasticNet(alpha=0.01, l1_ratio=0.1)

# cv results
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results.head()

# plotting cv results
plt.figure(figsize=(16,6))
a= 0.01
plt.plot(cv_results.loc[cv_results['param_alpha']==a,"param_l1_ratio"], cv_results[cv_results['param_alpha']==a]["mean_test_score"])
plt.plot(cv_results.loc[cv_results['param_alpha']==a,"param_l1_ratio"], cv_results[cv_results['param_alpha']==a]["mean_train_score"])
plt.xlabel('L1 ratio')
plt.ylabel('Negative MSE')
plt.title("Optimal L1 ratio parameter for alpha={}".format(a) )   
plt.legend(['test score', 'train score'], loc='upper right')

<matplotlib.legend.Legend at 0x324e73a40>

best_alpha = model_cv.best_params_['alpha']
best_l1_ratio = model_cv.best_params_['l1_ratio']
print("Best alpha: ", best_alpha,   "Best l1_ratio: ", best_l1_ratio)

Best alpha:  0.01 Best l1_ratio:  0.1

# Fit model (to the training set) with optimal alpha and l1_ratio
elastic_net = ElasticNet(alpha=best_alpha, l1_ratio=best_l1_ratio)  
elastic_net.fit(X_train_scaled, y_train)

ElasticNet(alpha=0.01, l1_ratio=0.1)

ElasticNet(alpha=0.01, l1_ratio=0.1)

# predict prices of X_test
y_test_pred = elastic_net.predict(X_test_scaled)

# Evaluate the model
from sklearn.metrics import mean_squared_error

test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
print("RMSE on test data: ", test_rmse)

RMSE on test data:  3.8754693909725972

Variable	Description
CRIM	per capita crime rate by town
ZN	proportion of residential land zoned for lots over 25,000 sq.ft.
INDUS	proportion of non-retail business acres per town
CHAS	Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
NOX	nitric oxides concentration (parts per 10 million)
RM	average number of rooms per dwelling
AGE	proportion of owner-occupied units built prior to 1940
DIS	weighted distances to five Boston employment centres
RAD	index of accessibility to radial highways
TAX	full-value property-tax rate per $10,000
PTRATIO	pupil-teacher ratio by town
B	1000(Bk - 0.63)^2 where Bk is the proportion of black people by town
LSTAT	% lower status of the population

	crim	zn	indus	chas	nox	rm	age	dis	rad	tax	ptratio	b	lstat	medv
count	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000
mean	3.613524	11.363636	11.136779	0.069170	0.554695	6.284634	68.574901	3.795043	9.549407	408.237154	18.455534	356.674032	12.653063	22.532806
std	8.601545	23.322453	6.860353	0.253994	0.115878	0.702617	28.148861	2.105710	8.707259	168.537116	2.164946	91.294864	7.141062	9.197104
min	0.006320	0.000000	0.460000	0.000000	0.385000	3.561000	2.900000	1.129600	1.000000	187.000000	12.600000	0.320000	1.730000	5.000000
25%	0.082045	0.000000	5.190000	0.000000	0.449000	5.885500	45.025000	2.100175	4.000000	279.000000	17.400000	375.377500	6.950000	17.025000
50%	0.256510	0.000000	9.690000	0.000000	0.538000	6.208500	77.500000	3.207450	5.000000	330.000000	19.050000	391.440000	11.360000	21.200000
75%	3.677083	12.500000	18.100000	0.000000	0.624000	6.623500	94.075000	5.188425	24.000000	666.000000	20.200000	396.225000	16.955000	25.000000
max	88.976200	100.000000	27.740000	1.000000	0.871000	8.780000	100.000000	12.126500	24.000000	711.000000	22.000000	396.900000	37.970000	50.000000

	crim	zn	indus	nox	rm	age	dis	rad	tax	ptratio	b	lstat
0	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1	296	15.3	396.90	4.98
1	0.02731	0.0	7.07	0.469	6.421	78.9	4.9671	2	242	17.8	396.90	9.14
2	0.02729	0.0	7.07	0.469	7.185	61.1	4.9671	2	242	17.8	392.83	4.03
3	0.03237	0.0	2.18	0.458	6.998	45.8	6.0622	3	222	18.7	394.63	2.94
4	0.06905	0.0	2.18	0.458	7.147	54.2	6.0622	3	222	18.7	396.90	5.33

Method	$R(\boldsymbol{\beta})$
OLS	0
Subset selection	$\lVert\boldsymbol{\beta}\rVert_0$
Lasso	$\lVert\boldsymbol{\beta}\rVert_1$
Ridge	$\lVert\boldsymbol{\beta}\rVert_2^2$
Elastic Net	$\alpha\lVert\boldsymbol{\beta}\rVert_1 + (1-\alpha)\lVert\boldsymbol{\beta}\rVert_2^2$

	mean_fit_time	std_fit_time	mean_score_time	std_score_time	param_alpha	param_l1_ratio	params	split0_test_score	split1_test_score	split2_test_score	...	split2_train_score	split3_train_score	split4_train_score	split5_train_score	split6_train_score	split7_train_score	split8_train_score	split9_train_score	mean_train_score	std_train_score
0	0.004507	0.005245	0.001400	0.002163	0.0001	0.1	{'alpha': 0.0001, 'l1_ratio': 0.1}	-13.406530	-14.328648	-12.011451	...	-13.755289	-13.819750	-13.797527	-13.206681	-13.885470	-11.411172	-13.730520	-13.507182	-13.428291	0.698492
1	0.000864	0.000134	0.000324	0.000040	0.0001	0.5	{'alpha': 0.0001, 'l1_ratio': 0.5}	-13.408003	-14.328863	-12.011969	...	-13.755289	-13.819749	-13.797526	-13.206680	-13.885470	-11.411172	-13.730519	-13.507181	-13.428290	0.698492
2	0.001147	0.000544	0.000569	0.000521	0.0001	0.9	{'alpha': 0.0001, 'l1_ratio': 0.9}	-13.409477	-14.329080	-12.012521	...	-13.755288	-13.819749	-13.797526	-13.206680	-13.885469	-11.411171	-13.730518	-13.507181	-13.428289	0.698492
3	0.000787	0.000105	0.000324	0.000050	0.0001	1.0	{'alpha': 0.0001, 'l1_ratio': 1}	-13.409844	-14.329135	-12.012652	...	-13.755288	-13.819749	-13.797525	-13.206680	-13.885469	-11.411171	-13.730518	-13.507181	-13.428289	0.698492
4	0.000756	0.000107	0.000287	0.000008	0.0010	0.1	{'alpha': 0.001, 'l1_ratio': 0.1}	-13.354994	-14.323175	-11.994855	...	-13.755485	-13.819927	-13.797724	-13.206868	-13.885637	-11.411323	-13.730734	-13.507338	-13.428476	0.698504

Foundations in Data Science and Machine Learning¶

Module 5a: Machine Learning - Regression¶

Malka Guillot¶

References¶

What is ML?¶

ML Landscape¶

An Aside: ML and Artificial Intelligence (AI)¶

Unsupervised Learning¶

Example: Clustering OECD Inflation Rates¶

Reinforcement Learning (RL)¶

Supervised Learning¶

The Goal of Supervised Learning¶

Supervised Learning Algorithms¶

Supervised ML Workflow¶

Step 1: Define the Prediction Task

¶

Boston housing data¶

Set up and load data¶

Create $X$ and $y$¶

Look for NA values in the dataset¶

Step 2: Exploratory Data Analysis

¶

Quantity to predict= price (target or $y$)¶

Features ($X$) used for prediction¶

Distributions¶

Correlations¶

Check for multicolinearity¶

Correlation plots¶

Scatter plot relative to the target (price)¶

What can we say ?¶

Linear Regression as a predictive model¶

Model building¶

Linear Regression¶

Linear estimate¶

Linear Models: pros and cons¶

Core ML concepts¶

Undefittin vs. Overfitting: Split the Sample Between Three Economists¶

Underffiting: High Bias, Low Variance¶

Overfitting: Low Bias, High Variance¶

"Justfitting": Bias and Variance are Just Right¶

The Typical Bias-Variance Trade-off in ML¶

When is the Bias-Variance Trade-off Important?¶

Measuring the Performance of a Model using Mean Squared Error (MSE)¶

Avoiding Overfitting: train/test split¶

Training MSE, test MSE and model flexibility¶

Overfitting¶

Regularization methods¶

Context: Generalization of the Linear Models¶

Why Regularization?¶

Adding a Regularization Term to the Loss Function $L(.)$¶

Notation: Norms¶

Commonly used penalty functions¶

Prerequisite: centering and scaling¶

Ridge Regression¶

Ridge Regression: shrinkage to $0$¶

Ridge: Variance-Bias Trade-Off¶

Ridge vs. Linear Models¶

LASSO (Least Absolute Shrinkage and Selection Operator)¶

Lasso¶

Lasso Coefficients¶

Lasso: Variance-Bias Trade-Off¶

Constrained Regression¶

Constraint Regions: Lasso vs. ridge¶

Elastic Net = Lasso + Ridge¶

Selecting Elastic Net Hyperparameters¶

Feature Engineering¶

2 extensions of the Linear Model¶

Non Linearity¶

Scikit-Learn Design Overview¶

Transformer (preprocessor): An object that transforms a data set.¶

Estimator: an object that can estimate parameters¶

Predictor: An object that forms a prediction from an input data set.¶

Step 3: set, train and evaluate the model

¶

Data cleaning¶

Prepare Training and Test Sets using train_test_split¶

Feature Scaling¶

Select and Train a Model¶

Coefficients of the linear regression¶

Choose the best model with cross-validation¶

Look for `NA` values in the dataset¶

Quantity to predict= price (`target` or $y$)¶

Prepare Training and Test Sets using `train_test_split`¶