%%HTML
<link rel="stylesheet" type="text/css" href="https://raw.githubusercontent.com/malkaguillot/Foundations-in-Data-Science-and-Machine-Learning/refs/heads/main/docs/utils/custom.css">
%%HTML
<link rel="stylesheet" type="text/css" href="../utils/custom.css">

import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np
import seaborn as sns
sns.set(style="whitegrid")

import warnings
warnings.filterwarnings("ignore", category=UserWarning)
%matplotlib inline

hitters=pd.read_csv('../../data/Hitters.csv').dropna()
hitters.head()

# Get Features 
features = ['Years', 'Hits']
X = hitters[features].values
y = np.log(hitters.Salary.values)

def make_figure_hist():
    fig, (ax1, ax2) = plt.subplots(1,2, figsize=(11,4))

    # Plot salary distribution
    ax1.hist(hitters.Salary.values)
    ax1.set_xlabel('Salary')
    ax2.hist(y)
    ax2.set_xlabel('Log(Salary)');

make_figure_hist()

def make_figure_hits_year():
    plt.ylabel("Hits", fontsize=16)
    plt.xlabel("Years", fontsize=16)
    plt.title("Salary : red (low) -> purple (high)", fontsize=18)
    plt.scatter(hitters['Years'], hitters['Hits'], c=hitters['Salary'], s=40,cmap='gist_rainbow')
    plt.colorbar()

make_figure_hits_year()

from sklearn.tree import DecisionTreeRegressor
from sklearn.tree._export import plot_tree

# Fit regression tree
tree = DecisionTreeRegressor(max_leaf_nodes=3)
tree.fit(X, y)

DecisionTreeRegressor(max_leaf_nodes=3)

DecisionTreeRegressor(max_leaf_nodes=3)

fig, ax = plt.subplots(1,1)
ax.set_title('Figure Decision tree');

# Plot tree
plot_tree(tree, filled=True, feature_names=features, fontsize=12, ax=ax);

def make_figure_hits_year2():
    plt.ylabel("Hits", fontsize=16)
    plt.xlabel("Years", fontsize=16)
    plt.title("3 Regions", fontsize=18)
    plt.scatter(hitters['Years'], hitters['Hits'], c=hitters['Salary'], s=10,cmap='gist_rainbow')
    plt.colorbar() 

    # Split lines
    plt.vlines(4.5, ymin=-5, ymax=250, color='g')
    plt.hlines(117.5, xmin=4.5, xmax=25, color='g')

    # Regions
    plt.annotate('R1', xy=(2,117.5), fontsize='xx-large')
    plt.annotate('R2', xy=(11,60), fontsize='xx-large')
    plt.annotate('R3', xy=(11,170), fontsize='xx-large');

make_figure_hits_year2()

# Compute tree
overfit_tree = DecisionTreeRegressor(max_leaf_nodes=5).fit(X, y)

# Plot tree
fig, ax = plt.subplots(1,1)
plot_tree(overfit_tree, filled=True, feature_names=features, fontsize=12, ax=ax);

# Compute tree
no_overfit_tree = DecisionTreeRegressor(max_leaf_nodes=5, min_samples_leaf=10).fit(X, y)

# Plot tree
fig, ax = plt.subplots(1,1)
plot_tree(no_overfit_tree, filled=True, feature_names=features, fontsize=14, ax=ax);

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold, cross_val_predict, cross_val_score

features = ['Years', 'Hits', 'RBI', 'PutOuts', 'Walks', 'Runs', 'AtBat', 'HmRun']
hitters=pd.read_csv('../../data/Hitters.csv').dropna()

X = hitters[features]
y =hitters['Salary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Init
params = range(2,11)
reg_scores = np.zeros((len(params),3))
best_score = 10**6

# Loop over all parameters
for i,k in enumerate(params):
    
    # Model
    tree = DecisionTreeRegressor(max_leaf_nodes=k)

    # Loop over splits
    tree.fit(X_train, y_train)
    reg_scores[i,0] = mean_squared_error(tree.predict(X_train), y_train)
    reg_scores[i,1] = mean_squared_error(tree.predict(X_test), y_test)

    # Get CV score
    kf6 = KFold(n_splits=6)
    reg_scores[i,2] = -cross_val_score(tree, X_train, y_train, cv=kf6, scoring='neg_mean_squared_error').mean()
    
    # Save best model
    if reg_scores[i,2]<best_score:
        best_model = tree
        best_score = reg_scores[i,2]

def make_figure_optimal_depth():
    
    # Init
    fig, (ax1,ax2) = plt.subplots(1,2,figsize=(16,6))
    fig.suptitle('Choosing the right tree depth', fontsize=20) 

    # Plot scores
    ax1.plot(params, reg_scores);
    ax1.axvline(params[np.argmin(reg_scores[:,2])], c='k', ls='--')
    ax1.legend(['Train','Test','6-fold CV']);
    ax1.set_title('Cross-Validation Scores');

    # Plot best tree
    plot_tree(best_model, filled=True, impurity=False, feature_names=features, fontsize=12, ax=ax2);
    ax2.set_title('Best Model')

make_figure_optimal_depth()

# Load heart dataset
heart=pd.read_csv('../../data/Heart.csv').drop('Unnamed: 0', axis=1).dropna()
heart.head()

# Fastorize variables
heart.ChestPain = pd.factorize(heart.ChestPain)[0]
heart.Thal = pd.factorize(heart.Thal)[0]
# Set features
features = [col for col in heart.columns if col!='AHD']
X2 = heart[features]
y2 = pd.factorize(heart.AHD)[0]

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# Init (takes a lot of time with J=30)
params = range(2,50)
bagging_scores = np.zeros((len(params),2))
J = 30;

# Loop over parameters
for i, k in enumerate(params):
    print("Computing k=%1.0f" % k, end ="")
    
    # Repeat J 
    temp_scores = np.zeros((J,2))
    for j in range(J):
        X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.5, random_state=j)
        bagging = BaggingClassifier(DecisionTreeClassifier(), max_samples=k, oob_score=True)
        bagging.fit(X2_train,y2_train)
        temp_scores[j,0] = bagging.score(X2_test, y2_test)
        temp_scores[j,1] = bagging.oob_score_
        
    # Average
    bagging_scores[i,:] = np.mean(temp_scores, axis=0)
    print("", end="\r")

Computing k=49

def make_new_figure():    
    # Init
    fig, ax = plt.subplots(1,1,figsize=(10,6))
    fig.suptitle("Estimated $R^2$")

    # Plot scores
    ax.plot(params, bagging_scores);
    ax.legend(['Test','OOB']);
    ax.set_xlabel('Number of Trees'); ax.set_ylabel('R^2');

make_new_figure()

# Compute feature importance
feature_importances = np.mean([tree.feature_importances_ for tree in bagging.estimators_], axis=0)

def make_figure_feature_importance():
    
    # Init
    fig, ax = plt.subplots(1,1,figsize=(6,6))
    ax.set_title('Figure : Feature Importance');

    # Plot feature importance
    h1 = pd.DataFrame({'Importance':feature_importances*100}, index=features)
    h1 = h1.sort_values(by='Importance', axis=0, ascending=False)
    h1.plot(kind='barh', color='r', ax=ax)
    ax.set_xlabel('Variable Importance'); 
    plt.yticks(fontsize=14);
    plt.gca().legend_ = None;

make_figure_feature_importance()

from sklearn.ensemble import RandomForestClassifier

# Init (takes a lot of time with J=30)
params = range(2,50)
forest_scores = np.zeros((len(params),2))
J = 30

# Loop over parameters
for i, k in enumerate(params):
    print("Computing k=%1.0f" % k, end ="")
    
    # Repeat J 
    temp_scores = np.zeros((J,2))
    for j in range(J):
        X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.5, random_state=j)
        forest = RandomForestClassifier(n_estimators=k, oob_score=True, max_features="sqrt")
        forest.fit(X2_train,y2_train)
        temp_scores[j,0] = forest.score(X2_test, y2_test)
        temp_scores[j,1] = forest.oob_score_
        
    # Average
    forest_scores[i,:] = np.mean(temp_scores, axis=0)
    print("", end="\r")

Computing k=49

def make_figure_bagging_vs_rf():
    
    # Init
    fig, ax = plt.subplots(1,1,figsize=(10,6))
    ax.set_title('Figure: Bagging vs Random Forest');

    # Plot scores
    ax.plot(params, bagging_scores[:,0], color='r', label='Test - Bagging');
    ax.plot(params, bagging_scores[:,1], linestyle='--', color='r', label='OOB - Bagging');
    ax.plot(params, forest_scores[:,0], color='g', label='Test - Forest');
    ax.plot(params, forest_scores[:,1], linestyle='--', color='g', label='OOB - Forest');
    ax.legend();
    ax.set_xlabel('Number of Trees'); ax.set_ylabel('R^2');

make_figure_bagging_vs_rf()

# Make new figure 2
def make_figure_feature_importance2():
    
    # Init
    fig, (ax1,ax2) = plt.subplots(1,2,figsize=(15,7))

    # Plot feature importance - Bagging
    h1 = pd.DataFrame({'Importance':feature_importances*100}, index=features)
    h1 = h1.sort_values(by='Importance', axis=0, ascending=False)
    h1.plot(kind='barh', color='r', ax=ax1)
    ax1.set_xlabel('Variable Importance'); 
    ax1.set_title('Tree Bagging')

    # Plot feature importance
    h2 = pd.DataFrame({'Importance':forest.feature_importances_*100}, index=features)
    h2 = h2.sort_values(by='Importance', axis=0, ascending=False)
    h2.plot(kind='barh', color='r', ax=ax2)
    ax2.set_title('Random Forest')

    # All plots
    for ax in fig.axes:
        ax.set_xlabel('Variable Importance'); 
        ax.legend([])

make_figure_feature_importance2()

import xgboost

xgb_reg=xgboost.XGBRegressor()
xgb_reg.fit(X_train, y_train)
y_pred=xgb_reg.predict(X_test)

from xgboost import plot_importance
plot_importance(xgb_reg)

<Axes: title={'center': 'Feature importance'}, xlabel='Importance score', ylabel='Features'>

	Unnamed: 0	AtBat	Hits	HmRun	Runs	RBI	Walks	Years	CAtBat	CHits	...	CRuns	CRBI	CWalks	League	Division	PutOuts	Assists	Errors	Salary	NewLeague
1	-Alan Ashby	315	81	7	24	38	39	14	3449	835	...	321	414	375	N	W	632	43	10	475.0	N
2	-Alvin Davis	479	130	18	66	72	76	3	1624	457	...	224	266	263	A	W	880	82	14	480.0	A
3	-Andre Dawson	496	141	20	65	78	37	11	5628	1575	...	828	838	354	N	E	200	11	3	500.0	N
4	-Andres Galarraga	321	87	10	39	42	30	2	396	101	...	48	46	33	N	E	805	40	4	91.5	N
5	-Alfredo Griffin	594	169	4	74	51	35	11	4408	1133	...	501	336	194	A	W	282	421	25	750.0	A

	Age	Sex	ChestPain	RestBP	Chol	Fbs	RestECG	MaxHR	ExAng	Oldpeak	Slope	Ca	Thal	AHD
0	63	1	typical	145	233	1	2	150	0	2.3	3	0.0	fixed	No
1	67	1	asymptomatic	160	286	0	2	108	1	1.5	2	3.0	normal	Yes
2	67	1	asymptomatic	120	229	0	2	129	1	2.6	2	2.0	reversable	Yes
3	37	1	nonanginal	130	250	0	0	187	0	3.5	3	0.0	normal	No
4	41	0	nontypical	130	204	0	2	172	0	1.4	1	0.0	normal	No

Foundations in Data Science and Machine Learning¶

Module 5c: Machine Learning - Tree-based methods¶

Malka Guillot¶

Introduction: Decision Trees and Ensemble Learning¶

Decision Trees (DTs)¶

Ensembe learning¶

Oveview of this family of methods¶

Decision Trees¶

Decision Trees¶

What is a tree?¶

Decision Trees¶

Regression Trees¶

Prediction the salary of baseball players¶

Using 2 explanatory variable: Years & Hits¶

Salary vs. Log(Salary)¶

Visualize the tree¶

A Visual Representation of the Data¶

Challenges with Splits¶

Building a Tree¶

Classification and regression trees (CART)¶

The Splitting Process: How does it work?¶

The CART Algorithm¶

How large should we grow the tree?¶

Cost-complexity pruning¶

Regularization Hyperparameters¶

Avoid overfitting using a minimum number of observations per leaf¶

Choosing optimal tree length using cross-validation¶

Classification¶

Building a Classification Tree¶

Possible loss functions to decide the splits are:¶

Pros and Cons¶

Pros¶

Cons¶

Extension: bagging, random forests, and boosting.¶

XGBoost Ingredient 2 -- Bagging (Bootstrapping)¶

Bagging¶

Out-of-Bag Error Estimation¶

Example in the case of a classification¶

Pre-processing step¶

Plot of the Out-of-Bag error computed while generating the bagged estimator.¶

Variable Importance Measures¶

XGBoost Ingredient 3 -- Random Forest¶

Random Forest¶

Random Forest classifier¶

XGBoost Ingredient 4 -- Boosting¶

Boosting¶

XGBoost Ingredient 5 -- Gradient Boosting¶

Gradient boosting = an additive ensemble of trees¶

XGBoost Ingredient 6 -- XGBoost¶

XGBoost =Extreme Gradient Boosting¶

Feature Importance¶

Using 2 explanatory variable: `Years` & `Hits`¶

`Salary` vs. `Log(Salary)`¶

XGBoost Ingredient 6 -- `XGBoost`¶

`XGBoost` =Extreme Gradient Boosting¶