%%HTML
<link rel="stylesheet" type="text/css" href="https://raw.githubusercontent.com/malkaguillot/Foundations-in-Data-Science-and-Machine-Learning/refs/heads/main/docs/utils/custom.css">
%%HTML
<link rel="stylesheet" type="text/css" href="../utils/custom.css">

import numpy as np
import pandas as pd
# import patsy

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_curve, roc_auc_score, classification_report, accuracy_score, confusion_matrix 

import seaborn as sns
import matplotlib.pyplot as plt

df=pd.read_csv("../../data/beers.csv")
df.shape

(225, 5)

df.head()

f, ax = plt.subplots(figsize=(7, 5))
sns.countplot(x='is_yummy', data=df)
_ = plt.title('# Yummy vs not yummy')
_ = plt.xlabel('Class (1==Yummy)')

# all columns up to the last one:
X = df.iloc[:, :-1]
# only the last column:
y = df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

scaler = StandardScaler()

lr = LogisticRegression(max_iter=10000, solver='lbfgs') #syntax if you wand to add hyperparameters

model1 = Pipeline([('standardize', scaler),
                    ('log_reg', lr)
                  ])
model1

Pipeline(steps=[('standardize', StandardScaler()),
                ('log_reg', LogisticRegression(max_iter=10000))])

Pipeline(steps=[('standardize', StandardScaler()),
                ('log_reg', LogisticRegression(max_iter=10000))])

StandardScaler()

LogisticRegression(max_iter=10000)

model1.fit(X_train, y_train)

Pipeline(steps=[('standardize', StandardScaler()),
                ('log_reg', LogisticRegression(max_iter=10000))])

Pipeline(steps=[('standardize', StandardScaler()),
                ('log_reg', LogisticRegression(max_iter=10000))])

StandardScaler()

LogisticRegression(max_iter=10000)

model1.get_params

<bound method Pipeline.get_params of Pipeline(steps=[('standardize', StandardScaler()),
                ('log_reg', LogisticRegression(max_iter=10000))])>

y_train_hat = model1.predict(X_train) # predicting on the training set
y_train_hat[:10]

array([0, 0, 0, 1, 1, 0, 1, 0, 0, 0])

y_train_hat_probs = model1.predict_proba(X_train)[:,1] # probabilities of being in class 1
y_train_hat_probs[:10]

array([6.36362252e-02, 4.14138359e-01, 3.13906994e-05, 6.43635817e-01,
       9.55425992e-01, 4.74967980e-05, 9.62592994e-01, 7.70180013e-02,
       3.88115657e-03, 1.48066398e-03])

temp= pd.DataFrame({'y_train_hat': y_train_hat, 'y_train_hat_probs': y_train_hat_probs})
temp.head(10)

temp['y_train_hat_probs'].hist()

<Axes: >

train_accuracy = accuracy_score(y_train, y_train_hat)*100
train_auc_roc = roc_auc_score(y_train, y_train_hat_probs)*100

print('Confusion matrix:\n', confusion_matrix(y_train, y_train_hat))
print('Training AUC: %.4f %%' % train_auc_roc)
print('Training accuracy: %.4f %%' % train_accuracy)

Confusion matrix:
 [[78  8]
 [ 5 89]]
Training AUC: 98.1321 %
Training accuracy: 92.7778 %

# Predictions on the test set
y_test_hat = model1.predict(X_test)
y_test_hat_probs = model1.predict_proba(X_test)[:,1] # Probabilities of being in class 1

# Metrics
test_accuracy = accuracy_score(y_test, y_test_hat)*100
test_auc_roc = roc_auc_score(y_test, y_test_hat_probs)*100

print("Accuracy in test data {} vs. in train data {}".format(test_accuracy, train_accuracy) )
print("AUC in test data {} vs. in train data {}".format(test_auc_roc, train_auc_roc) )

Accuracy in test data 91.11111111111111 vs. in train data 92.77777777777779
AUC in test data 96.6 vs. in train data 98.13211281543789

print(classification_report(y_test, y_test_hat, digits=2))

              precision    recall  f1-score   support

           0       0.83      1.00      0.91        20
           1       1.00      0.84      0.91        25

    accuracy                           0.91        45
   macro avg       0.92      0.92      0.91        45
weighted avg       0.93      0.91      0.91        45

from sklearn import metrics

fpr, tpr, threshold = metrics.roc_curve(y_test, y_test_hat_probs)
print("tresholds:",  len(threshold))
roc_auc = metrics.auc(fpr, tpr)
roc_auc

tresholds: 8

0.966

import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

		Truth
		Negative	Positive
Prediction	Negative	True negative (TN)	False negative (FN)
Prediction	Positive	False positive (FP)	True positive (TP)

	alcohol_content	bitterness	darkness	fruitiness	is_yummy
0	3.739295	0.422503	0.989463	0.215791	0
1	4.207849	0.841668	0.928626	0.380420	0
2	4.709494	0.322037	5.374682	0.145231	1
3	4.684743	0.434315	4.072805	0.191321	1
4	4.148710	0.570586	1.461568	0.260218	0

	y_train_hat	y_train_hat_probs
0	0	0.063636
1	0	0.414138
2	0	0.000031
3	1	0.643636
4	1	0.955426
5	0	0.000047
6	1	0.962593
7	0	0.077018
8	0	0.003881
9	0	0.001481

Foundations in Data Science and Machine Learning¶

Module 5b: Machine Learning - Classifications¶

Malka Guillot ¶

Classification Framework¶

Classification process¶

Confusion Matrix¶

Precision and Recall¶

F1 Score¶

Logistic Regression¶

Logistic Regression Cost Function¶

Implementation of the logistic regression

¶

Load & visualise data¶

Prepare data: split features and labels¶

Splitting into Training and Test set¶

Model Building and Training¶

Creating the pipeline¶

Fit our model to the training data¶

Predictions for the class and for the probabilities¶

Sensitivity Specificity Trade-off¶

The Precision/Recall Trade-off¶

The Precision/Recall Trade-off¶

Classification rule¶

Visualisation of the trade-off using the ROC Curve¶

Visualisation of the trade-off using the ROC Curve¶

ROC Curve¶

ROC Curve and AUC¶

Performance on the training set¶

Test set¶

Plot the ROC curve¶

Foundations in Data Science and Machine Learning¶

Module 5b: Machine Learning - Classifications¶

Malka Guillot¶

Classification Framework¶

Classification process¶

Confusion Matrix¶

Precision and Recall¶

F1 Score¶

Logistic Regression¶

Logistic Regression Cost Function¶

Implementation of the logistic regression

¶

Load & visualise data¶

Prepare data: split features and labels¶

Splitting into Training and Test set¶

Model Building and Training¶

Creating the pipeline¶

Fit our model to the training data¶

Predictions for the class and for the probabilities¶

Sensitivity Specificity Trade-off¶

The Precision/Recall Trade-off¶

The Precision/Recall Trade-off¶

Classification rule¶

Visualisation of the trade-off using the ROC Curve¶

Visualisation of the trade-off using the ROC Curve¶

ROC Curve¶

ROC Curve and AUC¶

Performance on the training set¶

Test set¶

Plot the ROC curve¶

Malka Guillot ¶