Model Evaluation Overview
Model evaluation is a critical step in the machine learning workflow that helps us:
- Understand how well the model performs on unseen data
- Compare performance between different models
- Select the best model and hyperparameters
- Identify overfitting or underfitting issuess
- Ensure the model can generalize to new data
The core of model evaluation is using appropriate evaluation metrics to measure the model's predictivecapacity, while using techniques like data splitting and cross-validation to ensure the reliability of evaluation results.
Classification Model Evaluation Metrics
1. Confusion Matrix
The confusion matrix is a fundamental tool for evaluating classification model performance. It shows the relationship between model predictions and actual labels.
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
# Load dataset
iris = load_iris()
X = iris.data
y = (iris.target == 0).astype(int) # Binary classification: is Iris Setosa
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train model
model = LogisticRegression()
model.fit(X_train, y_train)
# Predict
y_pred = model.predict(X_test)
# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Visualize confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()
# Calculate confusion matrix elements
tn, fp, fn, tp = cm.ravel()
print(f"True Negative: {tn}")
print(f"False Positive: {fp}")
print(f"False Negative: {fn}")
print(f"True Positive: {tp}")
2. Accuracy, Precision, Recall, and F1 Score
Based on the confusion matrix, we can calculate the following commonly used classification evaluation metrics:
- Accuracy: The proportion of correctly predicted samples out of all samples
- Precision: The proportion of positive predictions that are actually positive
- Recall: The proportion of actual positives that are correctly predicted
- F1 Score: The harmonic mean of precision and recall
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
# For multi-class classification, need to specify averaging method
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier
# Load dataset
digits = load_digits()
X = digits.data
y = digits.target
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
# Predict
y_pred = model.predict(X_test)
# Calculate multi-class evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision_macro = precision_score(y_test, y_pred, average='macro')
recall_macro = recall_score(y_test, y_pred, average='macro')
f1_macro = f1_score(y_test, y_pred, average='macro')
precision_weighted = precision_score(y_test, y_pred, average='weighted')
recall_weighted = recall_score(y_test, y_pred, average='weighted')
f1_weighted = f1_score(y_test, y_pred, average='weighted')
print("\nMulticlass Classification Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision (Macro): {precision_macro:.4f}")
print(f"Recall (Macro): {recall_macro:.4f}")
print(f"F1 Score (Macro): {f1_macro:.4f}")
print(f"Precision (Weighted): {precision_weighted:.4f}")
print(f"Recall (Weighted): {recall_weighted:.4f}")
print(f"F1 Score (Weighted): {f1_weighted:.4f}")
3. ROC Curve and AUC
ROC (Receiver Operating Characteristic) curve and AUC (Area Under the Curve) are important tools for evaluating binary classification model performance, considering different classification thresholds.
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.preprocessing import label_binarize
# Calculate ROC curve and AUC
# For binary classification
y_score = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_score)
roc_auc = auc(fpr, tpr)
# Visualize ROC curve
plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()
# For multi-class classification, need to use One-vs-All method
digits = load_digits()
X = digits.data
y = digits.target
# Binarize labels
y_bin = label_binarize(y, classes=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
n_classes = y_bin.shape[1]
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_bin, test_size=0.2, random_state=42)
# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, np.argmax(y_train, axis=1))
# Predict probabilities
y_score = model.predict_proba(X_test)
# Calculate ROC curve and AUC for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Calculate macro-average ROC curve and AUC
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
# Visualize ROC curve
plt.figure(figsize=(12, 10))
plt.plot(fpr["macro"], tpr["macro"], color='red', linestyle=':', linewidth=2, label=f'Macro-average ROC curve (area = {roc_auc["macro"]:.4f})')
for i in range(3): # Only show ROC curves for first 3 classes
plt.plot(fpr[i], tpr[i], lw=2, label=f'Class {i} (area = {roc_auc[i]:.4f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve for Multiclass')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()
4. Precision-Recall Curve
Precision-Recall curve (PR curve) is another important tool for evaluating classification model performance, especially for imbalanced datasets.
from sklearn.metrics import precision_recall_curve, average_precision_score
# Calculate precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_score)
average_precision = average_precision_score(y_test, y_score)
# Visualize precision-recall curve
plt.figure(figsize=(10, 8))
plt.plot(recall, precision, color='blue', lw=2, label=f'PR curve (AP = {average_precision:.4f})')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="best")
plt.grid(True)
plt.show()
Regression Model Evaluation Metrics
1. Mean Squared Error, Root Mean Squared Error, and Mean Absolute Error
Regression model evaluation metrics primarily measure the difference between predicted values and actual values:
- Mean Squared Error (MSE): The average of squared differences between predicted and actual values
- Root Mean Squared Error (RMSE): The square root of MSE, having the same unit as the target variable
- Mean Absolute Error (MAE): The average of absolute differences between predicted and actual values
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_boston
import numpy as np
# Load dataset
boston = load_boston()
X = boston.data
y = boston.target
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train model
model = LinearRegression()
model.fit(X_train, y_train)
# Predict
y_pred = model.predict(X_test)
# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
2. R² Score
The R² score (coefficient of determination) measures the model's ability to explain the variance of the target variable. It ranges from (-∞, 1], where 1 indicates perfect prediction.
from sklearn.metrics import r2_score
# Calculate R² score
r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2:.4f}")
# Calculate adjusted R² score
n = len(y_test)
p = X_test.shape[1]
adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
print(f"Adjusted R² Score: {adjusted_r2:.4f}")
Model Selection Methods
1. Training Set, Validation Set, and Test Set
To properly evaluate model performance and avoid overfitting, we typically split the dataset into three parts:
- Training set: Used to train the model
- Validation set: Used to tune model hyperparameters and select models
- Test set: Used for final model performance evaluation
from sklearn.model_selection import train_test_split
# First split: training set + validation+test set
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
# Second split: validation set + test set
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")
# Use training set to train model, validation set to tune parameters, test set to evaluate
model = LinearRegression()
model.fit(X_train, y_train)
# Evaluate on validation set
val_score = model.score(X_val, y_val)
print(f"Validation R² Score: {val_score:.4f}")
# Final evaluation on test set
test_score = model.score(X_test, y_test)
print(f"Test R² Score: {test_score:.4f}")
2. Cross-Validation
Cross-validation is a more reliable model evaluation method that reduces the variance of evaluation by splitting the dataset multiple times and calculating average performance.
from sklearn.model_selection import cross_val_score, KFold
# K-fold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=kfold, scoring='r2')
print(f"Cross-validation scores: {scores}")
print(f"Mean CV score: {scores.mean():.4f}")
print(f"CV score standard deviation: {scores.std():.4f}")
# Leave-One-Out cross-validation
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
scores = cross_val_score(model, X[:100], y[:100], cv=loo, scoring='r2') # Only use first 100 samples for faster computation
print(f"\nLeave-One-Out CV scores: {scores[:10]}...") # Only show first 10 scores
print(f"Mean LOOCV score: {scores.mean():.4f}")
print(f"LOOCV score standard deviation: {scores.std():.4f}")
# Stratified K-fold cross-validation (for classification problems)
from sklearn.model_selection import StratifiedKFold
from sklearn.datasets import load_iris
iris = load_iris()
X_iris = iris.data
y_iris = iris.target
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(LogisticRegression(), X_iris, y_iris, cv=stratified_kfold, scoring='accuracy')
print(f"\nStratified K-Fold CV scores: {scores}")
print(f"Mean Stratified CV score: {scores.mean():.4f}")
print(f"Stratified CV score standard deviation: {scores.std():.4f}")
3. Grid Search and Random Search
Grid search and random search are two common methods for hyperparameter tuning. They find the best combination by searching the hyperparameter space.
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import randint
# Define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [3, 5, 7, None],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
# Grid search
grid_search = GridSearchCV(
estimator=RandomForestRegressor(random_state=42),
param_grid=param_grid,
cv=5,
scoring='r2',
n_jobs=-1
)
grid_search.fit(X_train, y_train)
print(f"Best parameters found by GridSearchCV: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")
print(f"Test set score: {grid_search.score(X_test, y_test):.4f}")
# Random search
param_dist = {
'n_estimators': randint(50, 200),
'max_depth': [3, 5, 7, None],
'min_samples_split': randint(2, 10),
'min_samples_leaf': randint(1, 4)
}
random_search = RandomizedSearchCV(
estimator=RandomForestRegressor(random_state=42),
param_distributions=param_dist,
n_iter=20,
cv=5,
scoring='r2',
random_state=42,
n_jobs=-1
)
random_search.fit(X_train, y_train)
print(f"\nBest parameters found by RandomizedSearchCV: {random_search.best_params_}")
print(f"Best cross-validation score: {random_search.best_score_:.4f}")
print(f"Test set score: {random_search.score(X_test, y_test):.4f}")
Overfitting and Underfitting
1. Identifying Overfitting and Underfitting
Overfitting and underfitting are common problems in machine learning:
- Overfitting: The model performs very well on the training set but poorly on the test set, indicating the model is too complex and has learned noise in the training data
- Underfitting: The model performs poorly on both the training set and test set, indicating the model is too simple to capture patterns in the data
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
# Generate synthetic data
np.random.seed(42)
x = np.linspace(-3, 3, 100)
y = 2 * x**3 - 3 * x**2 + np.random.normal(0, 5, size=x.shape)
x = x.reshape(-1, 1)
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
# Try different polynomial degrees
degrees = [1, 2, 3, 10, 20]
train_scores = []
test_scores = []
for degree in degrees:
# Create polynomial regression model
pipeline = Pipeline([
('poly', PolynomialFeatures(degree=degree)),
('linear', LinearRegression())
])
# Train model
pipeline.fit(X_train, y_train)
# Evaluate model
train_score = pipeline.score(X_train, y_train)
test_score = pipeline.score(X_test, y_test)
train_scores.append(train_score)
test_scores.append(test_score)
# Plot learning curves
plt.figure(figsize=(12, 8))
plt.plot(degrees, train_scores, 'o-', label='Training score')
plt.plot(degrees, test_scores, 's-', label='Test score')
plt.xlabel('Polynomial degree')
plt.ylabel('R² score')
plt.title('Model Complexity vs. Performance')
plt.legend()
plt.grid(True)
plt.show()
# Visualize models with different complexity
plt.figure(figsize=(15, 10))
for i, degree in enumerate(degrees):
plt.subplot(2, 3, i+1)
# Create and train model
pipeline = Pipeline([
('poly', PolynomialFeatures(degree=degree)),
('linear', LinearRegression())
])
pipeline.fit(X_train, y_train)
# Generate predictions
x_plot = np.linspace(-3, 3, 100).reshape(-1, 1)
y_plot = pipeline.predict(x_plot)
# Plot data and model
plt.scatter(X_train, y_train, label='Training data')
plt.scatter(X_test, y_test, label='Test data')
plt.plot(x_plot, y_plot, 'r-', label=f'Model (degree={degree})')
plt.xlabel('x')
plt.ylabel('y')
plt.title(f'Degree {degree}\nTrain R²: {pipeline.score(X_train, y_train):.3f}, Test R²: {pipeline.score(X_test, y_test):.3f}')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
2. Methods to Prevent Overfitting
Common methods to prevent overfitting include:
- Data augmentation: Increasing training data by adding noise, transformations, etc.
- Regularization: Adding penalty terms to the loss function, such as L1 and L2 regularization
- Dropout: Randomly deactivating neurons in neural networks
- Early stopping: Stopping training when validation performance starts to degrade
- Model ensembling: Combining predictions from multiple models
- Feature selection: Selecting the most relevant features to reduce feature dimensionality
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import BaggingRegressor, AdaBoostRegressor
# Regularization: Ridge (L2 regularization)
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
print(f"Ridge regression - Train score: {ridge.score(X_train, y_train):.4f}, Test score: {ridge.score(X_test, y_test):.4f}")
# Regularization: Lasso (L1 regularization)
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
print(f"Lasso regression - Train score: {lasso.score(X_train, y_train):.4f}, Test score: {lasso.score(X_test, y_test):.4f}")
# Model ensembling: Bagging
bagging = BaggingRegressor(base_estimator=LinearRegression(), n_estimators=10, random_state=42)
bagging.fit(X_train, y_train)
print(f"Bagging regression - Train score: {bagging.score(X_train, y_train):.4f}, Test score: {bagging.score(X_test, y_test):.4f}")
# Model ensembling: AdaBoost
adaboost = AdaBoostRegressor(n_estimators=50, random_state=42)
adaboost.fit(X_train, y_train)
print(f"AdaBoost regression - Train score: {adaboost.score(X_train, y_train):.4f}, Test score: {adaboost.score(X_test, y_test):.4f}")
Practical Case: Model Evaluation and Selection
In this practical case, we'll use multiple models for regression analysis on the Boston housing dataset, and select the best model through cross-validation and grid search.
from sklearn.datasets import load_boston
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
# Load dataset
boston = load_boston()
X = boston.data
y = boston.target
# Define models and parameter grids
models = {
'Linear Regression': LinearRegression(),
'Ridge': GridSearchCV(Ridge(), {'alpha': [0.01, 0.1, 1, 10, 100]}, cv=5),
'Lasso': GridSearchCV(Lasso(), {'alpha': [0.01, 0.1, 1, 10, 100]}, cv=5),
'Decision Tree': GridSearchCV(DecisionTreeRegressor(random_state=42),
{'max_depth': [3, 5, 7, 10, None]}, cv=5),
'Random Forest': GridSearchCV(RandomForestRegressor(random_state=42),
{'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7, None]}, cv=5),
'Gradient Boosting': GridSearchCV(GradientBoostingRegressor(random_state=42),
{'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5]}, cv=5),
'SVR': GridSearchCV(make_pipeline(StandardScaler(), SVR()),
{'svr__C': [0.1, 1, 10, 100], 'svr__gamma': [0.001, 0.01, 0.1, 1]}, cv=5)
}
# Evaluate all models
results = {}
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
for name, model in models.items():
# Create pipeline with standardization
if name != 'SVR': # SVR already included in pipeline
pipeline = make_pipeline(StandardScaler(), model)
else:
pipeline = model
# Cross-validation
scores = cross_val_score(pipeline, X, y, cv=kfold, scoring='r2')
# Train model
pipeline.fit(X, y)
# Store results
results[name] = {
'mean_score': scores.mean(),
'std_score': scores.std(),
'all_scores': scores
}
print(f"{name}: Mean R² = {scores.mean():.4f}, Std = {scores.std():.4f}")
# Find best model
best_model = max(results, key=lambda x: results[x]['mean_score'])
print(f"\nBest model: {best_model}")
print(f"Best mean score: {results[best_model]['mean_score']:.4f}")
# Visualize results
plt.figure(figsize=(12, 8))
model_names = list(results.keys())
mean_scores = [results[name]['mean_score'] for name in model_names]
std_scores = [results[name]['std_score'] for name in model_names]
plt.barh(model_names, mean_scores, xerr=std_scores, capsize=5)
plt.xlabel('Mean R² Score')
plt.ylabel('Model')
plt.title('Cross-Validation Results for Different Models')
plt.xlim(0, 1)
plt.grid(True, axis='x')
plt.tight_layout()
plt.show()
Interactive Exercises
Exercise 1: Classification Model Evaluation
Use scikit-learn's breast_cancer dataset to train a classification model and evaluate its performance.
- Load the breast_cancer dataset
- Split into training and test sets
- Train a classification model (e.g., Random Forest)
- Calculate accuracy, precision, recall, and F1 score
- Plot confusion matrix and ROC curve
- Evaluate the model using cross-validation
Exercise 2: Regression Model Evaluation
Use scikit-learn's diabetes dataset to train a regression model and evaluate its performance.
- Load the diabetes dataset
- Split into training and test sets
- Train a regression model (e.g., Gradient Boosting)
- Calculate MSE, RMSE, MAE, and R² score
- Use grid search to tune model hyperparameters
- Evaluate the tuned model using cross-validation
Exercise 3: Overfitting and Underfitting
Use a synthetic dataset to demonstrate overfitting and underfitting problems, and try to solve them.
- Generate a nonlinear synthetic dataset
- Try models with different complexity (e.g., different depth decision trees)
- Plot learning curves to identify overfitting and underfitting
- Use regularization or other methods to prevent overfitting
- Evaluate the effectiveness of different methods
Recommended Tutorials
Deep Learning Basics
Explore neural networks and deep learning fundamentals, master core techniques for model building, training, and evaluation
View TutorialFeature Engineering
Master feature engineering techniques to improve model performance and generalization
View TutorialModel deploymentment
Learn how to deploy trained machine learning models to production environments
View Tutorial