"""Metrics calculation functions for the Calibration Measure package."""
__author__ = "Kwok Lung Jason Fan"
__copyright__ = "Copyright 2024"
__credits__ = ["Kwok Lung Jason Fan", "Qian Cao"]
__license__ = "Apache License 2.0"
__version__ = "0.1"
__maintainer__ = "Kwok Lung Jason Fan"
__email__ = "kwoklung.fan@fda.hhs.gov"
__status__ = "Development"
import scipy.stats as stats
import numpy as np
from .utils import *
import statsmodels.api as sm
import statsmodels.nonparametric.smoothers_lowess as lowess
import numpy.lib.recfunctions as rf
import contextlib
[docs]
def hosmer_lemeshow_test(reliability, confidence, bin_count, df=None, **kwargs):
"""
Compute the Hosmer-Lemeshow test for goodness of fit.
This test is used to assess the calibration of binary classification models with full probability outputs.
It compares observed and expected frequencies of events in groups of the data.
Args:
reliability (array-like): Observed proportion of positive samples in each bin.
confidence (array-like): Predicted probabilities for each bin.
bin_count (array-like): Number of samples in each bin.
df (int, optional): Degrees of freedom for the test. Defaults is nbins - 2.
Returns:
chi_squared (float): The chi-squared statistic of the Hosmer-Lemeshow test.
p_value (float): The p-value associated with the chi-squared statistic.
df (int): The degrees of freedom for the test.
Note:
- The Hosmer-Lemeshow test is widely used for assessing calibration in probabiliticst models.
- A small p-value (typically < 0.05) suggests that the model is a poor fit to the data.
- This test can be sensitive to the number of groups and sample size.
- It is recommended to use the Hosmer-Lemeshow test in conjunction with other metrics.
"""
# Convert inputs to numpy arrays for consistent handling
reliability = np.clip(np.array(reliability),1e-7,1-1e-7)
confidence = np.clip(np.array(confidence),1e-7,1-1e-7)
bin_count = np.array(bin_count)
# Remove bins with zero count to avoid division by zero
mask = bin_count != 0
reliability = reliability[mask]
confidence = confidence[mask]
bin_count = bin_count[mask]
# Compute observed and expected values for each bin
observed = reliability * bin_count
expected = confidence * bin_count
# Compute the Hosmer-Lemeshow statistic
chi_squared = np.sum(
(observed - expected) ** 2 / (expected * (1 - expected / bin_count))
)
# Compute degrees of freedom and p-value
num_bins = len(bin_count)
if df is None:
df = num_bins - 2 # Subtract 2 for the parameters estimated in the model
else:
df = df
p_value = 1 - stats.chi2.cdf(chi_squared, df)
return chi_squared, p_value, df
[docs]
def calculate_ece_mce(reliability, confindence, bin_counts):
"""
Calculate Expected Calibration Error (ECE) and Maximum Calibration Error (MCE).
These metrics assess the calibration of a classification model by comparing
predicted probabilities to observed frequencies.
Args:
reliability (array-like): Array of observed frequencies for each bin.
confindence (array-like): Array of predicted probabilities for each bin.
bin_counts (array-like): Array of sample counts in each bin.
Returns:
ece (float): The Expected Calibration Error.
mce (float): The Maximum Calibration Error.
Note:
- ECE is a weighted average of the absolute differences between confidence and reliability.
- MCE is the maximum absolute difference between confidence and reliability across all bins.
- Lower values of both ECE and MCE indicate better calibration.
"""
# Calculate total number of samples
total_count = np.sum(bin_counts)
# Compute absolute difference between reliability and confidence
error = np.abs(reliability - confindence)
# Compute Maximum Calibration Error
mce = np.max(error)
# Compute Expected Calibration Error
ece = np.sum(bin_counts * error) / total_count
return ece, mce
[docs]
def spiegelhalter_z_test(y_true, y_proba, class_to_calculate=1):
"""
Perform Spiegelhalter's Z-test for calibration of probabilistic predictions.
This test assesses whether predicted probabilities are well-calibrated by comparing
them to observed outcomes.
Args:
y_true (array-like): True labels of the samples.
y_proba (array-like): Predicted probabilities for each class. Shape should be (n_samples, n_classes).
class_to_calculate (int): Index of the class to calculate the test for. Default is 1.
Returns:
z_score (float): The z-score of the Spiegelhalter's Z-test.
p_value (float): The p-value associated with the z-score.
Note:
- This test is used to assess the calibration of a classification model.
- A small p-value (typically < 0.05) suggests that the model is poorly calibrated.
- The test assumes that predictions are independent and identically distributed.
"""
# Ensure inputs are numpy arrays and flatten y_true
y_true = np.array(y_true).ravel()
y_proba = np.array(y_proba)
# Convert to binary classification problem
mask = y_true == class_to_calculate
y_true[mask] = 1
y_true[np.logical_not(mask)] = 0
# Calculate components of the test statistic
numerator = np.sum(
(y_true - y_proba[:, class_to_calculate])
* (1 - 2 * y_proba[:, class_to_calculate])
)
denominator = np.sqrt(
np.sum(
((1 - 2 * y_proba[:, class_to_calculate]) ** 2)
* y_proba[:, class_to_calculate]
* (1 - y_proba[:, class_to_calculate])
)
)
# Compute z-score and p-value
z_score = numerator / denominator
p_value = 2 * (1 - stats.norm.cdf(abs(z_score))) # Two-tailed test
return z_score, p_value
[docs]
def cox_regression_analysis(
y_true,
y_proba,
epsilon=1e-7,
class_to_calculate=1,
print_results=False,
fix_intercept=False,
fix_slope=False,
**kwargs,
):
"""
Perform Cox regression analysis for classification calibration.
This function fits a logistic regression model to the logit of predicted probabilities
to assess the calibration of classification predictions.
Args:
y_true (array-like): True binary labels. If multi-class, will be converted to binary.
y_proba (array-like): Predicted probabilities. Should be of shape (n_samples, n_classes).
epsilon (float): Small value to avoid log(0) errors. Default is 1e-7.
class_to_calculate (int): The class to treat as the positive class in binary classification. Default is 1.
print_results (bool): If True, prints the summary of the logistic regression results. Default is False.
fix_intercept (bool): If True, fixes the intercept to 0. Can't be used with fix_slope. Default is False.
fix_slope (bool): If True, fixes the coefficient to 1. Can't be used with fix_intercept. Default is False.
Returns:
coef (float): The coefficient (slope) of the Cox regression.
intercept (float): The intercept of the Cox regression.
coef_ci (tuple): The confidence interval for the coefficient.
intercept_ci (tuple): The confidence interval for the intercept.
Note:
- A well-calibrated model should have a coefficient close to 1 and an intercept close to 0.
- The function clips probabilities to avoid numerical instability.
- For multi-class problems, the function converts the problem to binary classification
based on the specified class_to_calculate.
"""
# Clip probabilities to avoid numerical issues
proba = np.clip(y_proba[:, 1], epsilon, 1 - epsilon)
# Convert to binary classification problem
y_true = np.array(y_true).ravel()
mask = y_true == class_to_calculate
y_true[mask] = 1
y_true[np.logical_not(mask)] = 0
# Calculate logit of probabilities
logit = np.log(proba / (1 - proba))
# Add constant term to predictor variables
X = sm.add_constant(logit)
# Fit the logistic regression model
logit_model = sm.Logit(y_true, X)
# Filter kwargs to only include parameters accepted by fit_constrained
fit_kwargs = {k:v for k,v in kwargs.items() if k in ['method', 'maxiter', 'tol', 'full_output']}
if fix_intercept == True:
with contextlib.redirect_stdout(None):
logit_result = logit_model.fit_constrained("const=0", disp=0, **fit_kwargs)
elif fix_slope == True:
with contextlib.redirect_stdout(None):
logit_result = logit_model.fit_constrained("x1=1", disp=0, **fit_kwargs)
else:
logit_result = logit_model.fit(disp=0, **fit_kwargs)
# Print results if requested
if print_results:
print(logit_result.summary())
# Extract and return coefficients and confidence intervals
intercept = logit_result.params[0]
intercept_ci = logit_result.conf_int()[0, :]
coef = logit_result.params[1]
coef_ci = logit_result.conf_int()[1, :]
return coef, intercept, coef_ci, intercept_ci
[docs]
def cal_ICI_cox(
coef, intercept, y_proba, class_to_calculate=1, epsilon=1e-7, **kwargs
):
"""
Calculate the Integrated Calibration Index (ICI) for a given Cox regression model.
The ICI measures the average absolute difference between the predicted probabilities
and the probabilities transformed by the fitted Cox regression model.
Args:
coef (float): The coefficient (slope) from the Cox regression.
intercept (float): The intercept from the Cox regression.
y_proba (array-like): Predicted probabilities. Should be of shape (n_samples, n_classes).
class_to_calculate (int): The class to calculate the ICI for in multi-class problems. Default is 1.
epsilon (float): Small value to avoid numerical instability when clipping probabilities. Default is 1e-7.
Returns:
ICI (float): The Integrated Calibration Index.
Note:
- Lower ICI values indicate better calibration.
- The function applies the inverse logit transformation to the predicted probabilities
using the coefficients from the Cox regression.
"""
# Define the transformation function based on Cox regression results
func = lambda x: 1 / (1 + np.exp(-(coef * np.log(x / (1 - x)) + intercept)))
# Clip probabilities to avoid numerical issues
proba_clipped = np.clip(y_proba[:, class_to_calculate], epsilon, 1 - epsilon)
# Apply the transformation function
transformed_proba = func(proba_clipped)
# Calculate the ICI
ICI = np.mean(np.abs(transformed_proba - proba_clipped))
return ICI
[docs]
def lowess_regression_analysis(
y_true,
y_proba,
epsilon=1e-7,
class_to_calculate=1,
span=0.5,
delta=0.001,
it=0,
**kwargs,
):
"""
Perform Lowess regression analysis for classification calibration.
This function applies Locally Weighted Scatterplot Smoothing (LOWESS) to assess
the calibration of classification predictions.
Args:
y_true (array-like): True binary labels. If multi-class, will be converted to binary.
y_proba (array-like): Predicted probabilities. Should be of shape (n_samples, n_classes).
epsilon (float, optional): Small value to avoid numerical instability when clipping probabilities. Defaults to 1e-10.
class_to_calculate (int, optional): The class to treat as the positive class in binary classification. Defaults to 1.
span (float, optional): The fraction of the data used when estimating each y-value. Defaults to 0.5.
delta (float, optional): Distance within which to use linear-interpolation instead of weighted regression. Defaults to 0.001.
it (int, optional): The number of residual-based reweightings to perform. Defaults to 0.
Returns:
ICI (float): The Integrated Calibration Index.
sorted_proba (array-like): Sorted predicted probabilities.
smoothed_proba (array-like): Corresponding LOWESS-smoothed actual probabilities.
Note:
- The function clips probabilities to avoid numerical instability.
- For multi-class problems, the function converts the problem to binary classification
based on the specified class_to_calculate.
- The Integrated Calibration Index (ICI) provides a measure of calibration error,
with lower values indicating better calibration.
"""
# Clip probabilities to avoid numerical issues
proba = np.clip(y_proba[:, class_to_calculate], epsilon, 1 - epsilon)
# Convert to binary classification problem
y_true = np.array(y_true).ravel()
mask = y_true == class_to_calculate
y_true[mask] = 1
y_true[np.logical_not(mask)] = 0
# Perform LOWESS regression
lowess_fit = lowess.lowess(
y_true,
y_proba[:, class_to_calculate],
frac=span,
delta=delta,
it=it,
return_sorted=True,
)
# Calculate the Integrated Calibration Index (ICI)
ICI = np.abs(lowess_fit[:, 0] - lowess_fit[:, 1]).mean()
return ICI, lowess_fit[:, 0], lowess_fit[:, 1]
[docs]
def cal_ICI_func(func, y_proba, class_to_calculate=1):
"""
Calculate the Integrated Calibration Index (ICI) for a given calibration function.
Args:
func (callable): The calibration function to evaluate.
y_proba (array-like): Predicted probabilities for each class. Shape (n_samples, n_classes).
class_to_calculate (int, optional): The class index to calculate the ICI for. Defaults to 1.
Returns:
float: The Integrated Calibration Index (ICI) value.
Note:
The ICI is calculated by calculating the mean absolute difference between
predicted probabilities and the calibration function evaluated at predicted probabilities.
"""
# Apply the calibration function
y_adjust = func(y_proba[:, class_to_calculate])
# Calculate and return the ICI
return np.mean(np.abs(y_adjust - y_proba[:, class_to_calculate]))
[docs]
def cal_ICI(y_adjust, y_proba):
"""
Calculate the Integrated Calibration Index (ICI) for given adjusted probabilities.
Args:
y_adjust (array-like): Adjusted probabilities. Shape (n_samples,).
y_proba (array-like): Original predicted probabilities. Shape (n_samples,).
Returns:
float: The Integrated Calibration Index (ICI) value.
Note:
The ICI is calculated by calculating the mean absolute difference between
predicted probabilities and the adjusted probabilities.
"""
# Calculate and return the ICI
return np.mean(np.abs(y_adjust - y_proba))
[docs]
def logit_func(coef, intercept):
"""
Create a logistic function with given coefficient and intercept.
Args:
coef (float): The coefficient (slope) of the logistic function.
intercept (float): The intercept of the logistic function.
Returns:
callable: A function that takes an input x and returns the logistic function value.
Note:
The returned function applies the logistic transformation:
f(x) = 1 / (1 + exp(-(coef * log(x / (1 - x)) + intercept)))
"""
return lambda x: 1 / (1 + np.exp(-(coef * np.log(x / (1 - x)) + intercept)))
[docs]
def get_CI(result, alpha=0.05):
"""
Calculate confidence intervals for each field in the result.
Args:
result (numpy.ndarray): Structured array containing the results for which to calculate confidence intervals.
alpha (float, optional): The significance level for the confidence interval calculation. Defaults to 0.05.
Returns:
dict: A dictionary where keys are field names from the input array and values
are tuples containing the lower and upper bounds of the confidence interval.
Note:
This function calculates percentile-based confidence intervals for each field
in the input structured array. It's useful for bootstrap or Monte Carlo simulations.
"""
CI = {}
for key in result.dtype.names:
CI[key] = (
np.percentile(result[key], 100 * (alpha / 2)),
np.percentile(result[key], 100 * (1 - alpha / 2)),
)
return CI
[docs]
class CalibrationMetrics:
"""
A class for calculating calibration metrics for classification models.
"""
[docs]
def __init__(self, class_to_calculate=1, num_bins=10):
"""
Initialize the CalibrationMetrics class.
Args:
class_to_calculate (int, optional): The class index to calculate the metrics for. Defaults to 1.
num_bins (int, optional): Number of bins to use for the ECE/MCE/HL calculations. Defaults to 10.
"""
self.class_to_calculate = class_to_calculate
self.num_bins = num_bins
[docs]
def calculate_metrics(
self,
y_true,
y_proba,
metrics,
perform_pervalance_adjustment=False,
return_numpy=False,
**kwargs,
):
"""
Calculate the specified calibration metrics.
This function computes various calibration metrics for binary classification models.
It supports multiple metrics and can perform prevalence adjustment if needed.
List of available metrics:
* SpiegelhalterZ: Spiegelhalter's Z-test for calibration
* ECE-H: Expected Calibration Error with equal-space binning
* MCE-H: Maximum Calibration Error with equal-space binning
* HL-H: Hosmer-Lemeshow test with equal-space binning
* ECE-C: Expected Calibration Error with equal-count binning
* MCE-C: Maximum Calibration Error with equal-count binning
* HL-C: Hosmer-Lemeshow test with equal-count binning
* COX: Cox regression analysis
* Loess: Locally Estimated Scatterplot Smoothing regression analysis
Args:
y_true (numpy.ndarray): True labels.
y_proba (numpy.ndarray): Predicted probabilities.
metrics (list of str or 'all'): List of metric names to calculate. If 'all', calculates all available metrics.
perform_pervalance_adjustment (bool, optional): Whether to perform prevalence adjustment. Defaults to False.
return_numpy (bool, optional): Whether to return the results as a numpy array. Defaults to False.
**kwargs: Additional keyword arguments to pass to the metric calculation functions.
Returns:
dict or numpy.ndarray: A dictionary containing the calculated metrics, or a numpy array if return_numpy is True.
"""
# Initialize results dictionary and set up metrics list
if metrics == "all":
metrics = [
"SpiegelhalterZ",
"ECE-H",
"MCE-H",
"HL-H",
"ECE-C",
"MCE-C",
"HL-C",
"COX",
"Loess",
]
results = {}
precompute_H = False
precompute_C = False
# Perform prevalence adjustment if requested
if perform_pervalance_adjustment:
optimal_prevalance, y_proba = find_optimal_prevalence(
y_true=y_true,
y_proba=y_proba,
class_to_calculate=self.class_to_calculate,
)
class_to_calculate = 1
results["Optimal prevalence"] = optimal_prevalance
else:
class_to_calculate = self.class_to_calculate
# Calculate each requested metric
for metric in metrics:
# Spiegelhalter's Z-test
if metric == "SpiegelhalterZ":
score, p_value = spiegelhalter_z_test(
y_true=y_true,
y_proba=y_proba,
class_to_calculate=class_to_calculate,
)
results["SpiegelhalterZ score"] = score
results["SpiegelhalterZ p-value"] = p_value
# Metrics using equal-space binning
elif metric in ["ECE-H", "MCE-H", "HL-H"]:
if not precompute_H:
# Precompute equal-space binning results for efficiency
acc_H, confidence_H, bin_edges_H, bin_count_H = reliability_diagram(
y_true=y_true,
y_proba=y_proba,
num_bins=self.num_bins,
is_equal_freq=False,
)
(
acc_H_class,
confidence_H_class,
bin_edges_H_class,
bin_count_H_class,
) = reliability_diagram(
y_true=y_true,
y_proba=y_proba,
num_bins=self.num_bins,
is_equal_freq=False,
class_to_plot=class_to_calculate,
)
precompute_H = True
if metric == "ECE-H":
ece_h, mce_h = calculate_ece_mce(acc_H, confidence_H, bin_count_H)
ece_h_class, mce_h_class = calculate_ece_mce(
acc_H_class, confidence_H_class, bin_count_H_class
)
results["ECE-H topclass"] = ece_h
results["ECE-H"] = ece_h_class
elif metric == "MCE-H":
ece_h, mce_h = calculate_ece_mce(acc_H, confidence_H, bin_count_H)
ece_h_class, mce_h_class = calculate_ece_mce(
acc_H_class, confidence_H_class, bin_count_H_class
)
results["MCE-H topclass"] = mce_h
results["MCE-H"] = mce_h_class
elif metric == "HL-H":
hl_h_score, hl_h, _ = hosmer_lemeshow_test(
acc_H_class, confidence_H_class, bin_count_H_class, **kwargs
)
results["HL-H score"] = hl_h_score
results["HL-H p-value"] = hl_h
# Metrics using equal-count binning
elif metric in ["ECE-C", "MCE-C", "HL-C"]:
if not precompute_C:
# Precompute equal-count binning results for efficiency
acc_C, confidence_C, bin_edges_C, bin_count_C = reliability_diagram(
y_true=y_true,
y_proba=y_proba,
num_bins=self.num_bins,
is_equal_freq=True,
)
(
acc_C_class,
confidence_C_class,
bin_edges_C_class,
bin_count_C_class,
) = reliability_diagram(
y_true=y_true,
y_proba=y_proba,
num_bins=self.num_bins,
is_equal_freq=True,
class_to_plot=class_to_calculate,
)
precompute_C = True
if metric == "ECE-C":
ece_c, mce_c = calculate_ece_mce(acc_C, confidence_C, bin_count_C)
ece_c_class, mce_c_class = calculate_ece_mce(
acc_C_class, confidence_C_class, bin_count_C_class
)
results["ECE-C topclass"] = ece_c
results["ECE-C"] = ece_c_class
elif metric == "MCE-C":
ece_c, mce_c = calculate_ece_mce(acc_C, confidence_C, bin_count_C)
ece_c_class, mce_c_class = calculate_ece_mce(
acc_C_class, confidence_C_class, bin_count_C_class
)
results["MCE-C topclass"] = mce_c
results["MCE-C"] = mce_c_class
elif metric == "HL-C":
hl_c_score, hl_c, _ = hosmer_lemeshow_test(
acc_C_class, confidence_C_class, bin_count_C_class, **kwargs
)
results["HL-C score"] = hl_c_score
results["HL-C p-value"] = hl_c
# Cox regression analysis
elif metric == "COX":
coef, intercept, coef_ci, intercept_ci = cox_regression_analysis(
y_true=y_true, y_proba=y_proba, **kwargs
)
results["COX coef"] = coef
results["COX intercept"] = intercept
results["COX coef lowerci"] = coef_ci[0]
results["COX coef upperci"] = coef_ci[1]
results["COX intercept lowerci"] = intercept_ci[0]
results["COX intercept upperci"] = intercept_ci[1]
ICI = cal_ICI_cox(
coef,
intercept,
y_proba,
class_to_calculate=class_to_calculate,
**kwargs,
)
results["COX ICI"] = ICI
# Loess regression analysis
elif metric == "Loess":
loess_ICI, _, _ = lowess_regression_analysis(
y_true, y_proba, class_to_calculate=class_to_calculate, **kwargs
)
results["Loess ICI"] = loess_ICI
# Convert results to numpy array if requested
if return_numpy:
results = np.array(list(results.values()))
return results
[docs]
def bootstrap(
self,
y_true,
y_proba,
metrics,
perform_pervalance_adjustment=False,
n_samples=1000,
**kwargs,
):
"""
Run bootstrap and return a numpy structured array with correct field names.
This function performs bootstrap resampling to estimate the distribution of calibration metrics.
It generates multiple samples with replacement from the input data and calculates the specified
metrics for each sample.
Args:
y_true (array-like): True labels.
y_proba (array-like): Predicted probabilities.
metrics (list of str): List of metric names to calculate.
perform_pervalance_adjustment (bool, optional): Whether to perform prevalence adjustment for each bootstrap sample. Defaults to False.
n_samples (int, optional): Number of bootstrap samples to generate. Defaults to 1000.
**kwargs: Additional keyword arguments to pass to the metric calculation functions.
Returns:
numpy.ndarray: A structured array containing the bootstrapped metrics. Each field in the array
corresponds to a metric, and each row represents a bootstrap sample.
"""
bootstrap_results = []
n_instances = len(y_true)
# Generate bootstrap samples and calculate metrics
for i in range(n_samples):
# Sample with replacement
indices = np.random.choice(n_instances, n_instances, replace=True)
y_true_sample = y_true[indices]
y_proba_sample = y_proba[indices]
# Calculate metrics for the first sample to get the keys
if i == 0:
sample_results = self.calculate_metrics(
y_true_sample,
y_proba_sample,
metrics,
perform_pervalance_adjustment=perform_pervalance_adjustment,
return_numpy=False,
**kwargs,
)
keys = sample_results.keys()
sample_results = np.array(list(sample_results.values()))
else:
sample_results = self.calculate_metrics(
y_true_sample,
y_proba_sample,
metrics,
perform_pervalance_adjustment=perform_pervalance_adjustment,
return_numpy=True,
**kwargs,
)
bootstrap_results.append(sample_results)
# Create a structured array with correct field names
dtype = [(key, float) for key in keys]
unstructured_results = np.array(bootstrap_results)
structured_results = rf.unstructured_to_structured(
unstructured_results, dtype=dtype
)
return structured_results
[docs]
def optimal_prevalence_adjustment(self, y_true, y_proba):
"""
Perform optimal prevalence adjustment and return adjusted probabilities.
This function finds the optimal prevalence value that minimizes the difference
between the predicted and actual positive rates, and then adjusts the input
probabilities accordingly.
Args:
y_true (array-like): True labels.
y_proba (array-like): Predicted probabilities.
Returns:
optimal_prevalence (float): Optimal prevalence value.
adjusted_proba (array-like): Adjusted probabilities. First column is the adjusted probabilities for the other class,
second column is the adjusted probabilities for the class of interest.
"""
optimal_prevalence, adjusted_proba = find_optimal_prevalence(
y_true, y_proba, class_to_calculate=self.class_to_calculate
)
return optimal_prevalence, adjusted_proba