Source code for InnerEye.ML.visualizers.reliability_curve

#  ------------------------------------------------------------------------------------------
#  Copyright (c) Microsoft Corporation. All rights reserved.
#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
#  ------------------------------------------------------------------------------------------
from typing import List, Union

import matplotlib.pyplot as plt
import numpy as np
from sklearn.calibration import calibration_curve
from sklearn.metrics import brier_score_loss


[docs]def plot_reliability_curve( y_predict: Union[List[np.ndarray], np.ndarray], y_true: Union[List[np.ndarray], np.ndarray], num_bins: int = 15, normalise: bool = False) -> None: """ Plots reliability curves for multiple models to observe model calibration errors. Inputs can be either 1-D or a list of 1-D arrays depending on the use case. List elements are intended to be used for different model types, e.g. y_predict: (num_samples, num_models) :param y_predict: Model predictions, either a 1D array (num_samples) or list of 1D arrays (num_samples, num_models) :param y_true: Target values {0, 1} either a 1D array (num_samples) or list of 1D arrays (num_samples, num_models) Assuming a binary classification case :param num_bins: Number of bins used for model prediction probabilities. :param normalise: If set to true, predictions are normalised to range [0, 1] References [1] Predicting Good Probabilities with Supervised Learning <https://www.cs.cornell.edu/~alexn/papers/calibration.icml05.crc.rev3.pdf> """ if not isinstance(y_predict, list): y_predict = [y_predict] y_true = [y_true] if not len(y_true) == len(y_predict): raise ValueError("y_true and y_predict are not of the same length") # Generate the figure and axes plt.figure(0, figsize=(6, 6)) ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2) ax2 = plt.subplot2grid((3, 1), (2, 0)) ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated") # Iterate over all models and plot y_prediction values for model_id in range(len(y_predict)): _p = y_predict[model_id] _l = y_true[model_id] # Remove nan elements from both sets mask = ~np.isnan(_l) _p = _p[mask] _l = _l[mask] if _p.shape != _l.shape: raise ValueError("Target label and predictions are not of same shape") if normalise: _p = (_p - _p.min()) / (_p.max() - _p.min()) # noinspection PyArgumentList clf_score = brier_score_loss(_l, _p, pos_label=_l.max()) frac_of_positives, mean_predicted_value = calibration_curve(_l, _p, n_bins=num_bins) ax1.plot(mean_predicted_value, frac_of_positives, "s-", label="%s (%1.3f)" % (f"Model_{model_id}", clf_score)) ax2.hist(_p, range=(0, 1), bins=num_bins, label=f"Model_{model_id}", histtype="step", lw=2) ax1.set_ylabel("Fraction of positives") ax1.set_ylim([-0.05, 1.05]) ax1.legend(loc="lower right") ax1.set_title('Calibration plots (reliability curve)') ax1.grid() ax2.set_xlabel("Mean predicted value") ax2.set_ylabel("Count") ax2.legend(loc="upper center", ncol=2) ax2.grid() plt.tight_layout() plt.show()