Source code for InnerEye.ML.visualizers.reliability_curve

#  ------------------------------------------------------------------------------------------
#  Copyright (c) Microsoft Corporation. All rights reserved.
#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
#  ------------------------------------------------------------------------------------------
from typing import List, Union

import matplotlib.pyplot as plt
import numpy as np
from sklearn.calibration import calibration_curve
from sklearn.metrics import brier_score_loss


[docs]def plot_reliability_curve(
        y_predict: Union[List[np.ndarray], np.ndarray],
        y_true: Union[List[np.ndarray], np.ndarray],
        num_bins: int = 15,
        normalise: bool = False) -> None:
    """
    Plots reliability curves for multiple models to observe model calibration errors.
    Inputs can be either 1-D or a list of 1-D arrays depending on the use case.
    List elements are intended to be used for different model types, e.g. y_predict: (num_samples, num_models)

    :param y_predict: Model predictions, either a 1D array (num_samples) or list of 1D arrays (num_samples, num_models)
    :param y_true: Target values {0, 1}  either a 1D array (num_samples) or list of 1D arrays (num_samples, num_models)
                   Assuming a binary classification case

    :param num_bins: Number of bins used for model prediction probabilities.
    :param normalise: If set to true, predictions are normalised to range [0, 1]

    References
    [1] Predicting Good Probabilities with Supervised Learning
    <https://www.cs.cornell.edu/~alexn/papers/calibration.icml05.crc.rev3.pdf>
    """

    if not isinstance(y_predict, list):
        y_predict = [y_predict]
        y_true = [y_true]
    if not len(y_true) == len(y_predict):
        raise ValueError("y_true and y_predict are not of the same length")

    # Generate the figure and axes
    plt.figure(0, figsize=(6, 6))
    ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
    ax2 = plt.subplot2grid((3, 1), (2, 0))
    ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")

    # Iterate over all models and plot y_prediction values
    for model_id in range(len(y_predict)):
        _p = y_predict[model_id]
        _l = y_true[model_id]

        # Remove nan elements from both sets
        mask = ~np.isnan(_l)
        _p = _p[mask]
        _l = _l[mask]

        if _p.shape != _l.shape:
            raise ValueError("Target label and predictions are not of same shape")

        if normalise:
            _p = (_p - _p.min()) / (_p.max() - _p.min())

        # noinspection PyArgumentList
        clf_score = brier_score_loss(_l, _p, pos_label=_l.max())
        frac_of_positives, mean_predicted_value = calibration_curve(_l, _p, n_bins=num_bins)
        ax1.plot(mean_predicted_value, frac_of_positives, "s-", label="%s (%1.3f)" % (f"Model_{model_id}", clf_score))
        ax2.hist(_p, range=(0, 1), bins=num_bins, label=f"Model_{model_id}", histtype="step", lw=2)

    ax1.set_ylabel("Fraction of positives")
    ax1.set_ylim([-0.05, 1.05])
    ax1.legend(loc="lower right")
    ax1.set_title('Calibration plots (reliability curve)')
    ax1.grid()

    ax2.set_xlabel("Mean predicted value")
    ax2.set_ylabel("Count")
    ax2.legend(loc="upper center", ncol=2)
    ax2.grid()

    plt.tight_layout()
    plt.show()