Source code for InnerEye.ML.utils.csv_util

#  ------------------------------------------------------------------------------------------
#  Copyright (c) Microsoft Corporation. All rights reserved.
#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
#  ------------------------------------------------------------------------------------------
import logging
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

import numpy as np
import pandas as pd

from InnerEye.Common.metrics_constants import MetricsFileColumns

CSV_FEATURE_HEADER: str = "feature"
CSV_DATE_HEADER: str = "acquisition_date"
CSV_SUBJECT_HEADER: str = "subject"
CSV_PATH_HEADER: str = "filePath"
CSV_CHANNEL_HEADER: str = "channel"
CSV_INSTITUTION_HEADER: str = "institutionId"
CSV_SERIES_HEADER: str = "seriesId"
CSV_TAGS_HEADER: str = "tags"

COL_DICE = MetricsFileColumns.Dice.value
COL_SPLIT = "split"
COL_IS_OUTLIER = "is_outlier"


[docs]class OutlierType(Enum):
    HIGH = "High"
    LOW = "Low"


[docs]def load_csv(csv_path: Path, expected_cols: List[str], col_type_converters: Optional[Dict[str, Any]] = None
             ) -> pd.DataFrame:
    """
    Load a pandas dataframe from a csv. If the columns do not contain at least expected_cols, an exception is raised

    :param csv_path: Path to file
    :param expected_cols: A list of the columns which must, as a minimum, be present.
    :param col_type_converters: Dictionary of column: type, which ensures certain DataFrame columns are parsed with
        specific types
    :return: Loaded pandas DataFrame
    """
    if not expected_cols:
        raise ValueError("You must provide a list of at least one of the expected column headings of your CSV.")
    if not csv_path.is_file():
        raise FileNotFoundError("No CSV file exists at this location: {0}".format(csv_path))

    df = pd.read_csv(csv_path, converters=col_type_converters)
    if len(df) == 0:
        raise ValueError("Dataset at {0} contains no values".format(csv_path))
    # Check that all of the expected column headers are present in the CSV.
    actual_cols = list(df.columns)
    if not set(expected_cols).issubset(actual_cols):
        raise ValueError("CSV should at least contain the columns {0} but found {1}".format(expected_cols, actual_cols))
    return df


[docs]def drop_rows_missing_important_values(df: pd.DataFrame, important_cols: List[str]) -> pd.DataFrame:
    """
    Remove rows from the DataFrame in which the columns that have been specified by the user as "important" contain
    null values or only whitespace.

    :param df: DataFrame
    :param important_cols: Columns which must not contain null values
    :return: df: DataFrame without the dropped rows.
    """
    df = df.replace(r'^\s*$', np.nan, regex=True)
    before_len = len(df)
    df = df.dropna(subset=important_cols)
    num_dropped = len(df) - before_len
    if num_dropped > 0:
        logging.info("Dropping {0} rows from the data set since they are missing values from one of the columns: {1}"
                     .format(num_dropped, important_cols))
    return df


[docs]def extract_outliers(df: pd.DataFrame, outlier_range: float, outlier_col: str = COL_DICE,
                     outlier_type: OutlierType = OutlierType.LOW) -> pd.DataFrame:
    """
    Given a DataFrame, extract the subset in which a given value (specified by outlier_col) falls outside of
    mean +- outlier_range * std.

    :param df: DataFrame from which to extract the outliers
    :param outlier_range: The number of standard deviation from the mean which the points have to be apart
        to be considered an outlier i.e. a point is considered an outlier if its outlier_col value is above
    mean + outlier_range * std (if outlier_type is HIGH) or below mean - outlier_range * std (if outlier_type is
    LOW).

    :param outlier_col: The column from which to calculate outliers, e.g. Dice
    :param outlier_type: Either LOW (i.e. below accepted range) or HIGH (above accepted range) outliers.
    :return: DataFrame containing only the outliers
    """
    if outlier_range < 0:
        raise ValueError("outlier_range must be non-negative. Found: {}".format(outlier_range))
    if outlier_type == OutlierType.LOW:
        return df[df[outlier_col] < df[outlier_col].mean() - outlier_range * df[outlier_col].std()]
    elif outlier_type == OutlierType.HIGH:
        return df[df[outlier_col] > df[outlier_col].mean() + outlier_range * df[outlier_col].std()]
    raise ValueError(f"Outlier type must be one of LOW or HIGH. Received {outlier_type}")


[docs]def mark_outliers(df: pd.DataFrame,
                  outlier_range: float,
                  outlier_col: str,
                  high_values_are_good: bool) -> pd.DataFrame:
    """
    Given a DataFrame, add a column "is_outlier" that contains "Yes" for all rows that are considered outliers.
    Rows that are not considered outliers have an empty string in the new column.
    Outliers are taken from the column `outlier_col`, that have a value that falls outside of
    mean +- outlier_range * std.

    :param df: DataFrame from which to extract the outliers
    :param outlier_range: The number of standard deviation from the mean which the points have to be apart
        to be considered an outlier i.e. a point is considered an outlier if its outlier_col value is above
    mean + outlier_range * std (if outlier_type is HIGH) or below mean - outlier_range * std (if outlier_type is
    LOW).

    :param outlier_col: The column from which to calculate outliers, e.g. Dice
    :param high_values_are_good: If True, high values for the metric are considered good, and hence low values
        are marked as outliers. If False, low values are considered good, and high values are marked as outliers.
    :return: DataFrame with an additional column `is_outlier`
    """
    if outlier_range < 0:
        raise ValueError("outlier_range must be non-negative. Found: {}".format(outlier_range))
    mean = df[outlier_col].mean()
    std = df[outlier_col].std()
    if high_values_are_good:
        is_outlier = df[outlier_col] < mean - outlier_range * std
    else:
        is_outlier = df[outlier_col] > mean + outlier_range * std
    df[COL_IS_OUTLIER] = ["Yes" if b else "" for b in is_outlier]
    return df


[docs]def get_worst_performing_outliers(df: pd.DataFrame,
                                  outlier_range: float,
                                  outlier_col_name: str = COL_DICE,
                                  max_n_outliers: int = None) -> List[Tuple[int, str, float, str]]:
    """
    Returns a sorted list (worst to best) of all the worst performing outliers in the metrics table
    according to metric provided by outlier_col_name

    :param df: Metrics DataFrame
    :param outlier_col_name: The column by which to determine outliers
    :param outlier_range: The standard deviation from the mean which the points have to be below
        to be considered an outlier.

    :param max_n_outliers: the number of (worst performing) outlier IDs to return.
    :return: a sorted list (worst to best) of all the worst performing outliers
    """
    if outlier_col_name not in df.columns:
        raise ValueError(f"Column {outlier_col_name} is not present in DataFrame columns: {df.columns.tolist()}")

    outlier_df = extract_outliers(df, outlier_range, outlier_col=outlier_col_name).drop([COL_SPLIT], axis=1)
    sorted_outlier_df = outlier_df.sort_values(by=outlier_col_name, ascending=True)

    ids_and_structures = list(zip(sorted_outlier_df.Patient.values.astype(int),
                                  sorted_outlier_df.Structure.values.astype(str),
                                  sorted_outlier_df[outlier_col_name].values.astype(float),
                                  sorted_outlier_df.seriesId.values.astype(str)))

    if max_n_outliers is not None:
        return ids_and_structures[:max_n_outliers]
    return ids_and_structures