# ------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------
import logging
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
import numpy as np
import pandas as pd
from InnerEye.Common.metrics_constants import MetricsFileColumns
CSV_FEATURE_HEADER: str = "feature"
CSV_DATE_HEADER: str = "acquisition_date"
CSV_SUBJECT_HEADER: str = "subject"
CSV_PATH_HEADER: str = "filePath"
CSV_CHANNEL_HEADER: str = "channel"
CSV_INSTITUTION_HEADER: str = "institutionId"
CSV_SERIES_HEADER: str = "seriesId"
CSV_TAGS_HEADER: str = "tags"
COL_DICE = MetricsFileColumns.Dice.value
COL_SPLIT = "split"
COL_IS_OUTLIER = "is_outlier"
[docs]class OutlierType(Enum):
HIGH = "High"
LOW = "Low"
[docs]def load_csv(csv_path: Path, expected_cols: List[str], col_type_converters: Optional[Dict[str, Any]] = None
) -> pd.DataFrame:
"""
Load a pandas dataframe from a csv. If the columns do not contain at least expected_cols, an exception is raised
:param csv_path: Path to file
:param expected_cols: A list of the columns which must, as a minimum, be present.
:param col_type_converters: Dictionary of column: type, which ensures certain DataFrame columns are parsed with
specific types
:return: Loaded pandas DataFrame
"""
if not expected_cols:
raise ValueError("You must provide a list of at least one of the expected column headings of your CSV.")
if not csv_path.is_file():
raise FileNotFoundError("No CSV file exists at this location: {0}".format(csv_path))
df = pd.read_csv(csv_path, converters=col_type_converters)
if len(df) == 0:
raise ValueError("Dataset at {0} contains no values".format(csv_path))
# Check that all of the expected column headers are present in the CSV.
actual_cols = list(df.columns)
if not set(expected_cols).issubset(actual_cols):
raise ValueError("CSV should at least contain the columns {0} but found {1}".format(expected_cols, actual_cols))
return df
[docs]def drop_rows_missing_important_values(df: pd.DataFrame, important_cols: List[str]) -> pd.DataFrame:
"""
Remove rows from the DataFrame in which the columns that have been specified by the user as "important" contain
null values or only whitespace.
:param df: DataFrame
:param important_cols: Columns which must not contain null values
:return: df: DataFrame without the dropped rows.
"""
df = df.replace(r'^\s*$', np.nan, regex=True)
before_len = len(df)
df = df.dropna(subset=important_cols)
num_dropped = len(df) - before_len
if num_dropped > 0:
logging.info("Dropping {0} rows from the data set since they are missing values from one of the columns: {1}"
.format(num_dropped, important_cols))
return df
[docs]def mark_outliers(df: pd.DataFrame,
outlier_range: float,
outlier_col: str,
high_values_are_good: bool) -> pd.DataFrame:
"""
Given a DataFrame, add a column "is_outlier" that contains "Yes" for all rows that are considered outliers.
Rows that are not considered outliers have an empty string in the new column.
Outliers are taken from the column `outlier_col`, that have a value that falls outside of
mean +- outlier_range * std.
:param df: DataFrame from which to extract the outliers
:param outlier_range: The number of standard deviation from the mean which the points have to be apart
to be considered an outlier i.e. a point is considered an outlier if its outlier_col value is above
mean + outlier_range * std (if outlier_type is HIGH) or below mean - outlier_range * std (if outlier_type is
LOW).
:param outlier_col: The column from which to calculate outliers, e.g. Dice
:param high_values_are_good: If True, high values for the metric are considered good, and hence low values
are marked as outliers. If False, low values are considered good, and high values are marked as outliers.
:return: DataFrame with an additional column `is_outlier`
"""
if outlier_range < 0:
raise ValueError("outlier_range must be non-negative. Found: {}".format(outlier_range))
mean = df[outlier_col].mean()
std = df[outlier_col].std()
if high_values_are_good:
is_outlier = df[outlier_col] < mean - outlier_range * std
else:
is_outlier = df[outlier_col] > mean + outlier_range * std
df[COL_IS_OUTLIER] = ["Yes" if b else "" for b in is_outlier]
return df