Source code for InnerEye.ML.normalize_and_visualize_dataset

#  ------------------------------------------------------------------------------------------
#  Copyright (c) Microsoft Corporation. All rights reserved.
#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
#  ------------------------------------------------------------------------------------------
import sys
from pathlib import Path
from typing import Dict, Optional, Tuple

import pandas as pd
import param

from InnerEye.Azure.azure_config import AzureConfig
from InnerEye.Azure.azure_runner import ParserResult, create_runner_parser, parse_args_and_add_yaml_variables
from InnerEye.Common import fixed_paths
from InnerEye.Common.common_util import logging_to_stdout
from InnerEye.Common.generic_parsing import GenericConfig
from InnerEye.ML import plotting
from InnerEye.ML.common import ARGS_TXT, DATASET_CSV_FILE_NAME
from InnerEye.ML.config import SegmentationModelBase
from InnerEye.ML.dataset.full_image_dataset import load_dataset_sources
from InnerEye.ML.photometric_normalization import PhotometricNormalization
from InnerEye.ML.utils.config_loader import ModelConfigLoader
from InnerEye.ML.utils.io_util import load_images_from_dataset_source
from health_azure import DatasetConfig


[docs]class NormalizeAndVisualizeConfig(GenericConfig):
    image_channel: Optional[str] = param.String(default=None,
                                                doc="The name of the image channel that should be normalized.")
    gt_channel: Optional[str] = param.String(default=None, doc="The name of the ground truth channel that should "
                                                               "be used when visualizing slices.")
    only_first: int = param.Integer(default=0,
                                    doc="Only process the first N images of the dataset, to speed up things.")
    result_folder: str = param.String(default="NormResults",
                                      doc="The folder to use to store the resulting plots. By default, "
                                          "plots will go into the 'NormResults' subfolder inside of the dataset "
                                          "folder. If a relative path is specified here, the folder will be created as"
                                          "a subfolder of the dataset folder. An absolute path can be used too.")
    ignore_mask: bool = param.Boolean(doc="If true, the mask channel specified in the image will not be used, and all "
                                          "histograms and normalization will use all image pixels.")


[docs]def create_parser(yaml_file_path: Path) -> ParserResult:
    """
    Create a parser for all runner arguments, even though we are only using a subset of the arguments.
    This way, we can get secrets handling in a consistent way.
    In particular, this will create arguments for

    * ``--local_dataset``
    * ``--azure_dataset_id``
    """
    parser = create_runner_parser(SegmentationModelBase)
    NormalizeAndVisualizeConfig.add_args(parser)
    return parse_args_and_add_yaml_variables(parser, yaml_config_file=yaml_file_path, fail_on_unknown_args=True)


[docs]def get_configs(default_model_config: SegmentationModelBase,
                yaml_file_path: Path) -> Tuple[SegmentationModelBase, AzureConfig, Dict]:
    parser_result = create_parser(yaml_file_path)
    args = parser_result.args
    runner_config = AzureConfig(**args)
    logging_to_stdout(args["log_level"])
    config = default_model_config or ModelConfigLoader().create_model_config_from_name(runner_config.model)
    config.apply_overrides(parser_result.overrides, should_validate=False)
    return config, runner_config, args


[docs]def main(yaml_file_path: Path) -> None:
    """
    Invoke either by

    * specifying a model, ``--model Lung``
    * or specifying dataset and normalization parameters separately: ``--azure_dataset_id=foo --norm_method=None``

    In addition, the arguments ``--image_channel`` and ``--gt_channel`` must be specified.
    """
    config, runner_config, args = get_configs(SegmentationModelBase(should_validate=False), yaml_file_path)
    dataset_config = DatasetConfig(name=config.azure_dataset_id,
                                   local_folder=config.local_dataset,
                                   use_mounting=True)
    local_dataset, mount_context = dataset_config.to_input_dataset_local(workspace=runner_config.get_workspace())
    dataframe = pd.read_csv(local_dataset / DATASET_CSV_FILE_NAME)
    normalizer_config = NormalizeAndVisualizeConfig(**args)
    actual_mask_channel = None if normalizer_config.ignore_mask else config.mask_id
    image_channel = normalizer_config.image_channel or config.image_channels[0]
    if not image_channel:
        raise ValueError("No image channel selected. Specify a model by name, or use the image_channel argument.")
    gt_channel = normalizer_config.gt_channel or config.ground_truth_ids[0]
    if not gt_channel:
        raise ValueError("No GT channel selected. Specify a model by name, or use the gt_channel argument.")

    dataset_sources = load_dataset_sources(dataframe,
                                           local_dataset_root_folder=local_dataset,
                                           image_channels=[image_channel],
                                           ground_truth_channels=[gt_channel],
                                           mask_channel=actual_mask_channel)
    result_folder = local_dataset
    if normalizer_config.result_folder is not None:
        result_folder = result_folder / normalizer_config.result_folder
    if not result_folder.is_dir():
        result_folder.mkdir()
    all_patient_ids = [*dataset_sources.keys()]
    if normalizer_config.only_first == 0:
        patient_ids_to_process = all_patient_ids
    else:
        patient_ids_to_process = all_patient_ids[:normalizer_config.only_first]
    args_file = result_folder / ARGS_TXT
    args_file.write_text(" ".join(sys.argv[1:]))
    config_file = result_folder / "config.txt"
    config_file.write_text(str(config))
    normalizer = PhotometricNormalization(config)
    for patient_id in patient_ids_to_process:
        print(f"Starting to process patient {patient_id}")
        images = load_images_from_dataset_source(dataset_sources[patient_id])
        plotting.plot_normalization_result(images, normalizer, result_folder, result_prefix=image_channel)


if __name__ == '__main__':
    main(yaml_file_path=fixed_paths.SETTINGS_YAML_FILE)