Source code for plismbench.utils.evaluate
"""Utility functions for metrics evaluation."""
import itertools
from functools import lru_cache
from pathlib import Path
import numpy as np
import pandas as pd
NUM_TILES_PER_SLIDE: int = 16_278
NUM_SLIDES: int = 91
[docs]
def get_tiles_subset_idx(n_tiles: int) -> np.ndarray:
"""Get tiles subset from the original 16_278."""
if n_tiles == NUM_TILES_PER_SLIDE:
tiles_subset_idx = np.arange(0, NUM_TILES_PER_SLIDE)
else:
tiles_subset_idx = np.load(
Path(__file__).parents[2] / "assets" / f"tiles_subset_{n_tiles}.npy"
)
assert len(set(tiles_subset_idx)) == n_tiles
return tiles_subset_idx
[docs]
@lru_cache()
def load_features(fpath: Path) -> np.ndarray:
"""Load features from path using caching and convert to float32."""
feats = np.load(fpath)
return feats.astype(np.float32) # will be converted to float16 later on !
[docs]
def prepare_features_dataframe(features_dir: Path, extractor: str) -> pd.DataFrame:
"""Prepare unique WSI features dataframe with features paths and metadata."""
# Get {slide_id: features paths} dictionary
features_paths = {
fp: fp.parent.name
for fp in (features_dir / extractor).glob("*/features.npy")
if "_to_GMH_S60.tif" in str(fp)
}
# Prepare list of slide names, staining, and scanner directly
slide_data = []
for features_path, slide_name in features_paths.items():
staining, scanner = slide_name.split("_")[:2]
slide_data.append([slide_name, features_path, staining, scanner])
# Build output dataset
slide_features = pd.DataFrame(
slide_data, columns=["slide", "features_path", "staining", "scanner"]
)
return slide_features
[docs]
def prepare_pairs_dataframe(features_dir: Path, extractor: str) -> pd.DataFrame:
"""Prepare all pairs dataframe with features paths and metadata."""
slide_features = prepare_features_dataframe(
features_dir=features_dir, extractor=extractor
)
assert slide_features.shape == (
NUM_SLIDES,
4,
), "Slide features dataframe should be of shape (91, 4)."
pairs = slide_features.merge(slide_features, how="cross", suffixes=("_a", "_b"))
pairs.set_index(pairs["slide_a"] + "---" + pairs["slide_b"], inplace=True)
unique_pairs = [
"---".join([a, b])
for (a, b) in set(itertools.combinations(slide_features["slide"], 2))
]
pairs = (
pairs.loc[unique_pairs]
.sort_values(["features_path_a", "features_path_b"])
.reset_index(drop=True)
)
assert pairs.shape[0] == int(NUM_SLIDES * (NUM_SLIDES - 1) / 2), (
"There should be 4,095 unique pairs of slides."
)
return pairs