Source code for plismbench.engine.extract.core

"""Perform features extraction from PLISM dataset."""

from __future__ import annotations

from pathlib import Path

from plismbench.engine.extract.extract_from_h5 import run_extract_h5
from plismbench.engine.extract.extract_from_png import run_extract_streaming



[docs]
def run_extract(
    feature_extractor_name: str,
    batch_size: int,
    device: int,
    export_dir: Path,
    download_dir: Path | None = None,
    streaming: bool = False,
    overwrite: bool = False,
    workers: int = 8,
) -> None:
    """Run features extraction.

    If ``stream==False``, data will be downloaded and stored to disk from
    https://huggingface.co/datasets/owkin/plism-dataset. This dataset
    contains 91 .h5 files each containing 16,278 images converted
    into numpy arrays. In this scenario, 300Gb storage are necessary.

    If ``stream==True``, data will be downloaded on the fly from
    https://huggingface.co/datasets/owkin/plism-dataset-tiles but not
    stored to disk. This dataset contains 91x16278 images stored as .png
    files. Streaming is enable using the ``datasets`` library and
    `datasets.load_dataset(..., streaming=True)`. Note that this comes
    with the limitation to use ``IterableDataset`` meaning that no easy
    resume can be performed if the features extraction fails.
    """
    if streaming:
        run_extract_streaming(
            feature_extractor_name=feature_extractor_name,
            batch_size=batch_size,
            device=device,
            export_dir=export_dir,
            overwrite=overwrite,
        )
    else:
        assert isinstance(download_dir, str), "Download directory should be specified."
        run_extract_h5(
            feature_extractor_name=feature_extractor_name,
            batch_size=batch_size,
            device=device,
            export_dir=export_dir,
            download_dir=download_dir,
            overwrite=overwrite,
            workers=workers,
        )