Source code for plismbench.engine.extract.core
"""Perform features extraction from PLISM dataset."""
from __future__ import annotations
from pathlib import Path
from plismbench.engine.extract.extract_from_h5 import run_extract_h5
from plismbench.engine.extract.extract_from_png import run_extract_streaming
[docs]
def run_extract(
feature_extractor_name: str,
batch_size: int,
device: int,
export_dir: Path,
download_dir: Path | None = None,
streaming: bool = False,
overwrite: bool = False,
workers: int = 8,
) -> None:
"""Run features extraction.
If ``stream==False``, data will be downloaded and stored to disk from
https://huggingface.co/datasets/owkin/plism-dataset. This dataset
contains 91 .h5 files each containing 16,278 images converted
into numpy arrays. In this scenario, 300Gb storage are necessary.
If ``stream==True``, data will be downloaded on the fly from
https://huggingface.co/datasets/owkin/plism-dataset-tiles but not
stored to disk. This dataset contains 91x16278 images stored as .png
files. Streaming is enable using the ``datasets`` library and
`datasets.load_dataset(..., streaming=True)`. Note that this comes
with the limitation to use ``IterableDataset`` meaning that no easy
resume can be performed if the features extraction fails.
"""
if streaming:
run_extract_streaming(
feature_extractor_name=feature_extractor_name,
batch_size=batch_size,
device=device,
export_dir=export_dir,
overwrite=overwrite,
)
else:
assert isinstance(download_dir, str), "Download directory should be specified."
run_extract_h5(
feature_extractor_name=feature_extractor_name,
batch_size=batch_size,
device=device,
export_dir=export_dir,
download_dir=download_dir,
overwrite=overwrite,
workers=workers,
)