From 86d7ab46cc891531bb052d40d8cc9757d1eaf169 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Kuku=C4=8Dka?= Date: Mon, 20 Apr 2026 16:37:09 +0200 Subject: [PATCH 1/8] refactor: extract SlidesTilesLoader from MetaTiledSlides Separates metadata loading/indexing logic (no torch dependency) into SlidesTilesLoader; MetaTiledSlides delegates to it while keeping the same public interface. --- .../mlkit/data/datasets/meta_tiled_slides.py | 115 +++++++++++++----- 1 file changed, 83 insertions(+), 32 deletions(-) diff --git a/rationai/mlkit/data/datasets/meta_tiled_slides.py b/rationai/mlkit/data/datasets/meta_tiled_slides.py index 8437e41..7f01179 100644 --- a/rationai/mlkit/data/datasets/meta_tiled_slides.py +++ b/rationai/mlkit/data/datasets/meta_tiled_slides.py @@ -15,16 +15,8 @@ T = TypeVar("T", covariant=True) -class MetaTiledSlides(ConcatDataset[T], ABC): - """Abstract base class for creating concatenated datasets from slides and tiles. - - This class provides a factory method to load and concatenate datasets from different - sources: local storage, preloaded data, or artifacts stored in MLFlow. - - Attributes: - slides (HFDataset): Dataset containing slide metadata. - tiles (HFDataset): Dataset containing tile metadata. - """ +class SlidesTilesLoader: + """Loads and concatenates slides/tiles metadata.""" def __init__( self, @@ -34,7 +26,7 @@ def __init__( slides_and_tiles: tuple[HFDataset, HFDataset] | None = None, hf_kwargs: dict[str, Any] | None = None, ) -> None: - """Load slides and tiles from MLFlow artifacts. + """Load slides and tiles from local paths, MLFlow URIs, or preloaded datasets. Args: paths: List of directories to load slides and tiles from. Each @@ -66,8 +58,6 @@ def __init__( self.tiles = tiles self._slide_id_to_indices = self._build_tile_index(self.tiles) - super().__init__(self.generate_datasets()) - @staticmethod def _build_tile_index(tiles: HFDataset) -> dict[str | bytes, pa.ListScalar]: """Creates a fast lookup table for slide indices. @@ -115,25 +105,6 @@ def _build_tile_index(tiles: HFDataset) -> dict[str | bytes, pa.ListScalar]: # Map the string key to the PyArrow ListScalar return {key: values_array[i] for i, key in enumerate(keys)} - @abstractmethod - def generate_datasets(self) -> Iterable[Dataset[T]]: - """Factory method to generate datasets from slides and tiles. - - Example: - ```python - return ( - SlideTiles( - slide_path=slide["path"], - level=slide["level"], - tile_extent_x=slide["tile_extent_x"], - tile_extent_y=slide["tile_extent_y"], - tiles=self.filter_tiles_by_slide(slide.id), - ) - for slide in self.slides - ) - ``` - """ - def filter_tiles_by_slide(self, slide_id: str | bytes) -> HFDataset: """Returns a view of the dataset using a slice or indices. @@ -217,3 +188,83 @@ def resolve_search_path(partition: str) -> list[dict[str, str]]: except Exception as e: msg = "Failed to load Parquet files." raise FileNotFoundError(msg) from e + + +class MetaTiledSlides(ConcatDataset[T], ABC): + """Abstract base class for creating concatenated datasets from slides and tiles. + + This class provides a factory method to load and concatenate datasets from different + sources: local storage, preloaded data, or artifacts stored in MLFlow. + + Attributes: + slides (HFDataset): Dataset containing slide metadata. + tiles (HFDataset): Dataset containing tile metadata. + """ + + def __init__( + self, + *, + paths: Iterable[Path | str] | None = None, + uris: Iterable[str] | None = None, + slides_and_tiles: tuple[HFDataset, HFDataset] | None = None, + hf_kwargs: dict[str, Any] | None = None, + ) -> None: + """Load slides and tiles from MLFlow artifacts. + + Args: + paths: List of directories to load slides and tiles from. Each + directory must include either single files (`slides.parquet` + and `tiles.parquet`) or subdirectories (`slides/` and `tiles/`) + containing chunked Parquet files. + uris: List of MLFlow artifact URIs pointing to folders containing + either single files (`slides.parquet` and `tiles.parquet`) or + subdirectories (`slides/` and `tiles/`) containing chunked + Parquet files. + slides_and_tiles: Tuple containing the slides and tiles Datasets. + hf_kwargs: Additional keyword arguments to pass to HuggingFace's + `load_dataset` function. Defaults to `{"path": "parquet", "split": "train"}`. + """ + self._meta = SlidesTilesLoader( + paths=paths, + uris=uris, + slides_and_tiles=slides_and_tiles, + hf_kwargs=hf_kwargs, + ) + self.slides = self._meta.slides + self.tiles = self._meta.tiles + super().__init__(self.generate_datasets()) + + def filter_tiles_by_slide(self, slide_id: str | bytes) -> HFDataset: + """Returns a view of the dataset using a slice or indices. + + This function creates a view of the `self.tiles` dataset that contains only + the tiles belonging to the specified slide. It uses the precomputed + `_slide_id_to_indices` mapping to efficiently retrieve the relevant tiles + without copying data. + + Args: + slide_id: The ID of the slide to filter tiles. + + Returns: + A view of the tiles dataset containing only the tiles for the specified slide. + """ + return self._meta.filter_tiles_by_slide(slide_id) + + @abstractmethod + def generate_datasets(self) -> Iterable[Dataset[T]]: + """Factory method to generate datasets from slides and tiles. + + Example: + ```python + return ( + SlideTiles( + slide_path=slide["path"], + level=slide["level"], + tile_extent_x=slide["tile_extent_x"], + tile_extent_y=slide["tile_extent_y"], + tiles=self.filter_tiles_by_slide(slide.id), + ) + for slide in self.slides + ) + ``` + """ From 0c18e5e520aed7d4fd6ccddaa53011f5393b1e79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Kuku=C4=8Dka?= Date: Mon, 20 Apr 2026 16:41:37 +0200 Subject: [PATCH 2/8] docs: update outated docs Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- rationai/mlkit/data/datasets/meta_tiled_slides.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rationai/mlkit/data/datasets/meta_tiled_slides.py b/rationai/mlkit/data/datasets/meta_tiled_slides.py index 7f01179..b7878db 100644 --- a/rationai/mlkit/data/datasets/meta_tiled_slides.py +++ b/rationai/mlkit/data/datasets/meta_tiled_slides.py @@ -209,7 +209,7 @@ def __init__( slides_and_tiles: tuple[HFDataset, HFDataset] | None = None, hf_kwargs: dict[str, Any] | None = None, ) -> None: - """Load slides and tiles from MLFlow artifacts. + """Load slides and tiles from local paths, MLFlow URIs, or preloaded datasets. Args: paths: List of directories to load slides and tiles from. Each From a9af3a4193cc9cc74da187b5302d28ba26e8dc77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Kuku=C4=8Dka?= Date: Mon, 20 Apr 2026 16:44:42 +0200 Subject: [PATCH 3/8] fix: misleading error --- rationai/mlkit/data/datasets/meta_tiled_slides.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rationai/mlkit/data/datasets/meta_tiled_slides.py b/rationai/mlkit/data/datasets/meta_tiled_slides.py index b7878db..fbc69a5 100644 --- a/rationai/mlkit/data/datasets/meta_tiled_slides.py +++ b/rationai/mlkit/data/datasets/meta_tiled_slides.py @@ -143,7 +143,7 @@ def load_slides_and_tiles( `load_dataset` function. Raises: - FileNotFoundError: If the data cannot be loaded from the specified URIs. + RuntimeError: If the data cannot be loaded from the specified URIs. Returns: A tuple containing the slides and tiles Datasets. @@ -187,7 +187,7 @@ def resolve_search_path(partition: str) -> list[dict[str, str]]: except Exception as e: msg = "Failed to load Parquet files." - raise FileNotFoundError(msg) from e + raise RuntimeError(msg) from e class MetaTiledSlides(ConcatDataset[T], ABC): @@ -209,7 +209,7 @@ def __init__( slides_and_tiles: tuple[HFDataset, HFDataset] | None = None, hf_kwargs: dict[str, Any] | None = None, ) -> None: - """Load slides and tiles from local paths, MLFlow URIs, or preloaded datasets. + """Load slides and tiles from MLFlow artifacts. Args: paths: List of directories to load slides and tiles from. Each From 977b0c9819f645ba6d1c090673c0acf9c9f01c7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Kuku=C4=8Dka?= Date: Mon, 20 Apr 2026 18:54:01 +0200 Subject: [PATCH 4/8] fix: export class --- rationai/mlkit/data/datasets/__init__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/rationai/mlkit/data/datasets/__init__.py b/rationai/mlkit/data/datasets/__init__.py index 453129e..4499911 100644 --- a/rationai/mlkit/data/datasets/__init__.py +++ b/rationai/mlkit/data/datasets/__init__.py @@ -1,5 +1,7 @@ -from rationai.mlkit.data.datasets.meta_tiled_slides import MetaTiledSlides +from rationai.mlkit.data.datasets.meta_tiled_slides import ( + MetaTiledSlides, SlidesTilesLoader +) from rationai.mlkit.data.datasets.openslide_tiles_dataset import OpenSlideTilesDataset -__all__ = ["MetaTiledSlides", "OpenSlideTilesDataset"] +__all__ = ["MetaTiledSlides", "SlidesTilesLoader", "OpenSlideTilesDataset"] From 0123e12b96232dc06d7e44fd42b35c77b66386ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Kuku=C4=8Dka?= Date: Mon, 20 Apr 2026 19:57:21 +0200 Subject: [PATCH 5/8] refactor: move SlidesTilesLoader to its own module --- rationai/mlkit/data/datasets/__init__.py | 7 +- .../mlkit/data/datasets/meta_tiled_slides.py | 182 +---------------- .../data/datasets/slides_tiles_loader.py | 185 ++++++++++++++++++ 3 files changed, 190 insertions(+), 184 deletions(-) create mode 100644 rationai/mlkit/data/datasets/slides_tiles_loader.py diff --git a/rationai/mlkit/data/datasets/__init__.py b/rationai/mlkit/data/datasets/__init__.py index 4499911..bf56e5d 100644 --- a/rationai/mlkit/data/datasets/__init__.py +++ b/rationai/mlkit/data/datasets/__init__.py @@ -1,7 +1,6 @@ -from rationai.mlkit.data.datasets.meta_tiled_slides import ( - MetaTiledSlides, SlidesTilesLoader -) +from rationai.mlkit.data.datasets.meta_tiled_slides import MetaTiledSlides from rationai.mlkit.data.datasets.openslide_tiles_dataset import OpenSlideTilesDataset +from rationai.mlkit.data.datasets.slides_tiles_loader import SlidesTilesLoader -__all__ = ["MetaTiledSlides", "SlidesTilesLoader", "OpenSlideTilesDataset"] +__all__ = ["MetaTiledSlides", "OpenSlideTilesDataset", "SlidesTilesLoader"] diff --git a/rationai/mlkit/data/datasets/meta_tiled_slides.py b/rationai/mlkit/data/datasets/meta_tiled_slides.py index fbc69a5..1b80259 100644 --- a/rationai/mlkit/data/datasets/meta_tiled_slides.py +++ b/rationai/mlkit/data/datasets/meta_tiled_slides.py @@ -1,193 +1,15 @@ from abc import ABC, abstractmethod from collections.abc import Iterable -from concurrent.futures import ThreadPoolExecutor from pathlib import Path from typing import Any, TypeVar -import numpy as np -import pyarrow as pa from datasets import Dataset as HFDataset -from datasets import concatenate_datasets, load_dataset -from mlflow.artifacts import download_artifacts from torch.utils.data import ConcatDataset, Dataset +from rationai.mlkit.data.datasets.slides_tiles_loader import SlidesTilesLoader -T = TypeVar("T", covariant=True) - - -class SlidesTilesLoader: - """Loads and concatenates slides/tiles metadata.""" - - def __init__( - self, - *, - paths: Iterable[Path | str] | None = None, - uris: Iterable[str] | None = None, - slides_and_tiles: tuple[HFDataset, HFDataset] | None = None, - hf_kwargs: dict[str, Any] | None = None, - ) -> None: - """Load slides and tiles from local paths, MLFlow URIs, or preloaded datasets. - - Args: - paths: List of directories to load slides and tiles from. Each - directory must include either single files (`slides.parquet` - and `tiles.parquet`) or subdirectories (`slides/` and `tiles/`) - containing chunked Parquet files. - uris: List of MLFlow artifact URIs pointing to folders containing - either single files (`slides.parquet` and `tiles.parquet`) or - subdirectories (`slides/` and `tiles/`) containing chunked - Parquet files. - slides_and_tiles: Tuple containing the slides and tiles Datasets. - hf_kwargs: Additional keyword arguments to pass to HuggingFace's - `load_dataset` function. Defaults to `{"path": "parquet", "split": "train"}`. - """ - assert paths or uris or slides_and_tiles, ( - "At least one of paths, uris or slides_and_tiles must be provided." - ) - - if hf_kwargs is None: - hf_kwargs = {"path": "parquet", "split": "train"} - - slides, tiles = self.load_slides_and_tiles(paths or [], uris or [], hf_kwargs) - - if slides_and_tiles is not None: - slides = concatenate_datasets([slides, slides_and_tiles[0]]) - tiles = concatenate_datasets([tiles, slides_and_tiles[1]]) - - self.slides = slides - self.tiles = tiles - self._slide_id_to_indices = self._build_tile_index(self.tiles) - - @staticmethod - def _build_tile_index(tiles: HFDataset) -> dict[str | bytes, pa.ListScalar]: - """Creates a fast lookup table for slide indices. - - This function builds a mapping from `slide_id` to the list of indices in the - `tiles` dataset that correspond to that slide. - - Args: - tiles: A dataset containing a `slide_id` column. - - Returns: - A dictionary mapping each `slide_id` to a list of indices in the `tiles` dataset. - """ - if len(tiles) == 0: - return {} - - # 1. Grab the column directly from the underlying PyArrow Table - slide_ids = tiles.data.column("slide_id") - num_rows = len(slide_ids) - - # 2. Handle the "Large" type conversion - current_type = slide_ids.type - if pa.types.is_string(current_type): - slide_ids = slide_ids.cast(pa.large_string()) - elif pa.types.is_binary(current_type): - slide_ids = slide_ids.cast(pa.large_binary()) - - # 3. Generate sequential row indices - # np.arange is used here because PyArrow can wrap it instantly with zero-copy overhead - row_indices = pa.array(np.arange(num_rows, dtype=np.int64)) - - # 4. Combine them into a lightweight PyArrow Table - table = pa.Table.from_arrays( - [slide_ids, row_indices], names=["slide_id", "idx"] - ) - - # 5. Perform the native Arrow groupby and aggregate - # The "list" function aggregates all indices for a given slide_id into a single Arrow List scalar - grouped = table.group_by("slide_id").aggregate([("idx", "list")]) - # 6. Extract keys to Python, but KEEP values as PyArrow ListScalars - keys = grouped.column("slide_id").to_numpy() - values_array = grouped.column("idx_list") - - # Map the string key to the PyArrow ListScalar - return {key: values_array[i] for i, key in enumerate(keys)} - - def filter_tiles_by_slide(self, slide_id: str | bytes) -> HFDataset: - """Returns a view of the dataset using a slice or indices. - - This function creates a view of the `self.tiles` dataset that contains only - the tiles belonging to the specified slide. It uses the precomputed - `_slide_id_to_indices` mapping to efficiently retrieve the relevant tiles - without copying data. - - Args: - slide_id: The ID of the slide to filter tiles. - - Returns: - A view of the tiles dataset containing only the tiles for the specified slide. - """ - tile_indices = self._slide_id_to_indices.get( - slide_id, pa.scalar([], type=pa.list_(pa.int64())) - ) - return self.tiles.select(tile_indices.values.to_numpy()) - - @staticmethod - def load_slides_and_tiles( - paths: Iterable[str | Path], uris: Iterable[str], hf_kwargs: dict[str, Any] - ) -> tuple[HFDataset, HFDataset]: - """Load slides and tiles parquets from local storage and MLFlow artifacts. - - Args: - paths: List of directories to load slides and tiles from. Each - directory must include either single files (`slides.parquet` - and `tiles.parquet`) or subdirectories (`slides/` and `tiles/`) - containing chunked Parquet files. - uris: List of MLFlow artifact URIs pointing to folders containing - either single files (`slides.parquet` and `tiles.parquet`) or - subdirectories (`slides/` and `tiles/`) containing chunked - Parquet files. - hf_kwargs: Additional keyword arguments to pass to HuggingFace's - `load_dataset` function. - - Raises: - RuntimeError: If the data cannot be loaded from the specified URIs. - - Returns: - A tuple containing the slides and tiles Datasets. - """ - # Parallelize MLFlow downloads (I/O Bound) - with ThreadPoolExecutor() as executor: - artifacts_paths = list( - executor.map(lambda uri: download_artifacts(artifact_uri=uri), uris) - ) - - search_dirs = [Path(p) for p in (*paths, *artifacts_paths)] - - # Handle empty datasets - if not len(search_dirs): - return HFDataset.from_dict({}), HFDataset.from_dict({}) - - def resolve_search_path(partition: str) -> list[dict[str, str]]: - return [ - {"data_dir": str(path / partition)} - if (path / partition).is_dir() - else {"data_files": str(path / f"{partition}.parquet")} - for path in search_dirs - ] - - try: - slides_ds = concatenate_datasets( - [ - load_dataset(**hf_kwargs, **datasource) - for datasource in resolve_search_path("slides") - ] - ) - - tiles_ds = concatenate_datasets( - [ - load_dataset(**hf_kwargs, **datasource) - for datasource in resolve_search_path("tiles") - ] - ) - - return slides_ds, tiles_ds - - except Exception as e: - msg = "Failed to load Parquet files." - raise RuntimeError(msg) from e +T = TypeVar("T", covariant=True) class MetaTiledSlides(ConcatDataset[T], ABC): diff --git a/rationai/mlkit/data/datasets/slides_tiles_loader.py b/rationai/mlkit/data/datasets/slides_tiles_loader.py new file mode 100644 index 0000000..1aa55ce --- /dev/null +++ b/rationai/mlkit/data/datasets/slides_tiles_loader.py @@ -0,0 +1,185 @@ +from collections.abc import Iterable +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path +from typing import Any + +import numpy as np +import pyarrow as pa +from datasets import Dataset as HFDataset +from datasets import concatenate_datasets, load_dataset +from mlflow.artifacts import download_artifacts + + +class SlidesTilesLoader: + """Loads and concatenates slides/tiles metadata.""" + + def __init__( + self, + *, + paths: Iterable[Path | str] | None = None, + uris: Iterable[str] | None = None, + slides_and_tiles: tuple[HFDataset, HFDataset] | None = None, + hf_kwargs: dict[str, Any] | None = None, + ) -> None: + """Load slides and tiles from local paths, MLFlow URIs, or preloaded datasets. + + Args: + paths: List of directories to load slides and tiles from. Each + directory must include either single files (`slides.parquet` + and `tiles.parquet`) or subdirectories (`slides/` and `tiles/`) + containing chunked Parquet files. + uris: List of MLFlow artifact URIs pointing to folders containing + either single files (`slides.parquet` and `tiles.parquet`) or + subdirectories (`slides/` and `tiles/`) containing chunked + Parquet files. + slides_and_tiles: Tuple containing the slides and tiles Datasets. + hf_kwargs: Additional keyword arguments to pass to HuggingFace's + `load_dataset` function. Defaults to `{"path": "parquet", "split": "train"}`. + """ + assert paths or uris or slides_and_tiles, ( + "At least one of paths, uris or slides_and_tiles must be provided." + ) + + if hf_kwargs is None: + hf_kwargs = {"path": "parquet", "split": "train"} + + slides, tiles = self.load_slides_and_tiles(paths or [], uris or [], hf_kwargs) + + if slides_and_tiles is not None: + slides = concatenate_datasets([slides, slides_and_tiles[0]]) + tiles = concatenate_datasets([tiles, slides_and_tiles[1]]) + + self.slides = slides + self.tiles = tiles + self._slide_id_to_indices = self._build_tile_index(self.tiles) + + @staticmethod + def _build_tile_index(tiles: HFDataset) -> dict[str | bytes, pa.ListScalar]: + """Creates a fast lookup table for slide indices. + + This function builds a mapping from `slide_id` to the list of indices in the + `tiles` dataset that correspond to that slide. + + Args: + tiles: A dataset containing a `slide_id` column. + + Returns: + A dictionary mapping each `slide_id` to a list of indices in the `tiles` dataset. + """ + if len(tiles) == 0: + return {} + + # 1. Grab the column directly from the underlying PyArrow Table + slide_ids = tiles.data.column("slide_id") + num_rows = len(slide_ids) + + # 2. Handle the "Large" type conversion + current_type = slide_ids.type + if pa.types.is_string(current_type): + slide_ids = slide_ids.cast(pa.large_string()) + elif pa.types.is_binary(current_type): + slide_ids = slide_ids.cast(pa.large_binary()) + + # 3. Generate sequential row indices + # np.arange is used here because PyArrow can wrap it instantly with zero-copy overhead + row_indices = pa.array(np.arange(num_rows, dtype=np.int64)) + + # 4. Combine them into a lightweight PyArrow Table + table = pa.Table.from_arrays( + [slide_ids, row_indices], names=["slide_id", "idx"] + ) + + # 5. Perform the native Arrow groupby and aggregate + # The "list" function aggregates all indices for a given slide_id into a single Arrow List scalar + grouped = table.group_by("slide_id").aggregate([("idx", "list")]) + + # 6. Extract keys to Python, but KEEP values as PyArrow ListScalars + keys = grouped.column("slide_id").to_numpy() + values_array = grouped.column("idx_list") + + # Map the string key to the PyArrow ListScalar + return {key: values_array[i] for i, key in enumerate(keys)} + + def filter_tiles_by_slide(self, slide_id: str | bytes) -> HFDataset: + """Returns a view of the dataset using a slice or indices. + + This function creates a view of the `self.tiles` dataset that contains only + the tiles belonging to the specified slide. It uses the precomputed + `_slide_id_to_indices` mapping to efficiently retrieve the relevant tiles + without copying data. + + Args: + slide_id: The ID of the slide to filter tiles. + + Returns: + A view of the tiles dataset containing only the tiles for the specified slide. + """ + tile_indices = self._slide_id_to_indices.get( + slide_id, pa.scalar([], type=pa.list_(pa.int64())) + ) + return self.tiles.select(tile_indices.values.to_numpy()) + + @staticmethod + def load_slides_and_tiles( + paths: Iterable[str | Path], uris: Iterable[str], hf_kwargs: dict[str, Any] + ) -> tuple[HFDataset, HFDataset]: + """Load slides and tiles parquets from local storage and MLFlow artifacts. + + Args: + paths: List of directories to load slides and tiles from. Each + directory must include either single files (`slides.parquet` + and `tiles.parquet`) or subdirectories (`slides/` and `tiles/`) + containing chunked Parquet files. + uris: List of MLFlow artifact URIs pointing to folders containing + either single files (`slides.parquet` and `tiles.parquet`) or + subdirectories (`slides/` and `tiles/`) containing chunked + Parquet files. + hf_kwargs: Additional keyword arguments to pass to HuggingFace's + `load_dataset` function. + + Raises: + RuntimeError: If the data cannot be loaded from the specified URIs. + + Returns: + A tuple containing the slides and tiles Datasets. + """ + # Parallelize MLFlow downloads (I/O Bound) + with ThreadPoolExecutor() as executor: + artifacts_paths = list( + executor.map(lambda uri: download_artifacts(artifact_uri=uri), uris) + ) + + search_dirs = [Path(p) for p in (*paths, *artifacts_paths)] + + # Handle empty datasets + if not len(search_dirs): + return HFDataset.from_dict({}), HFDataset.from_dict({}) + + def resolve_search_path(partition: str) -> list[dict[str, str]]: + return [ + {"data_dir": str(path / partition)} + if (path / partition).is_dir() + else {"data_files": str(path / f"{partition}.parquet")} + for path in search_dirs + ] + + try: + slides_ds = concatenate_datasets( + [ + load_dataset(**hf_kwargs, **datasource) + for datasource in resolve_search_path("slides") + ] + ) + + tiles_ds = concatenate_datasets( + [ + load_dataset(**hf_kwargs, **datasource) + for datasource in resolve_search_path("tiles") + ] + ) + + return slides_ds, tiles_ds + + except Exception as e: + msg = "Failed to load Parquet files." + raise RuntimeError(msg) from e From ddf0e65762d46c26eb0234131ad9b4fc6172caa2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Kuku=C4=8Dka?= Date: Mon, 20 Apr 2026 20:00:36 +0200 Subject: [PATCH 6/8] docs: fix slide_id access in generate_datasets example --- rationai/mlkit/data/datasets/meta_tiled_slides.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rationai/mlkit/data/datasets/meta_tiled_slides.py b/rationai/mlkit/data/datasets/meta_tiled_slides.py index 1b80259..ff61c79 100644 --- a/rationai/mlkit/data/datasets/meta_tiled_slides.py +++ b/rationai/mlkit/data/datasets/meta_tiled_slides.py @@ -84,7 +84,7 @@ def generate_datasets(self) -> Iterable[Dataset[T]]: level=slide["level"], tile_extent_x=slide["tile_extent_x"], tile_extent_y=slide["tile_extent_y"], - tiles=self.filter_tiles_by_slide(slide.id), + tiles=self.filter_tiles_by_slide(slide["id"]), ) for slide in self.slides ) From 1dd09e7e6b2de512af79f0162aa4ef57cfae217c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Kuku=C4=8Dka?= Date: Mon, 20 Apr 2026 22:23:38 +0200 Subject: [PATCH 7/8] fix: concatenating with empty HFDataset --- .../mlkit/data/datasets/slides_tiles_loader.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/rationai/mlkit/data/datasets/slides_tiles_loader.py b/rationai/mlkit/data/datasets/slides_tiles_loader.py index 1aa55ce..f003710 100644 --- a/rationai/mlkit/data/datasets/slides_tiles_loader.py +++ b/rationai/mlkit/data/datasets/slides_tiles_loader.py @@ -43,14 +43,20 @@ def __init__( if hf_kwargs is None: hf_kwargs = {"path": "parquet", "split": "train"} - slides, tiles = self.load_slides_and_tiles(paths or [], uris or [], hf_kwargs) + slides = [] + tiles = [] + + if paths or uris: + s, t = self.load_slides_and_tiles(paths or [], uris or [], hf_kwargs) + slides.append(s) + tiles.append(t) if slides_and_tiles is not None: - slides = concatenate_datasets([slides, slides_and_tiles[0]]) - tiles = concatenate_datasets([tiles, slides_and_tiles[1]]) + slides.append(slides_and_tiles[0]) + tiles.append(slides_and_tiles[1]) - self.slides = slides - self.tiles = tiles + self.slides = concatenate_datasets(slides) if len(slides) > 1 else slides[0] + self.tiles = concatenate_datasets(tiles) if len(tiles) > 1 else tiles[0] self._slide_id_to_indices = self._build_tile_index(self.tiles) @staticmethod From d0959b6962b7eaf0f30538338714df312c8cc223 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Kuku=C4=8Dka?= Date: Mon, 20 Apr 2026 22:28:03 +0200 Subject: [PATCH 8/8] fix: replace assert with ValueError Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- rationai/mlkit/data/datasets/slides_tiles_loader.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/rationai/mlkit/data/datasets/slides_tiles_loader.py b/rationai/mlkit/data/datasets/slides_tiles_loader.py index f003710..7359965 100644 --- a/rationai/mlkit/data/datasets/slides_tiles_loader.py +++ b/rationai/mlkit/data/datasets/slides_tiles_loader.py @@ -36,9 +36,10 @@ def __init__( hf_kwargs: Additional keyword arguments to pass to HuggingFace's `load_dataset` function. Defaults to `{"path": "parquet", "split": "train"}`. """ - assert paths or uris or slides_and_tiles, ( - "At least one of paths, uris or slides_and_tiles must be provided." - ) + if not (paths or uris or slides_and_tiles): + raise ValueError( + "At least one of paths, uris or slides_and_tiles must be provided." + ) if hf_kwargs is None: hf_kwargs = {"path": "parquet", "split": "train"}