From 186d943392d39e0d439e15253cf3be74afa39383 Mon Sep 17 00:00:00 2001 From: Jakub Pekar <46449289+JakubPekar@users.noreply.github.com> Date: Sun, 5 Apr 2026 21:52:56 +0200 Subject: [PATCH 01/17] feat: Fix offset overflow by casting slide_ids to large_string Cast slide_ids to large_string to prevent offset overflow. --- rationai/mlkit/data/datasets/meta_tiled_slides.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/rationai/mlkit/data/datasets/meta_tiled_slides.py b/rationai/mlkit/data/datasets/meta_tiled_slides.py index c65a6b0..02cf3fd 100644 --- a/rationai/mlkit/data/datasets/meta_tiled_slides.py +++ b/rationai/mlkit/data/datasets/meta_tiled_slides.py @@ -75,14 +75,16 @@ def _build_tile_index(tiles: HFDataset) -> dict[str, range]: if len(tiles) == 0: return {} - # Get the underlying Arrow table (zero-copy) + # Get the underlying Arrow table table = tiles.data.table - # Since it's sorted, we only care about where 'slide_id' changes. slide_ids = table.column("slide_id") - # Since the dataset is sorted by 'slide_id', we can use - # run-end encoding to find group boundaries efficiently. - run_ends = pc.run_end_encode(slide_ids.combine_chunks()) # pyright: ignore[reportAttributeAccessIssue] + # FIX: Cast to large_string to prevent 32-bit offset overflow (2GB limit) + # we use pc.cast because slide_ids is a ChunkedArray + large_slide_ids = pc.cast(slide_ids, pa.large_string()) + + # Now combine_chunks will work because it uses 64-bit offsets + run_ends = pc.run_end_encode(large_slide_ids.combine_chunks()) values = run_ends.values ends = run_ends.run_ends From bd72a95eb8f148e12689c6716ca7f417dd5548ce Mon Sep 17 00:00:00 2001 From: Jakub Pekar <46449289+JakubPekar@users.noreply.github.com> Date: Sun, 5 Apr 2026 23:06:48 +0200 Subject: [PATCH 02/17] fix: Apply suggestions from code review Co-authored-by: Jakub Pekar <46449289+JakubPekar@users.noreply.github.com> --- rationai/mlkit/data/datasets/meta_tiled_slides.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rationai/mlkit/data/datasets/meta_tiled_slides.py b/rationai/mlkit/data/datasets/meta_tiled_slides.py index 02cf3fd..8056e47 100644 --- a/rationai/mlkit/data/datasets/meta_tiled_slides.py +++ b/rationai/mlkit/data/datasets/meta_tiled_slides.py @@ -9,7 +9,7 @@ from datasets import concatenate_datasets, load_dataset from mlflow.artifacts import download_artifacts from torch.utils.data import ConcatDataset, Dataset - +import pyarrow as pa T = TypeVar("T", covariant=True) From 5a7c9d53bcb08a6015829242338da6cd6711d98a Mon Sep 17 00:00:00 2001 From: Jakub Pekar <46449289+JakubPekar@users.noreply.github.com> Date: Sun, 5 Apr 2026 23:30:25 +0200 Subject: [PATCH 03/17] feat: Apply suggestions from code review Co-authored-by: Jakub Pekar <46449289+JakubPekar@users.noreply.github.com> --- rationai/mlkit/data/datasets/meta_tiled_slides.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rationai/mlkit/data/datasets/meta_tiled_slides.py b/rationai/mlkit/data/datasets/meta_tiled_slides.py index 8056e47..2395bf3 100644 --- a/rationai/mlkit/data/datasets/meta_tiled_slides.py +++ b/rationai/mlkit/data/datasets/meta_tiled_slides.py @@ -81,7 +81,7 @@ def _build_tile_index(tiles: HFDataset) -> dict[str, range]: # FIX: Cast to large_string to prevent 32-bit offset overflow (2GB limit) # we use pc.cast because slide_ids is a ChunkedArray - large_slide_ids = pc.cast(slide_ids, pa.large_string()) + large_slide_ids = pc.cast(slide_ids, pa.large_binary()) # Now combine_chunks will work because it uses 64-bit offsets run_ends = pc.run_end_encode(large_slide_ids.combine_chunks()) From a00ad1ba93520aa0955e92d0f49931ec43d90283 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Pek=C3=A1r?= <492788@mail.muni.cz> Date: Tue, 7 Apr 2026 15:03:56 +0000 Subject: [PATCH 04/17] feat: handle large files --- .../mlkit/data/datasets/meta_tiled_slides.py | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/rationai/mlkit/data/datasets/meta_tiled_slides.py b/rationai/mlkit/data/datasets/meta_tiled_slides.py index 2395bf3..ed9a89c 100644 --- a/rationai/mlkit/data/datasets/meta_tiled_slides.py +++ b/rationai/mlkit/data/datasets/meta_tiled_slides.py @@ -75,16 +75,21 @@ def _build_tile_index(tiles: HFDataset) -> dict[str, range]: if len(tiles) == 0: return {} - # Get the underlying Arrow table - table = tiles.data.table - slide_ids = table.column("slide_id") - - # FIX: Cast to large_string to prevent 32-bit offset overflow (2GB limit) - # we use pc.cast because slide_ids is a ChunkedArray - large_slide_ids = pc.cast(slide_ids, pa.large_binary()) - - # Now combine_chunks will work because it uses 64-bit offsets - run_ends = pc.run_end_encode(large_slide_ids.combine_chunks()) + # 1. Access the column while respecting the Sort Indices + # This returns a ChunkedArray + slide_ids = tiles.with_format("arrow")["slide_id"] + + # 2. Handle the "Large" type conversion dynamically + current_type = slide_ids.type + type_map = { + pa.string(): pa.large_string(), + pa.binary(): pa.large_binary() + } + target_type = type_map.get(current_type, current_type) + + # 3. Perform the Run-End Encoding + # Cast to avoid 2GB limits and combine chunks for the encoder + run_ends = pc.run_end_encode(pc.cast(slide_ids, target_type).combine_chunks()) values = run_ends.values ends = run_ends.run_ends From ced613cde3853ffa2c3bed2a4f96dd6e91a42988 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Pek=C3=A1r?= <492788@mail.muni.cz> Date: Thu, 9 Apr 2026 09:35:47 +0000 Subject: [PATCH 05/17] feat: fix tiles loading --- .../mlkit/data/datasets/meta_tiled_slides.py | 36 +++++-------------- 1 file changed, 9 insertions(+), 27 deletions(-) diff --git a/rationai/mlkit/data/datasets/meta_tiled_slides.py b/rationai/mlkit/data/datasets/meta_tiled_slides.py index ed9a89c..3e91bff 100644 --- a/rationai/mlkit/data/datasets/meta_tiled_slides.py +++ b/rationai/mlkit/data/datasets/meta_tiled_slides.py @@ -10,6 +10,7 @@ from mlflow.artifacts import download_artifacts from torch.utils.data import ConcatDataset, Dataset import pyarrow as pa +import numpy as np T = TypeVar("T", covariant=True) @@ -52,7 +53,7 @@ def __init__( tiles = concatenate_datasets([tiles, slides_and_tiles[1]]) self.slides = slides - self.tiles = tiles.sort("slide_id") + self.tiles = tiles self._slide_id_to_indices = self._build_tile_index(self.tiles) super().__init__(self.generate_datasets()) @@ -75,34 +76,15 @@ def _build_tile_index(tiles: HFDataset) -> dict[str, range]: if len(tiles) == 0: return {} - # 1. Access the column while respecting the Sort Indices - # This returns a ChunkedArray - slide_ids = tiles.with_format("arrow")["slide_id"] + slide_ids = tiles.data.column("slide_id") - # 2. Handle the "Large" type conversion dynamically - current_type = slide_ids.type - type_map = { - pa.string(): pa.large_string(), - pa.binary(): pa.large_binary() - } - target_type = type_map.get(current_type, current_type) + # FIX 3: Group indices without sorting the underlying dataset. + # Converting the single PyArrow column to Pandas is extremely fast. + df = slide_ids.to_pandas(name="slide_id").to_frame() + df['idx'] = np.arange(len(df)) - # 3. Perform the Run-End Encoding - # Cast to avoid 2GB limits and combine chunks for the encoder - run_ends = pc.run_end_encode(pc.cast(slide_ids, target_type).combine_chunks()) - - values = run_ends.values - ends = run_ends.run_ends - - index_map = {} - current_offset = 0 - - for sid, end in zip(values, ends, strict=True): - end_py = end.as_py() - index_map[sid.as_py()] = range(current_offset, end_py) - current_offset = end_py - - return index_map + # Group by slide_id and aggregate the physical indices into lists + return df.groupby("slide_id")['idx'].apply(list).to_dict() @abstractmethod def generate_datasets(self) -> Iterable[Dataset[T]]: From 0818342368530eaa002f97a8d153ca96d13087ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Pek=C3=A1r?= <492788@mail.muni.cz> Date: Thu, 9 Apr 2026 09:39:07 +0000 Subject: [PATCH 06/17] feat: update build index --- .../mlkit/data/datasets/meta_tiled_slides.py | 27 ++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/rationai/mlkit/data/datasets/meta_tiled_slides.py b/rationai/mlkit/data/datasets/meta_tiled_slides.py index 3e91bff..ea54532 100644 --- a/rationai/mlkit/data/datasets/meta_tiled_slides.py +++ b/rationai/mlkit/data/datasets/meta_tiled_slides.py @@ -76,15 +76,30 @@ def _build_tile_index(tiles: HFDataset) -> dict[str, range]: if len(tiles) == 0: return {} + # 1. Grab the column directly from the underlying PyArrow Table slide_ids = tiles.data.column("slide_id") + num_rows = len(slide_ids) - # FIX 3: Group indices without sorting the underlying dataset. - # Converting the single PyArrow column to Pandas is extremely fast. - df = slide_ids.to_pandas(name="slide_id").to_frame() - df['idx'] = np.arange(len(df)) + # 2. Generate sequential row indices + # np.arange is used here because PyArrow can wrap it instantly with zero-copy overhead + row_indices = pa.array(np.arange(num_rows, dtype=np.int64)) - # Group by slide_id and aggregate the physical indices into lists - return df.groupby("slide_id")['idx'].apply(list).to_dict() + # 3. Combine them into a lightweight PyArrow Table + table = pa.Table.from_arrays( + [slide_ids, row_indices], + names=["slide_id", "idx"] + ) + + # 4. Perform the native Arrow groupby and aggregate + # The "list" function aggregates all indices for a given slide_id into a single Arrow List scalar + grouped = table.group_by("slide_id").aggregate([("idx", "list")]) + + # 5. Extract to a Python dictionary + # PyArrow automatically names the aggregated column "idx_list" (pattern: {column}_{agg_func}) + keys = grouped.column("slide_id").to_pylist() + values = grouped.column("idx_list").to_pylist() + + return dict(zip(keys, values, strict=True)) @abstractmethod def generate_datasets(self) -> Iterable[Dataset[T]]: From 0c93ea65ed97dc38003fce02128a1198e2dc2b68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Pek=C3=A1r?= <492788@mail.muni.cz> Date: Mon, 13 Apr 2026 11:03:26 +0000 Subject: [PATCH 07/17] feat: chunk loading --- .../mlkit/data/datasets/meta_tiled_slides.py | 53 +++++++++++-------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/rationai/mlkit/data/datasets/meta_tiled_slides.py b/rationai/mlkit/data/datasets/meta_tiled_slides.py index ea54532..622321b 100644 --- a/rationai/mlkit/data/datasets/meta_tiled_slides.py +++ b/rationai/mlkit/data/datasets/meta_tiled_slides.py @@ -2,15 +2,15 @@ from collections.abc import Iterable from concurrent.futures import ThreadPoolExecutor from pathlib import Path -from typing import TypeVar, cast +from typing import TypeVar -import pyarrow.compute as pc +import numpy as np +import pyarrow as pa from datasets import Dataset as HFDataset from datasets import concatenate_datasets, load_dataset from mlflow.artifacts import download_artifacts from torch.utils.data import ConcatDataset, Dataset -import pyarrow as pa -import numpy as np + T = TypeVar("T", covariant=True) @@ -63,9 +63,7 @@ def _build_tile_index(tiles: HFDataset) -> dict[str, range]: """Creates a fast lookup table for slide indices. This function builds a mapping from `slide_id` to the range of indices in the - `tiles` dataset that correspond to that slide. It assumes that the `tiles` dataset - is sorted by `slide_id`, which allows for efficient retrieval of tile indices - for each slide without needing to scan the entire dataset for each slide. + `tiles` dataset that correspond to that slide. Args: tiles: A dataset containing a `slide_id` column, sorted by `slide_id`. @@ -86,8 +84,7 @@ def _build_tile_index(tiles: HFDataset) -> dict[str, range]: # 3. Combine them into a lightweight PyArrow Table table = pa.Table.from_arrays( - [slide_ids, row_indices], - names=["slide_id", "idx"] + [slide_ids, row_indices], names=["slide_id", "idx"] ) # 4. Perform the native Arrow groupby and aggregate @@ -147,7 +144,7 @@ def load_slides_and_tiles( paths: List of directories to load slides and tiles from. Each directory must include two files: `slides.parquet` and tiles.parquet`. uris: List of MLFlow artifact URIs pointing to folders containing - `slides.parquet` and `tiles.parquet`. + `slides.parquet` and `tiles.parquet` or chunks. Raises: FileNotFoundError: If the data cannot be loaded from the specified URIs. @@ -163,26 +160,36 @@ def load_slides_and_tiles( search_dirs = [Path(p) for p in (*paths, *artifacts_paths)] - # Extract existing file paths - slide_files = [ - str(s) for p in search_dirs if (s := p / "slides.parquet").exists() - ] - tile_files = [ - str(t) for p in search_dirs if (t := p / "tiles.parquet").exists() - ] + def resolve_search_path(partition: str) -> list[dict[str, str]]: + return [ + {"data_dir": str(path / partition)} + if (path / partition).is_dir() + else {"data_files": str(path / f"{partition}.parquet")} + for path in search_dirs + ] - # Handle empty datasets - if not (slide_files and tile_files): - return HFDataset.from_dict({}), HFDataset.from_dict({}) + slide_files = MetaTiledSlides.resolve_search_path(search_dirs, "slides") + tile_files = MetaTiledSlides.resolve_search_path(search_dirs, "tiles") try: # Load datasets with memory mapping (lazy) loader_kwargs = {"path": "parquet", "split": "train"} - slides_ds = load_dataset(**loader_kwargs, data_files=slide_files) # pyright: ignore[reportArgumentType, reportCallIssue] - tiles_ds = load_dataset(**loader_kwargs, data_files=tile_files) # pyright: ignore[reportArgumentType, reportCallIssue] + slides_ds = concatenate_datasets( + [ + load_dataset(**loader_kwargs, **datasource) + for datasource in slide_files + ] + ) + + tiles_ds = concatenate_datasets( + [ + load_dataset(**loader_kwargs, **datasource) + for datasource in tile_files + ] + ) - return cast("HFDataset", slides_ds), cast("HFDataset", tiles_ds) + return slides_ds, tiles_ds except Exception as e: msg = f"Failed to load Parquet files. Found {len(slide_files)} slides and {len(tile_files)} tiles." From a6db6c37d96f77fd3c3a37064c7d4732dbdc9264 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Pek=C3=A1r?= <492788@mail.muni.cz> Date: Mon, 13 Apr 2026 11:21:32 +0000 Subject: [PATCH 08/17] feat: fixes --- rationai/mlkit/data/datasets/meta_tiled_slides.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/rationai/mlkit/data/datasets/meta_tiled_slides.py b/rationai/mlkit/data/datasets/meta_tiled_slides.py index 622321b..8a08858 100644 --- a/rationai/mlkit/data/datasets/meta_tiled_slides.py +++ b/rationai/mlkit/data/datasets/meta_tiled_slides.py @@ -168,9 +168,6 @@ def resolve_search_path(partition: str) -> list[dict[str, str]]: for path in search_dirs ] - slide_files = MetaTiledSlides.resolve_search_path(search_dirs, "slides") - tile_files = MetaTiledSlides.resolve_search_path(search_dirs, "tiles") - try: # Load datasets with memory mapping (lazy) loader_kwargs = {"path": "parquet", "split": "train"} @@ -178,19 +175,19 @@ def resolve_search_path(partition: str) -> list[dict[str, str]]: slides_ds = concatenate_datasets( [ load_dataset(**loader_kwargs, **datasource) - for datasource in slide_files + for datasource in resolve_search_path("slides") ] ) tiles_ds = concatenate_datasets( [ load_dataset(**loader_kwargs, **datasource) - for datasource in tile_files + for datasource in resolve_search_path("tiles") ] ) return slides_ds, tiles_ds except Exception as e: - msg = f"Failed to load Parquet files. Found {len(slide_files)} slides and {len(tile_files)} tiles." + msg = "Failed to load Parquet files." raise FileNotFoundError(msg) from e From e50c256ce42843f61f1b080ab957e3f4f30eb030 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Pek=C3=A1r?= <492788@mail.muni.cz> Date: Mon, 13 Apr 2026 12:45:06 +0000 Subject: [PATCH 09/17] feat: type conversion --- .../mlkit/data/datasets/meta_tiled_slides.py | 32 +++++++++++++------ 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/rationai/mlkit/data/datasets/meta_tiled_slides.py b/rationai/mlkit/data/datasets/meta_tiled_slides.py index 8a08858..ebbbd78 100644 --- a/rationai/mlkit/data/datasets/meta_tiled_slides.py +++ b/rationai/mlkit/data/datasets/meta_tiled_slides.py @@ -66,7 +66,7 @@ def _build_tile_index(tiles: HFDataset) -> dict[str, range]: `tiles` dataset that correspond to that slide. Args: - tiles: A dataset containing a `slide_id` column, sorted by `slide_id`. + tiles: A dataset containing a `slide_id` column. Returns: A dictionary mapping each `slide_id` to a range of indices in the `tiles` dataset. @@ -78,25 +78,32 @@ def _build_tile_index(tiles: HFDataset) -> dict[str, range]: slide_ids = tiles.data.column("slide_id") num_rows = len(slide_ids) - # 2. Generate sequential row indices + # 2. Handle the "Large" type conversion + current_type = slide_ids.type + if pa.types.is_string(current_type): + slide_ids = slide_ids.cast(pa.large_string()) + elif pa.types.is_binary(current_type): + slide_ids = slide_ids.cast(pa.large_binary()) + + # 3. Generate sequential row indices # np.arange is used here because PyArrow can wrap it instantly with zero-copy overhead row_indices = pa.array(np.arange(num_rows, dtype=np.int64)) - # 3. Combine them into a lightweight PyArrow Table + # 4. Combine them into a lightweight PyArrow Table table = pa.Table.from_arrays( [slide_ids, row_indices], names=["slide_id", "idx"] ) - # 4. Perform the native Arrow groupby and aggregate + # 5. Perform the native Arrow groupby and aggregate # The "list" function aggregates all indices for a given slide_id into a single Arrow List scalar grouped = table.group_by("slide_id").aggregate([("idx", "list")]) - # 5. Extract to a Python dictionary - # PyArrow automatically names the aggregated column "idx_list" (pattern: {column}_{agg_func}) + # 6. Extract keys to Python, but KEEP values as PyArrow ListScalars keys = grouped.column("slide_id").to_pylist() - values = grouped.column("idx_list").to_pylist() + values_array = grouped.column("idx_list") - return dict(zip(keys, values, strict=True)) + # Map the string key to the PyArrow ListScalar + return {key: values_array[i] for i, key in enumerate(keys)} @abstractmethod def generate_datasets(self) -> Iterable[Dataset[T]]: @@ -131,8 +138,9 @@ def filter_tiles_by_slide(self, slide_id: str) -> HFDataset: Returns: A view of the tiles dataset containing only the tiles for the specified slide. """ - tile_range = self._slide_id_to_indices.get(slide_id, range(0)) - return self.tiles.select(tile_range) + # We consruct it only once + tile_indices = self._slide_id_to_indices.get(slide_id, pa.scalar([])) + return self.tiles.select(tile_indices.as_py()) @staticmethod def load_slides_and_tiles( @@ -160,6 +168,10 @@ def load_slides_and_tiles( search_dirs = [Path(p) for p in (*paths, *artifacts_paths)] + # Handle empty datasets + if not len(search_dirs): + return HFDataset.from_dict({}), HFDataset.from_dict({}) + def resolve_search_path(partition: str) -> list[dict[str, str]]: return [ {"data_dir": str(path / partition)} From 6ad4c4ce4c2b50bd78c96ec878b3f479199d0738 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Pek=C3=A1r?= <492788@mail.muni.cz> Date: Mon, 13 Apr 2026 13:15:04 +0000 Subject: [PATCH 10/17] feat: hf kwargs --- .../mlkit/data/datasets/meta_tiled_slides.py | 37 +++++++++++++------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/rationai/mlkit/data/datasets/meta_tiled_slides.py b/rationai/mlkit/data/datasets/meta_tiled_slides.py index ebbbd78..43c6426 100644 --- a/rationai/mlkit/data/datasets/meta_tiled_slides.py +++ b/rationai/mlkit/data/datasets/meta_tiled_slides.py @@ -2,7 +2,7 @@ from collections.abc import Iterable from concurrent.futures import ThreadPoolExecutor from pathlib import Path -from typing import TypeVar +from typing import Any, TypeVar import numpy as np import pyarrow as pa @@ -32,21 +32,31 @@ def __init__( paths: Iterable[Path | str] | None = None, uris: Iterable[str] | None = None, slides_and_tiles: tuple[HFDataset, HFDataset] | None = None, + hf_kwargs: dict[str, Any] | None = None ) -> None: """Load slides and tiles from MLFlow artifacts. Args: paths: List of directories to load slides and tiles from. Each - directory must include two files: `slides.parquet` and tiles.parquet`. + directory must include either single files (`slides.parquet` + and `tiles.parquet`) or subdirectories (`slides/` and `tiles/`) + containing chunked Parquet files. uris: List of MLFlow artifact URIs pointing to folders containing - `slides.parquet` and `tiles.parquet`. + either single files (`slides.parquet` and `tiles.parquet`) or + subdirectories (`slides/` and `tiles/`) containing chunked + Parquet files. slides_and_tiles: Tuple containing the slides and tiles Datasets. + hf_kwargs: Additional keyword arguments to pass to HuggingFace's + `load_dataset` function. Defaults to `{"path": "parquet", "split": "train"}`. """ assert paths or uris or slides_and_tiles, ( "At least one of paths, uris or slides_and_tiles must be provided." ) - slides, tiles = self.load_slides_and_tiles(paths or [], uris or []) + if hf_kwargs is None: + hf_kwargs = {"path": "parquet", "split": "train"} + + slides, tiles = self.load_slides_and_tiles(paths or [], uris or [], hf_kwargs) if slides_and_tiles is not None: slides = concatenate_datasets([slides, slides_and_tiles[0]]) @@ -144,15 +154,21 @@ def filter_tiles_by_slide(self, slide_id: str) -> HFDataset: @staticmethod def load_slides_and_tiles( - paths: Iterable[str | Path], uris: Iterable[str] + paths: Iterable[str | Path], uris: Iterable[str], hf_kwargs: dict[str, Any] ) -> tuple[HFDataset, HFDataset]: """Load slides and tiles parquets from local storage and MLFlow artifacts. Args: paths: List of directories to load slides and tiles from. Each - directory must include two files: `slides.parquet` and tiles.parquet`. + directory must include either single files (`slides.parquet` + and `tiles.parquet`) or subdirectories (`slides/` and `tiles/`) + containing chunked Parquet files. uris: List of MLFlow artifact URIs pointing to folders containing - `slides.parquet` and `tiles.parquet` or chunks. + either single files (`slides.parquet` and `tiles.parquet`) or + subdirectories (`slides/` and `tiles/`) containing chunked + Parquet files. + hf_kwargs: Additional keyword arguments to pass to HuggingFace's + `load_dataset` function. Raises: FileNotFoundError: If the data cannot be loaded from the specified URIs. @@ -181,19 +197,16 @@ def resolve_search_path(partition: str) -> list[dict[str, str]]: ] try: - # Load datasets with memory mapping (lazy) - loader_kwargs = {"path": "parquet", "split": "train"} - slides_ds = concatenate_datasets( [ - load_dataset(**loader_kwargs, **datasource) + load_dataset(**hf_kwargs, **datasource) for datasource in resolve_search_path("slides") ] ) tiles_ds = concatenate_datasets( [ - load_dataset(**loader_kwargs, **datasource) + load_dataset(**hf_kwargs, **datasource) for datasource in resolve_search_path("tiles") ] ) From cd3426587befb009bb3a7932ac32334b4b14cc11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Pek=C3=A1r?= <492788@mail.muni.cz> Date: Mon, 13 Apr 2026 13:17:58 +0000 Subject: [PATCH 11/17] feat: ruff fix --- rationai/mlkit/data/datasets/meta_tiled_slides.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rationai/mlkit/data/datasets/meta_tiled_slides.py b/rationai/mlkit/data/datasets/meta_tiled_slides.py index 43c6426..1f3324e 100644 --- a/rationai/mlkit/data/datasets/meta_tiled_slides.py +++ b/rationai/mlkit/data/datasets/meta_tiled_slides.py @@ -32,7 +32,7 @@ def __init__( paths: Iterable[Path | str] | None = None, uris: Iterable[str] | None = None, slides_and_tiles: tuple[HFDataset, HFDataset] | None = None, - hf_kwargs: dict[str, Any] | None = None + hf_kwargs: dict[str, Any] | None = None, ) -> None: """Load slides and tiles from MLFlow artifacts. From 16637a7b66607a6ca768194a717771caa06529f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Pek=C3=A1r?= <492788@mail.muni.cz> Date: Mon, 13 Apr 2026 14:22:39 +0000 Subject: [PATCH 12/17] feat: fixes --- rationai/mlkit/data/datasets/meta_tiled_slides.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/rationai/mlkit/data/datasets/meta_tiled_slides.py b/rationai/mlkit/data/datasets/meta_tiled_slides.py index 1f3324e..ec2edde 100644 --- a/rationai/mlkit/data/datasets/meta_tiled_slides.py +++ b/rationai/mlkit/data/datasets/meta_tiled_slides.py @@ -69,17 +69,17 @@ def __init__( super().__init__(self.generate_datasets()) @staticmethod - def _build_tile_index(tiles: HFDataset) -> dict[str, range]: + def _build_tile_index(tiles: HFDataset) -> dict[str | bytes, pa.ListScalar]: """Creates a fast lookup table for slide indices. - This function builds a mapping from `slide_id` to the range of indices in the + This function builds a mapping from `slide_id` to the list of indices in the `tiles` dataset that correspond to that slide. Args: tiles: A dataset containing a `slide_id` column. Returns: - A dictionary mapping each `slide_id` to a range of indices in the `tiles` dataset. + A dictionary mapping each `slide_id` to a list of indices in the `tiles` dataset. """ if len(tiles) == 0: return {} @@ -134,7 +134,7 @@ def generate_datasets(self) -> Iterable[Dataset[T]]: ``` """ - def filter_tiles_by_slide(self, slide_id: str) -> HFDataset: + def filter_tiles_by_slide(self, slide_id: str | bytes) -> HFDataset: """Returns a view of the dataset using a slice or indices. This function creates a view of the `self.tiles` dataset that contains only @@ -148,7 +148,6 @@ def filter_tiles_by_slide(self, slide_id: str) -> HFDataset: Returns: A view of the tiles dataset containing only the tiles for the specified slide. """ - # We consruct it only once tile_indices = self._slide_id_to_indices.get(slide_id, pa.scalar([])) return self.tiles.select(tile_indices.as_py()) From c7e11cfbd4e5fc4036ca0827af67390517c0208e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Pek=C3=A1r?= <492788@mail.muni.cz> Date: Mon, 13 Apr 2026 14:30:57 +0000 Subject: [PATCH 13/17] feat: py list tu numpy --- rationai/mlkit/data/datasets/meta_tiled_slides.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rationai/mlkit/data/datasets/meta_tiled_slides.py b/rationai/mlkit/data/datasets/meta_tiled_slides.py index ec2edde..b5642a9 100644 --- a/rationai/mlkit/data/datasets/meta_tiled_slides.py +++ b/rationai/mlkit/data/datasets/meta_tiled_slides.py @@ -149,7 +149,7 @@ def filter_tiles_by_slide(self, slide_id: str | bytes) -> HFDataset: A view of the tiles dataset containing only the tiles for the specified slide. """ tile_indices = self._slide_id_to_indices.get(slide_id, pa.scalar([])) - return self.tiles.select(tile_indices.as_py()) + return self.tiles.select(tile_indices.to_numpy()) @staticmethod def load_slides_and_tiles( From fa0697db94ee7b14c2ca6cb984075a83b3afc609 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Pek=C3=A1r?= <492788@mail.muni.cz> Date: Mon, 13 Apr 2026 14:32:11 +0000 Subject: [PATCH 14/17] feat: to numpy --- rationai/mlkit/data/datasets/meta_tiled_slides.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rationai/mlkit/data/datasets/meta_tiled_slides.py b/rationai/mlkit/data/datasets/meta_tiled_slides.py index b5642a9..798e396 100644 --- a/rationai/mlkit/data/datasets/meta_tiled_slides.py +++ b/rationai/mlkit/data/datasets/meta_tiled_slides.py @@ -109,7 +109,7 @@ def _build_tile_index(tiles: HFDataset) -> dict[str | bytes, pa.ListScalar]: grouped = table.group_by("slide_id").aggregate([("idx", "list")]) # 6. Extract keys to Python, but KEEP values as PyArrow ListScalars - keys = grouped.column("slide_id").to_pylist() + keys = grouped.column("slide_id").to_numpy() values_array = grouped.column("idx_list") # Map the string key to the PyArrow ListScalar From 8943c0bf77a834531584df9112b943ff32c83859 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Pek=C3=A1r?= <492788@mail.muni.cz> Date: Mon, 13 Apr 2026 15:15:42 +0000 Subject: [PATCH 15/17] feat: fix error --- rationai/mlkit/data/datasets/meta_tiled_slides.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rationai/mlkit/data/datasets/meta_tiled_slides.py b/rationai/mlkit/data/datasets/meta_tiled_slides.py index 798e396..4b8ac7e 100644 --- a/rationai/mlkit/data/datasets/meta_tiled_slides.py +++ b/rationai/mlkit/data/datasets/meta_tiled_slides.py @@ -148,8 +148,8 @@ def filter_tiles_by_slide(self, slide_id: str | bytes) -> HFDataset: Returns: A view of the tiles dataset containing only the tiles for the specified slide. """ - tile_indices = self._slide_id_to_indices.get(slide_id, pa.scalar([])) - return self.tiles.select(tile_indices.to_numpy()) + tile_indices = self._slide_id_to_indices.get(slide_id, pa.scalar([], type=pa.list_(pa.int64()))) + return self.tiles.select(tile_indices.values.to_numpy()) @staticmethod def load_slides_and_tiles( From 7e49e87b6151684bf6e463ea2a388316499f843f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Pek=C3=A1r?= <492788@mail.muni.cz> Date: Mon, 13 Apr 2026 15:18:33 +0000 Subject: [PATCH 16/17] feat: format --- rationai/mlkit/data/datasets/meta_tiled_slides.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/rationai/mlkit/data/datasets/meta_tiled_slides.py b/rationai/mlkit/data/datasets/meta_tiled_slides.py index 4b8ac7e..8437e41 100644 --- a/rationai/mlkit/data/datasets/meta_tiled_slides.py +++ b/rationai/mlkit/data/datasets/meta_tiled_slides.py @@ -148,7 +148,9 @@ def filter_tiles_by_slide(self, slide_id: str | bytes) -> HFDataset: Returns: A view of the tiles dataset containing only the tiles for the specified slide. """ - tile_indices = self._slide_id_to_indices.get(slide_id, pa.scalar([], type=pa.list_(pa.int64()))) + tile_indices = self._slide_id_to_indices.get( + slide_id, pa.scalar([], type=pa.list_(pa.int64())) + ) return self.tiles.select(tile_indices.values.to_numpy()) @staticmethod From e6131aab3c40b5027bfc9163cdeb628585d26369 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Pek=C3=A1r?= <492788@mail.muni.cz> Date: Mon, 13 Apr 2026 15:28:36 +0000 Subject: [PATCH 17/17] feat: version update --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 9c9a4b3..8124b29 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "rationai-mlkit" -version = "0.4.0" +version = "0.4.1" description = "" authors = [ { name = "Matěj Pekár", email = "matejpekar@mail.muni.cz" },