nauticalab · kurodo3 · Jun 11, 2026 · Jun 11, 2026 · Jun 11, 2026 · Jun 11, 2026
diff --git a/src/orcapod/__init__.py b/src/orcapod/__init__.py
@@ -40,3 +40,5 @@
     "streams",
     "types",
 ]
+
+
diff --git a/src/orcapod/core/data_function.py b/src/orcapod/core/data_function.py
@@ -4,6 +4,7 @@
 import logging
 import re
 import sys
+import uuid
 from abc import abstractmethod
 from collections.abc import Callable, Iterable, Sequence
 import typing
@@ -549,13 +550,13 @@ def combine(*components: tuple[str, ...]) -> str:
             inner_parsed = [":".join(component) for component in components]
             return "::".join(inner_parsed)
 
-        record_id = str(uuid7())
-        source_info = {k: combine(self.uri, (record_id,), (k,)) for k in output_data}
+        new_record_uuid = uuid.UUID(bytes=uuid7().bytes)
+        source_info = {k: combine(self.uri, (new_record_uuid.hex,), (k,)) for k in output_data}
 
         return Data(
             output_data,
             source_info=source_info,
-            record_id=record_id,
+            record_uuid=new_record_uuid,
             python_schema=self.output_data_schema,
             data_context=self.data_context,
         )

diff --git a/src/orcapod/core/datagrams/datagram.py b/src/orcapod/core/datagrams/datagram.py
@@ -20,6 +20,7 @@
 from __future__ import annotations
 
 import logging
+import uuid
 from collections.abc import Collection, Iterator, Mapping
 from typing import TYPE_CHECKING, Any, Self, cast
 
@@ -66,18 +67,18 @@ def __init__(
         data: Mapping[str, DataValue] | pa.Table | pa.RecordBatch,
         python_schema: SchemaLike | None = None,
         meta_info: Mapping[str, DataValue] | None = None,
-        record_id: str | None = None,
+        record_uuid: uuid.UUID | None = None,
         data_context: str | contexts.DataContext | None = None,
         config: OrcapodConfig | None = None,
     ) -> None:
         if isinstance(data, pa.RecordBatch):
             data = pa.Table.from_batches([data])
 
         if isinstance(data, pa.Table):
-            self._init_from_table(data, meta_info, data_context, record_id)
+            self._init_from_table(data, meta_info, data_context, record_uuid)
         else:
             self._init_from_dict(
-                data, python_schema, meta_info, data_context, record_id
+                data, python_schema, meta_info, data_context, record_uuid
             )
 
     def _init_from_dict(
@@ -86,7 +87,7 @@ def _init_from_dict(
         python_schema: SchemaLike | None,
         meta_info: Mapping[str, DataValue] | None,
         data_context: str | contexts.DataContext | None,
-        record_id: str | None,
+        record_uuid: uuid.UUID | None,
     ) -> None:
         data_columns: dict[str, DataValue] = {}
         meta_columns: dict[str, DataValue] = {}
@@ -102,7 +103,7 @@ def _init_from_dict(
                 data_columns[k] = v
 
         super().__init__(data_context=data_context or extracted_context)
-        self._datagram_id = record_id
+        self._datagram_uuid = record_uuid
 
         self._data_dict: dict[str, DataValue] | None = data_columns
         self._data_table: pa.Table | None = None
@@ -134,7 +135,7 @@ def _init_from_table(
         table: pa.Table,
         meta_info: Mapping[str, DataValue] | None,
         data_context: str | contexts.DataContext | None,
-        record_id: str | None,
+        record_uuid: uuid.UUID | None,
     ) -> None:
         if len(table) != 1:
             raise ValueError(
@@ -150,7 +151,7 @@ def _init_from_table(
         context_cols = [c for c in table.column_names if c == constants.CONTEXT_KEY]
 
         super().__init__(data_context=data_context)
-        self._datagram_id = record_id
+        self._datagram_uuid = record_uuid
 
         meta_col_names = [
             c for c in table.column_names if c.startswith(constants.META_PREFIX)
@@ -426,11 +427,19 @@ def identity_structure(self) -> Any:
         return self._ensure_data_table()
 
     @property
-    def datagram_id(self) -> str:
-        """Return (or lazily generate) the datagram's unique ID."""
-        if self._datagram_id is None:
-            self._datagram_id = str(uuid7())
-        return self._datagram_id
+    def datagram_uuid(self) -> uuid.UUID:
+        """Return (or lazily generate) the datagram's unique UUID."""
+        if self._datagram_uuid is None:
+            # uuid_utils is used here because it provides UUIDv7 (time-ordered,
+            # monotonic) generation, which is not available in the stdlib before
+            # Python 3.12.  However, uuid_utils.UUID and stdlib uuid.UUID are
+            # distinct types that do not compare equal even for identical bit
+            # patterns.  We therefore normalise to stdlib uuid.UUID immediately
+            # so that the public API always returns a consistent type and
+            # equality / hashing work correctly regardless of how a UUID was
+            # originally produced.
+            self._datagram_uuid = uuid.UUID(bytes=uuid7().bytes)
+        return self._datagram_uuid
 
     @property
     def converter(self) -> TypeConverterProtocol:
@@ -763,7 +772,7 @@ def copy(self, include_cache: bool = True, preserve_id: bool = True) -> Self:
         new_d._cached_int_hash = None
 
         # Datagram identity
-        new_d._datagram_id = self._datagram_id if preserve_id else None
+        new_d._datagram_uuid = self._datagram_uuid if preserve_id else None
 
         # Data representations — Arrow table is immutable so a ref copy is fine
         new_d._data_table = self._data_table

diff --git a/src/orcapod/core/datagrams/tag_data.py b/src/orcapod/core/datagrams/tag_data.py
@@ -17,6 +17,7 @@
 from __future__ import annotations
 
 import logging
+import uuid
 from collections.abc import Mapping
 from typing import TYPE_CHECKING, Any, Self
 
@@ -61,7 +62,7 @@ def __init__(
         meta_info: "Mapping[str, DataValue] | None" = None,
         python_schema: "SchemaLike | None" = None,
         data_context: "str | contexts.DataContext | None" = None,
-        record_id: "str | None" = None,
+        record_uuid: "uuid.UUID | None" = None,
         **kwargs,
     ) -> None:
         import pyarrow as _pa
@@ -78,7 +79,7 @@ def __init__(
                 data,
                 meta_info=meta_info,
                 data_context=data_context,
-                record_id=record_id,
+                record_uuid=record_uuid,
                 **kwargs,
             )
             sys_tag_cols = [
@@ -114,7 +115,7 @@ def __init__(
                 python_schema=python_schema,
                 meta_info=meta_info,
                 data_context=data_context,
-                record_id=record_id,
+                record_uuid=record_uuid,
                 **kwargs,
             )
 
@@ -256,7 +257,7 @@ def __init__(
         source_info: "Mapping[str, str | None] | None" = None,
         python_schema: "SchemaLike | None" = None,
         data_context: "str | contexts.DataContext | None" = None,
-        record_id: "str | None" = None,
+        record_uuid: "uuid.UUID | None" = None,
         **kwargs,
     ) -> None:
         import pyarrow as _pa
@@ -287,7 +288,7 @@ def __init__(
                 data_table,
                 meta_info=meta_info,
                 data_context=data_context,
-                record_id=record_id,
+                record_uuid=record_uuid,
                 **kwargs,
             )
             si_table = prefixed_tables[constants.SOURCE_PREFIX]
@@ -315,7 +316,7 @@ def __init__(
                 python_schema=python_schema,
                 meta_info=meta_info,
                 data_context=data_context,
-                record_id=record_id,
+                record_uuid=record_uuid,
                 **kwargs,
             )
             self._source_info = {**contained_source_info, **(source_info or {})}

diff --git a/src/orcapod/core/nodes/function_node.py b/src/orcapod/core/nodes/function_node.py
@@ -18,6 +18,7 @@
 
 import asyncio
 import logging
+import uuid
 from collections.abc import Iterator
 from typing import TYPE_CHECKING, Any, Literal, NamedTuple, cast
 
@@ -696,7 +697,7 @@ def __init__(
 
         # stream-level caching state
         self._cached_output_datas: dict[
-            str, tuple[TagProtocol, DataProtocol | None]
+            bytes, tuple[TagProtocol, DataProtocol | None]
         ] = {}
         self._cached_output_table: "pa.Table | None" = None
         self._cached_content_hash_column: "pa.Array | None" = None
@@ -1042,7 +1043,7 @@ def execute(
         ctx_obs.on_node_start(node_label, node_hash, tag_schema=tag_schema)
 
         # Collect upstream entries and resolve entry_ids
-        upstream_entries: list[tuple[TagProtocol, DataProtocol, str]] = [
+        upstream_entries: list[tuple[TagProtocol, DataProtocol, bytes]] = [
             (tag, data, self.compute_pipeline_entry_id(tag, data))
             for tag, data in input_stream.iter_data()
         ]
@@ -1125,7 +1126,7 @@ def _process_data_internal(
                 self.add_pipeline_record(
                     tag,
                     data,
-                    data_record_id=output_data.datagram_id,
+                    data_record_id=output_data.datagram_uuid,
                     computed=result_computed,
                 )
         else:
@@ -1142,8 +1143,8 @@ def _process_data_internal(
         return tag_out, output_data
 
     def get_cached_results(
-        self, entry_ids: list[str]
-    ) -> dict[str, tuple[TagProtocol, DataProtocol]]:
+        self, entry_ids: list[bytes]
+    ) -> dict[bytes, tuple[TagProtocol, DataProtocol]]:
         """Public cache façade: return already-computed results for the given entry IDs.
 
         Serves hits directly from the in-memory cache (``_cached_output_datas``).
@@ -1213,7 +1214,7 @@ async def _async_process_data_internal(
                 self.add_pipeline_record(
                     tag,
                     data,
-                    data_record_id=output_data.datagram_id,
+                    data_record_id=output_data.datagram_uuid,
                     computed=result_computed,
                 )
         else:
@@ -1233,7 +1234,7 @@ async def _async_process_data_internal(
 
     def compute_pipeline_entry_id(
         self, tag: TagProtocol, input_data: DataProtocol
-    ) -> str:
+    ) -> bytes:
         """Compute a unique pipeline entry ID from tag + system tags + input data hash.
 
         ``NODE_CONTENT_HASH_COL`` is always included so that two runs processing
@@ -1246,8 +1247,9 @@ def compute_pipeline_entry_id(
             input_data: The input data.
 
         Returns:
-            A hash string uniquely identifying this (tag, input_data, node run)
-            combination.
+            Method-prefixed raw bytes (``b"{method}:{digest}"``) uniquely
+            identifying this (tag, input_data, node run) combination.  Suitable
+            for storage in a ``pa.large_binary()`` column.
         """
         tag_with_hash = (
             tag.as_table(columns={"system_tags": True})
@@ -1260,13 +1262,13 @@ def compute_pipeline_entry_id(
                 pa.array([self.content_hash().to_string()], type=pa.large_string()),
             )
         )
-        return self.data_context.arrow_hasher.hash_table(tag_with_hash).to_string()
+        return self.data_context.arrow_hasher.hash_table(tag_with_hash).to_prefixed_digest()
 
     def add_pipeline_record(
         self,
         tag: TagProtocol,
         input_data: DataProtocol,
-        data_record_id: str,
+        data_record_id: uuid.UUID,
         computed: bool,
         skip_cache_lookup: bool = False,
     ) -> None:
@@ -1309,7 +1311,7 @@ def add_pipeline_record(
         meta_table = pa.table(
             {
                 constants.DATA_RECORD_ID: pa.array(
-                    [data_record_id], type=pa.large_string()
+                    [data_record_id.bytes], type=pa.large_binary()
                 ),
                 constants.NODE_CONTENT_HASH_COL: pa.array(
                     [self.content_hash().to_string()], type=pa.large_string()
@@ -1435,7 +1437,7 @@ def as_source(self):
 
     def _fetch_joined_records(
         self,
-        entry_ids: list[str] | None = None,
+        entry_ids: list[bytes] | None = None,
     ) -> _JoinedRecords | None:
         """Internal primitive: fetch both DBs, content-hash-filter, and inner-join.
 
@@ -1503,8 +1505,8 @@ def _fetch_joined_records(
 
     def _load_cached_entries(
         self,
-        entry_ids: list[str] | None = None,
-    ) -> "dict[str, tuple[TagProtocol, DataProtocol]]":
+        entry_ids: list[bytes] | None = None,
+    ) -> "dict[bytes, tuple[TagProtocol, DataProtocol]]":
         """DB loader: fetch ``(tag, data)`` pairs from the pipeline and result databases.
 
         Calls ``_fetch_joined_records`` to obtain the raw joined table, then
@@ -1565,7 +1567,7 @@ def _load_cached_entries(
         data_table = joined.drop([c for c in drop_cols if c in joined.column_names])
         stream = ArrowTableStream(data_table, tag_columns=tag_keys)
 
-        loaded: dict[str, tuple[TagProtocol, DataProtocol]] = {}
+        loaded: dict[bytes, tuple[TagProtocol, DataProtocol]] = {}
         for eid, (tag, data) in zip(entry_ids_col, stream.iter_data()):
             loaded[eid] = (tag, data)
         return loaded
@@ -1758,7 +1760,7 @@ async def async_execute(
                 if loaded:
                     self._cached_output_table = None
                     self._cached_content_hash_column = None
-                cached_by_entry_id: dict[str, tuple[TagProtocol, DataProtocol]] = dict(loaded)
+                cached_by_entry_id: dict[bytes, tuple[TagProtocol, DataProtocol]] = dict(loaded)
 
                 # Phase 2: drive output from input channel — cached or compute
                 async def _process_one_db(

diff --git a/src/orcapod/core/operators/join.py b/src/orcapod/core/operators/join.py
@@ -101,13 +101,13 @@ def _predict_system_tag_schema(self, *streams: StreamProtocol) -> Schema:
         system_tag_fields: dict[str, type] = {}
         for idx, stream in enumerate(ordered_streams):
             stream_tag_schema, _ = stream.output_schema(columns={"system_tags": True})
-            for col_name in stream_tag_schema:
+            for col_name, col_type in stream_tag_schema.items():
                 if col_name.startswith(constants.SYSTEM_TAG_PREFIX):
                     new_name = (
                         f"{col_name}{constants.BLOCK_SEPARATOR}"
                         f"{stream.pipeline_hash().to_hex(n_char)}:{idx}"
                     )
-                    system_tag_fields[new_name] = str
+                    system_tag_fields[new_name] = col_type
         return Schema(system_tag_fields)
 
     def static_process(self, *streams: StreamProtocol) -> StreamProtocol:
@@ -699,7 +699,7 @@ def _sort_merged_system_tags(merged_sys: dict) -> dict:
                 sid_val = merged_sys.get(fmap.get(sid_field, ""))
                 rid_val = merged_sys.get(fmap.get(rid_field, ""))
                 vals = {ft: merged_sys[k] for ft, k in fmap.items()}
-                entries.append(((sid_val or "", rid_val or ""), vals))
+                entries.append(((sid_val or "", rid_val or b""), vals))
 
             entries.sort(key=lambda e: e[0])
 

diff --git a/src/orcapod/core/operators/merge_join.py b/src/orcapod/core/operators/merge_join.py
@@ -144,13 +144,13 @@ def _predict_system_tag_schema(
         for stream, orig_idx in canonical:
             canon_pos = canonical.index((stream, orig_idx))
             stream_tag_schema, _ = stream.output_schema(columns={"system_tags": True})
-            for col_name in stream_tag_schema:
+            for col_name, col_type in stream_tag_schema.items():
                 if col_name.startswith(constants.SYSTEM_TAG_PREFIX):
                     new_name = (
                         f"{col_name}{constants.BLOCK_SEPARATOR}"
                         f"{stream.pipeline_hash().to_hex(n_char)}:{canon_pos}"
                     )
-                    system_tag_fields[new_name] = str
+                    system_tag_fields[new_name] = col_type
         return Schema(system_tag_fields)
 
     def binary_static_process(