diff --git a/src/orcapod/__init__.py b/src/orcapod/__init__.py index f8fe34dd..9d30caa7 100644 --- a/src/orcapod/__init__.py +++ b/src/orcapod/__init__.py @@ -40,3 +40,5 @@ "streams", "types", ] + + diff --git a/src/orcapod/core/data_function.py b/src/orcapod/core/data_function.py index 4e3a7c64..73575d1f 100644 --- a/src/orcapod/core/data_function.py +++ b/src/orcapod/core/data_function.py @@ -4,6 +4,7 @@ import logging import re import sys +import uuid from abc import abstractmethod from collections.abc import Callable, Iterable, Sequence import typing @@ -549,13 +550,13 @@ def combine(*components: tuple[str, ...]) -> str: inner_parsed = [":".join(component) for component in components] return "::".join(inner_parsed) - record_id = str(uuid7()) - source_info = {k: combine(self.uri, (record_id,), (k,)) for k in output_data} + new_record_uuid = uuid.UUID(bytes=uuid7().bytes) + source_info = {k: combine(self.uri, (new_record_uuid.hex,), (k,)) for k in output_data} return Data( output_data, source_info=source_info, - record_id=record_id, + record_uuid=new_record_uuid, python_schema=self.output_data_schema, data_context=self.data_context, ) diff --git a/src/orcapod/core/datagrams/datagram.py b/src/orcapod/core/datagrams/datagram.py index 77a0c679..8fa2b48b 100644 --- a/src/orcapod/core/datagrams/datagram.py +++ b/src/orcapod/core/datagrams/datagram.py @@ -20,6 +20,7 @@ from __future__ import annotations import logging +import uuid from collections.abc import Collection, Iterator, Mapping from typing import TYPE_CHECKING, Any, Self, cast @@ -66,7 +67,7 @@ def __init__( data: Mapping[str, DataValue] | pa.Table | pa.RecordBatch, python_schema: SchemaLike | None = None, meta_info: Mapping[str, DataValue] | None = None, - record_id: str | None = None, + record_uuid: uuid.UUID | None = None, data_context: str | contexts.DataContext | None = None, config: OrcapodConfig | None = None, ) -> None: @@ -74,10 +75,10 @@ def __init__( data = pa.Table.from_batches([data]) if isinstance(data, pa.Table): - self._init_from_table(data, meta_info, data_context, record_id) + self._init_from_table(data, meta_info, data_context, record_uuid) else: self._init_from_dict( - data, python_schema, meta_info, data_context, record_id + data, python_schema, meta_info, data_context, record_uuid ) def _init_from_dict( @@ -86,7 +87,7 @@ def _init_from_dict( python_schema: SchemaLike | None, meta_info: Mapping[str, DataValue] | None, data_context: str | contexts.DataContext | None, - record_id: str | None, + record_uuid: uuid.UUID | None, ) -> None: data_columns: dict[str, DataValue] = {} meta_columns: dict[str, DataValue] = {} @@ -102,7 +103,7 @@ def _init_from_dict( data_columns[k] = v super().__init__(data_context=data_context or extracted_context) - self._datagram_id = record_id + self._datagram_uuid = record_uuid self._data_dict: dict[str, DataValue] | None = data_columns self._data_table: pa.Table | None = None @@ -134,7 +135,7 @@ def _init_from_table( table: pa.Table, meta_info: Mapping[str, DataValue] | None, data_context: str | contexts.DataContext | None, - record_id: str | None, + record_uuid: uuid.UUID | None, ) -> None: if len(table) != 1: raise ValueError( @@ -150,7 +151,7 @@ def _init_from_table( context_cols = [c for c in table.column_names if c == constants.CONTEXT_KEY] super().__init__(data_context=data_context) - self._datagram_id = record_id + self._datagram_uuid = record_uuid meta_col_names = [ c for c in table.column_names if c.startswith(constants.META_PREFIX) @@ -426,11 +427,19 @@ def identity_structure(self) -> Any: return self._ensure_data_table() @property - def datagram_id(self) -> str: - """Return (or lazily generate) the datagram's unique ID.""" - if self._datagram_id is None: - self._datagram_id = str(uuid7()) - return self._datagram_id + def datagram_uuid(self) -> uuid.UUID: + """Return (or lazily generate) the datagram's unique UUID.""" + if self._datagram_uuid is None: + # uuid_utils is used here because it provides UUIDv7 (time-ordered, + # monotonic) generation, which is not available in the stdlib before + # Python 3.12. However, uuid_utils.UUID and stdlib uuid.UUID are + # distinct types that do not compare equal even for identical bit + # patterns. We therefore normalise to stdlib uuid.UUID immediately + # so that the public API always returns a consistent type and + # equality / hashing work correctly regardless of how a UUID was + # originally produced. + self._datagram_uuid = uuid.UUID(bytes=uuid7().bytes) + return self._datagram_uuid @property def converter(self) -> TypeConverterProtocol: @@ -763,7 +772,7 @@ def copy(self, include_cache: bool = True, preserve_id: bool = True) -> Self: new_d._cached_int_hash = None # Datagram identity - new_d._datagram_id = self._datagram_id if preserve_id else None + new_d._datagram_uuid = self._datagram_uuid if preserve_id else None # Data representations — Arrow table is immutable so a ref copy is fine new_d._data_table = self._data_table diff --git a/src/orcapod/core/datagrams/tag_data.py b/src/orcapod/core/datagrams/tag_data.py index f6d0c4c5..e14be9fa 100644 --- a/src/orcapod/core/datagrams/tag_data.py +++ b/src/orcapod/core/datagrams/tag_data.py @@ -17,6 +17,7 @@ from __future__ import annotations import logging +import uuid from collections.abc import Mapping from typing import TYPE_CHECKING, Any, Self @@ -61,7 +62,7 @@ def __init__( meta_info: "Mapping[str, DataValue] | None" = None, python_schema: "SchemaLike | None" = None, data_context: "str | contexts.DataContext | None" = None, - record_id: "str | None" = None, + record_uuid: "uuid.UUID | None" = None, **kwargs, ) -> None: import pyarrow as _pa @@ -78,7 +79,7 @@ def __init__( data, meta_info=meta_info, data_context=data_context, - record_id=record_id, + record_uuid=record_uuid, **kwargs, ) sys_tag_cols = [ @@ -114,7 +115,7 @@ def __init__( python_schema=python_schema, meta_info=meta_info, data_context=data_context, - record_id=record_id, + record_uuid=record_uuid, **kwargs, ) @@ -256,7 +257,7 @@ def __init__( source_info: "Mapping[str, str | None] | None" = None, python_schema: "SchemaLike | None" = None, data_context: "str | contexts.DataContext | None" = None, - record_id: "str | None" = None, + record_uuid: "uuid.UUID | None" = None, **kwargs, ) -> None: import pyarrow as _pa @@ -287,7 +288,7 @@ def __init__( data_table, meta_info=meta_info, data_context=data_context, - record_id=record_id, + record_uuid=record_uuid, **kwargs, ) si_table = prefixed_tables[constants.SOURCE_PREFIX] @@ -315,7 +316,7 @@ def __init__( python_schema=python_schema, meta_info=meta_info, data_context=data_context, - record_id=record_id, + record_uuid=record_uuid, **kwargs, ) self._source_info = {**contained_source_info, **(source_info or {})} diff --git a/src/orcapod/core/nodes/function_node.py b/src/orcapod/core/nodes/function_node.py index 7f6f97dd..01865986 100644 --- a/src/orcapod/core/nodes/function_node.py +++ b/src/orcapod/core/nodes/function_node.py @@ -18,6 +18,7 @@ import asyncio import logging +import uuid from collections.abc import Iterator from typing import TYPE_CHECKING, Any, Literal, NamedTuple, cast @@ -696,7 +697,7 @@ def __init__( # stream-level caching state self._cached_output_datas: dict[ - str, tuple[TagProtocol, DataProtocol | None] + bytes, tuple[TagProtocol, DataProtocol | None] ] = {} self._cached_output_table: "pa.Table | None" = None self._cached_content_hash_column: "pa.Array | None" = None @@ -1042,7 +1043,7 @@ def execute( ctx_obs.on_node_start(node_label, node_hash, tag_schema=tag_schema) # Collect upstream entries and resolve entry_ids - upstream_entries: list[tuple[TagProtocol, DataProtocol, str]] = [ + upstream_entries: list[tuple[TagProtocol, DataProtocol, bytes]] = [ (tag, data, self.compute_pipeline_entry_id(tag, data)) for tag, data in input_stream.iter_data() ] @@ -1125,7 +1126,7 @@ def _process_data_internal( self.add_pipeline_record( tag, data, - data_record_id=output_data.datagram_id, + data_record_id=output_data.datagram_uuid, computed=result_computed, ) else: @@ -1142,8 +1143,8 @@ def _process_data_internal( return tag_out, output_data def get_cached_results( - self, entry_ids: list[str] - ) -> dict[str, tuple[TagProtocol, DataProtocol]]: + self, entry_ids: list[bytes] + ) -> dict[bytes, tuple[TagProtocol, DataProtocol]]: """Public cache façade: return already-computed results for the given entry IDs. Serves hits directly from the in-memory cache (``_cached_output_datas``). @@ -1213,7 +1214,7 @@ async def _async_process_data_internal( self.add_pipeline_record( tag, data, - data_record_id=output_data.datagram_id, + data_record_id=output_data.datagram_uuid, computed=result_computed, ) else: @@ -1233,7 +1234,7 @@ async def _async_process_data_internal( def compute_pipeline_entry_id( self, tag: TagProtocol, input_data: DataProtocol - ) -> str: + ) -> bytes: """Compute a unique pipeline entry ID from tag + system tags + input data hash. ``NODE_CONTENT_HASH_COL`` is always included so that two runs processing @@ -1246,8 +1247,9 @@ def compute_pipeline_entry_id( input_data: The input data. Returns: - A hash string uniquely identifying this (tag, input_data, node run) - combination. + Method-prefixed raw bytes (``b"{method}:{digest}"``) uniquely + identifying this (tag, input_data, node run) combination. Suitable + for storage in a ``pa.large_binary()`` column. """ tag_with_hash = ( tag.as_table(columns={"system_tags": True}) @@ -1260,13 +1262,13 @@ def compute_pipeline_entry_id( pa.array([self.content_hash().to_string()], type=pa.large_string()), ) ) - return self.data_context.arrow_hasher.hash_table(tag_with_hash).to_string() + return self.data_context.arrow_hasher.hash_table(tag_with_hash).to_prefixed_digest() def add_pipeline_record( self, tag: TagProtocol, input_data: DataProtocol, - data_record_id: str, + data_record_id: uuid.UUID, computed: bool, skip_cache_lookup: bool = False, ) -> None: @@ -1309,7 +1311,7 @@ def add_pipeline_record( meta_table = pa.table( { constants.DATA_RECORD_ID: pa.array( - [data_record_id], type=pa.large_string() + [data_record_id.bytes], type=pa.large_binary() ), constants.NODE_CONTENT_HASH_COL: pa.array( [self.content_hash().to_string()], type=pa.large_string() @@ -1435,7 +1437,7 @@ def as_source(self): def _fetch_joined_records( self, - entry_ids: list[str] | None = None, + entry_ids: list[bytes] | None = None, ) -> _JoinedRecords | None: """Internal primitive: fetch both DBs, content-hash-filter, and inner-join. @@ -1503,8 +1505,8 @@ def _fetch_joined_records( def _load_cached_entries( self, - entry_ids: list[str] | None = None, - ) -> "dict[str, tuple[TagProtocol, DataProtocol]]": + entry_ids: list[bytes] | None = None, + ) -> "dict[bytes, tuple[TagProtocol, DataProtocol]]": """DB loader: fetch ``(tag, data)`` pairs from the pipeline and result databases. Calls ``_fetch_joined_records`` to obtain the raw joined table, then @@ -1565,7 +1567,7 @@ def _load_cached_entries( data_table = joined.drop([c for c in drop_cols if c in joined.column_names]) stream = ArrowTableStream(data_table, tag_columns=tag_keys) - loaded: dict[str, tuple[TagProtocol, DataProtocol]] = {} + loaded: dict[bytes, tuple[TagProtocol, DataProtocol]] = {} for eid, (tag, data) in zip(entry_ids_col, stream.iter_data()): loaded[eid] = (tag, data) return loaded @@ -1758,7 +1760,7 @@ async def async_execute( if loaded: self._cached_output_table = None self._cached_content_hash_column = None - cached_by_entry_id: dict[str, tuple[TagProtocol, DataProtocol]] = dict(loaded) + cached_by_entry_id: dict[bytes, tuple[TagProtocol, DataProtocol]] = dict(loaded) # Phase 2: drive output from input channel — cached or compute async def _process_one_db( diff --git a/src/orcapod/core/nodes/operator_node.py b/src/orcapod/core/nodes/operator_node.py index fc3f63b2..566e09c0 100644 --- a/src/orcapod/core/nodes/operator_node.py +++ b/src/orcapod/core/nodes/operator_node.py @@ -795,7 +795,7 @@ def _store_output_stream(self, stream: StreamProtocol) -> None: output_table = output_table.add_column( 0, self.HASH_COLUMN_NAME, - pa.array(record_hashes, type=pa.large_string()), + pa.array([h.encode() for h in record_hashes], type=pa.large_binary()), ) # Store — record IDs are run-scoped (NODE_CONTENT_HASH_COL is included in diff --git a/src/orcapod/core/operators/join.py b/src/orcapod/core/operators/join.py index 0fbe3998..a6d628e4 100644 --- a/src/orcapod/core/operators/join.py +++ b/src/orcapod/core/operators/join.py @@ -101,13 +101,13 @@ def _predict_system_tag_schema(self, *streams: StreamProtocol) -> Schema: system_tag_fields: dict[str, type] = {} for idx, stream in enumerate(ordered_streams): stream_tag_schema, _ = stream.output_schema(columns={"system_tags": True}) - for col_name in stream_tag_schema: + for col_name, col_type in stream_tag_schema.items(): if col_name.startswith(constants.SYSTEM_TAG_PREFIX): new_name = ( f"{col_name}{constants.BLOCK_SEPARATOR}" f"{stream.pipeline_hash().to_hex(n_char)}:{idx}" ) - system_tag_fields[new_name] = str + system_tag_fields[new_name] = col_type return Schema(system_tag_fields) def static_process(self, *streams: StreamProtocol) -> StreamProtocol: @@ -699,7 +699,7 @@ def _sort_merged_system_tags(merged_sys: dict) -> dict: sid_val = merged_sys.get(fmap.get(sid_field, "")) rid_val = merged_sys.get(fmap.get(rid_field, "")) vals = {ft: merged_sys[k] for ft, k in fmap.items()} - entries.append(((sid_val or "", rid_val or ""), vals)) + entries.append(((sid_val or "", rid_val or b""), vals)) entries.sort(key=lambda e: e[0]) diff --git a/src/orcapod/core/operators/merge_join.py b/src/orcapod/core/operators/merge_join.py index 36240f32..43324f7d 100644 --- a/src/orcapod/core/operators/merge_join.py +++ b/src/orcapod/core/operators/merge_join.py @@ -144,13 +144,13 @@ def _predict_system_tag_schema( for stream, orig_idx in canonical: canon_pos = canonical.index((stream, orig_idx)) stream_tag_schema, _ = stream.output_schema(columns={"system_tags": True}) - for col_name in stream_tag_schema: + for col_name, col_type in stream_tag_schema.items(): if col_name.startswith(constants.SYSTEM_TAG_PREFIX): new_name = ( f"{col_name}{constants.BLOCK_SEPARATOR}" f"{stream.pipeline_hash().to_hex(n_char)}:{canon_pos}" ) - system_tag_fields[new_name] = str + system_tag_fields[new_name] = col_type return Schema(system_tag_fields) def binary_static_process( diff --git a/src/orcapod/core/result_cache.py b/src/orcapod/core/result_cache.py index c4cf7225..065730de 100644 --- a/src/orcapod/core/result_cache.py +++ b/src/orcapod/core/result_cache.py @@ -8,6 +8,7 @@ from __future__ import annotations import logging +import uuid from datetime import datetime, timezone from typing import TYPE_CHECKING @@ -124,7 +125,9 @@ def lookup( [(constants.POD_TIMESTAMP, "descending")] ).take([0]) - record_id = result_table.to_pylist()[0][RECORD_ID_COL] + record_id_bytes = result_table.to_pylist()[0][RECORD_ID_COL] + # Convert bytes back to uuid.UUID (stored as binary(16) in the DB) + record_uuid = uuid.UUID(bytes=bytes(record_id_bytes)) if record_id_bytes is not None else None # Drop lookup columns from the returned data drop_cols = [RECORD_ID_COL] + [ c for c in constraints if c in result_table.column_names @@ -133,7 +136,7 @@ def lookup( return Data( result_table, - record_id=record_id, + record_uuid=record_uuid, meta_info={self.RESULT_COMPUTED_FLAG: False}, ) @@ -201,7 +204,7 @@ def store( self._result_database.add_record( self._record_path, - output_data.datagram_id, + output_data.datagram_uuid.bytes, data_table, skip_duplicates=skip_duplicates, ) diff --git a/src/orcapod/core/sources/cached_source.py b/src/orcapod/core/sources/cached_source.py index b924e0cd..2a2e4f18 100644 --- a/src/orcapod/core/sources/cached_source.py +++ b/src/orcapod/core/sources/cached_source.py @@ -10,6 +10,7 @@ from orcapod.core.streams.arrow_table_stream import ArrowTableStream from orcapod.protocols.core_protocols import DataProtocol, SourceProtocol, TagProtocol from orcapod.protocols.database_protocols import ArrowDatabaseProtocol +from orcapod.system_constants import constants from orcapod.types import ColumnConfig, Schema from orcapod.utils.lazy_module import LazyModule @@ -219,7 +220,11 @@ def _ingest_live_data(self) -> None: columns={"source": True, "system_tags": True} ) - # Compute per-row record hashes for dedup: hash(full row) + # Compute per-row record hashes for dedup: hash the full row including + # all system tag columns. The record_id system tag encodes row origin + # (source_id + provenance token), so two rows that are byte-identical in + # their data columns but originate from different positions in the source + # will have different record_ids and must be stored as distinct entries. arrow_hasher = self.data_context.arrow_hasher record_hashes: list[str] = [] for batch in live_table.to_batches(): @@ -232,7 +237,7 @@ def _ingest_live_data(self) -> None: live_with_hash = live_table.add_column( 0, self.HASH_COLUMN_NAME, - pa.array(record_hashes, type=pa.large_string()), + pa.array([h.encode() for h in record_hashes], type=pa.large_binary()), ) self._cache_database.add_records( self.cache_path, diff --git a/src/orcapod/core/sources/stream_builder.py b/src/orcapod/core/sources/stream_builder.py index 1a4f704c..622e7cbc 100644 --- a/src/orcapod/core/sources/stream_builder.py +++ b/src/orcapod/core/sources/stream_builder.py @@ -8,6 +8,7 @@ from __future__ import annotations +import uuid from collections.abc import Collection from dataclasses import dataclass from typing import TYPE_CHECKING @@ -27,9 +28,20 @@ else: pa = LazyModule("pyarrow") +# Namespace UUID for Orcapod source record IDs. +# Derived as UUID v5 of NAMESPACE_URL + a stable Orcapod-specific URL so the +# value is principled rather than an opaque hex literal. This value is fixed for +# the lifetime of the project — changing it would invalidate all existing stored +# record IDs. +# Computed value: uuid.UUID('877ec89b-6645-5852-ba37-a94604043f5e') +_SOURCE_RECORD_ID_NAMESPACE = uuid.uuid5( + uuid.NAMESPACE_URL, + "https://orcapod.org/namespaces/source-record-id", +) -def _make_record_id(record_id_column: str | None, row_index: int, row: dict) -> str: - """Build the record-ID token for a single row. + +def _make_provenance_token(record_id_column: str | None, row_index: int, row: dict) -> str: + """Build the human-readable provenance token for a single row. When *record_id_column* is given the token is ``"{column}={value}"``, giving a stable, human-readable key that survives row reordering. @@ -40,6 +52,28 @@ def _make_record_id(record_id_column: str | None, row_index: int, row: dict) -> return f"row_{row_index}" +def _make_record_id(source_id: str, provenance_token: str) -> uuid.UUID: + """Build a stable record ID UUID from source_id and provenance token. + + Uses UUID v5 (name-based, SHA-1) to produce a deterministic UUID from + the source identity and row provenance token. The result is stable across + runs for the same source_id and provenance_token. + + Callers that need raw bytes for Arrow storage (``pa.binary(16)``) + should call ``.bytes`` on the returned UUID. + + Args: + source_id: The canonical source identifier. + provenance_token: Human-readable per-row provenance key + (e.g. ``"row_0"`` or ``"col=value"``). + + Returns: + A deterministic ``uuid.UUID`` identifying this (source, row) pair. + """ + name = f"{source_id}::{provenance_token}" + return uuid.uuid5(_SOURCE_RECORD_ID_NAMESPACE, name) + + @dataclass(frozen=True) class SourceStreamResult: """Artifacts produced by ``SourceStreamBuilder.build()``.""" @@ -141,13 +175,22 @@ def build( if source_id is None: source_id = table_hash.to_hex(char_count=self._config.hashing.path_n_char) - # 7. Build per-row source-info strings. + # 7. Build per-row source-info strings and deterministic UUID record IDs. rows_as_dicts = table.to_pylist() - source_info = [ - f"{source_id}{constants.BLOCK_SEPARATOR}" - f"{_make_record_id(record_id_column, i, row)}" + provenance_tokens = [ + _make_provenance_token(record_id_column, i, row) for i, row in enumerate(rows_as_dicts) ] + source_info = [ + f"{source_id}{constants.BLOCK_SEPARATOR}{token}" + for token in provenance_tokens + ] + # Record IDs stored in the system tag column are stable UUID v5 bytes — + # deterministically derived from source_id and per-row provenance token. + # Same source_id + same row position/content → same record_id bytes. + record_id_values = [ + _make_record_id(source_id, token).bytes for token in provenance_tokens + ] # 8. Add source-info provenance columns. table = arrow_utils.add_source_info( @@ -155,10 +198,6 @@ def build( ) # 9. Add system tag columns. - record_id_values = [ - _make_record_id(record_id_column, i, row) - for i, row in enumerate(rows_as_dicts) - ] table = arrow_utils.add_system_tag_columns( table, schema_hash, diff --git a/src/orcapod/databases/connector_arrow_database.py b/src/orcapod/databases/connector_arrow_database.py index c930fe98..a7ed5a46 100644 --- a/src/orcapod/databases/connector_arrow_database.py +++ b/src/orcapod/databases/connector_arrow_database.py @@ -12,7 +12,7 @@ connector = SQLiteConnector(":memory:") # PLT-1076 db = ConnectorArrowDatabase(connector) - db.add_record(("results", "my_fn"), record_id="abc", record=table) + db.add_record(("results", "my_fn"), record_id=b"sha256:abc", record=table) db.flush() """ from __future__ import annotations @@ -22,6 +22,7 @@ from collections.abc import Collection, Mapping from typing import TYPE_CHECKING, Any, cast +from orcapod.databases.utils import coerce_record_id from orcapod.protocols.db_connector_protocol import ColumnInfo, DBConnectorProtocol from orcapod.utils.lazy_module import LazyModule @@ -64,7 +65,7 @@ def __init__( max_hierarchy_depth: int = 10, _path_prefix: tuple[str, ...] = (), _shared_pending_batches: dict[str, pa.Table] | None = None, - _shared_pending_record_ids: dict[str, set[str]] | None = None, + _shared_pending_record_ids: dict[str, set[bytes]] | None = None, _shared_pending_skip_existing: dict[str, bool] | None = None, _root: ConnectorArrowDatabase | None = None, _scoped_path: tuple[str, ...] = (), @@ -75,7 +76,7 @@ def __init__( self._root = _root self._scoped_path = _scoped_path self._pending_batches: dict[str, pa.Table] = _shared_pending_batches if _shared_pending_batches is not None else {} - self._pending_record_ids: dict[str, set[str]] = _shared_pending_record_ids if _shared_pending_record_ids is not None else defaultdict(set) + self._pending_record_ids: dict[str, set[bytes]] = _shared_pending_record_ids if _shared_pending_record_ids is not None else defaultdict(set) # Per-batch flag: True when the batch was added with skip_duplicates=True, # so flush() can pass skip_existing=True to the connector and let it use # native INSERT-OR-IGNORE semantics rather than Python-side prefiltering. @@ -125,11 +126,12 @@ def _validate_record_path(self, record_path: tuple[str, ...]) -> None: # ── Record-ID column helpers ────────────────────────────────────────────── def _ensure_record_id_column( - self, arrow_data: pa.Table, record_id: str + self, arrow_data: pa.Table, record_id: str | bytes ) -> pa.Table: + record_id = coerce_record_id(record_id) if self.RECORD_ID_COLUMN not in arrow_data.column_names: key_array = pa.array( - [record_id] * len(arrow_data), type=pa.large_string() + [record_id] * len(arrow_data), type=pa.large_binary() ) arrow_data = arrow_data.add_column(0, self.RECORD_ID_COLUMN, key_array) return arrow_data @@ -190,7 +192,7 @@ def _get_committed_table( def add_record( self, record_path: tuple[str, ...], - record_id: str, + record_id: str | bytes, record: pa.Table, skip_duplicates: bool = False, flush: bool = False, @@ -233,9 +235,18 @@ def add_records( [rename_map.get(c, c) for c in records.column_names] ) + # Enforce binary record-id type — string columns are rejected to prevent + # silent deduplication failures (set[str] & set[bytes] is always empty). + rid_type = records[self.RECORD_ID_COLUMN].type + if not pa.types.is_large_binary(rid_type) and not pa.types.is_binary(rid_type): + raise TypeError( + f"Record-id column must be pa.large_binary() or pa.binary(), " + f"got {rid_type}. Encode the column to bytes before calling add_records()." + ) + records = self._deduplicate_within_table(records) record_key = self._get_record_key(record_path) - input_ids = set(cast(list[str], records[self.RECORD_ID_COLUMN].to_pylist())) + input_ids = set(cast(list[bytes], records[self.RECORD_ID_COLUMN].to_pylist())) if skip_duplicates: # Only filter records that conflict with the in-flight pending batch. @@ -272,7 +283,7 @@ def add_records( [existing_pending, records] ) self._pending_record_ids[record_key].update( - cast(list[str], records[self.RECORD_ID_COLUMN].to_pylist()) + cast(list[bytes], records[self.RECORD_ID_COLUMN].to_pylist()) ) if flush: @@ -365,12 +376,13 @@ def flush(self) -> None: def get_record_by_id( self, record_path: tuple[str, ...], - record_id: str, + record_id: str | bytes, record_id_column: str | None = None, flush: bool = False, ) -> pa.Table | None: if flush: self.flush() + record_id = coerce_record_id(record_id) record_key = self._get_record_key(record_path) # Check pending first @@ -412,13 +424,13 @@ def get_all_records( def get_records_by_ids( self, record_path: tuple[str, ...], - record_ids: Collection[str], + record_ids: Collection[str | bytes], record_id_column: str | None = None, flush: bool = False, ) -> pa.Table | None: if flush: self.flush() - ids_list = list(record_ids) + ids_list = [coerce_record_id(r) for r in record_ids] if not ids_list: return None all_records = self.get_all_records( diff --git a/src/orcapod/databases/delta_lake_databases.py b/src/orcapod/databases/delta_lake_databases.py index bec09181..abb597ee 100644 --- a/src/orcapod/databases/delta_lake_databases.py +++ b/src/orcapod/databases/delta_lake_databases.py @@ -6,9 +6,10 @@ from pathlib import Path from typing import TYPE_CHECKING, Any, Literal, cast +from orcapod.databases.utils import coerce_record_id +from orcapod.databases.storage_utils import is_cloud_uri, parse_base_path from orcapod.utils import arrow_utils from orcapod.utils.lazy_module import LazyModule -from orcapod.databases.storage_utils import is_cloud_uri, parse_base_path if TYPE_CHECKING: import deltalake @@ -55,7 +56,7 @@ def __init__( _root: "DeltaTableDatabase | None" = None, _scoped_path: tuple[str, ...] = (), _shared_pending_batches: "dict[str, pa.Table] | None" = None, - _shared_pending_record_ids: "defaultdict[str, set[str]] | None" = None, + _shared_pending_record_ids: "defaultdict[str, set[bytes]] | None" = None, ): self._root_uri, self._storage_options = parse_base_path(base_path, storage_options) self._is_cloud: bool = is_cloud_uri(self._root_uri) @@ -88,7 +89,7 @@ def __init__( if _shared_pending_record_ids is not None: self._pending_record_ids = _shared_pending_record_ids else: - self._pending_record_ids: dict[str, set[str]] = defaultdict(set) + self._pending_record_ids: dict[str, set[bytes]] = defaultdict(set) self._existing_ids_cache: dict[str, set[str]] = defaultdict(set) self._cache_dirty: dict[str, bool] = defaultdict(lambda: True) @@ -211,12 +212,13 @@ def _get_delta_table(self, record_path: tuple[str, ...]) -> deltalake.DeltaTable raise def _ensure_record_id_column( - self, arrow_data: pa.Table, record_id: str + self, arrow_data: pa.Table, record_id: str | bytes ) -> pa.Table: """Ensure the table has an record id column.""" + record_id = coerce_record_id(record_id) if self.RECORD_ID_COLUMN not in arrow_data.column_names: # Add record_id column at the beginning - key_array = pa.array([record_id] * len(arrow_data), type=pa.large_string()) + key_array = pa.array([record_id] * len(arrow_data), type=pa.large_binary()) arrow_data = arrow_data.add_column(0, self.RECORD_ID_COLUMN, key_array) return arrow_data @@ -254,7 +256,7 @@ def _handle_record_id_column( f"Record ID column '{self.RECORD_ID_COLUMN}' not found in the table and cannot be renamed." ) - def _create_record_id_filter(self, record_id: str) -> list: + def _create_record_id_filter(self, record_id: str | bytes) -> list: """ Create a proper filter expression for Delta Lake. @@ -264,9 +266,9 @@ def _create_record_id_filter(self, record_id: str) -> list: Returns: List containing the filter expression for Delta Lake """ - return [(self.RECORD_ID_COLUMN, "=", record_id)] + return [(self.RECORD_ID_COLUMN, "=", coerce_record_id(record_id))] - def _create_record_ids_filter(self, record_ids: list[str]) -> list: + def _create_record_ids_filter(self, record_ids: list[bytes]) -> list: """ Create a proper filter expression for multiple entry IDs. @@ -333,7 +335,7 @@ def _invalidate_cache(self, record_path: tuple[str, ...]) -> None: def add_record( self, record_path: tuple[str, ...], - record_id: str, + record_id: str | bytes, record: pa.Table, skip_duplicates: bool = False, flush: bool = False, @@ -698,7 +700,7 @@ def get_records_with_column_value( def get_record_by_id( self, record_path: tuple[str, ...], - record_id: str, + record_id: str | bytes, record_id_column: str | None = None, flush: bool = False, ) -> "pa.Table | None": @@ -716,6 +718,7 @@ def get_record_by_id( if flush: self.flush_batch(record_path) + record_id = coerce_record_id(record_id) # check if record_id is found in pending batches record_key = self._get_record_key(record_path) if record_id in self._pending_record_ids[record_key]: @@ -756,7 +759,7 @@ def get_record_by_id( def get_records_by_ids( self, record_path: tuple[str, ...], - record_ids: Collection[str] | pl.Series | pa.Array, + record_ids: Collection[str | bytes] | pl.Series | pa.Array, record_id_column: str | None = None, flush: bool = False, ) -> "pa.Table | None": @@ -776,17 +779,17 @@ def get_records_by_ids( if flush: self.flush_batch(record_path) - # Convert input to list of strings for consistency + # Convert input to a list of bytes for consistency if isinstance(record_ids, pl.Series): - record_ids_list = cast(list[str], record_ids.to_list()) + record_ids_list = [coerce_record_id(r) for r in record_ids.to_list()] elif isinstance(record_ids, (pa.Array, pa.ChunkedArray)): - record_ids_list = cast(list[str], record_ids.to_pylist()) + record_ids_list = [coerce_record_id(r) for r in record_ids.to_pylist()] elif isinstance(record_ids, Collection): - record_ids_list = list(record_ids) + record_ids_list = [coerce_record_id(r) for r in record_ids] else: raise TypeError( - f"record_ids must be list[str], pl.Series, or pa.Array, got {type(record_ids)}" + f"record_ids must be Collection[str | bytes], pl.Series, or pa.Array, got {type(record_ids)}" ) if len(record_ids) == 0: return None diff --git a/src/orcapod/databases/in_memory_databases.py b/src/orcapod/databases/in_memory_databases.py index d6b3fcc8..a51b2f37 100644 --- a/src/orcapod/databases/in_memory_databases.py +++ b/src/orcapod/databases/in_memory_databases.py @@ -5,6 +5,7 @@ from collections.abc import Collection, Mapping from typing import TYPE_CHECKING, Any, cast +from orcapod.databases.utils import coerce_record_id from orcapod.utils.lazy_module import LazyModule if TYPE_CHECKING: @@ -38,7 +39,7 @@ def __init__( _path_prefix: tuple[str, ...] = (), _shared_tables: "dict[str, pa.Table] | None" = None, _shared_pending_batches: "dict[str, pa.Table] | None" = None, - _shared_pending_record_ids: "dict[str, set[str]] | None" = None, + _shared_pending_record_ids: "dict[str, set[bytes]] | None" = None, _root: InMemoryArrowDatabase | None = None, _scoped_path: tuple[str, ...] = (), ): @@ -48,7 +49,7 @@ def __init__( self.max_hierarchy_depth = max_hierarchy_depth self._tables: dict[str, pa.Table] = _shared_tables if _shared_tables is not None else {} self._pending_batches: dict[str, pa.Table] = _shared_pending_batches if _shared_pending_batches is not None else {} - self._pending_record_ids: dict[str, set[str]] = _shared_pending_record_ids if _shared_pending_record_ids is not None else defaultdict(set) + self._pending_record_ids: dict[str, set[bytes]] = _shared_pending_record_ids if _shared_pending_record_ids is not None else defaultdict(set) # ------------------------------------------------------------------ # Path helpers @@ -86,10 +87,11 @@ def _validate_record_path(self, record_path: tuple[str, ...]) -> None: # ------------------------------------------------------------------ def _ensure_record_id_column( - self, arrow_data: "pa.Table", record_id: str + self, arrow_data: "pa.Table", record_id: str | bytes ) -> "pa.Table": + record_id = coerce_record_id(record_id) if self.RECORD_ID_COLUMN not in arrow_data.column_names: - key_array = pa.array([record_id] * len(arrow_data), type=pa.large_string()) + key_array = pa.array([record_id] * len(arrow_data), type=pa.large_binary()) arrow_data = arrow_data.add_column(0, self.RECORD_ID_COLUMN, key_array) return arrow_data @@ -136,14 +138,13 @@ def _deduplicate_within_table(self, table: "pa.Table") -> "pa.Table": # Internal helpers for duplicate detection # ------------------------------------------------------------------ - def _committed_ids(self, record_key: str) -> set[str]: + def _committed_ids(self, record_key: str) -> "set[bytes]": committed = self._tables.get(record_key) if committed is None or committed.num_rows == 0: return set() existing_ids = committed[self.RECORD_ID_COLUMN].to_pylist() - existing_ids = [str(id) for id in existing_ids if id is not None] # TODO: evaluate the efficiency of this implementation - return set(existing_ids) + return {id for id in existing_ids if id is not None} def _filter_existing_records( self, record_key: str, table: "pa.Table" @@ -167,7 +168,7 @@ def _filter_existing_records( def add_record( self, record_path: tuple[str, ...], - record_id: str, + record_id: str | bytes, record: "pa.Table", skip_duplicates: bool = False, flush: bool = False, @@ -210,6 +211,15 @@ def add_records( [rename_map.get(c, c) for c in records.column_names] ) + # Enforce binary record-id type — string columns are rejected to prevent + # silent deduplication failures (set[str] & set[bytes] is always empty). + rid_type = records[self.RECORD_ID_COLUMN].type + if not pa.types.is_large_binary(rid_type) and not pa.types.is_binary(rid_type): + raise TypeError( + f"Record-id column must be pa.large_binary() or pa.binary(), " + f"got {rid_type}. Encode the column to bytes before calling add_records()." + ) + # Deduplicate within the incoming batch (keep last) records = self._deduplicate_within_table(records) @@ -237,7 +247,7 @@ def add_records( self._pending_batches[record_key] = pa.concat_tables( [existing_pending, records] ) - pending_ids = cast(list[str], records[self.RECORD_ID_COLUMN].to_pylist()) + pending_ids = cast(list[bytes], records[self.RECORD_ID_COLUMN].to_pylist()) self._pending_record_ids[record_key].update(pending_ids) if flush: @@ -348,13 +358,14 @@ def _combined_table(self, record_key: str) -> "pa.Table | None": def get_record_by_id( self, record_path: tuple[str, ...], - record_id: str, + record_id: str | bytes, record_id_column: str | None = None, flush: bool = False, ) -> "pa.Table | None": if flush: self.flush() + record_id = coerce_record_id(record_id) record_key = self._get_record_key(record_path) # Check pending first @@ -387,14 +398,14 @@ def get_all_records( def get_records_by_ids( self, record_path: tuple[str, ...], - record_ids: "Collection[str]", + record_ids: "Collection[str | bytes]", record_id_column: str | None = None, flush: bool = False, ) -> "pa.Table | None": if flush: self.flush() - record_ids_list = list(record_ids) + record_ids_list = [coerce_record_id(r) for r in record_ids] if not record_ids_list: return None diff --git a/src/orcapod/databases/noop_database.py b/src/orcapod/databases/noop_database.py index a63ab77f..433be8a6 100644 --- a/src/orcapod/databases/noop_database.py +++ b/src/orcapod/databases/noop_database.py @@ -35,7 +35,7 @@ def __init__( def add_record( self, record_path: tuple[str, ...], - record_id: str, + record_id: str | bytes, record: "pa.Table", skip_duplicates: bool = False, flush: bool = False, @@ -55,7 +55,7 @@ def add_records( def get_record_by_id( self, record_path: tuple[str, ...], - record_id: str, + record_id: str | bytes, record_id_column: str | None = None, flush: bool = False, ) -> "pa.Table | None": @@ -71,7 +71,7 @@ def get_all_records( def get_records_by_ids( self, record_path: tuple[str, ...], - record_ids: Collection[str], + record_ids: Collection[str | bytes], record_id_column: str | None = None, flush: bool = False, ) -> "pa.Table | None": diff --git a/src/orcapod/databases/postgresql_connector.py b/src/orcapod/databases/postgresql_connector.py index 4f98810e..0ae53cf2 100644 --- a/src/orcapod/databases/postgresql_connector.py +++ b/src/orcapod/databases/postgresql_connector.py @@ -16,6 +16,7 @@ import logging import re import threading +import uuid from collections.abc import Iterator from typing import TYPE_CHECKING, Any @@ -74,13 +75,13 @@ def _pg_type_to_arrow(pg_type_name: str, udt_name: str) -> pa.DataType: return _pa.float32() if t in ("float8", "double precision", "numeric", "decimal"): return _pa.float64() + if t == "uuid": + return _pa.binary(16) + if t in ("text", "varchar", "character varying", "char", "bpchar", - "name", "uuid", "json", "jsonb", "time", "timetz"): + "name", "json", "jsonb", "time", "timetz"): if t in ("time", "timetz"): logger.warning("PostgreSQL type %r mapped to pa.large_string() (known gap)", t) - if t == "uuid": - # TODO: revisit mapping once PLT-1162 decides on a canonical UUID Arrow type - pass return _pa.large_string() if t == "bytea": return _pa.large_binary() @@ -150,6 +151,26 @@ def _arrow_type_to_pg_sql(arrow_type: pa.DataType) -> str: return "TEXT" +def _coerce_pg_value(value: Any, arrow_type: "pa.DataType") -> Any: + """Coerce a psycopg driver value to match the target Arrow type. + + Args: + value: Raw value from the psycopg cursor. + arrow_type: The Arrow type the value will be stored as. + + Returns: + A Python value compatible with ``pa.array(..., type=arrow_type)``. + """ + import pyarrow as _pa + + if value is None: + return None + # psycopg returns uuid columns as uuid.UUID objects; binary(16) needs raw bytes + if arrow_type == _pa.binary(16) and isinstance(value, uuid.UUID): + return value.bytes + return value + + def _resolve_column_type_lookup( query: str, connector: "PostgreSQLConnector", @@ -360,7 +381,7 @@ def iter_batches( while rows: arrays = [ - _pa.array([r[i] for r in rows], type=t) + _pa.array([_coerce_pg_value(r[i], t) for r in rows], type=t) for i, t in enumerate(arrow_types) ] yield _pa.RecordBatch.from_arrays(arrays, schema=schema) diff --git a/src/orcapod/databases/utils.py b/src/orcapod/databases/utils.py new file mode 100644 index 00000000..22da2422 --- /dev/null +++ b/src/orcapod/databases/utils.py @@ -0,0 +1,22 @@ +"""Utilities shared across database implementations.""" + +from __future__ import annotations + + +def coerce_record_id(record_id: str | bytes) -> bytes: + """Encode ``record_id`` to bytes if it is a ``str``. + + All database implementations store record IDs as ``bytes`` + (``pa.large_binary()``). This helper lets callers pass plain strings + without the caller needing to know the encoding — UTF-8 is used, which + is a lossless round-trip for any ASCII or text-based ID. + + Args: + record_id: A record identifier as either ``bytes`` or a ``str``. + + Returns: + The record identifier as ``bytes``. + """ + if isinstance(record_id, str): + return record_id.encode() + return record_id diff --git a/src/orcapod/hashing/semantic_hashing/builtin_handlers.py b/src/orcapod/hashing/semantic_hashing/builtin_handlers.py index 51c257b9..1b66d039 100644 --- a/src/orcapod/hashing/semantic_hashing/builtin_handlers.py +++ b/src/orcapod/hashing/semantic_hashing/builtin_handlers.py @@ -6,7 +6,7 @@ - PathContentHandler -- pathlib.Path: returns ContentHash of file content - UPathContentHandler -- upath.UPath: returns ContentHash of file content (remote-aware) - - UUIDHandler -- uuid.UUID: canonical string representation + - UUIDHandler -- uuid.UUID: raw 16-byte binary representation - BytesHandler -- bytes / bytearray: hex string representation - FunctionHandler -- callable with __code__: via FunctionInfoExtractorProtocol - TypeObjectHandler -- type objects (classes): stable "type:" string @@ -139,16 +139,17 @@ def handle(self, obj: Any, hasher: "SemanticHasherProtocol") -> Any: class UUIDHandler: - """ - Handler for uuid.UUID objects. + """Handler for ``uuid.UUID`` objects. - Converts the UUID to its canonical hyphenated string representation - (e.g. ``"550e8400-e29b-41d4-a716-446655440000"``), which is stable, - human-readable, and unambiguous. + Returns the raw 16-byte binary representation of the UUID. + The binary form is compact, unambiguous, and independent of string + formatting conventions. UUID values in data columns are stored as + ``pa.binary(16)`` (fixed-size) within the struct type used by + ``UUIDStructConverter``; database record IDs use ``pa.large_binary()``. """ def handle(self, obj: Any, hasher: "SemanticHasherProtocol") -> Any: - return str(obj) + return obj.bytes class BytesHandler: diff --git a/src/orcapod/hashing/versioned_hashers.py b/src/orcapod/hashing/versioned_hashers.py index f736293b..1e7b7255 100644 --- a/src/orcapod/hashing/versioned_hashers.py +++ b/src/orcapod/hashing/versioned_hashers.py @@ -126,7 +126,10 @@ def get_versioned_semantic_arrow_hasher( from orcapod.hashing.arrow_hashers import StarfixArrowHasher from orcapod.hashing.file_hashers import BasicFileHasher from orcapod.semantic_types.semantic_registry import SemanticTypeRegistry - from orcapod.semantic_types.semantic_struct_converters import PythonPathStructConverter + from orcapod.semantic_types.semantic_struct_converters import ( + PythonPathStructConverter, + UUIDStructConverter, + ) # Build a default semantic registry populated with the standard converters. # We use Any-typed locals here to side-step type-checker false positives @@ -136,6 +139,8 @@ def get_versioned_semantic_arrow_hasher( file_hasher = BasicFileHasher(algorithm="sha256") path_converter: Any = PythonPathStructConverter(file_hasher=file_hasher) registry.register_converter("path", path_converter) + uuid_converter: Any = UUIDStructConverter() + registry.register_converter("uuid", uuid_converter) logger.debug( "get_versioned_semantic_arrow_hasher: creating StarfixArrowHasher " diff --git a/src/orcapod/hashing/visitors.py b/src/orcapod/hashing/visitors.py index 2ceb8dd3..f3a6fe50 100644 --- a/src/orcapod/hashing/visitors.py +++ b/src/orcapod/hashing/visitors.py @@ -181,7 +181,7 @@ def visit_struct( if converter: # This is a semantic type - hash it try: - hash_string = converter.hash_struct_dict(data, add_prefix=True) + hash_string = converter.hash_struct_dict(data) return pa.large_string(), hash_string except Exception as e: field_path = ( diff --git a/src/orcapod/pipeline/logging_observer.py b/src/orcapod/pipeline/logging_observer.py index 07d1f1d5..1b8931c1 100644 --- a/src/orcapod/pipeline/logging_observer.py +++ b/src/orcapod/pipeline/logging_observer.py @@ -22,7 +22,7 @@ Fixed columns are prefixed with ``_log_`` to follow system column conventions and avoid collision with user-defined tag column names. - - ``_log_id`` (large_utf8): UUID unique to this log entry. + - ``_log_id`` (binary(16)): UUID unique to this log entry. - ``_log_run_id`` (large_utf8): UUID of the pipeline run (from ``on_run_start``). - ``_log_stdout_log`` (large_utf8): Captured standard output. - ``_log_stderr_log`` (large_utf8): Captured standard error. @@ -100,12 +100,12 @@ def record(self, **kwargs: Any) -> None: """ import pyarrow as pa - log_id = str(uuid7()) + log_id = uuid7().bytes timestamp = datetime.now(timezone.utc).isoformat() # Context columns — prefixed with "_log_" to follow system column conventions columns: dict[str, pa.Array] = { - "_log_id": pa.array([log_id], type=pa.large_utf8()), + "_log_id": pa.array([log_id], type=pa.binary(16)), "_log_run_id": pa.array([self._run_id], type=pa.large_utf8()), "_log_timestamp": pa.array([timestamp], type=pa.large_utf8()), } diff --git a/src/orcapod/pipeline/status_observer.py b/src/orcapod/pipeline/status_observer.py index 768bbde4..50cc3f44 100644 --- a/src/orcapod/pipeline/status_observer.py +++ b/src/orcapod/pipeline/status_observer.py @@ -21,7 +21,7 @@ Fixed columns are prefixed with ``_status_`` to follow system column conventions and avoid collision with user-defined tag column names. - - ``_status_id`` (large_utf8): UUID7 unique to this status event. + - ``_status_id`` (binary(16)): UUID7 unique to this status event. - ``_status_run_id`` (large_utf8): UUID of the pipeline run (from ``on_run_start``). - ``_status_pipeline_uri`` (large_utf8): Opaque URI identifying the pipeline version that produced this event (e.g. ``"my_pipeline@a1b2c3d4e5f6a1b2"``). Set to ``""`` @@ -270,11 +270,11 @@ def _write_event( tag_schema = self._tag_schema_per_node.get(node_label, {}) - status_id = str(uuid7()) + status_id = uuid7().bytes timestamp = datetime.now(timezone.utc).isoformat() columns: dict[str, pa.Array] = { - "_status_id": pa.array([status_id], type=pa.large_utf8()), + "_status_id": pa.array([status_id], type=pa.binary(16)), "_status_run_id": pa.array([self._current_run_id], type=pa.large_utf8()), "_status_pipeline_uri": pa.array([self._current_pipeline_uri], type=pa.large_utf8()), "_status_state": pa.array([state], type=pa.large_utf8()), @@ -397,11 +397,11 @@ def _write_event( """Build and write a single status event row to the scoped database.""" import pyarrow as pa - status_id = str(uuid7()) + status_id = uuid7().bytes timestamp = datetime.now(timezone.utc).isoformat() columns: dict[str, pa.Array] = { - "_status_id": pa.array([status_id], type=pa.large_utf8()), + "_status_id": pa.array([status_id], type=pa.binary(16)), "_status_run_id": pa.array([self._current_run_id], type=pa.large_utf8()), "_status_pipeline_uri": pa.array([self._current_pipeline_uri], type=pa.large_utf8()), "_status_state": pa.array([state], type=pa.large_utf8()), diff --git a/src/orcapod/protocols/core_protocols/datagrams.py b/src/orcapod/protocols/core_protocols/datagrams.py index d3908d90..831574b7 100644 --- a/src/orcapod/protocols/core_protocols/datagrams.py +++ b/src/orcapod/protocols/core_protocols/datagrams.py @@ -1,5 +1,6 @@ from __future__ import annotations +import uuid from collections.abc import Iterator, Mapping from typing import ( TYPE_CHECKING, @@ -47,12 +48,12 @@ class DatagramProtocol(ContentIdentifiableProtocol, DataContextAwareProtocol, Pr """ @property - def datagram_id(self) -> str: + def datagram_uuid(self) -> uuid.UUID: """ - Return the UUID of this datagram. + Return the UUID of this datagram (UUID v7). Returns: - UUID: The unique identifier for this instance of datagram. + uuid.UUID: The UUID for this datagram instance. """ ... diff --git a/src/orcapod/protocols/database_protocols.py b/src/orcapod/protocols/database_protocols.py index c4b0cb42..3e87ee1b 100644 --- a/src/orcapod/protocols/database_protocols.py +++ b/src/orcapod/protocols/database_protocols.py @@ -14,7 +14,7 @@ class ArrowDatabaseProtocol(Protocol): def add_record( self, record_path: tuple[str, ...], - record_id: str, + record_id: bytes, record: pa.Table, skip_duplicates: bool = False, flush: bool = False, @@ -32,7 +32,7 @@ def add_records( def get_record_by_id( self, record_path: tuple[str, ...], - record_id: str, + record_id: bytes, record_id_column: str | None = None, flush: bool = False, ) -> pa.Table | None: ... @@ -48,7 +48,7 @@ def get_all_records( def get_records_by_ids( self, record_path: tuple[str, ...], - record_ids: Collection[str], + record_ids: Collection[bytes], record_id_column: str | None = None, flush: bool = False, ) -> pa.Table | None: ... diff --git a/src/orcapod/protocols/semantic_types_protocols.py b/src/orcapod/protocols/semantic_types_protocols.py index 5b968813..002e2686 100644 --- a/src/orcapod/protocols/semantic_types_protocols.py +++ b/src/orcapod/protocols/semantic_types_protocols.py @@ -82,18 +82,16 @@ def can_handle_struct_type(self, struct_type: "pa.StructType") -> bool: """Check if this converter can handle the given struct type.""" ... - def hash_struct_dict( - self, struct_dict: dict[str, Any], add_prefix: bool = False - ) -> str: + def hash_struct_dict(self, struct_dict: dict[str, Any]) -> str: """ Compute hash of the semantic type from its struct dictionary representation. Args: struct_dict: Arrow struct dictionary representation - add_prefix: If True, prefix with semantic type and algorithm info Returns: - Hash string, optionally prefixed like "path:sha256:abc123..." + Hash string of the form ``"{type}:sha256:"``, + e.g. ``"path:sha256:abc123"`` Raises: Exception: If hashing fails (e.g., file not found for path types) diff --git a/src/orcapod/semantic_types/semantic_struct_converters.py b/src/orcapod/semantic_types/semantic_struct_converters.py index 4f60bd80..54be49a2 100644 --- a/src/orcapod/semantic_types/semantic_struct_converters.py +++ b/src/orcapod/semantic_types/semantic_struct_converters.py @@ -7,6 +7,7 @@ from __future__ import annotations +import uuid as _uuid_module from abc import ABC, abstractmethod from pathlib import Path from typing import TYPE_CHECKING, Any @@ -46,37 +47,33 @@ def hasher_id(self) -> str: """Default hasher ID based on semantic type name""" return self._hasher_id - def _format_hash_string(self, hash_bytes: bytes, add_prefix: bool = False) -> str: - """ - Format hash bytes into the standard hash string format. + def _compute_content_hash(self, content: bytes) -> ContentHash: + """Compute SHA-256 hash of content bytes. Args: - hash_bytes: Raw hash bytes - add_prefix: Whether to add semantic type and algorithm prefix + content: Content to hash. Returns: - Formatted hash string + ``ContentHash`` with ``method="sha256"`` and the raw digest. """ - hash_hex = hash_bytes.hex() - if add_prefix: - return f"{self.semantic_type_name}:sha256:{hash_hex}" - else: - return hash_hex + import hashlib - def _compute_content_hash(self, content: bytes) -> ContentHash: - """ - Compute SHA-256 hash of content bytes. + digest = hashlib.sha256(content).digest() + return ContentHash(method="sha256", digest=digest) + + def _format_semantic_hash(self, content_hash: ContentHash) -> str: + """Format a ``ContentHash`` into the standard semantic hash string. + + Always returns ``"{semantic_type_name}:{method}:{hex}"``, + e.g. ``"uuid:sha256:abc123"``. Args: - content: Content to hash + content_hash: Hash to format. Returns: - SHA-256 hash bytes + Formatted hash string with semantic type and algorithm prefix. """ - import hashlib - - digest = hashlib.sha256(content).digest() - return ContentHash(method=f"{self.semantic_type_name}:sha256", digest=digest) + return f"{self.semantic_type_name}:{content_hash.to_string(prefix_method=True)}" class PathStructConverterBase(SemanticStructConverterBase, ABC): @@ -148,17 +145,17 @@ def is_semantic_struct(self, struct_dict: dict[str, Any]) -> bool: and isinstance(struct_dict[self._field_name], str) ) - def hash_struct_dict( - self, struct_dict: dict[str, Any], add_prefix: bool = False - ) -> str: + def hash_struct_dict(self, struct_dict: dict[str, Any]) -> str: """Compute hash of a path semantic type by hashing the file content. + Returns a string of the form ``"{type}:{algorithm}:{hex}"``, + e.g. ``"path:sha256:abc123"``. + Args: struct_dict: Dict with the path field containing a file path string. - add_prefix: If True, prefix with semantic type and algorithm info. Returns: - Hash string of the file content. + Hash string of the file content with semantic type and algorithm prefix. Raises: FileNotFoundError: If the path does not exist. @@ -174,8 +171,8 @@ def hash_struct_dict( if path.is_dir(): raise IsADirectoryError(f"Path is a directory: {path}") - content_hash = self._file_hasher.hash_file(path) - return self._format_hash_string(content_hash.digest, add_prefix=add_prefix) + file_hash = self._file_hasher.hash_file(path) + return self._format_semantic_hash(file_hash) class PythonPathStructConverter(PathStructConverterBase): @@ -218,3 +215,119 @@ def __init__(self, file_hasher: "FileContentHasherProtocol"): def _make_path(self, path_str: str) -> UPath: return UPath(path_str) + + +class UUIDStructConverter(SemanticStructConverterBase): + """Converter for ``uuid.UUID`` objects to/from Arrow semantic structs. + + Stores UUIDs as fixed 16-byte binary values inside a single-field struct, + following the same pattern as ``PythonPathStructConverter`` and + ``UPathStructConverter``. + + Note: + ``uuid_utils.UUID`` objects (e.g. from ``uuid7()``) are accepted via + duck typing because they expose a ``.bytes`` attribute but do not + inherit from ``uuid.UUID``. + """ + + def __init__(self) -> None: + super().__init__("uuid") + self._python_type = _uuid_module.UUID + self._arrow_struct_type = pa.struct([pa.field("uuid", pa.binary(16))]) + + @property + def python_type(self) -> type: + """The Python type this converter handles (``uuid.UUID``).""" + return self._python_type + + @property + def arrow_struct_type(self) -> "pa.StructType": + """The Arrow struct type used for serialisation.""" + return self._arrow_struct_type + + def python_to_struct_dict(self, value: Any) -> dict[str, bytes]: + """Convert a UUID to a struct dictionary with a single ``uuid`` field. + + Accepts both ``uuid.UUID`` instances and duck-typed UUID-compatible + objects (e.g. ``uuid_utils.UUID``) that expose a ``.bytes`` attribute + returning 16 raw bytes. + + Args: + value: A ``uuid.UUID`` instance or compatible UUID-like object. + + Returns: + A dict with a single key ``"uuid"`` whose value is 16 raw bytes. + + Raises: + TypeError: If ``value`` is not a ``uuid.UUID`` instance or + compatible duck-typed UUID object. + """ + if isinstance(value, _uuid_module.UUID): + return {"uuid": value.bytes} + # Accept uuid_utils.UUID and other duck-typed UUID objects + raw = getattr(value, "bytes", None) + if isinstance(raw, bytes) and len(raw) == 16: + return {"uuid": raw} + raise TypeError( + f"Expected uuid.UUID or compatible UUID object, got {type(value)}" + ) + + def struct_dict_to_python(self, struct_dict: dict[str, Any]) -> _uuid_module.UUID: + """Convert a struct dictionary back to a ``uuid.UUID`` instance. + + Args: + struct_dict: Dict with a ``"uuid"`` key containing 16 raw bytes + (``bytes`` or ``bytearray``). + + Returns: + A ``uuid.UUID`` constructed from the raw bytes. + + Raises: + ValueError: If the ``"uuid"`` key is absent from ``struct_dict``. + """ + raw = struct_dict.get("uuid") + if raw is None: + raise ValueError("Missing 'uuid' field in struct dict") + return _uuid_module.UUID(bytes=bytes(raw)) + + def can_handle_python_type(self, python_type: type) -> bool: + """Check if this converter can handle the given Python type. + + Args: + python_type: The Python type to check. + + Returns: + ``True`` if ``python_type`` is ``uuid.UUID`` or a subclass of it. + """ + return issubclass(python_type, self._python_type) + + def can_handle_struct_type(self, struct_type: "pa.StructType") -> bool: + """Check if this converter can handle the given Arrow struct type. + + Args: + struct_type: The Arrow struct type to check. + + Returns: + ``True`` if ``struct_type`` equals the UUID Arrow struct type. + """ + return struct_type == self._arrow_struct_type + + def hash_struct_dict(self, struct_dict: dict[str, Any]) -> str: + """Compute a SHA-256 hash of the UUID from its struct dictionary representation. + + Hashes the raw 16 UUID bytes directly. + + Args: + struct_dict: Dict with a ``"uuid"`` key containing 16 raw bytes. + + Returns: + Hash string of the form ``"uuid:sha256:"``. + + Raises: + ValueError: If the ``"uuid"`` key is absent from ``struct_dict``. + """ + raw = struct_dict.get("uuid") + if raw is None: + raise ValueError("Missing 'uuid' field in struct dict") + content_hash = self._compute_content_hash(bytes(raw)) + return self._format_semantic_hash(content_hash) diff --git a/src/orcapod/semantic_types/universal_converter.py b/src/orcapod/semantic_types/universal_converter.py index 18a8975c..765745df 100644 --- a/src/orcapod/semantic_types/universal_converter.py +++ b/src/orcapod/semantic_types/universal_converter.py @@ -501,7 +501,11 @@ def _convert_arrow_to_python(self, arrow_type: pa.DataType) -> type | Any: return str elif pa.types.is_boolean(arrow_type): return bool - elif pa.types.is_binary(arrow_type) or pa.types.is_large_binary(arrow_type): + elif ( + pa.types.is_binary(arrow_type) + or pa.types.is_large_binary(arrow_type) + or pa.types.is_fixed_size_binary(arrow_type) + ): return bytes # Handle struct types @@ -828,6 +832,7 @@ def _create_arrow_to_python_converter( or pa.types.is_large_string(arrow_type) or pa.types.is_binary(arrow_type) or pa.types.is_large_binary(arrow_type) + or pa.types.is_fixed_size_binary(arrow_type) ): return lambda value: value diff --git a/src/orcapod/types.py b/src/orcapod/types.py index bfe05695..9a2588fe 100644 --- a/src/orcapod/types.py +++ b/src/orcapod/types.py @@ -74,7 +74,6 @@ _T = TypeVar("_T") - class Schema(Mapping[str, DataType]): """Immutable schema representing a mapping of field names to Python types. @@ -585,6 +584,25 @@ def to_string( return f"{self.method}:{self.to_hex(hexdigits)}" return self.to_hex(hexdigits) + def to_prefixed_digest(self) -> bytes: + """Return the hash as method-prefixed raw bytes: ``b"{method}:{digest}"``. + + Encodes the method name as ASCII, appends a literal ``:`` byte, then + appends the raw digest bytes. The result is suitable for storage in a + ``pa.large_binary()`` column and preserves the hash method for + introspection without the hex-encoding overhead of ``to_string()``. + + Example:: + + h = ContentHash("arrow_v2.1", b"\\x00" * 32) + h.to_prefixed_digest() + # b"arrow_v2.1:\\x00\\x00...\\x00" + + Returns: + Raw bytes in the form ``b"{method}:{raw_digest}"``. + """ + return self.method.encode("ascii") + b":" + self.digest + def __str__(self) -> str: return self.to_string() @@ -696,3 +714,5 @@ def __post_init__(self) -> None: f"PollingConfig.error_backoff_base must be > 0, " f"got {self.error_backoff_base}" ) + + diff --git a/src/orcapod/utils/arrow_utils.py b/src/orcapod/utils/arrow_utils.py index 7d438d2f..df7ba4c4 100644 --- a/src/orcapod/utils/arrow_utils.py +++ b/src/orcapod/utils/arrow_utils.py @@ -1014,8 +1014,9 @@ def system_tag_column_names(schema_hash: str) -> tuple[str, str]: Returns: Tuple of ``(source_id_column_name, record_id_column_name)``. - Both start with ``constants.SYSTEM_TAG_PREFIX`` and have ``large_string`` - type in the Arrow table. + Both start with ``constants.SYSTEM_TAG_PREFIX``. The source_id column + has ``large_string`` type; the record_id column has ``binary(16)`` + (fixed-size 16-byte binary) type in the Arrow table. """ source_id_col = ( f"{constants.SYSTEM_TAG_SOURCE_ID_PREFIX}{constants.BLOCK_SEPARATOR}{schema_hash}" @@ -1030,7 +1031,7 @@ def add_system_tag_columns( table: "pa.Table", schema_hash: str, source_ids: str | Collection[str], - record_ids: Collection[str], + record_ids: Collection[bytes], ) -> "pa.Table": """Add paired source_id and record_id system tag columns to an Arrow table.""" if not table.column_names: @@ -1053,7 +1054,7 @@ def add_system_tag_columns( source_id_col_name, record_id_col_name = system_tag_column_names(schema_hash) source_id_array = pa.array(source_ids, type=pa.large_string()) - record_id_array = pa.array(record_ids, type=pa.large_string()) + record_id_array = pa.array(record_ids, type=pa.binary(16)) # System tag columns are always computed, never null — declare nullable=False # explicitly so the schema intent is not lost in Polars round-trips. @@ -1061,7 +1062,7 @@ def add_system_tag_columns( pa.field(source_id_col_name, pa.large_string(), nullable=False), source_id_array ) table = table.append_column( - pa.field(record_id_col_name, pa.large_string(), nullable=False), record_id_array + pa.field(record_id_col_name, pa.binary(16), nullable=False), record_id_array ) return table diff --git a/superpowers/plans/2026-06-11-plt-1162-uuid-arrow-type.md b/superpowers/plans/2026-06-11-plt-1162-uuid-arrow-type.md new file mode 100644 index 00000000..8884082e --- /dev/null +++ b/superpowers/plans/2026-06-11-plt-1162-uuid-arrow-type.md @@ -0,0 +1,956 @@ +# UUID Arrow Type Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use sensei:subagent-driven-development (recommended) or sensei:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Replace all `pa.large_string()` UUID storage in OrcaPod with `pa.binary(16)` (`fixed_size_binary[16]`), introducing two named constants and a semantic struct converter. + +**Architecture:** A single `UUID_ARROW_TYPE = pa.binary(16)` constant in `types.py` is used for all system UUID columns; a `UUID_STRUCT_ARROW_TYPE = pa.struct([pa.field("uuid", pa.binary(16))])` constant enables `uuid.UUID` Python objects to round-trip through the semantic type system. All UUID generation sites switch from `str(uuid7())` to `uuid7().bytes`. + +**Tech Stack:** PyArrow 23, `uuid_utils` (uuid7), `uuid` stdlib, pytest + +--- + +## File Map + +| File | Action | What changes | +|---|---|---| +| `src/orcapod/types.py` | Modify | Add `UUID_ARROW_TYPE`, `UUID_STRUCT_ARROW_TYPE` constants | +| `src/orcapod/__init__.py` | Modify | Export both new constants | +| `src/orcapod/semantic_types/semantic_struct_converters.py` | Modify | Add `UUIDStructConverter` class | +| `src/orcapod/hashing/versioned_hashers.py` | Modify | Register `UUIDStructConverter` in default registry | +| `src/orcapod/hashing/semantic_hashing/builtin_handlers.py` | Modify | `UUIDHandler.handle` returns `obj.bytes` not `str(obj)` | +| `src/orcapod/utils/arrow_data_utils.py` | Modify | System UUID column field types → `UUID_ARROW_TYPE` | +| `src/orcapod/core/datagrams/datagram.py` | Modify | `str(uuid7())` → `uuid7().bytes`; `_datagram_id` type `bytes` | +| `src/orcapod/core/data_function.py` | Modify | `str(uuid7())` → `uuid7().bytes`; `record_id` type `bytes` | +| `src/orcapod/pipeline/logging_observer.py` | Modify | `str(uuid7())` → `uuid7().bytes` | +| `src/orcapod/pipeline/status_observer.py` | Modify | `str(uuid7())` → `uuid7().bytes` | +| `src/orcapod/databases/postgresql_connector.py` | Modify | `uuid` pg type → `UUID_ARROW_TYPE`; decode driver values to bytes | +| `tests/test_types.py` | Modify | Add constant assertions | +| `tests/test_semantic_types/test_uuid_struct_converter.py` | Create | Full converter test suite | +| `tests/test_semantic_types/test_semantic_registry.py` | Modify | Assert UUID type registered | +| `tests/` (various) | Modify | Fix tests asserting `pa.large_string()` for UUID columns | + +--- + +### Task 1: UUID Arrow type constants + +**Files:** +- Modify: `src/orcapod/types.py` +- Modify: `src/orcapod/__init__.py` +- Modify: `tests/test_types.py` + +- [ ] **Step 1: Write the failing test** + +Add to `tests/test_types.py`: + +```python +import pyarrow as pa + +from orcapod.types import UUID_ARROW_TYPE, UUID_STRUCT_ARROW_TYPE + + +def test_uuid_arrow_type_is_binary16(): + assert UUID_ARROW_TYPE == pa.binary(16) + + +def test_uuid_struct_arrow_type_structure(): + assert UUID_STRUCT_ARROW_TYPE == pa.struct([pa.field("uuid", pa.binary(16))]) + + +def test_uuid_struct_inner_type_matches_constant(): + assert UUID_STRUCT_ARROW_TYPE.field("uuid").type == UUID_ARROW_TYPE +``` + +- [ ] **Step 2: Run test to verify it fails** + +```bash +uv run pytest tests/test_types.py::test_uuid_arrow_type_is_binary16 tests/test_types.py::test_uuid_struct_arrow_type_structure -v +``` + +Expected: `ImportError: cannot import name 'UUID_ARROW_TYPE'` + +- [ ] **Step 3: Add constants to `types.py`** + +Find the import block at the top of `src/orcapod/types.py`. After the `if TYPE_CHECKING:` block (around line 22), add: + +```python +if TYPE_CHECKING: + import pyarrow as pa + # ... existing ... +else: + from orcapod.utils.lazy_module import LazyModule + pa = LazyModule("pyarrow") +``` + +Then at module level, after the imports, before the first class definition, add: + +```python +# --------------------------------------------------------------------------- +# UUID Arrow type constants +# --------------------------------------------------------------------------- + +def _make_uuid_types() -> "tuple[pa.DataType, pa.StructType]": + """Build UUID Arrow types (deferred so pyarrow is not imported at module level).""" + import pyarrow as _pa + arrow_type = _pa.binary(16) + struct_type = _pa.struct([_pa.field("uuid", arrow_type)]) + return arrow_type, struct_type + + +# Canonical Arrow type for all UUID values in OrcaPod. +# Stored as fixed_size_binary[16] — 16 raw bytes, no hex encoding, no dashes. +UUID_ARROW_TYPE: "pa.DataType" +# Semantic struct type for Python uuid.UUID round-trips through the type system. +UUID_STRUCT_ARROW_TYPE: "pa.StructType" +UUID_ARROW_TYPE, UUID_STRUCT_ARROW_TYPE = _make_uuid_types() +del _make_uuid_types +``` + +> **Note on lazy import:** The rest of `types.py` uses `LazyModule("pyarrow")` to defer the heavy import. For module-level constants that need the actual `pa` object at definition time, the pattern is to call a helper function that imports `pyarrow` directly. The assignment happens at module import time but is deferred inside the helper. + +- [ ] **Step 4: Run tests to verify they pass** + +```bash +uv run pytest tests/test_types.py::test_uuid_arrow_type_is_binary16 tests/test_types.py::test_uuid_struct_arrow_type_structure tests/test_types.py::test_uuid_struct_inner_type_matches_constant -v +``` + +Expected: all PASS + +- [ ] **Step 5: Export from `src/orcapod/__init__.py`** + +Open `src/orcapod/__init__.py` and find the existing imports from `types`. Add `UUID_ARROW_TYPE` and `UUID_STRUCT_ARROW_TYPE` to the same import line: + +```python +from orcapod.types import ( + # ... existing exports ... + UUID_ARROW_TYPE, + UUID_STRUCT_ARROW_TYPE, +) +``` + +- [ ] **Step 6: Verify import from top-level package** + +```bash +uv run python -c "from orcapod import UUID_ARROW_TYPE, UUID_STRUCT_ARROW_TYPE; import pyarrow as pa; assert UUID_ARROW_TYPE == pa.binary(16); print('OK')" +``` + +Expected: `OK` + +- [ ] **Step 7: Commit** + +```bash +git add src/orcapod/types.py src/orcapod/__init__.py tests/test_types.py +git commit -m "feat(types): add UUID_ARROW_TYPE and UUID_STRUCT_ARROW_TYPE constants (PLT-1162)" +``` + +--- + +### Task 2: UUIDStructConverter + +**Files:** +- Modify: `src/orcapod/semantic_types/semantic_struct_converters.py` +- Create: `tests/test_semantic_types/test_uuid_struct_converter.py` + +- [ ] **Step 1: Write the failing tests** + +Create `tests/test_semantic_types/test_uuid_struct_converter.py`: + +```python +"""Tests for UUIDStructConverter.""" +import uuid + +import pyarrow as pa +import pytest + +from orcapod.semantic_types.semantic_struct_converters import UUIDStructConverter +from orcapod.types import UUID_ARROW_TYPE, UUID_STRUCT_ARROW_TYPE + + +@pytest.fixture +def converter(): + return UUIDStructConverter() + + +@pytest.fixture +def sample_uuid(): + return uuid.UUID("550e8400-e29b-41d4-a716-446655440000") + + +def test_python_type(converter): + assert converter.python_type is uuid.UUID + + +def test_arrow_struct_type(converter): + assert converter.arrow_struct_type == UUID_STRUCT_ARROW_TYPE + + +def test_semantic_type_name(converter): + assert converter.semantic_type_name == "uuid" + + +def test_python_to_struct_dict(converter, sample_uuid): + result = converter.python_to_struct_dict(sample_uuid) + assert result == {"uuid": sample_uuid.bytes} + assert isinstance(result["uuid"], bytes) + assert len(result["uuid"]) == 16 + + +def test_python_to_struct_dict_rejects_non_uuid(converter): + with pytest.raises(TypeError): + converter.python_to_struct_dict("550e8400-e29b-41d4-a716-446655440000") # type: ignore + + +def test_struct_dict_to_python(converter, sample_uuid): + struct_dict = {"uuid": sample_uuid.bytes} + result = converter.struct_dict_to_python(struct_dict) + assert result == sample_uuid + assert isinstance(result, uuid.UUID) + + +def test_struct_dict_to_python_from_bytearray(converter, sample_uuid): + """Arrow may return binary fields as bytearray — must handle both.""" + struct_dict = {"uuid": bytearray(sample_uuid.bytes)} + result = converter.struct_dict_to_python(struct_dict) + assert result == sample_uuid + + +def test_struct_dict_to_python_missing_field(converter): + with pytest.raises(ValueError, match="Missing 'uuid' field"): + converter.struct_dict_to_python({}) + + +def test_round_trip(converter, sample_uuid): + struct_dict = converter.python_to_struct_dict(sample_uuid) + recovered = converter.struct_dict_to_python(struct_dict) + assert recovered == sample_uuid + + +def test_round_trip_all_versions(): + """Verify round-trip works for uuid4, uuid5, and uuid7 (uuid_utils).""" + from uuid_utils import uuid7 + + converter = UUIDStructConverter() + for u in [uuid.uuid4(), uuid.uuid5(uuid.NAMESPACE_OID, "test"), uuid7()]: + assert converter.struct_dict_to_python(converter.python_to_struct_dict(u)) == u + + +def test_arrow_array_round_trip(converter, sample_uuid): + """Verify UUID survives a PyArrow array round-trip.""" + struct_dict = converter.python_to_struct_dict(sample_uuid) + arr = pa.array([struct_dict], type=UUID_STRUCT_ARROW_TYPE) + recovered_dict = arr[0].as_py() + recovered_uuid = converter.struct_dict_to_python(recovered_dict) + assert recovered_uuid == sample_uuid + + +def test_distinct_uuids_produce_distinct_struct_dicts(converter): + u1, u2 = uuid.uuid4(), uuid.uuid4() + assert converter.python_to_struct_dict(u1) != converter.python_to_struct_dict(u2) +``` + +- [ ] **Step 2: Run tests to verify they fail** + +```bash +uv run pytest tests/test_semantic_types/test_uuid_struct_converter.py -v +``` + +Expected: `ImportError: cannot import name 'UUIDStructConverter'` + +- [ ] **Step 3: Add `UUIDStructConverter` to `semantic_struct_converters.py`** + +Add the following imports at the top of `src/orcapod/semantic_types/semantic_struct_converters.py` (after existing imports): + +```python +import uuid as _uuid_module +from typing import Any + +from orcapod.types import UUID_STRUCT_ARROW_TYPE +``` + +Then add the class after `UPathStructConverter`: + +```python +class UUIDStructConverter(SemanticStructConverterBase): + """Converts Python ``uuid.UUID`` objects to/from the OrcaPod UUID struct. + + Stores UUIDs as their raw 16-byte binary representation inside a + single-field struct ``struct``. This follows + the same single-field struct pattern used by ``PythonPathStructConverter`` + and ``UPathStructConverter``. + """ + + def __init__(self) -> None: + super().__init__("uuid") + self._arrow_struct_type = UUID_STRUCT_ARROW_TYPE + + @property + def python_type(self) -> type: + """Python type handled by this converter.""" + return _uuid_module.UUID + + @property + def arrow_struct_type(self) -> "pa.StructType": + """Arrow struct type for UUID values.""" + return self._arrow_struct_type + + def python_to_struct_dict(self, value: Any) -> dict[str, bytes]: + """Convert a ``uuid.UUID`` to a struct dictionary. + + Args: + value: A ``uuid.UUID`` instance. + + Returns: + ``{"uuid": <16 bytes>}`` + + Raises: + TypeError: If ``value`` is not a ``uuid.UUID``. + """ + if not isinstance(value, _uuid_module.UUID): + raise TypeError( + f"Expected uuid.UUID, got {type(value).__name__}" + ) + return {"uuid": value.bytes} + + def struct_dict_to_python(self, struct_dict: dict[str, Any]) -> _uuid_module.UUID: + """Convert a struct dictionary back to a ``uuid.UUID``. + + Args: + struct_dict: Dict with a ``"uuid"`` key holding 16 bytes. + + Returns: + The reconstructed ``uuid.UUID``. + + Raises: + ValueError: If the ``"uuid"`` field is missing. + """ + raw = struct_dict.get("uuid") + if raw is None: + raise ValueError("Missing 'uuid' field in struct dict") + return _uuid_module.UUID(bytes=bytes(raw)) +``` + +- [ ] **Step 4: Run tests to verify they pass** + +```bash +uv run pytest tests/test_semantic_types/test_uuid_struct_converter.py -v +``` + +Expected: all PASS + +- [ ] **Step 5: Commit** + +```bash +git add src/orcapod/semantic_types/semantic_struct_converters.py \ + tests/test_semantic_types/test_uuid_struct_converter.py +git commit -m "feat(semantic-types): add UUIDStructConverter for uuid.UUID Arrow struct round-trips (PLT-1162)" +``` + +--- + +### Task 3: Register UUIDStructConverter in the default semantic registry + +**Files:** +- Modify: `src/orcapod/hashing/versioned_hashers.py` +- Modify: `tests/test_semantic_types/test_semantic_registry.py` + +- [ ] **Step 1: Find the fixture name used in existing registry tests** + +```bash +grep -n "def.*registry\|fixture.*registry" \ + tests/test_semantic_types/test_semantic_registry.py tests/conftest.py 2>/dev/null | head -10 +``` + +Note the fixture name (e.g. `default_registry`, `registry`, `semantic_registry`). Use that exact name in the new tests below. + +- [ ] **Step 1b: Write the failing tests** + +Open `tests/test_semantic_types/test_semantic_registry.py` and add (replacing `default_registry` with the fixture name found above): + +```python +import uuid + +import pyarrow as pa + +from orcapod.types import UUID_STRUCT_ARROW_TYPE + + +def test_uuid_type_registered_in_default_registry(default_registry): + """uuid.UUID should be registered and map to UUID_STRUCT_ARROW_TYPE.""" + struct_type = default_registry.get_struct_for_python_type(uuid.UUID) + assert struct_type == UUID_STRUCT_ARROW_TYPE + + +def test_uuid_struct_resolves_to_python_type(default_registry): + """UUID_STRUCT_ARROW_TYPE should resolve back to uuid.UUID.""" + python_type = default_registry.get_python_type_for_struct(UUID_STRUCT_ARROW_TYPE) + assert python_type is uuid.UUID + + +def test_uuid_semantic_type_name_registered(default_registry): + """Converter registered under the name 'uuid'.""" + converter = default_registry.get_converter_for_semantic_type("uuid") + assert converter is not None + assert converter.python_type is uuid.UUID +``` + +- [ ] **Step 2: Run tests to verify they fail** + +```bash +uv run pytest tests/test_semantic_types/test_semantic_registry.py::test_uuid_type_registered_in_default_registry -v +``` + +Expected: FAIL — `uuid.UUID` not registered + +- [ ] **Step 3: Register the converter in `versioned_hashers.py`** + +Open `src/orcapod/hashing/versioned_hashers.py`. Find the block that registers `PythonPathStructConverter` (around line 125-138): + +```python + from orcapod.semantic_types.semantic_struct_converters import PythonPathStructConverter + + registry: Any = SemanticTypeRegistry() + file_hasher = BasicFileHasher(algorithm="sha256") + path_converter: Any = PythonPathStructConverter(file_hasher=file_hasher) + registry.register_converter("path", path_converter) +``` + +Extend it to also register `UUIDStructConverter`: + +```python + from orcapod.semantic_types.semantic_struct_converters import ( + PythonPathStructConverter, + UUIDStructConverter, + ) + + registry: Any = SemanticTypeRegistry() + file_hasher = BasicFileHasher(algorithm="sha256") + path_converter: Any = PythonPathStructConverter(file_hasher=file_hasher) + registry.register_converter("path", path_converter) + uuid_converter: Any = UUIDStructConverter() + registry.register_converter("uuid", uuid_converter) +``` + +- [ ] **Step 4: Run tests to verify they pass** + +```bash +uv run pytest tests/test_semantic_types/test_semantic_registry.py::test_uuid_type_registered_in_default_registry tests/test_semantic_types/test_semantic_registry.py::test_uuid_struct_resolves_to_python_type tests/test_semantic_types/test_semantic_registry.py::test_uuid_semantic_type_name_registered -v +``` + +Expected: all PASS + +- [ ] **Step 5: Commit** + +```bash +git add src/orcapod/hashing/versioned_hashers.py \ + tests/test_semantic_types/test_semantic_registry.py +git commit -m "feat(semantic-types): register UUIDStructConverter in default semantic registry (PLT-1162)" +``` + +--- + +### Task 4: Update UUIDHandler in semantic hashing + +**Files:** +- Modify: `src/orcapod/hashing/semantic_hashing/builtin_handlers.py` +- Modify: (existing test file for builtin handlers — find with `grep -rl "UUIDHandler" tests/`) + +- [ ] **Step 1: Find the existing UUIDHandler test** + +```bash +grep -rl "UUIDHandler\|uuid.*handler\|handler.*uuid" tests/ --include="*.py" -i +``` + +If a file is found, open it and locate the test asserting that handling a `uuid.UUID` returns its string form — update it per Step 2 below. + +If **no file is found**, create `tests/test_hashing/test_uuid_handler.py`: + +```python +"""Tests for UUIDHandler in semantic hashing.""" +``` + +Then proceed to Step 2. + +- [ ] **Step 2: Update the test to expect bytes** + +In the test file found above, update the UUID handler test: + +```python +import uuid as _uuid + +def test_uuid_handler_returns_bytes(): + """UUIDHandler should return the 16-byte binary representation.""" + from orcapod.hashing.semantic_hashing.builtin_handlers import UUIDHandler + + handler = UUIDHandler() + u = _uuid.UUID("550e8400-e29b-41d4-a716-446655440000") + result = handler.handle(u, hasher=None) # type: ignore[arg-type] + assert result == u.bytes + assert isinstance(result, bytes) + assert len(result) == 16 + + +def test_uuid_handler_different_uuids_produce_different_bytes(): + from orcapod.hashing.semantic_hashing.builtin_handlers import UUIDHandler + + handler = UUIDHandler() + u1 = _uuid.uuid4() + u2 = _uuid.uuid4() + assert handler.handle(u1, None) != handler.handle(u2, None) # type: ignore[arg-type] +``` + +- [ ] **Step 3: Run to verify test fails** + +```bash +uv run pytest tests/ -k "uuid_handler" -v +``` + +Expected: FAIL — result is a string, not bytes + +- [ ] **Step 4: Update `UUIDHandler` in `builtin_handlers.py`** + +Open `src/orcapod/hashing/semantic_hashing/builtin_handlers.py`. Find `UUIDHandler` (around line 141): + +```python +class UUIDHandler: + """ + Handler for uuid.UUID objects. + + Converts the UUID to its canonical hyphenated string representation + (e.g. ``"550e8400-e29b-41d4-a716-446655440000"``), which is stable, + human-readable, and unambiguous. + """ + + def handle(self, obj: Any, hasher: "SemanticHasherProtocol") -> Any: + return str(obj) +``` + +Replace with: + +```python +class UUIDHandler: + """Handler for ``uuid.UUID`` objects. + + Returns the raw 16-byte binary representation of the UUID, consistent + with OrcaPod's canonical ``pa.binary(16)`` Arrow storage format. + The binary form is compact, unambiguous, and independent of string + formatting conventions. + """ + + def handle(self, obj: Any, hasher: "SemanticHasherProtocol") -> Any: + return obj.bytes +``` + +- [ ] **Step 5: Run tests to verify they pass** + +```bash +uv run pytest tests/ -k "uuid_handler" -v +``` + +Expected: all PASS + +- [ ] **Step 6: Commit** + +```bash +git add src/orcapod/hashing/semantic_hashing/builtin_handlers.py +git commit -m "fix(hashing): UUIDHandler returns bytes instead of str, consistent with binary Arrow storage (PLT-1162)" +``` + +--- + +### Task 5: Update system UUID column Arrow types in arrow_data_utils.py + +**Files:** +- Modify: `src/orcapod/utils/arrow_data_utils.py` + +- [ ] **Step 1: Locate all UUID column field definitions** + +The UUID columns that need updating are **only** the following — all others (e.g. `source_id`, `pipeline_hash`, string metadata) are NOT UUIDs and must be left as `pa.large_string()`: + +| Column name | File | Contains UUID? | +|---|---|---| +| `datagram_id` | `datagram.py`, `arrow_data_utils.py` | ✅ uuid7 | +| `record_id` / `record_id_col_name` | `arrow_data_utils.py`, `data_function.py` | ✅ uuid7 | +| `_log_id` | `logging_observer.py` | ✅ uuid7 | +| `_status_id` | `status_observer.py` | ✅ uuid7 | +| `source_id` | `arrow_data_utils.py`, `stream_builder.py` | ❌ hash string | +| `_source_*` columns | `arrow_data_utils.py` | ❌ provenance string | +| `pipeline_hash` | various | ❌ hash string | + +Run this to confirm which lines in `arrow_data_utils.py` define UUID-carrying fields: + +```bash +grep -n "record_id\|datagram_id\|_log_id\|_status_id" \ + src/orcapod/utils/arrow_data_utils.py | grep -i "field\|large_string\|array" +``` + +- [ ] **Step 2: Update imports in `arrow_data_utils.py`** + +Add the import for `UUID_ARROW_TYPE` at the top of `src/orcapod/utils/arrow_data_utils.py`, alongside any existing `orcapod` imports: + +```python +from orcapod.types import UUID_ARROW_TYPE +``` + +- [ ] **Step 3: Replace UUID column field types** + +For every `pa.field(record_id_col_name, pa.large_string(), ...)` and `pa.array(..., type=pa.large_string())` that is used for UUID columns (identified in Step 1), replace `pa.large_string()` with `UUID_ARROW_TYPE`. Leave non-UUID `large_string` columns (e.g. `source_id` hash columns, string metadata) unchanged. + +Example — if you see: + +```python +pa.field(record_id_col_name, pa.large_string(), nullable=False) +``` + +Change to: + +```python +pa.field(record_id_col_name, UUID_ARROW_TYPE, nullable=False) +``` + +And if you see an array constructor for record IDs: + +```python +pa.array(record_ids, type=pa.large_string()) +``` + +Change to: + +```python +pa.array(record_ids, type=UUID_ARROW_TYPE) +``` + +- [ ] **Step 4: Run the relevant test suite to surface failures** + +```bash +uv run pytest tests/test_core/ tests/test_utils/ -x -v 2>&1 | head -60 +``` + +Fix any type assertion failures by updating tests that check `pa.large_string()` for UUID columns to check `pa.binary(16)` instead. + +- [ ] **Step 5: Commit** + +```bash +git add src/orcapod/utils/arrow_data_utils.py tests/ +git commit -m "fix(arrow): update system UUID column Arrow types to pa.binary(16) (PLT-1162)" +``` + +--- + +### Task 6: Update UUID generation sites to produce bytes + +**Files:** +- Modify: `src/orcapod/core/datagrams/datagram.py` +- Modify: `src/orcapod/core/data_function.py` +- Modify: `src/orcapod/pipeline/logging_observer.py` +- Modify: `src/orcapod/pipeline/status_observer.py` + +- [ ] **Step 1: Update `datagram.py`** + +Open `src/orcapod/core/datagrams/datagram.py`. Find the `datagram_id` property (around line 432): + +```python +@property +def datagram_id(self) -> str: + if self._datagram_id is None: + self._datagram_id = str(uuid7()) + return self._datagram_id +``` + +Change to: + +```python +@property +def datagram_id(self) -> bytes: + if self._datagram_id is None: + self._datagram_id = uuid7().bytes + return self._datagram_id +``` + +Also update the `_datagram_id` instance variable type annotation (if present) from `str | None` to `bytes | None`, and update the `__init__` parameter type annotation for `datagram_id` from `str | None` to `bytes | None`. + +- [ ] **Step 2: Update `data_function.py`** + +Open `src/orcapod/core/data_function.py`. Find the `record_id` generation (around line 555): + +```python +record_id = str(uuid7()) +``` + +Change to: + +```python +record_id = uuid7().bytes +``` + +Update any type annotation for `record_id` in the same method from `str` to `bytes`. + +- [ ] **Step 3: Update `logging_observer.py`** + +Open `src/orcapod/pipeline/logging_observer.py`. Find the `_log_id` generation (around line 103): + +```bash +grep -n "uuid7\|log_id" src/orcapod/pipeline/logging_observer.py +``` + +Change all `str(uuid7())` used for Arrow-stored IDs to `uuid7().bytes`. + +- [ ] **Step 4: Update `status_observer.py`** + +Open `src/orcapod/pipeline/status_observer.py`. Find the `_status_id` generation (lines 273 and 400): + +```bash +grep -n "uuid7\|status_id" src/orcapod/pipeline/status_observer.py +``` + +Change all `str(uuid7())` used for Arrow-stored IDs to `uuid7().bytes`. + +- [ ] **Step 5: Run the core test suite** + +```bash +uv run pytest tests/test_core/datagrams/ tests/test_core/data_function/ \ + tests/test_observability/ -x -v 2>&1 | head -80 +``` + +Fix any type assertion failures — tests that previously checked `isinstance(datagram_id, str)` should now check `isinstance(datagram_id, bytes)` and `len(datagram_id) == 16`. + +- [ ] **Step 6: Commit** + +```bash +git add src/orcapod/core/datagrams/datagram.py \ + src/orcapod/core/data_function.py \ + src/orcapod/pipeline/logging_observer.py \ + src/orcapod/pipeline/status_observer.py \ + tests/ +git commit -m "fix(core): UUID generation sites produce bytes instead of str (PLT-1162)" +``` + +--- + +### Task 7: Update PostgreSQL connector UUID mapping + +**Files:** +- Modify: `src/orcapod/databases/postgresql_connector.py` +- Modify: `tests/test_databases/` (find PostgreSQL connector test) + +- [ ] **Step 1: Locate and write the failing test** + +```bash +grep -rl "PostgreSQL\|postgresql\|pg_type_to_arrow\|uuid" tests/test_databases/ --include="*.py" | head -5 +``` + +Open the relevant test file and add: + +```python +import pyarrow as pa +from orcapod.types import UUID_ARROW_TYPE + + +def test_pg_uuid_maps_to_binary16(): + """PostgreSQL 'uuid' type must map to UUID_ARROW_TYPE = pa.binary(16).""" + from orcapod.databases.postgresql_connector import _pg_type_to_arrow + + result = _pg_type_to_arrow("uuid") + assert result == UUID_ARROW_TYPE + assert result == pa.binary(16) +``` + +- [ ] **Step 2: Run to verify test fails** + +```bash +uv run pytest tests/test_databases/ -k "pg_uuid_maps" -v +``` + +Expected: FAIL — returns `pa.large_string()` + +- [ ] **Step 3: Update `_pg_type_to_arrow` in `postgresql_connector.py`** + +Open `src/orcapod/databases/postgresql_connector.py`. Add the import near the top of the file (or inside the function): + +```python +from orcapod.types import UUID_ARROW_TYPE +``` + +Find the UUID branch in `_pg_type_to_arrow` (around line 95-100): + +```python + if t in ("text", "varchar", "character varying", "char", "bpchar", + "name", "uuid", "json", "jsonb", "time", "timetz"): + if t in ("time", "timetz"): + logger.warning("PostgreSQL type %r mapped to pa.large_string() (known gap)", t) + if t == "uuid": + # TODO: revisit mapping once PLT-1162 decides on a canonical UUID Arrow type + pass + return _pa.large_string() +``` + +Restructure to handle `uuid` separately before the grouped string branch: + +```python + if t == "uuid": + return UUID_ARROW_TYPE + + if t in ("text", "varchar", "character varying", "char", "bpchar", + "name", "json", "jsonb", "time", "timetz"): + if t in ("time", "timetz"): + logger.warning("PostgreSQL type %r mapped to pa.large_string() (known gap)", t) + return _pa.large_string() +``` + +- [ ] **Step 4: Handle bytes conversion in array construction** + +The PostgreSQL driver (`psycopg2`) returns `uuid` column values as Python `uuid.UUID` objects. When building Arrow arrays from fetched rows the connector must convert these to `bytes` before passing to `pa.binary(16)`. + +First, locate the array-construction code: + +```bash +grep -n "pa\.array\|pa\.table\|pa\.record_batch\|pyarrow\|_coerce\|_build\|_rows_to" \ + src/orcapod/databases/postgresql_connector.py | head -30 +``` + +The connector will have a loop that collects column values and calls `pa.array(column_values, type=arrow_type)`. Add the following module-level helper above that loop: + +```python +def _coerce_pg_value(value: Any, arrow_type: "pa.DataType") -> Any: + """Coerce a psycopg2 value to match the target Arrow type. + + Args: + value: Raw value from the psycopg2 cursor. + arrow_type: The Arrow type the value will be stored as. + + Returns: + A Python value compatible with ``pa.array(..., type=arrow_type)``. + """ + import pyarrow as _pa + if value is None: + return None + # psycopg2 returns uuid columns as uuid.UUID objects; binary(16) needs bytes + if arrow_type == _pa.binary(16) and hasattr(value, "bytes"): + return value.bytes + return value +``` + +Then in the column-building loop, wrap each value: + +```python +# Before +column_values = [row[col_idx] for row in rows] + +# After +column_values = [_coerce_pg_value(row[col_idx], arrow_type) for row in rows] +``` + +If the connector uses a different pattern (e.g. a single `pa.Table.from_pydict` call), apply the same coercion on the dict values for any `pa.binary(16)` column. + +- [ ] **Step 5: Run tests to verify they pass** + +```bash +uv run pytest tests/test_databases/ -x -v 2>&1 | head -60 +``` + +Expected: all PASS + +- [ ] **Step 6: Commit** + +```bash +git add src/orcapod/databases/postgresql_connector.py tests/test_databases/ +git commit -m "fix(databases): map PostgreSQL uuid columns to UUID_ARROW_TYPE (pa.binary(16)) (PLT-1162)" +``` + +--- + +### Task 8: Full test suite — fix remaining failures + +- [ ] **Step 1: Run the full test suite** + +```bash +uv run pytest tests/ -x --tb=short 2>&1 | tail -40 +``` + +- [ ] **Step 2: For each failure, apply the fix pattern** + +Most remaining failures will be one of these patterns: + +**Pattern A — test asserts `pa.large_string()` for a UUID column:** +```python +# Before (failing) +assert schema.field("datagram_id").type == pa.large_string() +# After (fixed) +assert schema.field("datagram_id").type == pa.binary(16) +``` + +**Pattern B — test constructs a UUID value as a string:** +```python +# Before (failing) +datagram = Datagram(datagram_id="550e8400-e29b-41d4-a716-446655440000") +# After (fixed) +import uuid +datagram = Datagram(datagram_id=uuid.UUID("550e8400-e29b-41d4-a716-446655440000").bytes) +``` + +**Pattern C — test checks `isinstance(x, str)` for a UUID column value:** +```python +# Before (failing) +assert isinstance(datagram.datagram_id, str) +# After (fixed) +assert isinstance(datagram.datagram_id, bytes) +assert len(datagram.datagram_id) == 16 +``` + +- [ ] **Step 3: Run the full suite again to confirm all pass** + +```bash +uv run pytest tests/ --tb=short 2>&1 | tail -20 +``` + +Expected: all tests PASS, no failures + +- [ ] **Step 4: Commit** + +```bash +git add tests/ +git commit -m "test: update UUID column type assertions from large_string to binary(16) (PLT-1162)" +``` + +--- + +### Task 9: Post-implementation linear comment + +- [ ] **Step 1: Post design note to PLT-1162** + +Post a comment on [PLT-1162](https://linear.app/enigma-metamorphic/issue/PLT-1162) summarising the decision: + +``` +mcp__claude_ai_Linear__save_comment( + issueId: "PLT-1162", + body: "## Decision + +Canonical UUID Arrow type: `pa.binary(16)` (`fixed_size_binary[16]`) for all UUID values in OrcaPod. + +**Two layers:** +- `UUID_ARROW_TYPE = pa.binary(16)` — all system UUID columns (datagram_id, record_id, _log_id, _status_id) +- `UUID_STRUCT_ARROW_TYPE = pa.struct([pa.field(\"uuid\", pa.binary(16))])` — semantic type for uuid.UUID Python round-trips + +**Alternatives rejected:** +- `pa.large_string()` — 2.75× larger, no type distinction from arbitrary strings +- `pa.uuid()` — Polars 1.31 hard incompatibility (ComputeError on conversion) +- struct with large_string inner type — UUID is fundamentally binary, not a string + +**Known limitation:** Polars `write_json()` panics on Binary columns (pola-rs/polars#15410). OrcaPod has zero `write_json()` calls; unaffected in practice. + +Full spec: `superpowers/specs/2026-06-11-plt-1162-uuid-arrow-type-design.md`" +) +``` + +--- + +### Task 10: Push and create PR + +- [ ] **Step 1: Push branch** + +```bash +git push -u origin eywalker/plt-1162-design-spike-uuid-arrow-type-mapping-large_string-vs +``` + +- [ ] **Step 2: Invoke `sensei:create-pr`** + +Use the `sensei:create-pr` skill to create the pull request targeting `main`. diff --git a/superpowers/specs/2026-06-11-plt-1162-uuid-arrow-type-design.md b/superpowers/specs/2026-06-11-plt-1162-uuid-arrow-type-design.md new file mode 100644 index 00000000..cd64ad07 --- /dev/null +++ b/superpowers/specs/2026-06-11-plt-1162-uuid-arrow-type-design.md @@ -0,0 +1,213 @@ +# UUID Arrow Type Mapping Design + +**Date:** 2026-06-11 +**Linear issue:** [PLT-1162](https://linear.app/enigma-metamorphic/issue/PLT-1162/design-spike-uuid-arrow-type-mapping-large_string-vs-dedicated-struct) +**Status:** Approved +**Breaking change:** Yes — intentional and acceptable at pre-v0.1.0 + +--- + +## Overview + +OrcaPod has historically stored UUID values (both internal system identifiers and +user-facing data) as `pa.large_string()`. This spec formalises the canonical Arrow +representation for all UUID values: **`pa.binary(16)` (`fixed_size_binary[16]`)**, +with a named constant for system use and a dedicated semantic struct type for +user-facing `uuid.UUID` round-trips through the type system. + +All UUID values are stored at full 16-byte precision. No truncation or abbreviation +is permitted. + +--- + +## Goals & Success Criteria + +- A single named constant `UUID_ARROW_TYPE = pa.binary(16)` is the authoritative + Arrow type for all internal UUID columns (`datagram_id`, `record_id`, `_log_id`, + `_status_id`, system tag identifier columns). +- A semantic struct type `pa.struct([pa.field("uuid", pa.binary(16))])` enables + `uuid.UUID` Python objects to round-trip through Arrow, registering in the + semantic type registry alongside `path`/`upath`. +- Every UUID stored in Arrow uses the full 16 bytes. No hex strings, no dashes, no + truncated representations. +- The `# TODO: revisit mapping once PLT-1162 decides` comment in + `databases/postgresql_connector.py` is removed and replaced with the correct + binary mapping. +- All DB connectors (`PostgreSQLConnector`, `SQLiteConnector`) decode string UUID + values from the DB driver into `bytes` before constructing Arrow arrays. +- No `pa.large_string()` UUID columns remain in system-managed schemas. + +--- + +## Design + +### Two-layer UUID representation + +| Layer | Arrow type | Used for | +|---|---|---| +| **System** | `pa.binary(16)` via `UUID_ARROW_TYPE` | `datagram_id`, `record_id`, `_log_id`, `_status_id`, system tag ID columns | +| **Semantic** | `pa.struct([pa.field("uuid", pa.binary(16))])` via `UUID_STRUCT_ARROW_TYPE` | User functions returning `uuid.UUID`; DB `uuid` columns mapped through the semantic type system | + +### Named constants + +Place both constants in `src/orcapod/types.py` alongside `Schema`, `ColumnConfig`, +and `ContentHash`: + +```python +import pyarrow as pa + +# Canonical Arrow type for all UUID values in OrcaPod. +# Stored as fixed_size_binary[16] — 16 raw bytes, no hex encoding, no dashes. +UUID_ARROW_TYPE: pa.DataType = pa.binary(16) + +# Semantic struct type for Python uuid.UUID round-trips through the type system. +# Follows the same single-field struct pattern as path/upath. +UUID_STRUCT_ARROW_TYPE: pa.StructType = pa.struct( + [pa.field("uuid", UUID_ARROW_TYPE)] +) +``` + +### UUID generation at all sites + +All sites that generate a UUID for Arrow storage must produce `bytes`, not `str`: + +```python +# Before +self._datagram_id = str(uuid7()) + +# After +self._datagram_id = uuid7().bytes +``` + +`uuid_utils.uuid7()` returns a `uuid.UUID`-compatible object with a `.bytes` +property. Standard `uuid.uuid4()` and `uuid.uuid5()` likewise provide `.bytes`. + +`run_id` in `sync_orchestrator.py` and `async_orchestrator.py` is a **transient +Python string** passed to observer callbacks — it is never stored in an Arrow +column and is explicitly out of scope for this change. It remains `str(uuid.uuid4())`. + +### Semantic struct converter + +Add `UUIDStructConverter` in `src/orcapod/semantic_types/semantic_struct_converters.py`, +following the `PythonPathStructConverter`/`UPathStructConverter` pattern: + +```python +class UUIDStructConverter(SemanticStructConverterBase): + """Converts Python ``uuid.UUID`` objects to/from the OrcaPod UUID struct.""" + + python_type = uuid.UUID + arrow_struct_type = UUID_STRUCT_ARROW_TYPE + + def python_to_struct_dict(self, value: uuid.UUID) -> dict[str, bytes]: + return {"uuid": value.bytes} + + def struct_dict_to_python(self, struct_dict: dict[str, Any]) -> uuid.UUID: + return uuid.UUID(bytes=struct_dict["uuid"]) +``` + +Register `UUIDStructConverter` in the default semantic type registry alongside +`PythonPathStructConverter` and `UPathStructConverter`. + +### DB connector mapping + +**PostgreSQL** (`databases/postgresql_connector.py`): + +The `_pg_type_to_arrow` function currently maps `uuid` to `pa.large_string()` with +a TODO comment. Replace with `UUID_ARROW_TYPE`: + +```python +if t == "uuid": + return UUID_ARROW_TYPE +``` + +The PostgreSQL driver (`psycopg2`/`asyncpg`) returns UUID column values as Python +`uuid.UUID` objects or strings depending on the driver and registration. During +array construction, extract `.bytes` from each value before passing to the +`pa.binary(16)` array builder. + +**SQLite** (`databases/sqlite_connector.py`): + +SQLite has no native UUID type; all UUID columns are stored as `TEXT` and arrive +as strings from the driver. Because `TEXT` is used for all string-like data, the +connector cannot automatically distinguish UUID columns from other string columns. +`TEXT` columns therefore continue to map to `pa.large_string()` by default. +No change is required to the SQLite connector as part of this spec. The broader +question of how OrcaPod preserves Arrow type fidelity when round-tripping through +loosely-typed backends is tracked in +[PLT-1615](https://linear.app/enigma-metamorphic/issue/PLT-1615). + +### No truncation — enforcement + +The rule "no truncation of UUID values" specifically means: + +- No `uuid4().hex[:N]` patterns for UUID identity columns. +- No `str(uuid7())[:N]` style abbreviations. +- UUID-carrying Arrow columns always use the full 16-byte representation. + +The broader audit of hardcoded hash truncation lengths across the codebase (covering +`job.py`, `serialization.py`, `universal_converter.py`, etc.) is tracked separately +under [PLT-1614](https://linear.app/enigma-metamorphic/issue/PLT-1614). + +--- + +## Alternatives Considered + +### `pa.large_string()` for all UUIDs + +Current state. Simple and Polars-compatible everywhere including `write_json()`. +Rejected because: semantically imprecise (UUID is binary, not a string); 2.75× larger +storage per UUID value; no type-level distinction from arbitrary strings. + +### `pa.uuid()` PyArrow extension type + +Available in PyArrow ≥ 20.0 as `pa.uuid()` (`arrow.uuid` extension over +`fixed_size_binary[16]`). Rejected because Polars 1.31 raises +`ComputeError: cannot create series from Extension(...)` on conversion — a hard +incompatibility. + +### `pa.struct([pa.field("uuid", pa.large_string())])` for semantic type + +Considered for the semantic struct inner type to stay consistent with the `path`/ +`upath` precedent. Rejected in favour of `binary(16)` because: paths are +fundamentally strings (their string form is their canonical representation), while +UUIDs are fundamentally binary (strings are merely a display convention). Using +`binary(16)` also aligns the semantic struct's inner type with the system constant, +giving a single canonical representation. + +--- + +## Known Limitations + +**Polars `write_json()` panics on `Binary`/`BinaryView` columns.** + +Tracked upstream: [pola-rs/polars#15410](https://github.com/pola-rs/polars/issues/15410) +(filed April 2024, open as of Polars 1.31.0). + +This is acceptable because: +- OrcaPod has zero calls to `pl.DataFrame.write_json()` anywhere in its codebase. +- The custom `serialize_pyarrow_table` in `hashing/arrow_utils.py` uses + `column.to_pylist()` + stdlib `json.dumps` and already handles `bytes` values + via base64 encoding — it is unaffected. +- The Polars team has this on their radar; the fix will propagate automatically + once merged. + +--- + +## Implementation Scope + +This spec covers the design decision. Implementation is a follow-on task: + +| File | Change | +|---|---| +| `src/orcapod/types.py` | Add `UUID_ARROW_TYPE`, `UUID_STRUCT_ARROW_TYPE` constants | +| `src/orcapod/semantic_types/semantic_struct_converters.py` | Add `UUIDStructConverter` | +| `src/orcapod/semantic_types/` (registry setup) | Register `UUIDStructConverter` | +| `src/orcapod/utils/arrow_data_utils.py` | Update system column field definitions | +| `src/orcapod/core/datagrams/datagram.py` | `str(uuid7())` → `uuid7().bytes` | +| `src/orcapod/core/data_function.py` | `str(uuid7())` → `uuid7().bytes` | +| `src/orcapod/pipeline/logging_observer.py` | `str(uuid7())` → `uuid7().bytes` | +| `src/orcapod/pipeline/status_observer.py` | `str(uuid7())` → `uuid7().bytes` | +| `src/orcapod/databases/postgresql_connector.py` | Remove TODO; map `uuid` → `UUID_ARROW_TYPE`; decode driver values to bytes | +| `src/orcapod/databases/sqlite_connector.py` | No change — `TEXT` columns remain `pa.large_string()` | +| `src/orcapod/hashing/semantic_hashing/builtin_handlers.py` | Update `UUIDHandler` to accept `bytes` input | +| Tests | Update all tests that assert `pa.large_string()` for UUID columns | diff --git a/test-objective/unit/test_arrow_data_utils.py b/test-objective/unit/test_arrow_data_utils.py index 58da04b9..6527e8ad 100644 --- a/test-objective/unit/test_arrow_data_utils.py +++ b/test-objective/unit/test_arrow_data_utils.py @@ -6,6 +6,8 @@ from __future__ import annotations +import uuid + import pyarrow as pa import pytest @@ -20,6 +22,11 @@ ) +def _make_uuid_bytes() -> bytes: + """Return 16 UUID bytes for use as a record_id.""" + return uuid.uuid4().bytes + + # --------------------------------------------------------------------------- # add_system_tag_columns # --------------------------------------------------------------------------- @@ -35,7 +42,7 @@ def test_adds_system_tag_columns(self): table, schema_hash="abc123", source_ids="src1", - record_ids=["rec1", "rec2"], + record_ids=[_make_uuid_bytes(), _make_uuid_bytes()], ) # Should have original columns plus new system tag columns assert result.num_rows == 2 @@ -61,7 +68,7 @@ def test_length_mismatch_raises(self): table, schema_hash="abc", source_ids=["s1", "s2"], # 2 source_ids for 3 rows - record_ids=["r1", "r2", "r3"], + record_ids=[_make_uuid_bytes(), _make_uuid_bytes(), _make_uuid_bytes()], ) @@ -80,7 +87,7 @@ def test_appends_value_to_system_tags(self): table, schema_hash="abc", source_ids="src1", - record_ids=["r1", "r2"], + record_ids=[_make_uuid_bytes(), _make_uuid_bytes()], ) result = append_to_system_tags(table_with_tags, value="::extra:0") # System tag column names should have changed (appended) @@ -121,7 +128,7 @@ def test_sorts_system_tag_values(self): table, schema_hash="abc", source_ids="src1", - record_ids=["r1", "r2"], + record_ids=[_make_uuid_bytes(), _make_uuid_bytes()], ) result = sort_system_tag_values(table_with_tags) assert result.num_rows == table_with_tags.num_rows diff --git a/test-objective/unit/test_databases.py b/test-objective/unit/test_databases.py index 04def11e..dad67bfc 100644 --- a/test-objective/unit/test_databases.py +++ b/test-objective/unit/test_databases.py @@ -42,8 +42,8 @@ def test_add_and_get_single_record(self) -> None: path = ("test", "table") record = _make_record(42) - db.add_record(path, "rec_1", record, flush=True) - result = db.get_record_by_id(path, "rec_1") + db.add_record(path, b"rec_1", record, flush=True) + result = db.get_record_by_id(path, b"rec_1") assert result is not None assert result.num_rows == 1 @@ -54,8 +54,8 @@ def test_add_and_get_preserves_data(self) -> None: path = ("data",) record = pa.table({"x": [10], "y": ["hello"]}) - db.add_record(path, "id1", record, flush=True) - result = db.get_record_by_id(path, "id1") + db.add_record(path, b"id1", record, flush=True) + result = db.get_record_by_id(path, b"id1") assert result is not None assert result["x"].to_pylist() == [10] @@ -70,7 +70,7 @@ def test_add_records_multiple_rows(self) -> None: path = ("batch",) records = pa.table( { - "__record_id": ["a", "b", "c"], + "__record_id": pa.array([b"a", b"b", b"c"], type=pa.large_binary()), "value": [1, 2, 3], } ) @@ -89,8 +89,8 @@ def test_get_all_records(self) -> None: db = InMemoryArrowDatabase() path = ("multi",) - db.add_record(path, "r1", _make_record(1), flush=True) - db.add_record(path, "r2", _make_record(2), flush=True) + db.add_record(path, b"r1", _make_record(1), flush=True) + db.add_record(path, b"r2", _make_record(2), flush=True) all_records = db.get_all_records(path) assert all_records is not None @@ -105,10 +105,10 @@ def test_get_records_by_ids_subset(self) -> None: path = ("subset",) for i in range(5): - db.add_record(path, f"id_{i}", _make_record(i)) + db.add_record(path, f"id_{i}".encode(), _make_record(i)) db.flush() - result = db.get_records_by_ids(path, ["id_1", "id_3"]) + result = db.get_records_by_ids(path, [b"id_1", b"id_3"]) assert result is not None assert result.num_rows == 2 @@ -120,37 +120,37 @@ def test_skip_duplicates_no_error(self) -> None: db = InMemoryArrowDatabase() path = ("dup",) - db.add_record(path, "same_id", _make_record(1), flush=True) + db.add_record(path, b"same_id", _make_record(1), flush=True) # Adding same ID again with skip_duplicates=True should not raise. - db.add_record(path, "same_id", _make_record(2), skip_duplicates=True, flush=True) + db.add_record(path, b"same_id", _make_record(2), skip_duplicates=True, flush=True) - result = db.get_record_by_id(path, "same_id") + result = db.get_record_by_id(path, b"same_id") assert result is not None # Original record should be preserved (duplicate was skipped). assert result.num_rows == 1 class TestInMemoryArrowDatabasePendingBatch: - """Pending batch semantics: records not visible until flush().""" + """Pending batch semantics: records accessible before flush().""" def test_records_accessible_before_flush(self) -> None: db = InMemoryArrowDatabase() path = ("pending",) - db.add_record(path, "p1", _make_record(1)) + db.add_record(path, b"p1", _make_record(1)) # Records should be accessible via public API even before flush - result = db.get_record_by_id(path, "p1") + result = db.get_record_by_id(path, b"p1") assert result is not None, "Record should be accessible via get_record_by_id before flush" def test_flush_makes_records_visible(self) -> None: db = InMemoryArrowDatabase() path = ("pending",) - db.add_record(path, "p1", _make_record(1)) + db.add_record(path, b"p1", _make_record(1)) db.flush() # After flush, records should still be accessible via public API - result = db.get_record_by_id(path, "p1") + result = db.get_record_by_id(path, b"p1") assert result is not None, "Record should be accessible after flush" all_records = db.get_all_records(path) @@ -165,8 +165,8 @@ def test_flush_commits_pending(self) -> None: db = InMemoryArrowDatabase() path = ("flush_test",) - db.add_record(path, "f1", _make_record(10)) - db.add_record(path, "f2", _make_record(20)) + db.add_record(path, b"f1", _make_record(10)) + db.add_record(path, b"f2", _make_record(20)) db.flush() all_records = db.get_all_records(path) @@ -180,7 +180,7 @@ class TestInMemoryArrowDatabaseInvalidPath: def test_empty_path_raises(self) -> None: db = InMemoryArrowDatabase() with pytest.raises(ValueError, match="cannot be empty"): - db.add_record((), "id", _make_record()) + db.add_record((), b"id", _make_record()) class TestInMemoryArrowDatabaseNonexistentPath: @@ -188,7 +188,7 @@ class TestInMemoryArrowDatabaseNonexistentPath: def test_get_record_by_id_nonexistent(self) -> None: db = InMemoryArrowDatabase() - result = db.get_record_by_id(("no", "such"), "missing_id") + result = db.get_record_by_id(("no", "such"), b"missing_id") assert result is None def test_get_all_records_nonexistent(self) -> None: @@ -198,7 +198,7 @@ def test_get_all_records_nonexistent(self) -> None: def test_get_records_by_ids_nonexistent(self) -> None: db = InMemoryArrowDatabase() - result = db.get_records_by_ids(("no", "such"), ["a", "b"]) + result = db.get_records_by_ids(("no", "such"), [b"a", b"b"]) assert result is None @@ -212,7 +212,7 @@ class TestNoOpArrowDatabaseWrites: def test_add_record_no_error(self) -> None: db = NoOpArrowDatabase() - db.add_record(("path",), "id", _make_record()) + db.add_record(("path",), b"id", _make_record()) def test_add_records_no_error(self) -> None: db = NoOpArrowDatabase() @@ -224,8 +224,8 @@ class TestNoOpArrowDatabaseReads: def test_get_record_by_id_returns_none(self) -> None: db = NoOpArrowDatabase() - db.add_record(("path",), "id", _make_record()) - assert db.get_record_by_id(("path",), "id") is None + db.add_record(("path",), b"id", _make_record()) + assert db.get_record_by_id(("path",), b"id") is None def test_get_all_records_returns_none(self) -> None: db = NoOpArrowDatabase() @@ -233,7 +233,7 @@ def test_get_all_records_returns_none(self) -> None: def test_get_records_by_ids_returns_none(self) -> None: db = NoOpArrowDatabase() - assert db.get_records_by_ids(("path",), ["a"]) is None + assert db.get_records_by_ids(("path",), [b"a"]) is None def test_get_records_with_column_value_returns_none(self) -> None: db = NoOpArrowDatabase() @@ -262,8 +262,8 @@ def test_add_and_get_record(self, tmp_path: object) -> None: path = ("delta", "test") record = _make_record(99) - db.add_record(path, "d1", record, flush=True) - result = db.get_record_by_id(path, "d1") + db.add_record(path, b"d1", record, flush=True) + result = db.get_record_by_id(path, b"d1") assert result is not None assert result.num_rows == 1 @@ -279,12 +279,12 @@ def test_flush_persists_to_disk(self, tmp_path: object) -> None: path = ("persist",) record = _make_record(7) - db.add_record(path, "p1", record) + db.add_record(path, b"p1", record) db.flush() # Create a new database instance pointing at the same path to verify # data was persisted. db2 = DeltaTableDatabase(base_path=tmp_path, create_base_path=False) - result = db2.get_record_by_id(path, "p1") + result = db2.get_record_by_id(path, b"p1") assert result is not None assert result["col_a"].to_pylist() == [7] diff --git a/tests/test_core/data_function/test_cached_data_function.py b/tests/test_core/data_function/test_cached_data_function.py index aa5b35dc..260d5f68 100644 --- a/tests/test_core/data_function/test_cached_data_function.py +++ b/tests/test_core/data_function/test_cached_data_function.py @@ -211,7 +211,7 @@ def test_skip_cache_lookup_still_returns_result(self, cached_pf, input_data): def test_skip_cache_lookup_adds_another_record(self, cached_pf, input_data, db): cached_pf.call(input_data) # first call — inserts record # Second call with skip_cache_lookup=True tries to insert again; - # skip_duplicates=False is the default, but the data has a new datagram_id + # skip_duplicates=False is the default, but the data has a new datagram_uuid # so a second record with the same input_data_hash is inserted. cached_pf.call(input_data, skip_cache_lookup=True) stored = db.get_all_records(cached_pf.record_path) diff --git a/tests/test_core/data_function/test_data_function.py b/tests/test_core/data_function/test_data_function.py index 3eac088e..5cbb791f 100644 --- a/tests/test_core/data_function/test_data_function.py +++ b/tests/test_core/data_function/test_data_function.py @@ -521,12 +521,16 @@ def test_source_info_record_id_is_uuid(self, add_pf, add_data): result = add_pf.call(add_data) source_str = result.source_info()["result"] - # The record_id segment is between the URI components and the key name - # Format: uri_part1:uri_part2:..::record_id::key - uuid_pattern = re.compile( - r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}" + # Format: :::: + # Extract the second "::" segment which is always the UUID hex. + parts = source_str.split("::") + assert len(parts) == 3, ( + f"Expected exactly 3 '::'-separated segments in {source_str!r}, got {len(parts)}" + ) + uuid_hex_segment = parts[1] + assert re.fullmatch(r"[0-9a-f]{32}", uuid_hex_segment), ( + f"Record ID segment {uuid_hex_segment!r} is not a 32-char lowercase hex string" ) - assert uuid_pattern.search(source_str), f"No UUID found in {source_str!r}" def test_inactive_returns_none(self, add_pf, add_data): add_pf.set_active(False) diff --git a/tests/test_core/function_pod/test_function_node_caching.py b/tests/test_core/function_pod/test_function_node_caching.py index 735f571a..64f94bf2 100644 --- a/tests/test_core/function_pod/test_function_node_caching.py +++ b/tests/test_core/function_pod/test_function_node_caching.py @@ -85,14 +85,16 @@ def _make_node(stream, db=None): class TestComputePipelineEntryId: - def test_returns_non_empty_string(self): + def test_returns_non_empty_bytes(self): stream = _make_stream([{"id": 0, "x": 10}]) node, _ = _make_node(stream) tag = Tag({"id": 0}) data = Data({"x": 10}) entry_id = node.compute_pipeline_entry_id(tag, data) - assert isinstance(entry_id, str) + assert isinstance(entry_id, bytes) assert len(entry_id) > 0 + # Must be method-prefixed: b"{method}:{raw_digest}" + assert b":" in entry_id def test_same_inputs_produce_same_id(self): stream = _make_stream([{"id": 0, "x": 10}]) diff --git a/tests/test_core/function_pod/test_function_pod_node.py b/tests/test_core/function_pod/test_function_pod_node.py index c8e33f71..df839f49 100644 --- a/tests/test_core/function_pod/test_function_pod_node.py +++ b/tests/test_core/function_pod/test_function_pod_node.py @@ -501,11 +501,11 @@ def test_input_data_hash_values_are_non_empty_strings(self, filled_node): hashes = result.column(constants.INPUT_DATA_HASH_COL).to_pylist() assert all(isinstance(h, str) and len(h) > 0 for h in hashes) - def test_data_record_id_values_are_non_empty_strings(self, filled_node): + def test_data_record_id_values_are_non_empty_bytes(self, filled_node): result = filled_node.get_all_records(columns={"meta": True}) assert result is not None ids = result.column(constants.DATA_RECORD_ID).to_pylist() - assert all(isinstance(rid, str) and len(rid) > 0 for rid in ids) + assert all(isinstance(rid, bytes) and len(rid) == 16 for rid in ids) # --------------------------------------------------------------------------- diff --git a/tests/test_core/nodes/test_function_node_iteration.py b/tests/test_core/nodes/test_function_node_iteration.py index f12db251..d22e2dfa 100644 --- a/tests/test_core/nodes/test_function_node_iteration.py +++ b/tests/test_core/nodes/test_function_node_iteration.py @@ -96,13 +96,14 @@ def test_iter_twice_same_order_db_queried_once(self): assert mock_load.call_count <= 1 # at most one DB query assert first == second - def test_cached_output_datas_keyed_by_entry_id_strings(self): - """After run(), _cached_output_datas keys are entry_id strings, not ints.""" + def test_cached_output_datas_keyed_by_entry_id_bytes(self): + """After run(), _cached_output_datas keys are entry_id bytes (method-prefixed digest).""" node = _make_node() node.run() assert len(node._cached_output_datas) == 3 for key in node._cached_output_datas: - assert isinstance(key, str), f"Expected str key, got {type(key)}: {key!r}" + assert isinstance(key, bytes), f"Expected bytes key, got {type(key)}: {key!r}" + assert b":" in key, f"Entry ID bytes should be method-prefixed, got {key!r}" def test_as_table_fresh_node_returns_empty_no_compute(self): """as_table() on a fresh node with no run() and empty DB returns empty table.""" diff --git a/tests/test_core/operators/test_operators.py b/tests/test_core/operators/test_operators.py index 63d5dcbb..232276f9 100644 --- a/tests/test_core/operators/test_operators.py +++ b/tests/test_core/operators/test_operators.py @@ -761,6 +761,55 @@ def test_predicted_schema_matches_result_stream_schema_with_system_tags(self): assert dict(predicted_tag) == dict(actual_tag) assert dict(predicted_pkt) == dict(actual_pkt) + def test_output_schema_preserves_system_tag_col_types(self): + """_predict_system_tag_schema must carry col_type through unchanged. + + The Schema stores Python types. record_id system-tag columns are + stored as ``pa.binary(16)`` in Arrow, which maps to ``bytes`` in + the Python Schema. source_id columns are ``pa.large_string()`` → + ``str``. After a two-way join both Python types must survive — no + silent coercion to ``str`` for record_id. + """ + from orcapod.core.sources.arrow_table_source import ArrowTableSource + from orcapod.system_constants import constants + + src_a = ArrowTableSource( + pa.table( + { + "id": pa.array([1, 2], type=pa.int64()), + "alpha": pa.array([10, 20], type=pa.int64()), + } + ), + tag_columns=["id"], + infer_nullable=True, + ) + src_b = ArrowTableSource( + pa.table( + { + "id": pa.array([1, 2], type=pa.int64()), + "beta": pa.array([100, 200], type=pa.int64()), + } + ), + tag_columns=["id"], + infer_nullable=True, + ) + + op = Join() + tag_schema, _ = op.output_schema(src_a, src_b, columns={"system_tags": True}) + + for col_name, col_type in tag_schema.items(): + if not col_name.startswith(constants.SYSTEM_TAG_PREFIX): + continue + if col_name.startswith(constants.SYSTEM_TAG_RECORD_ID_PREFIX): + assert col_type == bytes, ( + f"record_id column '{col_name}' should map to bytes, " + f"got {col_type!r}" + ) + else: + assert col_type == str, ( + f"source_id column '{col_name}' should map to str, got {col_type!r}" + ) + class TestSemiJoinBehavior: def test_semijoin_filters_left_by_right(self, left_stream, right_stream): @@ -1382,7 +1431,8 @@ def test_swapped_input_order_produces_identical_system_tags(self, three_sources) def test_system_tag_values_are_per_row_source_provenance(self, three_sources): """System tag column values should reflect the source provenance. - source_id columns contain the source_id, record_id columns contain the record_id.""" + source_id columns contain the source_id (str), record_id columns + contain the record_id (bytes, 16-byte binary).""" from orcapod.system_constants import constants src_a, src_b, src_c = three_sources @@ -1395,8 +1445,19 @@ def test_system_tag_values_are_per_row_source_provenance(self, three_sources): values = result_table.column(col).to_pylist() assert len(values) == result_table.num_rows for val in values: - assert isinstance(val, str) - assert len(val) > 0 + if col.startswith(constants.SYSTEM_TAG_RECORD_ID_PREFIX): + # record_id columns store 16-byte UUID values + assert isinstance(val, bytes), ( + f"record_id column {col!r} should contain bytes, got {type(val)}" + ) + assert len(val) == 16, ( + f"record_id column {col!r} should be 16 bytes" + ) + else: + # source_id and other system tag columns are large_string + assert isinstance(val, str), ( + f"system tag column {col!r} should contain str, got {type(val)}" + ) def test_intermediate_operators_produce_different_stream_hash(self): """When sources pass through intermediate operators before Join, diff --git a/tests/test_core/test_result_cache.py b/tests/test_core/test_result_cache.py index aa581776..4d1add6e 100644 --- a/tests/test_core/test_result_cache.py +++ b/tests/test_core/test_result_cache.py @@ -166,7 +166,7 @@ def test_most_recent_wins(self): # Lookup should return the most recent cached = cache.lookup(input_pkt) assert cached is not None - assert cached.datagram_id == output2.datagram_id + assert cached.datagram_uuid == output2.datagram_uuid # --------------------------------------------------------------------------- diff --git a/tests/test_databases/test_connector_arrow_database.py b/tests/test_databases/test_connector_arrow_database.py index 75e18acb..d87701b3 100644 --- a/tests/test_databases/test_connector_arrow_database.py +++ b/tests/test_databases/test_connector_arrow_database.py @@ -300,13 +300,13 @@ class TestEmptyTable: PATH = ("source", "v1") def test_get_record_by_id_returns_none_when_empty(self, db): - assert db.get_record_by_id(self.PATH, "id-1", flush=True) is None + assert db.get_record_by_id(self.PATH, b"id-1", flush=True) is None def test_get_all_records_returns_none_when_empty(self, db): assert db.get_all_records(self.PATH) is None def test_get_records_by_ids_returns_none_when_empty(self, db): - assert db.get_records_by_ids(self.PATH, ["id-1"], flush=True) is None + assert db.get_records_by_ids(self.PATH, [b"id-1"], flush=True) is None def test_get_records_with_column_value_returns_none_when_empty(self, db): assert ( @@ -325,51 +325,51 @@ class TestAddRecordRoundTrip: def test_added_record_retrievable_from_pending(self, db): record = make_table(value=[42]) - db.add_record(self.PATH, "id-1", record) - result = db.get_record_by_id(self.PATH, "id-1") + db.add_record(self.PATH, b"id-1", record) + result = db.get_record_by_id(self.PATH, b"id-1") assert result is not None assert result.column("value").to_pylist() == [42] def test_added_record_retrievable_after_flush(self, db): record = make_table(value=[99]) - db.add_record(self.PATH, "id-2", record) + db.add_record(self.PATH, b"id-2", record) db.flush() - result = db.get_record_by_id(self.PATH, "id-2", flush=True) + result = db.get_record_by_id(self.PATH, b"id-2", flush=True) assert result is not None assert result.column("value").to_pylist() == [99] def test_record_id_column_not_in_result_by_default(self, db): record = make_table(value=[1]) - db.add_record(self.PATH, "id-3", record) - result = db.get_record_by_id(self.PATH, "id-3") + db.add_record(self.PATH, b"id-3", record) + result = db.get_record_by_id(self.PATH, b"id-3") assert result is not None assert ConnectorArrowDatabase.RECORD_ID_COLUMN not in result.column_names def test_record_id_column_exposed_when_requested(self, db): record = make_table(value=[1]) - db.add_record(self.PATH, "id-4", record) + db.add_record(self.PATH, b"id-4", record) db.flush() result = db.get_record_by_id( - self.PATH, "id-4", record_id_column="my_id", flush=True + self.PATH, b"id-4", record_id_column="my_id", flush=True ) assert result is not None assert "my_id" in result.column_names - assert result.column("my_id").to_pylist() == ["id-4"] + assert result.column("my_id").to_pylist() == [b"id-4"] def test_unknown_record_returns_none(self, db): record = make_table(value=[1]) - db.add_record(self.PATH, "id-5", record) + db.add_record(self.PATH, b"id-5", record) db.flush() - assert db.get_record_by_id(self.PATH, "nonexistent", flush=True) is None + assert db.get_record_by_id(self.PATH, b"nonexistent", flush=True) is None def test_multi_row_record_deduplicates_to_last_row(self, db): # add_record stamps ALL rows with the same __record_id value, so # within-batch deduplication (keep-last) leaves a single row. # This mirrors InMemoryArrowDatabase behaviour by design. record = make_table(x=[1, 2, 3]) - db.add_record(self.PATH, "multi-row", record) + db.add_record(self.PATH, b"multi-row", record) db.flush() - result = db.get_record_by_id(self.PATH, "multi-row", flush=True) + result = db.get_record_by_id(self.PATH, b"multi-row", flush=True) assert result is not None assert result.num_rows == 1 assert result.column("x").to_pylist() == [3] # last row kept @@ -384,7 +384,7 @@ class TestAddRecordsRoundTrip: PATH = ("multi", "v1") def test_add_records_bulk_and_retrieve_all(self, db): - records = make_table(__record_id=["a", "b", "c"], value=[10, 20, 30]) + records = make_table(__record_id=[b"a", b"b", b"c"], value=[10, 20, 30]) db.add_records(self.PATH, records, record_id_column="__record_id") db.flush() result = db.get_all_records(self.PATH) @@ -392,14 +392,14 @@ def test_add_records_bulk_and_retrieve_all(self, db): assert result.num_rows == 3 def test_get_all_records_includes_pending(self, db): - records = make_table(__record_id=["x", "y"], value=[1, 2]) + records = make_table(__record_id=[b"x", b"y"], value=[1, 2]) db.add_records(self.PATH, records, record_id_column="__record_id") result = db.get_all_records(self.PATH) assert result is not None assert result.num_rows == 2 def test_first_column_used_as_record_id_by_default(self, db): - records = make_table(id=["r1", "r2"], score=[5, 6]) + records = make_table(id=[b"r1", b"r2"], score=[5, 6]) db.add_records(self.PATH, records) db.flush() result = db.get_all_records(self.PATH) @@ -410,13 +410,13 @@ def test_pending_and_committed_combined(self, db): """Records added before and after flush should both appear in get_all_records.""" db.add_records( self.PATH, - make_table(__record_id=["a"], v=[1]), + make_table(__record_id=[b"a"], v=[1]), record_id_column="__record_id", flush=True, ) db.add_records( self.PATH, - make_table(__record_id=["b"], v=[2]), + make_table(__record_id=[b"b"], v=[2]), record_id_column="__record_id", ) result = db.get_all_records(self.PATH) @@ -438,32 +438,32 @@ class TestDuplicateHandling: PATH = ("dup", "v1") def test_skip_duplicates_true_does_not_raise(self, db): - db.add_record(self.PATH, "dup-id", make_table(value=[1])) + db.add_record(self.PATH, b"dup-id", make_table(value=[1])) db.flush() # same id again with skip_duplicates=True — should silently skip - db.add_record(self.PATH, "dup-id", make_table(value=[2]), skip_duplicates=True) + db.add_record(self.PATH, b"dup-id", make_table(value=[2]), skip_duplicates=True) def test_skip_duplicates_preserves_original_value(self, db): - db.add_record(self.PATH, "dup-id", make_table(value=[1]), flush=True) + db.add_record(self.PATH, b"dup-id", make_table(value=[1]), flush=True) db.add_record( - self.PATH, "dup-id", make_table(value=[99]), skip_duplicates=True, flush=True + self.PATH, b"dup-id", make_table(value=[99]), skip_duplicates=True, flush=True ) - result = db.get_record_by_id(self.PATH, "dup-id", flush=True) + result = db.get_record_by_id(self.PATH, b"dup-id", flush=True) assert result is not None assert result.column("value").to_pylist() == [1] # original preserved def test_skip_duplicates_false_raises_on_pending_duplicate(self, db): - db.add_record(self.PATH, "dup-id2", make_table(value=[1])) + db.add_record(self.PATH, b"dup-id2", make_table(value=[1])) with pytest.raises(ValueError): db.add_records( self.PATH, - make_table(__record_id=["dup-id2"], value=[99]), + make_table(__record_id=[b"dup-id2"], value=[99]), record_id_column="__record_id", skip_duplicates=False, ) def test_within_batch_deduplication_keeps_last(self, db): - records = make_table(__record_id=["same", "same"], value=[1, 2]) + records = make_table(__record_id=[b"same", b"same"], value=[1, 2]) db.add_records(self.PATH, records, record_id_column="__record_id") db.flush() result = db.get_all_records(self.PATH) @@ -482,24 +482,24 @@ class TestGetRecordsByIds: @pytest.fixture(autouse=True) def populate(self, db): - records = make_table(__record_id=["a", "b", "c"], value=[10, 20, 30]) + records = make_table(__record_id=[b"a", b"b", b"c"], value=[10, 20, 30]) db.add_records(self.PATH, records, record_id_column="__record_id") db.flush() def test_retrieves_subset(self, db): - result = db.get_records_by_ids(self.PATH, ["a", "c"], flush=True) + result = db.get_records_by_ids(self.PATH, [b"a", b"c"], flush=True) assert result is not None assert result.num_rows == 2 def test_returns_none_for_missing_ids(self, db): - result = db.get_records_by_ids(self.PATH, ["z"], flush=True) + result = db.get_records_by_ids(self.PATH, [b"z"], flush=True) assert result is None def test_empty_id_list_returns_none(self, db): assert db.get_records_by_ids(self.PATH, [], flush=True) is None def test_retrieves_single_id(self, db): - result = db.get_records_by_ids(self.PATH, ["b"], flush=True) + result = db.get_records_by_ids(self.PATH, [b"b"], flush=True) assert result is not None assert result.num_rows == 1 @@ -515,7 +515,7 @@ class TestGetRecordsWithColumnValue: @pytest.fixture(autouse=True) def populate(self, db): records = make_table( - __record_id=["p", "q", "r"], category=["A", "B", "A"] + __record_id=[b"p", b"q", b"r"], category=["A", "B", "A"] ) db.add_records(self.PATH, records, record_id_column="__record_id") db.flush() @@ -552,41 +552,41 @@ def test_accepts_mapping_and_collection_of_tuples(self, db): class TestHierarchicalPath: def test_deep_path_stores_and_retrieves(self, db): path = ("org", "project", "dataset", "v1") - db.add_record(path, "deep-id", make_table(x=[7])) + db.add_record(path, b"deep-id", make_table(x=[7])) db.flush() - result = db.get_record_by_id(path, "deep-id", flush=True) + result = db.get_record_by_id(path, b"deep-id", flush=True) assert result is not None assert result.column("x").to_pylist() == [7] def test_different_paths_are_independent(self, db): path_a = ("ns", "a") path_b = ("ns", "b") - db.add_record(path_a, "id-1", make_table(v=[1])) - db.add_record(path_b, "id-1", make_table(v=[2])) + db.add_record(path_a, b"id-1", make_table(v=[1])) + db.add_record(path_b, b"id-1", make_table(v=[2])) db.flush() - result_a = db.get_record_by_id(path_a, "id-1", flush=True) - result_b = db.get_record_by_id(path_b, "id-1", flush=True) + result_a = db.get_record_by_id(path_a, b"id-1", flush=True) + result_b = db.get_record_by_id(path_b, b"id-1", flush=True) assert result_a.column("v").to_pylist() == [1] assert result_b.column("v").to_pylist() == [2] def test_invalid_empty_path_raises(self, db): with pytest.raises(ValueError): - db.add_record((), "id-1", make_table(v=[1])) + db.add_record((), b"id-1", make_table(v=[1])) def test_path_exceeding_max_depth_raises(self, db): path = tuple(f"part{i}" for i in range(db.max_hierarchy_depth + 1)) with pytest.raises(ValueError, match="exceeds maximum"): - db.add_record(path, "id-1", make_table(v=[1])) + db.add_record(path, b"id-1", make_table(v=[1])) def test_path_component_with_slash_raises(self, db): # "/" is the _get_record_key separator; allowing it would corrupt # flush()'s record_path reconstruction via split("/"). with pytest.raises(ValueError, match="invalid character"): - db.add_record(("bad/path",), "id-1", make_table(v=[1])) + db.add_record(("bad/path",), b"id-1", make_table(v=[1])) def test_path_component_with_null_byte_raises(self, db): with pytest.raises(ValueError, match="invalid character"): - db.add_record(("bad\x00path",), "id-1", make_table(v=[1])) + db.add_record(("bad\x00path",), b"id-1", make_table(v=[1])) class TestPathToTableName: @@ -624,8 +624,8 @@ class TestFlushBehaviour: PATH = ("flush", "v1") def test_flush_writes_pending_to_connector(self, db, connector): - db.add_record(self.PATH, "f1", make_table(v=[1])) - db.add_record(self.PATH, "f2", make_table(v=[2])) + db.add_record(self.PATH, b"f1", make_table(v=[1])) + db.add_record(self.PATH, b"f2", make_table(v=[2])) # pending key exists before flush record_key = db._get_record_key(self.PATH) assert record_key in db._pending_batches @@ -637,14 +637,14 @@ def test_flush_writes_pending_to_connector(self, db, connector): assert table_name in connector.get_table_names() def test_flush_inline_via_flush_kwarg(self, db, connector): - db.add_record(self.PATH, "x", make_table(v=[5]), flush=True) + db.add_record(self.PATH, b"x", make_table(v=[5]), flush=True) table_name = db._path_to_table_name(self.PATH) assert table_name in connector.get_table_names() def test_multiple_flushes_accumulate_records(self, db): - db.add_record(self.PATH, "m1", make_table(v=[10])) + db.add_record(self.PATH, b"m1", make_table(v=[10])) db.flush() - db.add_record(self.PATH, "m2", make_table(v=[20])) + db.add_record(self.PATH, b"m2", make_table(v=[20])) db.flush() result = db.get_all_records(self.PATH) assert result is not None @@ -652,8 +652,8 @@ def test_multiple_flushes_accumulate_records(self, db): def test_second_flush_on_existing_table_upserts(self, db): """Flushing the same path twice should not duplicate rows.""" - db.add_record(self.PATH, "u1", make_table(v=[1]), flush=True) - db.add_record(self.PATH, "u1", make_table(v=[99]), skip_duplicates=True, flush=True) + db.add_record(self.PATH, b"u1", make_table(v=[1]), flush=True) + db.add_record(self.PATH, b"u1", make_table(v=[99]), skip_duplicates=True, flush=True) result = db.get_all_records(self.PATH) assert result is not None # skip_existing=True means original is preserved, row count stays 1 @@ -669,21 +669,21 @@ def test_flush_with_skip_duplicates_passes_skip_existing_to_connector( """skip_duplicates=True must translate to skip_existing=True at flush, so connectors can use native INSERT-OR-IGNORE without a Python-side full-table read.""" - db.add_record(("t",), "a", make_table(v=[1]), flush=True) + db.add_record(("t",), b"a", make_table(v=[1]), flush=True) # Second add with skip_duplicates=True should not overwrite v=1 - db.add_record(("t",), "a", make_table(v=[99]), skip_duplicates=True, flush=True) - result = db.get_record_by_id(("t",), "a", flush=True) + db.add_record(("t",), b"a", make_table(v=[99]), skip_duplicates=True, flush=True) + result = db.get_record_by_id(("t",), b"a", flush=True) assert result is not None assert result["v"][0].as_py() == 1 # original preserved via skip_existing=True def test_flush_schema_mismatch_raises_value_error(self, db): """flush() must raise ValueError when the pending schema differs from the table already in the connector — before any data is written.""" - db.add_record(("t",), "a", make_table(v=[1]), flush=True) + db.add_record(("t",), b"a", make_table(v=[1]), flush=True) # Now try to flush a batch with a different column name db.add_records( ("t",), - pa.table({"__record_id": pa.array(["b"]), "x": pa.array([2])}), + pa.table({"__record_id": pa.array([b"b"]), "x": pa.array([2])}), record_id_column="__record_id", ) with pytest.raises(ValueError, match="Schema mismatch"): @@ -740,37 +740,37 @@ def test_at_does_not_modify_original(self, db): def test_writes_through_scoped_view_readable_from_same_view(self, db): scoped = db.at("pipeline", "node1") record = pa.table({"value": pa.array([42])}) - scoped.add_record(("outputs",), "id1", record, flush=True) - result = scoped.get_record_by_id(("outputs",), "id1") + scoped.add_record(("outputs",), b"id1", record, flush=True) + result = scoped.get_record_by_id(("outputs",), b"id1") assert result is not None assert result.column("value").to_pylist() == [42] def test_scoped_write_not_visible_via_parent_at_same_path(self, db): scoped = db.at("pipeline", "node1") - scoped.add_record(("outputs",), "id1", pa.table({"v": pa.array([1])}), flush=True) - assert db.get_record_by_id(("outputs",), "id1") is None + scoped.add_record(("outputs",), b"id1", pa.table({"v": pa.array([1])}), flush=True) + assert db.get_record_by_id(("outputs",), b"id1") is None def test_two_scoped_views_share_storage(self, db): view_a = db.at("pipeline", "node1") view_b = db.at("pipeline", "node1") - view_a.add_record(("outputs",), "id1", pa.table({"v": pa.array([99])}), flush=True) - result = view_b.get_record_by_id(("outputs",), "id1") + view_a.add_record(("outputs",), b"id1", pa.table({"v": pa.array([99])}), flush=True) + result = view_b.get_record_by_id(("outputs",), b"id1") assert result is not None assert result.column("v").to_pylist() == [99] def test_prefix_appears_in_sql_table_name(self, db): """Prefix components are included in the SQL table name via _path_to_table_name.""" scoped = db.at("pipeline", "node1") - scoped.add_record(("outputs",), "id1", pa.table({"v": pa.array([1])}), flush=True) + scoped.add_record(("outputs",), b"id1", pa.table({"v": pa.array([1])}), flush=True) # Table name should be pipeline__node1__outputs table_names = db._connector.get_table_names() assert "pipeline__node1__outputs" in table_names def test_validate_record_path_checks_combined_depth(self, db): scoped = db.at("a", "b", "c", "d", "e", "f", "g", "h", "i") # 9 prefix components - scoped.add_record(("z",), "id1", pa.table({"v": pa.array([1])})) + scoped.add_record(("z",), b"id1", pa.table({"v": pa.array([1])})) with pytest.raises(ValueError): - scoped.add_record(("z", "extra"), "id2", pa.table({"v": pa.array([2])})) + scoped.add_record(("z", "extra"), b"id2", pa.table({"v": pa.array([2])})) def test_at_rejects_slash_in_component(self, db): with pytest.raises(ValueError, match="invalid character"): diff --git a/tests/test_databases/test_delta_table_database.py b/tests/test_databases/test_delta_table_database.py index fa19f0e8..7fc9ef91 100644 --- a/tests/test_databases/test_delta_table_database.py +++ b/tests/test_databases/test_delta_table_database.py @@ -98,13 +98,13 @@ class TestEmptyTable: PATH = ("source", "v1") def test_get_record_by_id_returns_none_when_empty(self, db): - assert db.get_record_by_id(self.PATH, "id-1", flush=True) is None + assert db.get_record_by_id(self.PATH, b"id-1", flush=True) is None def test_get_all_records_returns_none_when_empty(self, db): assert db.get_all_records(self.PATH) is None def test_get_records_by_ids_returns_none_when_empty(self, db): - assert db.get_records_by_ids(self.PATH, ["id-1"], flush=True) is None + assert db.get_records_by_ids(self.PATH, [b"id-1"], flush=True) is None def test_get_records_with_column_value_returns_none_when_empty(self, db): assert ( @@ -123,42 +123,42 @@ class TestAddRecordRoundTrip: def test_added_record_retrievable_from_pending(self, db): record = make_table(value=[42]) - db.add_record(self.PATH, "id-1", record) - result = db.get_record_by_id(self.PATH, "id-1") + db.add_record(self.PATH, b"id-1", record) + result = db.get_record_by_id(self.PATH, b"id-1") assert result is not None assert result.column("value").to_pylist() == [42] def test_added_record_retrievable_after_flush(self, db): record = make_table(value=[99]) - db.add_record(self.PATH, "id-2", record) + db.add_record(self.PATH, b"id-2", record) db.flush() - result = db.get_record_by_id(self.PATH, "id-2", flush=True) + result = db.get_record_by_id(self.PATH, b"id-2", flush=True) assert result is not None assert result.column("value").to_pylist() == [99] def test_record_id_column_not_in_result_by_default(self, db): record = make_table(value=[1]) - db.add_record(self.PATH, "id-3", record) - result = db.get_record_by_id(self.PATH, "id-3") + db.add_record(self.PATH, b"id-3", record) + result = db.get_record_by_id(self.PATH, b"id-3") assert result is not None assert DeltaTableDatabase.RECORD_ID_COLUMN not in result.column_names def test_record_id_column_exposed_when_requested(self, db): record = make_table(value=[1]) - db.add_record(self.PATH, "id-4", record) + db.add_record(self.PATH, b"id-4", record) db.flush() result = db.get_record_by_id( - self.PATH, "id-4", record_id_column="my_id", flush=True + self.PATH, b"id-4", record_id_column="my_id", flush=True ) assert result is not None assert "my_id" in result.column_names - assert result.column("my_id").to_pylist() == ["id-4"] + assert result.column("my_id").to_pylist() == [b"id-4"] def test_unknown_record_returns_none(self, db): record = make_table(value=[1]) - db.add_record(self.PATH, "id-5", record) + db.add_record(self.PATH, b"id-5", record) db.flush() - assert db.get_record_by_id(self.PATH, "nonexistent", flush=True) is None + assert db.get_record_by_id(self.PATH, b"nonexistent", flush=True) is None # --------------------------------------------------------------------------- @@ -204,14 +204,14 @@ class TestDuplicateHandling: def test_skip_duplicates_true_does_not_raise(self, db): record = make_table(value=[1]) - db.add_record(self.PATH, "dup-id", record) + db.add_record(self.PATH, b"dup-id", record) db.flush() # same id again — should silently skip - db.add_record(self.PATH, "dup-id", make_table(value=[2]), skip_duplicates=True) + db.add_record(self.PATH, b"dup-id", make_table(value=[2]), skip_duplicates=True) def test_skip_duplicates_false_raises_on_pending_duplicate(self, db): record = make_table(value=[1]) - db.add_record(self.PATH, "dup-id2", record) + db.add_record(self.PATH, b"dup-id2", record) with pytest.raises(ValueError): db.add_records( self.PATH, @@ -240,19 +240,19 @@ class TestGetRecordsByIds: PATH = ("byids", "v1") def _populate(self, db): - records = make_table(__record_id=["a", "b", "c"], value=[10, 20, 30]) + records = make_table(__record_id=[b"a", b"b", b"c"], value=[10, 20, 30]) db.add_records(self.PATH, records, record_id_column="__record_id") db.flush() def test_retrieves_subset(self, db): self._populate(db) - result = db.get_records_by_ids(self.PATH, ["a", "c"], flush=True) + result = db.get_records_by_ids(self.PATH, [b"a", b"c"], flush=True) assert result is not None assert result.num_rows == 2 def test_returns_none_for_missing_ids(self, db): self._populate(db) - result = db.get_records_by_ids(self.PATH, ["z"], flush=True) + result = db.get_records_by_ids(self.PATH, [b"z"], flush=True) assert result is None def test_empty_id_list_returns_none(self, db): @@ -310,30 +310,30 @@ class TestHierarchicalPath: def test_deep_path_stores_and_retrieves(self, db): path = ("org", "project", "dataset", "v1") record = make_table(x=[7]) - db.add_record(path, "deep-id", record) + db.add_record(path, b"deep-id", record) db.flush() - result = db.get_record_by_id(path, "deep-id", flush=True) + result = db.get_record_by_id(path, b"deep-id", flush=True) assert result is not None assert result.column("x").to_pylist() == [7] def test_different_paths_are_independent(self, db): path_a = ("ns", "a") path_b = ("ns", "b") - db.add_record(path_a, "id-1", make_table(v=[1])) - db.add_record(path_b, "id-1", make_table(v=[2])) + db.add_record(path_a, b"id-1", make_table(v=[1])) + db.add_record(path_b, b"id-1", make_table(v=[2])) db.flush() - result_a = db.get_record_by_id(path_a, "id-1", flush=True) - result_b = db.get_record_by_id(path_b, "id-1", flush=True) + result_a = db.get_record_by_id(path_a, b"id-1", flush=True) + result_b = db.get_record_by_id(path_b, b"id-1", flush=True) assert result_a.column("v").to_pylist() == [1] assert result_b.column("v").to_pylist() == [2] def test_invalid_empty_path_raises(self, db): with pytest.raises(ValueError): - db.add_record((), "id-1", make_table(v=[1])) + db.add_record((), b"id-1", make_table(v=[1])) def test_path_with_unsafe_characters_raises(self, db): with pytest.raises(ValueError): - db.add_record(("bad/path",), "id-1", make_table(v=[1])) + db.add_record(("bad/path",), b"id-1", make_table(v=[1])) # --------------------------------------------------------------------------- @@ -345,8 +345,8 @@ class TestFlushBehaviour: PATH = ("flush", "v1") def test_flush_writes_pending_to_delta(self, db): - db.add_record(self.PATH, "f1", make_table(v=[1])) - db.add_record(self.PATH, "f2", make_table(v=[2])) + db.add_record(self.PATH, b"f1", make_table(v=[1])) + db.add_record(self.PATH, b"f2", make_table(v=[2])) assert "flush/v1" in db._pending_batches # records are buffered db.flush() assert "flush/v1" not in db._pending_batches # pending cleared after flush @@ -355,9 +355,9 @@ def test_flush_writes_pending_to_delta(self, db): assert result.num_rows == 2 def test_multiple_flushes_accumulate_records(self, db): - db.add_record(self.PATH, "m1", make_table(v=[10])) + db.add_record(self.PATH, b"m1", make_table(v=[10])) db.flush() - db.add_record(self.PATH, "m2", make_table(v=[20])) + db.add_record(self.PATH, b"m2", make_table(v=[20])) db.flush() result = db.get_all_records(self.PATH, retrieve_pending=False) assert result is not None @@ -371,8 +371,8 @@ def test_multiple_flushes_accumulate_records(self, db): def test_list_sources(tmp_path): db = DeltaTableDatabase(tmp_path) - db.add_record(("alpha",), "r1", make_table(v=[1]), flush=True) - db.add_record(("beta", "sub"), "r1", make_table(v=[2]), flush=True) + db.add_record(("alpha",), b"r1", make_table(v=[1]), flush=True) + db.add_record(("beta", "sub"), b"r1", make_table(v=[2]), flush=True) sources = db.list_sources() assert ("alpha",) in sources @@ -384,17 +384,17 @@ def test_upath_local_accepted(tmp_path): from upath import UPath db = DeltaTableDatabase(UPath(tmp_path)) - db.add_record(("src",), "r1", make_table(v=[1]), flush=True) - result = db.get_record_by_id(("src",), "r1") + db.add_record(("src",), b"r1", make_table(v=[1]), flush=True) + result = db.get_record_by_id(("src",), b"r1") assert result is not None def test_schema_evolution(tmp_path): db = DeltaTableDatabase(tmp_path, allow_schema_evolution=True) - db.add_record(("src",), "r1", make_table(a=[1]), flush=True) + db.add_record(("src",), b"r1", make_table(a=[1]), flush=True) db.add_record( ("src",), - "r2", + b"r2", make_table(a=[2], b=["new-col"]), schema_handling="merge", flush=True, @@ -429,19 +429,19 @@ def test_at_does_not_modify_original(self, db): def test_writes_through_scoped_view_readable_from_same_view(self, db): scoped = db.at("pipeline", "node1") record = make_table(value=[42]) - scoped.add_record(("outputs",), "id1", record, flush=True) - result = scoped.get_record_by_id(("outputs",), "id1", flush=True) + scoped.add_record(("outputs",), b"id1", record, flush=True) + result = scoped.get_record_by_id(("outputs",), b"id1", flush=True) assert result is not None assert result.column("value").to_pylist() == [42] def test_scoped_write_not_visible_via_parent_at_same_path(self, db): scoped = db.at("pipeline", "node1") - scoped.add_record(("outputs",), "id1", make_table(value=[42]), flush=True) - assert db.get_record_by_id(("outputs",), "id1") is None + scoped.add_record(("outputs",), b"id1", make_table(value=[42]), flush=True) + assert db.get_record_by_id(("outputs",), b"id1") is None def test_scoped_write_stored_under_correct_filesystem_path(self, db, tmp_path): scoped = db.at("pipeline", "node1") - scoped.add_record(("outputs",), "id1", make_table(value=[1]), flush=True) + scoped.add_record(("outputs",), b"id1", make_table(value=[1]), flush=True) # Delta table should be at /db/pipeline/node1/outputs/ expected_path = tmp_path / "db" / "pipeline" / "node1" / "outputs" assert expected_path.exists() @@ -450,16 +450,16 @@ def test_validate_record_path_checks_combined_depth(self, tmp_path): db = DeltaTableDatabase(base_path=tmp_path / "db", max_hierarchy_depth=10) scoped = db.at("a", "b", "c", "d", "e", "f", "g", "h", "i") # 9 prefix # 9 + 1 = 10: OK - scoped.add_record(("z",), "id1", make_table(value=[1])) + scoped.add_record(("z",), b"id1", make_table(value=[1])) # 9 + 2 = 11: should raise with pytest.raises(ValueError): - scoped.add_record(("z", "extra"), "id2", make_table(value=[2])) + scoped.add_record(("z", "extra"), b"id2", make_table(value=[2])) def test_list_sources_on_scoped_instance(self, tmp_path): db = DeltaTableDatabase(base_path=tmp_path / "db") scoped = db.at("pipeline") - scoped.add_record(("node1",), "id1", make_table(value=[1]), flush=True) - scoped.add_record(("node2",), "id2", make_table(value=[2]), flush=True) + scoped.add_record(("node1",), b"id1", make_table(value=[1]), flush=True) + scoped.add_record(("node2",), b"id2", make_table(value=[2]), flush=True) sources = scoped.list_sources() assert ("node1",) in sources assert ("node2",) in sources diff --git a/tests/test_databases/test_delta_table_database_s3.py b/tests/test_databases/test_delta_table_database_s3.py index a6443b72..0037ceb3 100644 --- a/tests/test_databases/test_delta_table_database_s3.py +++ b/tests/test_databases/test_delta_table_database_s3.py @@ -23,9 +23,9 @@ def test_s3_add_record_and_retrieve(s3_base_uri, s3_storage_options): db = DeltaTableDatabase(s3_base_uri, storage_options=s3_storage_options) record = pa.table({"x": [1], "y": ["hello"]}) - db.add_record(("run1",), "rec-001", record, flush=True) + db.add_record(("run1",), b"rec-001", record, flush=True) - result = db.get_record_by_id(("run1",), "rec-001") + result = db.get_record_by_id(("run1",), b"rec-001") assert result is not None assert result["x"][0].as_py() == 1 @@ -43,7 +43,7 @@ def test_s3_add_records_batch(s3_base_uri, s3_storage_options): def test_s3_flush_and_read(s3_base_uri, s3_storage_options): db = DeltaTableDatabase(s3_base_uri, storage_options=s3_storage_options) for i in range(3): - db.add_record(("src",), f"r{i}", pa.table({"n": [i]})) + db.add_record(("src",), f"r{i}".encode(), pa.table({"n": [i]})) db.flush() result = db.get_all_records(("src",), retrieve_pending=False) @@ -54,8 +54,8 @@ def test_s3_flush_and_read(s3_base_uri, s3_storage_options): def test_s3_skip_duplicates(s3_base_uri, s3_storage_options): db = DeltaTableDatabase(s3_base_uri, storage_options=s3_storage_options) record = pa.table({"v": [1]}) - db.add_record(("src",), "dup", record, flush=True) - db.add_record(("src",), "dup", record, skip_duplicates=True, flush=True) + db.add_record(("src",), b"dup", record, flush=True) + db.add_record(("src",), b"dup", record, skip_duplicates=True, flush=True) result = db.get_all_records(("src",)) assert result is not None @@ -94,8 +94,8 @@ def test_s3_upath_credentials(s3_base_uri, minio_server): }, ) record = pa.table({"k": ["hello"]}) - db.add_record(("t",), "r1", record, flush=True) - result = db.get_record_by_id(("t",), "r1") + db.add_record(("t",), b"r1", record, flush=True) + result = db.get_record_by_id(("t",), b"r1") assert result is not None assert result["k"][0].as_py() == "hello" diff --git a/tests/test_databases/test_in_memory_database.py b/tests/test_databases/test_in_memory_database.py index 29e4ec80..955068fb 100644 --- a/tests/test_databases/test_in_memory_database.py +++ b/tests/test_databases/test_in_memory_database.py @@ -68,13 +68,13 @@ class TestEmptyTable: PATH = ("source", "v1") def test_get_record_by_id_returns_none_when_empty(self, db): - assert db.get_record_by_id(self.PATH, "id-1", flush=True) is None + assert db.get_record_by_id(self.PATH, b"id-1", flush=True) is None def test_get_all_records_returns_none_when_empty(self, db): assert db.get_all_records(self.PATH) is None def test_get_records_by_ids_returns_none_when_empty(self, db): - assert db.get_records_by_ids(self.PATH, ["id-1"], flush=True) is None + assert db.get_records_by_ids(self.PATH, [b"id-1"], flush=True) is None def test_get_records_with_column_value_returns_none_when_empty(self, db): assert ( @@ -93,42 +93,42 @@ class TestAddRecordRoundTrip: def test_added_record_retrievable_from_pending(self, db): record = make_table(value=[42]) - db.add_record(self.PATH, "id-1", record) - result = db.get_record_by_id(self.PATH, "id-1") + db.add_record(self.PATH, b"id-1", record) + result = db.get_record_by_id(self.PATH, b"id-1") assert result is not None assert result.column("value").to_pylist() == [42] def test_added_record_retrievable_after_flush(self, db): record = make_table(value=[99]) - db.add_record(self.PATH, "id-2", record) + db.add_record(self.PATH, b"id-2", record) db.flush() - result = db.get_record_by_id(self.PATH, "id-2", flush=True) + result = db.get_record_by_id(self.PATH, b"id-2", flush=True) assert result is not None assert result.column("value").to_pylist() == [99] def test_record_id_column_not_in_result_by_default(self, db): record = make_table(value=[1]) - db.add_record(self.PATH, "id-3", record) - result = db.get_record_by_id(self.PATH, "id-3") + db.add_record(self.PATH, b"id-3", record) + result = db.get_record_by_id(self.PATH, b"id-3") assert result is not None assert InMemoryArrowDatabase.RECORD_ID_COLUMN not in result.column_names def test_record_id_column_exposed_when_requested(self, db): record = make_table(value=[1]) - db.add_record(self.PATH, "id-4", record) + db.add_record(self.PATH, b"id-4", record) db.flush() result = db.get_record_by_id( - self.PATH, "id-4", record_id_column="my_id", flush=True + self.PATH, b"id-4", record_id_column="my_id", flush=True ) assert result is not None assert "my_id" in result.column_names - assert result.column("my_id").to_pylist() == ["id-4"] + assert result.column("my_id").to_pylist() == [b"id-4"] def test_unknown_record_returns_none(self, db): record = make_table(value=[1]) - db.add_record(self.PATH, "id-5", record) + db.add_record(self.PATH, b"id-5", record) db.flush() - assert db.get_record_by_id(self.PATH, "nonexistent", flush=True) is None + assert db.get_record_by_id(self.PATH, b"nonexistent", flush=True) is None # --------------------------------------------------------------------------- @@ -140,7 +140,7 @@ class TestAddRecordsRoundTrip: PATH = ("multi", "v1") def test_add_records_bulk_and_retrieve_all(self, db): - records = make_table(__record_id=["a", "b", "c"], value=[10, 20, 30]) + records = make_table(__record_id=[b"a", b"b", b"c"], value=[10, 20, 30]) db.add_records(self.PATH, records, record_id_column="__record_id") db.flush() result = db.get_all_records(self.PATH) @@ -148,7 +148,7 @@ def test_add_records_bulk_and_retrieve_all(self, db): assert result.num_rows == 3 def test_get_all_records_includes_pending(self, db): - records = make_table(__record_id=["x", "y"], value=[1, 2]) + records = make_table(__record_id=[b"x", b"y"], value=[1, 2]) db.add_records(self.PATH, records, record_id_column="__record_id") # do NOT flush — should still be visible result = db.get_all_records(self.PATH) @@ -156,7 +156,7 @@ def test_get_all_records_includes_pending(self, db): assert result.num_rows == 2 def test_first_column_used_as_record_id_by_default(self, db): - records = make_table(id=["r1", "r2"], score=[5, 6]) + records = make_table(id=[b"r1", b"r2"], score=[5, 6]) db.add_records(self.PATH, records) db.flush() result = db.get_all_records(self.PATH) @@ -174,24 +174,24 @@ class TestDuplicateHandling: def test_skip_duplicates_true_does_not_raise(self, db): record = make_table(value=[1]) - db.add_record(self.PATH, "dup-id", record) + db.add_record(self.PATH, b"dup-id", record) db.flush() # same id again — should silently skip - db.add_record(self.PATH, "dup-id", make_table(value=[2]), skip_duplicates=True) + db.add_record(self.PATH, b"dup-id", make_table(value=[2]), skip_duplicates=True) def test_skip_duplicates_false_raises_on_pending_duplicate(self, db): record = make_table(value=[1]) - db.add_record(self.PATH, "dup-id2", record) + db.add_record(self.PATH, b"dup-id2", record) with pytest.raises(ValueError): db.add_records( self.PATH, - make_table(__record_id=["dup-id2"], value=[99]), + make_table(__record_id=[b"dup-id2"], value=[99]), record_id_column="__record_id", skip_duplicates=False, ) def test_within_batch_deduplication_keeps_last(self, db): - records = make_table(__record_id=["same", "same"], value=[1, 2]) + records = make_table(__record_id=[b"same", b"same"], value=[1, 2]) db.add_records(self.PATH, records, record_id_column="__record_id") db.flush() result = db.get_all_records(self.PATH) @@ -209,19 +209,19 @@ class TestGetRecordsByIds: PATH = ("byids", "v1") def _populate(self, db): - records = make_table(__record_id=["a", "b", "c"], value=[10, 20, 30]) + records = make_table(__record_id=[b"a", b"b", b"c"], value=[10, 20, 30]) db.add_records(self.PATH, records, record_id_column="__record_id") db.flush() def test_retrieves_subset(self, db): self._populate(db) - result = db.get_records_by_ids(self.PATH, ["a", "c"], flush=True) + result = db.get_records_by_ids(self.PATH, [b"a", b"c"], flush=True) assert result is not None assert result.num_rows == 2 def test_returns_none_for_missing_ids(self, db): self._populate(db) - result = db.get_records_by_ids(self.PATH, ["z"], flush=True) + result = db.get_records_by_ids(self.PATH, [b"z"], flush=True) assert result is None def test_empty_id_list_returns_none(self, db): @@ -238,7 +238,7 @@ class TestGetRecordsWithColumnValue: PATH = ("colval", "v1") def _populate(self, db): - records = make_table(__record_id=["p", "q", "r"], category=["A", "B", "A"]) + records = make_table(__record_id=[b"p", b"q", b"r"], category=["A", "B", "A"]) db.add_records(self.PATH, records, record_id_column="__record_id") db.flush() @@ -279,30 +279,30 @@ class TestHierarchicalPath: def test_deep_path_stores_and_retrieves(self, db): path = ("org", "project", "dataset", "v1") record = make_table(x=[7]) - db.add_record(path, "deep-id", record) + db.add_record(path, b"deep-id", record) db.flush() - result = db.get_record_by_id(path, "deep-id", flush=True) + result = db.get_record_by_id(path, b"deep-id", flush=True) assert result is not None assert result.column("x").to_pylist() == [7] def test_different_paths_are_independent(self, db): path_a = ("ns", "a") path_b = ("ns", "b") - db.add_record(path_a, "id-1", make_table(v=[1])) - db.add_record(path_b, "id-1", make_table(v=[2])) + db.add_record(path_a, b"id-1", make_table(v=[1])) + db.add_record(path_b, b"id-1", make_table(v=[2])) db.flush() - result_a = db.get_record_by_id(path_a, "id-1", flush=True) - result_b = db.get_record_by_id(path_b, "id-1", flush=True) + result_a = db.get_record_by_id(path_a, b"id-1", flush=True) + result_b = db.get_record_by_id(path_b, b"id-1", flush=True) assert result_a.column("v").to_pylist() == [1] assert result_b.column("v").to_pylist() == [2] def test_invalid_empty_path_raises(self, db): with pytest.raises(ValueError): - db.add_record((), "id-1", make_table(v=[1])) + db.add_record((), b"id-1", make_table(v=[1])) def test_path_with_unsafe_characters_raises(self, db): with pytest.raises(ValueError): - db.add_record(("bad/path",), "id-1", make_table(v=[1])) + db.add_record(("bad/path",), b"id-1", make_table(v=[1])) # --------------------------------------------------------------------------- @@ -314,8 +314,8 @@ class TestFlushBehaviour: PATH = ("flush", "v1") def test_flush_writes_pending_to_store(self, db): - db.add_record(self.PATH, "f1", make_table(v=[1])) - db.add_record(self.PATH, "f2", make_table(v=[2])) + db.add_record(self.PATH, b"f1", make_table(v=[1])) + db.add_record(self.PATH, b"f2", make_table(v=[2])) assert "flush/v1" in db._pending_batches # records are buffered db.flush() assert "flush/v1" not in db._pending_batches # pending cleared after flush @@ -324,9 +324,9 @@ def test_flush_writes_pending_to_store(self, db): assert result.num_rows == 2 def test_multiple_flushes_accumulate_records(self, db): - db.add_record(self.PATH, "m1", make_table(v=[10])) + db.add_record(self.PATH, b"m1", make_table(v=[10])) db.flush() - db.add_record(self.PATH, "m2", make_table(v=[20])) + db.add_record(self.PATH, b"m2", make_table(v=[20])) db.flush() result = db.get_all_records(self.PATH) assert result is not None @@ -358,23 +358,23 @@ def test_at_does_not_modify_original(self, db): def test_writes_through_scoped_view_readable_from_same_view(self, db): scoped = db.at("pipeline", "node1") record = make_table(value=[42]) - scoped.add_record(("outputs",), "id1", record) - result = scoped.get_record_by_id(("outputs",), "id1") + scoped.add_record(("outputs",), b"id1", record) + result = scoped.get_record_by_id(("outputs",), b"id1") assert result is not None assert result.column("value").to_pylist() == [42] def test_scoped_write_not_visible_via_parent_at_same_path(self, db): scoped = db.at("pipeline", "node1") - scoped.add_record(("outputs",), "id1", make_table(value=[42])) + scoped.add_record(("outputs",), b"id1", make_table(value=[42])) # Parent sees ("outputs",) as a different key from ("pipeline","node1","outputs") - assert db.get_record_by_id(("outputs",), "id1") is None + assert db.get_record_by_id(("outputs",), b"id1") is None def test_two_scoped_views_share_storage(self, db): view_a = db.at("pipeline", "node1") view_b = db.at("pipeline", "node1") - view_a.add_record(("outputs",), "id1", make_table(value=[99])) + view_a.add_record(("outputs",), b"id1", make_table(value=[99])) view_a.flush() - result = view_b.get_record_by_id(("outputs",), "id1") + result = view_b.get_record_by_id(("outputs",), b"id1") assert result is not None assert result.column("value").to_pylist() == [99] @@ -383,10 +383,10 @@ def test_validate_record_path_checks_combined_depth(self, db): deep_db = InMemoryArrowDatabase(max_hierarchy_depth=10) scoped = deep_db.at("a", "b", "c", "d", "e", "f", "g", "h", "i") # 9 prefix + 1 record_path = 10: OK - scoped.add_record(("z",), "id1", make_table(value=[1])) + scoped.add_record(("z",), b"id1", make_table(value=[1])) # 9 prefix + 2 record_path = 11: should raise with pytest.raises(ValueError): - scoped.add_record(("z", "extra"), "id2", make_table(value=[2])) + scoped.add_record(("z", "extra"), b"id2", make_table(value=[2])) def test_at_rejects_slash_in_component(self, db): with pytest.raises(ValueError, match="invalid character"): diff --git a/tests/test_databases/test_noop_database.py b/tests/test_databases/test_noop_database.py index c4e0eff7..45f58891 100644 --- a/tests/test_databases/test_noop_database.py +++ b/tests/test_databases/test_noop_database.py @@ -71,13 +71,13 @@ def test_has_flush(self, db): class TestWriteOperationsAreNoOps: def test_add_record_does_not_raise(self, db): - db.add_record(PATH, "id1", make_table(x=[1, 2])) + db.add_record(PATH, b"id1", make_table(x=[1, 2])) def test_add_record_with_skip_duplicates_does_not_raise(self, db): - db.add_record(PATH, "id1", make_table(x=[1]), skip_duplicates=True) + db.add_record(PATH, b"id1", make_table(x=[1]), skip_duplicates=True) def test_add_record_with_flush_does_not_raise(self, db): - db.add_record(PATH, "id1", make_table(x=[1]), flush=True) + db.add_record(PATH, b"id1", make_table(x=[1]), flush=True) def test_add_records_does_not_raise(self, db): db.add_records(PATH, make_table(x=[1, 2, 3])) @@ -99,7 +99,7 @@ def test_flush_does_not_raise(self, db): db.flush() def test_flush_after_writes_does_not_raise(self, db): - db.add_record(PATH, "id1", make_table(x=[1])) + db.add_record(PATH, b"id1", make_table(x=[1])) db.add_records(PATH, make_table(x=[2, 3])) db.flush() @@ -111,14 +111,14 @@ def test_flush_after_writes_does_not_raise(self, db): class TestReadOperationsReturnNone: def test_get_record_by_id_returns_none_on_empty_db(self, db): - assert db.get_record_by_id(PATH, "id1") is None + assert db.get_record_by_id(PATH, b"id1") is None def test_get_record_by_id_returns_none_after_write(self, db): - db.add_record(PATH, "id1", make_table(x=[42])) - assert db.get_record_by_id(PATH, "id1") is None + db.add_record(PATH, b"id1", make_table(x=[42])) + assert db.get_record_by_id(PATH, b"id1") is None def test_get_record_by_id_with_record_id_column_returns_none(self, db): - assert db.get_record_by_id(PATH, "id1", record_id_column="rid") is None + assert db.get_record_by_id(PATH, b"id1", record_id_column="rid") is None def test_get_all_records_returns_none_on_empty_db(self, db): assert db.get_all_records(PATH) is None @@ -131,11 +131,11 @@ def test_get_all_records_with_record_id_column_returns_none(self, db): assert db.get_all_records(PATH, record_id_column="rid") is None def test_get_records_by_ids_returns_none_on_empty_db(self, db): - assert db.get_records_by_ids(PATH, ["id1", "id2"]) is None + assert db.get_records_by_ids(PATH, [b"id1", b"id2"]) is None def test_get_records_by_ids_returns_none_after_write(self, db): - db.add_record(PATH, "id1", make_table(x=[1])) - assert db.get_records_by_ids(PATH, ["id1"]) is None + db.add_record(PATH, b"id1", make_table(x=[1])) + assert db.get_records_by_ids(PATH, [b"id1"]) is None def test_get_records_by_ids_empty_collection_returns_none(self, db): assert db.get_records_by_ids(PATH, []) is None @@ -198,6 +198,6 @@ def test_at_does_not_modify_original(self): def test_scoped_reads_still_return_none(self): db = NoOpArrowDatabase() scoped = db.at("pipeline", "node1") - scoped.add_record(("outputs",), "id1", pa.table({"v": [1]})) - assert scoped.get_record_by_id(("outputs",), "id1") is None + scoped.add_record(("outputs",), b"id1", pa.table({"v": [1]})) + assert scoped.get_record_by_id(("outputs",), b"id1") is None assert scoped.get_all_records(("outputs",)) is None diff --git a/tests/test_databases/test_postgresql_connector.py b/tests/test_databases/test_postgresql_connector.py index 95afeeaa..71ec2ea6 100644 --- a/tests/test_databases/test_postgresql_connector.py +++ b/tests/test_databases/test_postgresql_connector.py @@ -10,6 +10,7 @@ from orcapod.databases.postgresql_connector import ( PostgreSQLConnector, _arrow_type_to_pg_sql, + _coerce_pg_value, _pg_type_to_arrow, ) from orcapod.protocols.db_connector_protocol import DBConnectorProtocol @@ -53,7 +54,8 @@ def test_bytea(self): assert _pg_type_to_arrow("bytea", "bytea") == pa.large_binary() def test_uuid(self): - assert _pg_type_to_arrow("uuid", "uuid") == pa.large_string() + result = _pg_type_to_arrow("uuid", "uuid") + assert result == pa.binary(16) def test_json(self): assert _pg_type_to_arrow("json", "json") == pa.large_string() @@ -138,6 +140,37 @@ def test_unknown_falls_back_to_text(self): assert result == "TEXT" +class TestCoercePgValue: + """Unit tests for _coerce_pg_value.""" + + def test_none_passthrough(self): + assert _coerce_pg_value(None, pa.binary(16)) is None + + def test_uuid_coerced_to_bytes_for_binary16(self): + import uuid as _uuid + u = _uuid.UUID("12345678-1234-5678-1234-567812345678") + result = _coerce_pg_value(u, pa.binary(16)) + assert result == u.bytes + assert isinstance(result, bytes) + assert len(result) == 16 + + def test_uuid_not_coerced_for_other_types(self): + import uuid as _uuid + u = _uuid.UUID("12345678-1234-5678-1234-567812345678") + # Non-binary(16) target — UUID object passed through unchanged + result = _coerce_pg_value(u, pa.large_string()) + assert result is u + + def test_plain_bytes_passthrough_for_binary16(self): + raw = b"\x00" * 16 + result = _coerce_pg_value(raw, pa.binary(16)) + assert result is raw + + def test_non_uuid_value_passthrough(self): + assert _coerce_pg_value(42, pa.int32()) == 42 + assert _coerce_pg_value("hello", pa.large_string()) == "hello" + + class TestPostgreSQLConnectorScaffold: def test_isinstance_dbconnector_protocol(self) -> None: with patch("psycopg.connect") as mock_connect: diff --git a/tests/test_databases/test_postgresql_connector_integration.py b/tests/test_databases/test_postgresql_connector_integration.py index 42301eb3..346a3f2e 100644 --- a/tests/test_databases/test_postgresql_connector_integration.py +++ b/tests/test_databases/test_postgresql_connector_integration.py @@ -195,7 +195,7 @@ def test_all_pg_types(self, connector: PostgreSQLConnector) -> None: assert infos["f"].arrow_type == pa.float64() assert infos["g"].arrow_type == pa.large_string() assert infos["h"].arrow_type == pa.large_binary() - assert infos["i"].arrow_type == pa.large_string() # TODO: PLT-1162 + assert infos["i"].arrow_type == pa.binary(16) assert infos["j"].arrow_type == pa.large_string() assert infos["k"].arrow_type == pa.date32() assert infos["l"].arrow_type == pa.timestamp("us") diff --git a/tests/test_hashing/test_file_hashing_consistency.py b/tests/test_hashing/test_file_hashing_consistency.py index 1ce9886b..e5bd4bbf 100644 --- a/tests/test_hashing/test_file_hashing_consistency.py +++ b/tests/test_hashing/test_file_hashing_consistency.py @@ -181,17 +181,20 @@ def test_arrow_and_semantic_hash_same_file_content( self, path_converter, semantic_hasher, file_hasher, tmp_path ): """The file content hash extracted by PythonPathStructConverter.hash_struct_dict - must match the ContentHash produced by PathContentHandler.handle (which - the semantic hasher uses internally for Path objects). + must embed the same digest as ContentHash produced by PathContentHandler.handle + (which the semantic hasher uses internally for Path objects). - We compare at the file_hasher level: both paths ultimately call - file_hasher.hash_file(path), so the raw digest must be identical. + Both paths ultimately call file_hasher.hash_file(path), so the raw digest + must be identical. hash_struct_dict always returns the fully-prefixed form + "path:sha256:", so we strip the prefix when comparing. """ file = tmp_path / "shared.txt" file.write_text("shared content for both paths") - # Arrow path: PythonPathStructConverter.hash_struct_dict (no prefix) - arrow_hash_hex = path_converter.hash_struct_dict({"path": str(file)}) + # Arrow path: PythonPathStructConverter.hash_struct_dict — always prefixed + arrow_hash = path_converter.hash_struct_dict({"path": str(file)}) + # Strip "path:sha256:" prefix to get the raw hex + arrow_hash_hex = arrow_hash.split(":")[-1] # Semantic path: file_hasher.hash_file directly (same as PathContentHandler) semantic_content_hash = file_hasher.hash_file(file) @@ -209,7 +212,8 @@ def test_arrow_and_semantic_same_content_two_files( file1.write_text(content) file2.write_text(content) - arrow_hex = path_converter.hash_struct_dict({"path": str(file1)}) + # hash_struct_dict always returns "path:sha256:" — strip prefix + arrow_hex = path_converter.hash_struct_dict({"path": str(file1)}).split(":")[-1] semantic_hex = file_hasher.hash_file(file2).digest.hex() assert arrow_hex == semantic_hex diff --git a/tests/test_hashing/test_uuid_handler.py b/tests/test_hashing/test_uuid_handler.py new file mode 100644 index 00000000..8b69d78b --- /dev/null +++ b/tests/test_hashing/test_uuid_handler.py @@ -0,0 +1,32 @@ +"""Tests for UUIDHandler low-level handle() method behaviour. + +Verifies that UUIDHandler returns the 16-byte binary representation of a +UUID, consistent with OrcaPod's canonical ``pa.binary(16)`` Arrow storage +format. +""" + +from __future__ import annotations + +import uuid as _uuid + + +def test_uuid_handler_returns_bytes(): + """UUIDHandler should return the 16-byte binary representation.""" + from orcapod.hashing.semantic_hashing.builtin_handlers import UUIDHandler + + handler = UUIDHandler() + u = _uuid.UUID("550e8400-e29b-41d4-a716-446655440000") + result = handler.handle(u, hasher=None) # type: ignore[arg-type] + assert result == u.bytes + assert isinstance(result, bytes) + assert len(result) == 16 + + +def test_uuid_handler_different_uuids_produce_different_bytes(): + """Different UUID values must produce different byte sequences.""" + from orcapod.hashing.semantic_hashing.builtin_handlers import UUIDHandler + + handler = UUIDHandler() + u1 = _uuid.uuid4() + u2 = _uuid.uuid4() + assert handler.handle(u1, None) != handler.handle(u2, None) # type: ignore[arg-type] diff --git a/tests/test_semantic_types/test_path_struct_converter.py b/tests/test_semantic_types/test_path_struct_converter.py index c66ca05d..740b0c16 100644 --- a/tests/test_semantic_types/test_path_struct_converter.py +++ b/tests/test_semantic_types/test_path_struct_converter.py @@ -110,10 +110,10 @@ def test_hash_path_objects_content_based(converter, tmp_path): def test_hash_struct_dict_with_prefix(converter, tmp_path): - """Prefixed hash starts with 'path:sha256:'.""" + """Hash always starts with 'path:sha256:'.""" file = tmp_path / "file.txt" file.write_text("hello") - hash_str = converter.hash_struct_dict({"path": str(file)}, add_prefix=True) + hash_str = converter.hash_struct_dict({"path": str(file)}) assert hash_str.startswith("path:sha256:") diff --git a/tests/test_semantic_types/test_semantic_registry.py b/tests/test_semantic_types/test_semantic_registry.py index eb8d26be..82df93e0 100644 --- a/tests/test_semantic_types/test_semantic_registry.py +++ b/tests/test_semantic_types/test_semantic_registry.py @@ -1,5 +1,7 @@ +import uuid from unittest.mock import Mock +import pyarrow as pa import pytest from orcapod.semantic_types import semantic_registry @@ -130,6 +132,41 @@ def test_integration_with_converter(): assert retrieved is converter +def test_uuid_type_registered_in_default_registry(): + """uuid.UUID should be registered and map to pa.struct([pa.field('uuid', pa.binary(16))]).""" + from orcapod.hashing.versioned_hashers import get_versioned_semantic_arrow_hasher + + hasher = get_versioned_semantic_arrow_hasher() + registry = hasher.semantic_registry + converter = registry.get_converter_for_python_type(uuid.UUID) + assert converter is not None + assert converter.arrow_struct_type == pa.struct([pa.field("uuid", pa.binary(16))]) + + +def test_uuid_struct_resolves_to_converter(): + """pa.struct([pa.field('uuid', pa.binary(16))]) should resolve back to a converter for uuid.UUID.""" + from orcapod.hashing.versioned_hashers import get_versioned_semantic_arrow_hasher + + hasher = get_versioned_semantic_arrow_hasher() + registry = hasher.semantic_registry + converter = registry.get_converter_for_struct_signature( + pa.struct([pa.field("uuid", pa.binary(16))]) + ) + assert converter is not None + assert converter.python_type is uuid.UUID + + +def test_uuid_semantic_type_name_registered(): + """Converter registered under the name 'uuid'.""" + from orcapod.hashing.versioned_hashers import get_versioned_semantic_arrow_hasher + + hasher = get_versioned_semantic_arrow_hasher() + registry = hasher.semantic_registry + converter = registry.get_converter_for_semantic_type("uuid") + assert converter is not None + assert converter.python_type is uuid.UUID + + # Comprehensive unregister tests for future implementation # Uncomment when unregister methods are implemented # diff --git a/tests/test_semantic_types/test_semantic_struct_converters.py b/tests/test_semantic_types/test_semantic_struct_converters.py index 77bdfb1d..168f1a45 100644 --- a/tests/test_semantic_types/test_semantic_struct_converters.py +++ b/tests/test_semantic_types/test_semantic_struct_converters.py @@ -32,7 +32,7 @@ def can_handle_struct_type(self, struct_type): def is_semantic_struct(self, struct_dict): return isinstance(struct_dict, dict) - def hash_struct_dict(self, struct_dict, add_prefix=False): + def hash_struct_dict(self, struct_dict): return "dummyhash" @@ -43,15 +43,6 @@ def test_semantic_struct_converter_base_properties(): assert converter.hasher_id == "dummy_content_sha256" -def test_format_hash_string(): - converter = DummyConverter() - hash_bytes = b"\x01\x02" - assert converter._format_hash_string(hash_bytes, add_prefix=False) == "0102" - assert ( - converter._format_hash_string(hash_bytes, add_prefix=True) - == "dummy:sha256:0102" - ) - def test_compute_content_hash(): converter = DummyConverter() @@ -95,7 +86,7 @@ def can_handle_struct_type(self, struct_type): def is_semantic_struct(self, struct_dict): return "data" in struct_dict - def hash_struct_dict(self, struct_dict, add_prefix=False): + def hash_struct_dict(self, struct_dict): return "newhash" converter = NewConverter() diff --git a/tests/test_semantic_types/test_upath_struct_converter.py b/tests/test_semantic_types/test_upath_struct_converter.py index ee503ae0..ccfe014f 100644 --- a/tests/test_semantic_types/test_upath_struct_converter.py +++ b/tests/test_semantic_types/test_upath_struct_converter.py @@ -79,10 +79,10 @@ def test_hash_struct_dict_content_based(converter, tmp_path): def test_hash_struct_dict_with_prefix(converter, tmp_path): - """Prefixed hash starts with 'upath:sha256:'.""" + """Hash always starts with 'upath:sha256:'.""" file = tmp_path / "file.txt" file.write_text("hello") - hash_str = converter.hash_struct_dict({"upath": str(file)}, add_prefix=True) + hash_str = converter.hash_struct_dict({"upath": str(file)}) assert hash_str.startswith("upath:sha256:") diff --git a/tests/test_semantic_types/test_uuid_struct_converter.py b/tests/test_semantic_types/test_uuid_struct_converter.py new file mode 100644 index 00000000..c8084991 --- /dev/null +++ b/tests/test_semantic_types/test_uuid_struct_converter.py @@ -0,0 +1,134 @@ +"""Tests for UUIDStructConverter.""" +import uuid + +import pyarrow as pa +import pytest + +from orcapod.semantic_types.semantic_struct_converters import UUIDStructConverter + + +@pytest.fixture +def converter(): + return UUIDStructConverter() + + +@pytest.fixture +def sample_uuid(): + return uuid.UUID("550e8400-e29b-41d4-a716-446655440000") + + +def test_python_type(converter): + assert converter.python_type is uuid.UUID + + +def test_arrow_struct_type(converter): + assert converter.arrow_struct_type == pa.struct([pa.field("uuid", pa.binary(16))]) + + +def test_semantic_type_name(converter): + assert converter.semantic_type_name == "uuid" + + +def test_python_to_struct_dict(converter, sample_uuid): + result = converter.python_to_struct_dict(sample_uuid) + assert result == {"uuid": sample_uuid.bytes} + assert isinstance(result["uuid"], bytes) + assert len(result["uuid"]) == 16 + + +def test_python_to_struct_dict_rejects_non_uuid(converter): + with pytest.raises(TypeError): + converter.python_to_struct_dict("550e8400-e29b-41d4-a716-446655440000") # type: ignore + + +def test_struct_dict_to_python(converter, sample_uuid): + struct_dict = {"uuid": sample_uuid.bytes} + result = converter.struct_dict_to_python(struct_dict) + assert result == sample_uuid + assert isinstance(result, uuid.UUID) + + +def test_struct_dict_to_python_from_bytearray(converter, sample_uuid): + """Arrow may return binary fields as bytearray — must handle both.""" + struct_dict = {"uuid": bytearray(sample_uuid.bytes)} + result = converter.struct_dict_to_python(struct_dict) + assert result == sample_uuid + + +def test_struct_dict_to_python_missing_field(converter): + with pytest.raises(ValueError, match="Missing 'uuid' field"): + converter.struct_dict_to_python({}) + + +def test_round_trip(converter, sample_uuid): + struct_dict = converter.python_to_struct_dict(sample_uuid) + recovered = converter.struct_dict_to_python(struct_dict) + assert recovered == sample_uuid + + +def test_round_trip_all_versions(): + """Verify round-trip works for uuid4, uuid5, and uuid7 (uuid_utils). + + ``uuid_utils.UUID`` objects do not inherit from ``uuid.UUID`` and their + ``__eq__`` does not cross-compare with ``uuid.UUID``, so we compare by + the canonical string representation instead of direct equality. + """ + from uuid_utils import uuid7 + + converter = UUIDStructConverter() + for u in [uuid.uuid4(), uuid.uuid5(uuid.NAMESPACE_OID, "test"), uuid7()]: + recovered = converter.struct_dict_to_python(converter.python_to_struct_dict(u)) + assert str(recovered) == str(u) + + +def test_arrow_array_round_trip(converter, sample_uuid): + """Verify UUID survives a PyArrow array round-trip.""" + struct_dict = converter.python_to_struct_dict(sample_uuid) + arr = pa.array([struct_dict], type=pa.struct([pa.field("uuid", pa.binary(16))])) + recovered_dict = arr[0].as_py() + recovered_uuid = converter.struct_dict_to_python(recovered_dict) + assert recovered_uuid == sample_uuid + + +def test_distinct_uuids_produce_distinct_struct_dicts(converter): + u1, u2 = uuid.uuid4(), uuid.uuid4() + assert converter.python_to_struct_dict(u1) != converter.python_to_struct_dict(u2) + + +def test_can_handle_python_type_uuid(converter): + assert converter.can_handle_python_type(uuid.UUID) is True + + +def test_can_handle_python_type_rejects_str(converter): + assert converter.can_handle_python_type(str) is False + + +def test_can_handle_struct_type_uuid(converter): + assert converter.can_handle_struct_type(pa.struct([pa.field("uuid", pa.binary(16))])) is True + + +def test_can_handle_struct_type_rejects_other(converter): + import pyarrow as pa + + assert converter.can_handle_struct_type(pa.struct([pa.field("path", pa.large_string())])) is False + + +def test_hash_struct_dict_returns_string(converter, sample_uuid): + struct_dict = converter.python_to_struct_dict(sample_uuid) + result = converter.hash_struct_dict(struct_dict) + assert isinstance(result, str) + assert len(result) > 0 + + +def test_hash_struct_dict_consistent(converter, sample_uuid): + """Same UUID always produces the same hash.""" + struct_dict = converter.python_to_struct_dict(sample_uuid) + assert converter.hash_struct_dict(struct_dict) == converter.hash_struct_dict(struct_dict) + + +def test_hash_struct_dict_different_uuids(converter): + """Different UUIDs produce different hashes.""" + u1, u2 = uuid.uuid4(), uuid.uuid4() + d1 = converter.python_to_struct_dict(u1) + d2 = converter.python_to_struct_dict(u2) + assert converter.hash_struct_dict(d1) != converter.hash_struct_dict(d2) diff --git a/tests/test_utils/test_arrow_utils.py b/tests/test_utils/test_arrow_utils.py index c80bc03f..68ec445e 100644 --- a/tests/test_utils/test_arrow_utils.py +++ b/tests/test_utils/test_arrow_utils.py @@ -278,12 +278,16 @@ def test_different_hashes_produce_different_names(self): def test_col_names_match_add_system_tag_columns_output(self): """Column names from system_tag_column_names() must match those added to a table by add_system_tag_columns().""" + import uuid + import pyarrow as pa from orcapod.utils.arrow_utils import add_system_tag_columns, system_tag_column_names schema_hash = "testhash" table = pa.table({"id": pa.array([1]), "v": pa.array([1.0])}) - enriched = add_system_tag_columns(table, schema_hash, ["src_a"], ["row_0"]) + enriched = add_system_tag_columns( + table, schema_hash, ["src_a"], [uuid.uuid4().bytes] + ) src_col, rec_col = system_tag_column_names(schema_hash) assert src_col in enriched.column_names assert rec_col in enriched.column_names