From 58dedce62decab8ef83362940a973f6345199dfc Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Sat, 20 Jun 2026 19:16:18 +0200 Subject: [PATCH] =?UTF-8?q?fix(writer):=20exclude=20I8/I16=20from=20global?= =?UTF-8?q?=20dict=20=E2=80=94=20reader=20can't=20decode=20narrow-int=20di?= =?UTF-8?q?ct?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mutation coverage (PR #99) surfaced that the writer's global dict admitted I8/U8/I16/U16 columns, but the reader's lazy dict decode only supports I32/I64/F32/F64 — a low-cardinality I16 column wrote a vortex.dict the reader rejected with "unsupported ptype for lazy dict: I16", i.e. an unreadable file. Cross-checked against the Rust reference: the JNI writer does NOT dict-encode a low-cardinality I16 column, and the Java reader reads its output back exactly (new RustWritesJavaReadsIntegrationTest#jniWriter_javaReader_lowCardinalityI16). So the Java reader is already Rust-conformant; the Java writer was the outlier. Fix: exclude I8/U8/I16/U16 from isDictCandidate (alongside F16/F32). Narrow-int dict gives no real benefit (a U8/U16 code is no smaller than the value), matches Rust, and only ever produced files the reader couldn't read — so excluding it is not a regression. A low-card I16 column now encodes via the cascade and round-trips (GlobalDictPrimitiveTest#lowCardinality_i16_notDicted_roundTrips). Co-Authored-By: Claude Opus 4.8 --- .../RustWritesJavaReadsIntegrationTest.java | 57 +++++++++++++++++++ .../dfa1/vortex/writer/VortexWriter.java | 17 ++++-- .../writer/GlobalDictPrimitiveTest.java | 34 +++++++---- .../writer/VortexWriterDictDecisionTest.java | 12 ++++ 4 files changed, 105 insertions(+), 15 deletions(-) diff --git a/integration/src/test/java/io/github/dfa1/vortex/integration/RustWritesJavaReadsIntegrationTest.java b/integration/src/test/java/io/github/dfa1/vortex/integration/RustWritesJavaReadsIntegrationTest.java index 7c028129..1bc12424 100644 --- a/integration/src/test/java/io/github/dfa1/vortex/integration/RustWritesJavaReadsIntegrationTest.java +++ b/integration/src/test/java/io/github/dfa1/vortex/integration/RustWritesJavaReadsIntegrationTest.java @@ -23,6 +23,7 @@ import org.apache.arrow.vector.BigIntVector; import org.apache.arrow.vector.Float2Vector; import org.apache.arrow.vector.Float8Vector; +import org.apache.arrow.vector.SmallIntVector; import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.arrow.vector.ipc.ArrowReader; import org.apache.arrow.vector.types.FloatingPointPrecision; @@ -390,6 +391,62 @@ void jniWriter_javaReader_fewUniqueF64Values(@TempDir Path tmp) throws IOExcepti } } + private static final Schema I16_SCHEMA = new Schema(List.of( + Field.notNullable("v", new ArrowType.Int(16, true)) + )); + + private static void writeJniI16(Path file, short[] vals) throws IOException { + String uri = file.toAbsolutePath().toUri().toString(); + try (VortexWriter writer = VortexWriter.create(SESSION, uri, I16_SCHEMA, new HashMap<>(), ALLOCATOR); + VectorSchemaRoot root = VectorSchemaRoot.create(I16_SCHEMA, ALLOCATOR)) { + SmallIntVector vec = (SmallIntVector) root.getVector("v"); + vec.allocateNew(vals.length); + for (int i = 0; i < vals.length; i++) { + vec.setSafe(i, vals[i]); + } + root.setRowCount(vals.length); + try (ArrowArray arr = ArrowArray.allocateNew(ALLOCATOR); + ArrowSchema schema = ArrowSchema.allocateNew(ALLOCATOR)) { + Data.exportVectorSchemaRoot(ALLOCATOR, root, null, arr, schema); + writer.writeBatch(arr.memoryAddress(), schema.memoryAddress()); + } + } + } + + @Test + void jniWriter_javaReader_lowCardinalityI16(@TempDir Path tmp) throws IOException { + // Given — 10_000 rows cycling 3 unique I16 values. Ground-truth cross-check: does the + // Rust/JNI compressor dict-encode a low-cardinality I16 column, and can the Java reader + // read it back? (The Java writer's global dict admitted I16 but the Java reader rejected + // it with "unsupported ptype for lazy dict: I16".) If Rust dicts I16 and Java cannot read + // it, the bug is reader-side and this test fails with that exact message. + int n = 10_000; + short[] unique = {7, 8, 9}; + short[] vals = new short[n]; + for (int i = 0; i < n; i++) { + vals[i] = unique[i % unique.length]; + } + Path file = tmp.resolve("jni_i16_lowcard.vtx"); + writeJniI16(file, vals); + + // When / Then — must round-trip exactly + try (var vf = VortexReader.open(file, ReadRegistry.loadAll())) { + List results = scanAll(vf, io.github.dfa1.vortex.reader.ScanOptions.columns("v")); + long total = results.stream().mapToLong(JavaChunk::rowCount).sum(); + assertThat(total).isEqualTo(n); + var got = new ArrayList(); + for (JavaChunk r : results) { + short[] decoded = (short[]) r.columns().get("v"); + for (short s : decoded) { + got.add(s); + } + } + for (int i = 0; i < n; i++) { + assertThat(got.get(i)).as("row %d", i).isEqualTo(unique[i % unique.length]); + } + } + } + // ── S3 helpers ──────────────────────────────────────────────────────────── @Test diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java b/writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java index e2298cf7..9a5dd840 100644 --- a/writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java +++ b/writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java @@ -927,11 +927,18 @@ static boolean isUtf8DictCandidate(String[] data) { } static boolean isDictCandidate(PType ptype, Object data) { - // F16/F32 excluded: no measured workload; ALP usually wins. F64 admitted: low-card - // F64 columns (taxi mta_tax/Airport_fee/extra) compress better via global dict + - // sparse-coded codes (matches Rust FloatDictScheme). Skip rule (cardinality / 2 - // below) mirrors Rust's >50%-distinct skip. - if (ptype == PType.F16 || ptype == PType.F32) { + // Only the carriers the reader's lazy dict decode supports (I32/I64/F64) are admitted. + // - I8/U8/I16/U16 excluded: dict gives little/no benefit (a U8/U16 code is no smaller + // than the value), the Rust compressor does not dict them either (verified by + // RustWritesJavaReadsIntegrationTest#jniWriter_javaReader_lowCardinalityI16), and the + // reader cannot decode a narrow-int dict — emitting one produced an unreadable file. + // - F16/F32 excluded: no measured workload; ALP usually wins. + // F64 admitted: low-card F64 columns (taxi mta_tax/Airport_fee/extra) compress better via + // global dict + sparse-coded codes (matches Rust FloatDictScheme). The skip rule + // (cardinality / 2 below) mirrors Rust's >50%-distinct skip. + if (ptype == PType.I8 || ptype == PType.U8 + || ptype == PType.I16 || ptype == PType.U16 + || ptype == PType.F16 || ptype == PType.F32) { return false; } int n = primitiveArrayLen(data, ptype); diff --git a/writer/src/test/java/io/github/dfa1/vortex/writer/GlobalDictPrimitiveTest.java b/writer/src/test/java/io/github/dfa1/vortex/writer/GlobalDictPrimitiveTest.java index c4b133da..71b68d1c 100644 --- a/writer/src/test/java/io/github/dfa1/vortex/writer/GlobalDictPrimitiveTest.java +++ b/writer/src/test/java/io/github/dfa1/vortex/writer/GlobalDictPrimitiveTest.java @@ -143,11 +143,12 @@ void lowCardinality_i32_usesGlobalDict(@TempDir Path tmp) throws IOException { } @Test - void lowCardinality_i16_globalDict_readerRejects(@TempDir Path tmp) throws IOException { - // Documents a real write/read incompatibility surfaced by mutation coverage: the writer's - // global dict admits I8/I16 columns (isDictCandidate), but the reader's lazy dict decode - // only supports I32/I64/F64 — reading back throws "unsupported ptype for lazy dict: I16". - // Pinning it here makes the gap explicit; fixing it belongs to the reader's dict decode. + void lowCardinality_i16_notDicted_roundTrips(@TempDir Path tmp) throws IOException { + // Regression: the writer used to admit I16 to the global dict, producing a vortex.dict + // column the reader cannot decode ("unsupported ptype for lazy dict: I16"). I8/I16 are now + // excluded from dict candidacy — matching the Rust compressor, which does not dict narrow + // ints (RustWritesJavaReadsIntegrationTest#jniWriter_javaReader_lowCardinalityI16) — so a + // low-card I16 column encodes via the cascade and round-trips cleanly. var schema = new DType.Struct(List.of("v"), List.of(new DType.Primitive(PType.I16, false)), false); short[] data = {1, 2, 3, 1, 2, 3, 1, 2}; Path file = tmp.resolve("i16.vortex"); @@ -156,12 +157,25 @@ void lowCardinality_i16_globalDict_readerRejects(@TempDir Path tmp) throws IOExc sut.writeChunk(Map.of("v", data)); } - // When / Then — the round-trip is not yet supported; assert the current behaviour - try (var vf = VortexReader.open(file, ReadRegistry.loadAll())) { - org.assertj.core.api.Assertions.assertThatThrownBy(() -> VortexReads.readAllInts(vf, "v")) - .isInstanceOf(io.github.dfa1.vortex.core.VortexException.class) - .hasMessageContaining("unsupported ptype for lazy dict"); + // When — I16 now encodes via the cascade (a ShortArray view), not a dict + short[] result; + try (var vf = VortexReader.open(file, ReadRegistry.loadAll()); + var iter = vf.scan(io.github.dfa1.vortex.reader.ScanOptions.columns("v"))) { + var out = new java.util.ArrayList(); + iter.forEachRemaining(c -> { + var arr = (io.github.dfa1.vortex.reader.array.ShortArray) c.columns().get("v"); + for (long i = 0; i < arr.length(); i++) { + out.add(arr.getShort(i)); + } + }); + result = new short[out.size()]; + for (int i = 0; i < result.length; i++) { + result[i] = out.get(i); + } } + + // Then — reads back exactly, no lazy-dict error + assertThat(result).containsExactly(data); } @Test diff --git a/writer/src/test/java/io/github/dfa1/vortex/writer/VortexWriterDictDecisionTest.java b/writer/src/test/java/io/github/dfa1/vortex/writer/VortexWriterDictDecisionTest.java index c96098ff..d32ce08f 100644 --- a/writer/src/test/java/io/github/dfa1/vortex/writer/VortexWriterDictDecisionTest.java +++ b/writer/src/test/java/io/github/dfa1/vortex/writer/VortexWriterDictDecisionTest.java @@ -22,6 +22,18 @@ void isDictCandidate_excludesF16AndF32() { assertThat(VortexWriter.isDictCandidate(PType.F16, new short[]{1, 1, 2, 2, 2})).isFalse(); } + @Test + void isDictCandidate_excludesNarrowIntegers() { + // I8/U8/I16/U16 are excluded: a U8/U16 code is no smaller than the value, the Rust + // compressor does not dict them, and the reader's lazy dict cannot decode a narrow-int + // dictionary. Low-cardinality data that would otherwise pass the ratio gate must still + // be rejected. + assertThat(VortexWriter.isDictCandidate(PType.I8, new byte[]{1, 2, 1, 2, 1})).isFalse(); + assertThat(VortexWriter.isDictCandidate(PType.U8, new byte[]{1, 2, 1, 2, 1})).isFalse(); + assertThat(VortexWriter.isDictCandidate(PType.I16, new short[]{1, 2, 1, 2, 1})).isFalse(); + assertThat(VortexWriter.isDictCandidate(PType.U16, new short[]{1, 2, 1, 2, 1})).isFalse(); + } + @Test void isDictCandidate_admitsLowCardinalityI64AndF64() { // Given — 2 distinct over 5 rows: 2*2 = 4 < 5, under the 50%-unique gate