From dabcf07e1d83266a52e5aaa518d77969ed9c9a4f Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Sat, 20 Jun 2026 19:09:04 +0200 Subject: [PATCH] test(writer): mutation-cover VortexWriter global-dict + factory paths (74% -> 89%) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add VortexWriter to the writer pitest scope and close the bulk of its gaps — chiefly the global-dictionary feature, which was barely tested end to end. - VortexWriterDictDecisionTest unit-tests the pure dict-decision helpers directly (made package-private): isDictCandidate / isUtf8DictCandidate (cardinality + 50%-ratio gates, empty, single-value, F16/F32 exclusion), codePTypeForSize (U8/U16/U32 width boundaries), primitiveArrayLen, readPrimitiveElement. These choices only affect encoding, not the decoded values, so a round-trip cannot pin them. - GlobalDictPrimitiveTest round-trips low-cardinality I32/I64/F64 columns through the global dict build (the integer/float counterpart to the existing GlobalDictUtf8Test), plus high-cardinality and single-value fallbacks and the globalDict-disabled opt-out. - VortexWriterTest covers the create(WriteRegistry) factory overload. Finding: mutation coverage surfaced a real write/read incompatibility — the writer's global dict admits I8/I16 columns, but the reader's lazy dict decode only supports I32/I64/F64 and throws "unsupported ptype for lazy dict: I16" on read-back. Pinned by lowCardinality_i16_globalDict_readerRejects; the fix belongs to the reader's dict decode. VortexWriter 74% -> 89% killed (strength 92%). Remaining survivors are structural (segment alignment math, layout/flatbuffer build), impractical (U32 codes need 65k+ distinct), or unreachable (F32 dict case, the I8 reader gap above). Co-Authored-By: Claude Opus 4.8 --- writer/pom.xml | 1 + .../dfa1/vortex/writer/VortexWriter.java | 12 +- .../writer/GlobalDictPrimitiveTest.java | 210 ++++++++++++++++++ .../writer/VortexWriterDictDecisionTest.java | 117 ++++++++++ .../dfa1/vortex/writer/VortexWriterTest.java | 21 ++ 5 files changed, 355 insertions(+), 6 deletions(-) create mode 100644 writer/src/test/java/io/github/dfa1/vortex/writer/GlobalDictPrimitiveTest.java create mode 100644 writer/src/test/java/io/github/dfa1/vortex/writer/VortexWriterDictDecisionTest.java diff --git a/writer/pom.xml b/writer/pom.xml index f80bf6c9..64d78034 100644 --- a/writer/pom.xml +++ b/writer/pom.xml @@ -81,6 +81,7 @@ io.github.dfa1.vortex.writer.ChunkImpl io.github.dfa1.vortex.writer.WriteRegistry io.github.dfa1.vortex.writer.WriteRegistry$Builder + io.github.dfa1.vortex.writer.VortexWriter diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java b/writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java index f12275a4..e2298cf7 100644 --- a/writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java +++ b/writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java @@ -78,7 +78,7 @@ public final class VortexWriter implements Closeable { // Columns with global cardinality below this threshold are dict-encoded across all chunks. // Kept low: global dict hurts high-cardinality F64 columns (ALP codes beat U16 dict codes). - private static final int GLOBAL_DICT_MAX_CARDINALITY = 2_048; + static final int GLOBAL_DICT_MAX_CARDINALITY = 2_048; private static final List DEFAULT_CODECS = List.of( new AlpEncodingEncoder(), new PrimitiveEncodingEncoder(), new BoolEncodingEncoder(), @@ -912,7 +912,7 @@ private static Object buildUtf8CodesArray(String[] strs, Map va }; } - private static boolean isUtf8DictCandidate(String[] data) { + static boolean isUtf8DictCandidate(String[] data) { if (data.length == 0) { return false; } @@ -926,7 +926,7 @@ private static boolean isUtf8DictCandidate(String[] data) { return seen.size() * 2 < data.length; } - private static boolean isDictCandidate(PType ptype, Object data) { + static boolean isDictCandidate(PType ptype, Object data) { // F16/F32 excluded: no measured workload; ALP usually wins. F64 admitted: low-card // F64 columns (taxi mta_tax/Airport_fee/extra) compress better via global dict + // sparse-coded codes (matches Rust FloatDictScheme). Skip rule (cardinality / 2 @@ -953,7 +953,7 @@ private static boolean isDictCandidate(PType ptype, Object data) { return seen.size() * 2 < n; } - private static int primitiveArrayLen(Object data, PType ptype) { + static int primitiveArrayLen(Object data, PType ptype) { return switch (ptype) { case I8, U8 -> ((byte[]) data).length; case I16, U16, F16 -> ((short[]) data).length; @@ -964,7 +964,7 @@ private static int primitiveArrayLen(Object data, PType ptype) { }; } - private static Object readPrimitiveElement(Object data, PType ptype, int i) { + static Object readPrimitiveElement(Object data, PType ptype, int i) { return switch (ptype) { case I8, U8 -> ((byte[]) data)[i]; case I16, U16, F16 -> ((short[]) data)[i]; @@ -975,7 +975,7 @@ private static Object readPrimitiveElement(Object data, PType ptype, int i) { }; } - private static PType codePTypeForSize(int dictSize) { + static PType codePTypeForSize(int dictSize) { if (dictSize <= 256) { return PType.U8; } diff --git a/writer/src/test/java/io/github/dfa1/vortex/writer/GlobalDictPrimitiveTest.java b/writer/src/test/java/io/github/dfa1/vortex/writer/GlobalDictPrimitiveTest.java new file mode 100644 index 00000000..c4b133da --- /dev/null +++ b/writer/src/test/java/io/github/dfa1/vortex/writer/GlobalDictPrimitiveTest.java @@ -0,0 +1,210 @@ +package io.github.dfa1.vortex.writer; + +import io.github.dfa1.vortex.core.DType; +import io.github.dfa1.vortex.core.PType; +import io.github.dfa1.vortex.reader.ReadRegistry; +import io.github.dfa1.vortex.reader.VortexReader; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.IOException; +import java.nio.channels.FileChannel; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.util.List; +import java.util.Map; + +import static io.github.dfa1.vortex.writer.VortexReads.readAllDoubles; +import static io.github.dfa1.vortex.writer.VortexReads.readAllLongs; +import static org.assertj.core.api.Assertions.assertThat; + +/// Global dictionary encoding for low-cardinality primitive columns — the integer/float +/// counterpart to [GlobalDictUtf8Test]. Exercises the primitive dict-candidate gate +/// (`isDictCandidate`), the unique-key and codes array construction, and the shared dictionary +/// build across chunks, all asserted by exact value round-trips. +class GlobalDictPrimitiveTest { + + private static DType.Struct i64Schema() { + return new DType.Struct(List.of("v"), List.of(new DType.Primitive(PType.I64, false)), false); + } + + private static long[] writeI64(Path file, long[][] chunks, WriteOptions options) throws IOException { + try (var ch = FileChannel.open(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE); + var sut = VortexWriter.create(ch, i64Schema(), options)) { + for (long[] chunk : chunks) { + sut.writeChunk(Map.of("v", chunk)); + } + } + try (var vf = VortexReader.open(file, ReadRegistry.loadAll())) { + return readAllLongs(vf, "v"); + } + } + + @Test + void lowCardinality_i64_acrossChunks_usesGlobalDict(@TempDir Path tmp) throws IOException { + // Given — 4 distinct longs cycled across 5 chunks: cardinality 4 with 50/50 distribution, + // well under the < 50%-unique gate, so the global dict path fires. + long[] dict = {100L, 200L, 300L, 400L}; + int rowsPerChunk = 1_000; + int chunkCount = 5; + long[][] chunks = new long[chunkCount][rowsPerChunk]; + long[] expected = new long[rowsPerChunk * chunkCount]; + for (int c = 0; c < chunkCount; c++) { + for (int i = 0; i < rowsPerChunk; i++) { + long value = dict[(c + i) % dict.length]; + chunks[c][i] = value; + expected[c * rowsPerChunk + i] = value; + } + } + Path file = tmp.resolve("low_i64.vortex"); + + // When + long[] result = writeI64(file, chunks, WriteOptions.cascading(3)); + + // Then — every value round-trips, and one shared U8-coded dict keeps the file tiny + assertThat(result).containsExactly(expected); + assertThat(Files.size(file)).as("global dict for 5k low-card longs").isLessThan(9_000L); + } + + @Test + void highCardinality_i64_fallsBackToCascade(@TempDir Path tmp) throws IOException { + // Given — every value unique, so the cardinality/ratio gate rejects the dict path + int rows = 2_000; + long[] data = new long[rows]; + for (int i = 0; i < rows; i++) { + data[i] = 1_000_000L + i; + } + Path file = tmp.resolve("high_i64.vortex"); + + // When + long[] result = writeI64(file, new long[][]{data}, WriteOptions.cascading(3)); + + // Then — correctness, not size: the fallback round-trips exactly + assertThat(result).containsExactly(data); + } + + @Test + void singleValue_i64_fallsBackToConstant(@TempDir Path tmp) throws IOException { + // Given — one distinct value: dict overhead is pointless, so isDictCandidate returns false + // at the seen.size() == 1 guard and the column routes to vortex.constant instead. + int rows = 1_500; + long[] data = new long[rows]; + java.util.Arrays.fill(data, 42L); + Path file = tmp.resolve("const_i64.vortex"); + + // When + long[] result = writeI64(file, new long[][]{data}, WriteOptions.cascading(3)); + + // Then + assertThat(result).containsExactly(data); + } + + @Test + void cardinalityJustOverU8_i64_roundTrips(@TempDir Path tmp) throws IOException { + // Given — 300 distinct values over 1000 rows: above the 256 U8-code boundary (codes need + // U16) and still under the 50%-unique gate, exercising the wider code-ptype path. + int rows = 1_000; + int cardinality = 300; + long[] data = new long[rows]; + for (int i = 0; i < rows; i++) { + data[i] = i % cardinality; + } + Path file = tmp.resolve("u16_i64.vortex"); + + // When + long[] result = writeI64(file, new long[][]{data}, WriteOptions.cascading(3)); + + // Then + assertThat(result).containsExactly(data); + } + + @Test + void lowCardinality_i32_usesGlobalDict(@TempDir Path tmp) throws IOException { + // Given — a low-cardinality I32 column drives the global dict build through the int[] + // unique-array and codes paths (the narrower I8/I16 carriers are NOT round-tripped here: + // the reader's lazy dict rejects them — see lowCardinality_i16_globalDict_readerRejects). + var schema = new DType.Struct(List.of("v"), List.of(new DType.Primitive(PType.I32, false)), false); + int[] data = {10, 20, 30, 10, 20, 30, 10, 20}; + Path file = tmp.resolve("i32.vortex"); + try (var ch = FileChannel.open(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE); + var sut = VortexWriter.create(ch, schema, WriteOptions.cascading(3))) { + sut.writeChunk(Map.of("v", data)); + } + + // When + int[] result; + try (var vf = VortexReader.open(file, ReadRegistry.loadAll())) { + result = io.github.dfa1.vortex.writer.VortexReads.readAllInts(vf, "v"); + } + + // Then + assertThat(result).containsExactly(data); + } + + @Test + void lowCardinality_i16_globalDict_readerRejects(@TempDir Path tmp) throws IOException { + // Documents a real write/read incompatibility surfaced by mutation coverage: the writer's + // global dict admits I8/I16 columns (isDictCandidate), but the reader's lazy dict decode + // only supports I32/I64/F64 — reading back throws "unsupported ptype for lazy dict: I16". + // Pinning it here makes the gap explicit; fixing it belongs to the reader's dict decode. + var schema = new DType.Struct(List.of("v"), List.of(new DType.Primitive(PType.I16, false)), false); + short[] data = {1, 2, 3, 1, 2, 3, 1, 2}; + Path file = tmp.resolve("i16.vortex"); + try (var ch = FileChannel.open(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE); + var sut = VortexWriter.create(ch, schema, WriteOptions.cascading(3))) { + sut.writeChunk(Map.of("v", data)); + } + + // When / Then — the round-trip is not yet supported; assert the current behaviour + try (var vf = VortexReader.open(file, ReadRegistry.loadAll())) { + org.assertj.core.api.Assertions.assertThatThrownBy(() -> VortexReads.readAllInts(vf, "v")) + .isInstanceOf(io.github.dfa1.vortex.core.VortexException.class) + .hasMessageContaining("unsupported ptype for lazy dict"); + } + } + + @Test + void lowCardinality_f64_usesGlobalDict(@TempDir Path tmp) throws IOException { + // Given — F64 is admitted to the dict path (unlike F16/F32); a low-card float column must + // round-trip through the float dict build. + var schema = new DType.Struct(List.of("v"), List.of(new DType.Primitive(PType.F64, false)), false); + double[] dictVals = {1.5, 2.5, 3.5}; + int rows = 900; + double[] data = new double[rows]; + for (int i = 0; i < rows; i++) { + data[i] = dictVals[i % dictVals.length]; + } + Path file = tmp.resolve("low_f64.vortex"); + + // When + double[] result; + try (var ch = FileChannel.open(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE); + var sut = VortexWriter.create(ch, schema, WriteOptions.cascading(3))) { + sut.writeChunk(Map.of("v", data)); + } + try (var vf = VortexReader.open(file, ReadRegistry.loadAll())) { + result = readAllDoubles(vf, "v"); + } + + // Then + assertThat(result).containsExactly(data); + } + + @Test + void i64_globalDictDisabled_roundTrips(@TempDir Path tmp) throws IOException { + // Given — low-card column but globalDict() off: must fall back to per-chunk encoding and + // still round-trip, guarding the opt-out branch. + long[] data = new long[600]; + for (int i = 0; i < data.length; i++) { + data[i] = i % 3; + } + Path file = tmp.resolve("nogdict_i64.vortex"); + + // When + long[] result = writeI64(file, new long[][]{data}, WriteOptions.cascading(3).withGlobalDict(false)); + + // Then + assertThat(result).containsExactly(data); + } +} diff --git a/writer/src/test/java/io/github/dfa1/vortex/writer/VortexWriterDictDecisionTest.java b/writer/src/test/java/io/github/dfa1/vortex/writer/VortexWriterDictDecisionTest.java new file mode 100644 index 00000000..c96098ff --- /dev/null +++ b/writer/src/test/java/io/github/dfa1/vortex/writer/VortexWriterDictDecisionTest.java @@ -0,0 +1,117 @@ +package io.github.dfa1.vortex.writer; + +import io.github.dfa1.vortex.core.PType; +import org.junit.jupiter.api.Test; + +import static io.github.dfa1.vortex.writer.VortexWriter.GLOBAL_DICT_MAX_CARDINALITY; +import static org.assertj.core.api.Assertions.assertThat; + +/// Direct unit tests for the global-dictionary decision helpers in [VortexWriter]. These choices +/// (dict vs cascade fallback, and the code width) only affect *encoding*, not the values a reader +/// gets back — so a round-trip cannot pin their boundaries. Testing the pure predicates directly +/// fixes each cardinality / ratio edge. +class VortexWriterDictDecisionTest { + + // ── isDictCandidate (primitive) ────────────────────────────────────────────── + + @Test + void isDictCandidate_excludesF16AndF32() { + // Given low-cardinality float data that would otherwise qualify + // When / Then — F16/F32 are excluded outright (ALP wins there) + assertThat(VortexWriter.isDictCandidate(PType.F32, new float[]{1f, 1f, 2f, 2f, 2f})).isFalse(); + assertThat(VortexWriter.isDictCandidate(PType.F16, new short[]{1, 1, 2, 2, 2})).isFalse(); + } + + @Test + void isDictCandidate_admitsLowCardinalityI64AndF64() { + // Given — 2 distinct over 5 rows: 2*2 = 4 < 5, under the 50%-unique gate + assertThat(VortexWriter.isDictCandidate(PType.I64, new long[]{1, 2, 1, 2, 1})).isTrue(); + assertThat(VortexWriter.isDictCandidate(PType.F64, new double[]{1, 2, 1, 2, 1})).isTrue(); + } + + @Test + void isDictCandidate_emptyArray_isFalse() { + assertThat(VortexWriter.isDictCandidate(PType.I64, new long[0])).isFalse(); + } + + @Test + void isDictCandidate_singleDistinctValue_isFalse() { + // One distinct value fits vortex.constant better than a dict + assertThat(VortexWriter.isDictCandidate(PType.I64, new long[]{7, 7, 7, 7})).isFalse(); + } + + @Test + void isDictCandidate_ratioGate_isExclusive() { + // 2 distinct over 4 rows: 2*2 == 4, NOT < 4 → rejected (exactly 50% unique) + assertThat(VortexWriter.isDictCandidate(PType.I64, new long[]{1, 2, 1, 2})).isFalse(); + // 2 distinct over 5 rows: 4 < 5 → admitted + assertThat(VortexWriter.isDictCandidate(PType.I64, new long[]{1, 2, 1, 2, 1})).isTrue(); + } + + @Test + void isDictCandidate_cardinalityAtAndOverMax() { + // At MAX distinct values (well under 50% unique) → still a candidate + assertThat(VortexWriter.isDictCandidate(PType.I64, distinctThenRepeat(GLOBAL_DICT_MAX_CARDINALITY))).isTrue(); + // One over MAX → rejected by the cardinality guard + assertThat(VortexWriter.isDictCandidate(PType.I64, distinctThenRepeat(GLOBAL_DICT_MAX_CARDINALITY + 1))).isFalse(); + } + + // ── isUtf8DictCandidate ────────────────────────────────────────────────────── + + @Test + void isUtf8DictCandidate_emptyArray_isFalse() { + assertThat(VortexWriter.isUtf8DictCandidate(new String[0])).isFalse(); + } + + @Test + void isUtf8DictCandidate_ratioGate_isExclusive() { + // 2 distinct over 4: 4 == 4, not < → rejected + assertThat(VortexWriter.isUtf8DictCandidate(new String[]{"a", "b", "a", "b"})).isFalse(); + // 2 distinct over 5: 4 < 5 → admitted + assertThat(VortexWriter.isUtf8DictCandidate(new String[]{"a", "b", "a", "b", "a"})).isTrue(); + } + + @Test + void isUtf8DictCandidate_overMaxCardinality_isFalse() { + String[] data = new String[(GLOBAL_DICT_MAX_CARDINALITY + 1) * 4]; + for (int i = 0; i < data.length; i++) { + data[i] = "s" + (i % (GLOBAL_DICT_MAX_CARDINALITY + 1)); + } + assertThat(VortexWriter.isUtf8DictCandidate(data)).isFalse(); + } + + // ── codePTypeForSize ───────────────────────────────────────────────────────── + + @Test + void codePTypeForSize_picksNarrowestUnsignedCarrier() { + assertThat(VortexWriter.codePTypeForSize(1)).isEqualTo(PType.U8); + assertThat(VortexWriter.codePTypeForSize(256)).isEqualTo(PType.U8); // upper edge of U8 + assertThat(VortexWriter.codePTypeForSize(257)).isEqualTo(PType.U16); // first U16 + assertThat(VortexWriter.codePTypeForSize(65_536)).isEqualTo(PType.U16); // upper edge of U16 + assertThat(VortexWriter.codePTypeForSize(65_537)).isEqualTo(PType.U32); // first U32 + } + + // ── primitiveArrayLen / readPrimitiveElement ───────────────────────────────── + + @Test + void primitiveArrayLen_returnsActualLength() { + assertThat(VortexWriter.primitiveArrayLen(new long[]{1, 2, 3}, PType.I64)).isEqualTo(3); + assertThat(VortexWriter.primitiveArrayLen(new int[]{1, 2}, PType.I32)).isEqualTo(2); + } + + @Test + void readPrimitiveElement_returnsElementAtIndex() { + assertThat(VortexWriter.readPrimitiveElement(new long[]{7, 8, 9}, PType.I64, 1)).isEqualTo(8L); + assertThat(VortexWriter.readPrimitiveElement(new int[]{7, 8, 9}, PType.I32, 2)).isEqualTo(9); + } + + /// Builds a long[] with exactly `distinct` distinct values, each repeated 4× (so the array is + /// comfortably under the 50%-unique ratio gate and only the cardinality guard is in play). + private static long[] distinctThenRepeat(int distinct) { + long[] a = new long[distinct * 4]; + for (int i = 0; i < a.length; i++) { + a[i] = i % distinct; + } + return a; + } +} diff --git a/writer/src/test/java/io/github/dfa1/vortex/writer/VortexWriterTest.java b/writer/src/test/java/io/github/dfa1/vortex/writer/VortexWriterTest.java index 3755788e..58f8d82c 100644 --- a/writer/src/test/java/io/github/dfa1/vortex/writer/VortexWriterTest.java +++ b/writer/src/test/java/io/github/dfa1/vortex/writer/VortexWriterTest.java @@ -438,4 +438,25 @@ void writeAndRead_columnProjection_returnsOnlyRequestedColumns(@TempDir Path tmp assertThat(snapshots.getFirst().columnNames()).doesNotContain("value"); } } + + @Test + void create_withWriteRegistry_roundTrips(@TempDir Path tmp) throws IOException { + // Given — the WriteRegistry factory overload (global dict forced off, encoders taken from + // the registry). A round-trip exercises this otherwise-untested create path end to end. + var schema = new DType.Struct(List.of("v"), + List.of(new DType.Primitive(PType.I64, false)), false); + Path file = tmp.resolve("registry.vortex"); + long[] data = {1L, 2L, 3L, 4L}; + + // When + try (var ch = FileChannel.open(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE); + var sut = VortexWriter.create(ch, schema, WriteOptions.defaults(), WriteRegistry.loadAll())) { + sut.writeChunk(Map.of("v", data)); + } + + // Then + try (var vf = VortexReader.open(file, ReadRegistry.loadAll())) { + assertThat(VortexReads.readAllLongs(vf, "v")).containsExactly(data); + } + } }