diff --git a/writer/pom.xml b/writer/pom.xml
index f80bf6c9..64d78034 100644
--- a/writer/pom.xml
+++ b/writer/pom.xml
@@ -81,6 +81,7 @@
io.github.dfa1.vortex.writer.ChunkImpl
io.github.dfa1.vortex.writer.WriteRegistry
io.github.dfa1.vortex.writer.WriteRegistry$Builder
+ io.github.dfa1.vortex.writer.VortexWriter
diff --git a/writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java b/writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java
index f12275a4..e2298cf7 100644
--- a/writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java
+++ b/writer/src/main/java/io/github/dfa1/vortex/writer/VortexWriter.java
@@ -78,7 +78,7 @@ public final class VortexWriter implements Closeable {
// Columns with global cardinality below this threshold are dict-encoded across all chunks.
// Kept low: global dict hurts high-cardinality F64 columns (ALP codes beat U16 dict codes).
- private static final int GLOBAL_DICT_MAX_CARDINALITY = 2_048;
+ static final int GLOBAL_DICT_MAX_CARDINALITY = 2_048;
private static final List DEFAULT_CODECS = List.of(
new AlpEncodingEncoder(), new PrimitiveEncodingEncoder(), new BoolEncodingEncoder(),
@@ -912,7 +912,7 @@ private static Object buildUtf8CodesArray(String[] strs, Map va
};
}
- private static boolean isUtf8DictCandidate(String[] data) {
+ static boolean isUtf8DictCandidate(String[] data) {
if (data.length == 0) {
return false;
}
@@ -926,7 +926,7 @@ private static boolean isUtf8DictCandidate(String[] data) {
return seen.size() * 2 < data.length;
}
- private static boolean isDictCandidate(PType ptype, Object data) {
+ static boolean isDictCandidate(PType ptype, Object data) {
// F16/F32 excluded: no measured workload; ALP usually wins. F64 admitted: low-card
// F64 columns (taxi mta_tax/Airport_fee/extra) compress better via global dict +
// sparse-coded codes (matches Rust FloatDictScheme). Skip rule (cardinality / 2
@@ -953,7 +953,7 @@ private static boolean isDictCandidate(PType ptype, Object data) {
return seen.size() * 2 < n;
}
- private static int primitiveArrayLen(Object data, PType ptype) {
+ static int primitiveArrayLen(Object data, PType ptype) {
return switch (ptype) {
case I8, U8 -> ((byte[]) data).length;
case I16, U16, F16 -> ((short[]) data).length;
@@ -964,7 +964,7 @@ private static int primitiveArrayLen(Object data, PType ptype) {
};
}
- private static Object readPrimitiveElement(Object data, PType ptype, int i) {
+ static Object readPrimitiveElement(Object data, PType ptype, int i) {
return switch (ptype) {
case I8, U8 -> ((byte[]) data)[i];
case I16, U16, F16 -> ((short[]) data)[i];
@@ -975,7 +975,7 @@ private static Object readPrimitiveElement(Object data, PType ptype, int i) {
};
}
- private static PType codePTypeForSize(int dictSize) {
+ static PType codePTypeForSize(int dictSize) {
if (dictSize <= 256) {
return PType.U8;
}
diff --git a/writer/src/test/java/io/github/dfa1/vortex/writer/GlobalDictPrimitiveTest.java b/writer/src/test/java/io/github/dfa1/vortex/writer/GlobalDictPrimitiveTest.java
new file mode 100644
index 00000000..c4b133da
--- /dev/null
+++ b/writer/src/test/java/io/github/dfa1/vortex/writer/GlobalDictPrimitiveTest.java
@@ -0,0 +1,210 @@
+package io.github.dfa1.vortex.writer;
+
+import io.github.dfa1.vortex.core.DType;
+import io.github.dfa1.vortex.core.PType;
+import io.github.dfa1.vortex.reader.ReadRegistry;
+import io.github.dfa1.vortex.reader.VortexReader;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import java.io.IOException;
+import java.nio.channels.FileChannel;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
+import java.util.List;
+import java.util.Map;
+
+import static io.github.dfa1.vortex.writer.VortexReads.readAllDoubles;
+import static io.github.dfa1.vortex.writer.VortexReads.readAllLongs;
+import static org.assertj.core.api.Assertions.assertThat;
+
+/// Global dictionary encoding for low-cardinality primitive columns — the integer/float
+/// counterpart to [GlobalDictUtf8Test]. Exercises the primitive dict-candidate gate
+/// (`isDictCandidate`), the unique-key and codes array construction, and the shared dictionary
+/// build across chunks, all asserted by exact value round-trips.
+class GlobalDictPrimitiveTest {
+
+ private static DType.Struct i64Schema() {
+ return new DType.Struct(List.of("v"), List.of(new DType.Primitive(PType.I64, false)), false);
+ }
+
+ private static long[] writeI64(Path file, long[][] chunks, WriteOptions options) throws IOException {
+ try (var ch = FileChannel.open(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE);
+ var sut = VortexWriter.create(ch, i64Schema(), options)) {
+ for (long[] chunk : chunks) {
+ sut.writeChunk(Map.of("v", chunk));
+ }
+ }
+ try (var vf = VortexReader.open(file, ReadRegistry.loadAll())) {
+ return readAllLongs(vf, "v");
+ }
+ }
+
+ @Test
+ void lowCardinality_i64_acrossChunks_usesGlobalDict(@TempDir Path tmp) throws IOException {
+ // Given — 4 distinct longs cycled across 5 chunks: cardinality 4 with 50/50 distribution,
+ // well under the < 50%-unique gate, so the global dict path fires.
+ long[] dict = {100L, 200L, 300L, 400L};
+ int rowsPerChunk = 1_000;
+ int chunkCount = 5;
+ long[][] chunks = new long[chunkCount][rowsPerChunk];
+ long[] expected = new long[rowsPerChunk * chunkCount];
+ for (int c = 0; c < chunkCount; c++) {
+ for (int i = 0; i < rowsPerChunk; i++) {
+ long value = dict[(c + i) % dict.length];
+ chunks[c][i] = value;
+ expected[c * rowsPerChunk + i] = value;
+ }
+ }
+ Path file = tmp.resolve("low_i64.vortex");
+
+ // When
+ long[] result = writeI64(file, chunks, WriteOptions.cascading(3));
+
+ // Then — every value round-trips, and one shared U8-coded dict keeps the file tiny
+ assertThat(result).containsExactly(expected);
+ assertThat(Files.size(file)).as("global dict for 5k low-card longs").isLessThan(9_000L);
+ }
+
+ @Test
+ void highCardinality_i64_fallsBackToCascade(@TempDir Path tmp) throws IOException {
+ // Given — every value unique, so the cardinality/ratio gate rejects the dict path
+ int rows = 2_000;
+ long[] data = new long[rows];
+ for (int i = 0; i < rows; i++) {
+ data[i] = 1_000_000L + i;
+ }
+ Path file = tmp.resolve("high_i64.vortex");
+
+ // When
+ long[] result = writeI64(file, new long[][]{data}, WriteOptions.cascading(3));
+
+ // Then — correctness, not size: the fallback round-trips exactly
+ assertThat(result).containsExactly(data);
+ }
+
+ @Test
+ void singleValue_i64_fallsBackToConstant(@TempDir Path tmp) throws IOException {
+ // Given — one distinct value: dict overhead is pointless, so isDictCandidate returns false
+ // at the seen.size() == 1 guard and the column routes to vortex.constant instead.
+ int rows = 1_500;
+ long[] data = new long[rows];
+ java.util.Arrays.fill(data, 42L);
+ Path file = tmp.resolve("const_i64.vortex");
+
+ // When
+ long[] result = writeI64(file, new long[][]{data}, WriteOptions.cascading(3));
+
+ // Then
+ assertThat(result).containsExactly(data);
+ }
+
+ @Test
+ void cardinalityJustOverU8_i64_roundTrips(@TempDir Path tmp) throws IOException {
+ // Given — 300 distinct values over 1000 rows: above the 256 U8-code boundary (codes need
+ // U16) and still under the 50%-unique gate, exercising the wider code-ptype path.
+ int rows = 1_000;
+ int cardinality = 300;
+ long[] data = new long[rows];
+ for (int i = 0; i < rows; i++) {
+ data[i] = i % cardinality;
+ }
+ Path file = tmp.resolve("u16_i64.vortex");
+
+ // When
+ long[] result = writeI64(file, new long[][]{data}, WriteOptions.cascading(3));
+
+ // Then
+ assertThat(result).containsExactly(data);
+ }
+
+ @Test
+ void lowCardinality_i32_usesGlobalDict(@TempDir Path tmp) throws IOException {
+ // Given — a low-cardinality I32 column drives the global dict build through the int[]
+ // unique-array and codes paths (the narrower I8/I16 carriers are NOT round-tripped here:
+ // the reader's lazy dict rejects them — see lowCardinality_i16_globalDict_readerRejects).
+ var schema = new DType.Struct(List.of("v"), List.of(new DType.Primitive(PType.I32, false)), false);
+ int[] data = {10, 20, 30, 10, 20, 30, 10, 20};
+ Path file = tmp.resolve("i32.vortex");
+ try (var ch = FileChannel.open(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE);
+ var sut = VortexWriter.create(ch, schema, WriteOptions.cascading(3))) {
+ sut.writeChunk(Map.of("v", data));
+ }
+
+ // When
+ int[] result;
+ try (var vf = VortexReader.open(file, ReadRegistry.loadAll())) {
+ result = io.github.dfa1.vortex.writer.VortexReads.readAllInts(vf, "v");
+ }
+
+ // Then
+ assertThat(result).containsExactly(data);
+ }
+
+ @Test
+ void lowCardinality_i16_globalDict_readerRejects(@TempDir Path tmp) throws IOException {
+ // Documents a real write/read incompatibility surfaced by mutation coverage: the writer's
+ // global dict admits I8/I16 columns (isDictCandidate), but the reader's lazy dict decode
+ // only supports I32/I64/F64 — reading back throws "unsupported ptype for lazy dict: I16".
+ // Pinning it here makes the gap explicit; fixing it belongs to the reader's dict decode.
+ var schema = new DType.Struct(List.of("v"), List.of(new DType.Primitive(PType.I16, false)), false);
+ short[] data = {1, 2, 3, 1, 2, 3, 1, 2};
+ Path file = tmp.resolve("i16.vortex");
+ try (var ch = FileChannel.open(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE);
+ var sut = VortexWriter.create(ch, schema, WriteOptions.cascading(3))) {
+ sut.writeChunk(Map.of("v", data));
+ }
+
+ // When / Then — the round-trip is not yet supported; assert the current behaviour
+ try (var vf = VortexReader.open(file, ReadRegistry.loadAll())) {
+ org.assertj.core.api.Assertions.assertThatThrownBy(() -> VortexReads.readAllInts(vf, "v"))
+ .isInstanceOf(io.github.dfa1.vortex.core.VortexException.class)
+ .hasMessageContaining("unsupported ptype for lazy dict");
+ }
+ }
+
+ @Test
+ void lowCardinality_f64_usesGlobalDict(@TempDir Path tmp) throws IOException {
+ // Given — F64 is admitted to the dict path (unlike F16/F32); a low-card float column must
+ // round-trip through the float dict build.
+ var schema = new DType.Struct(List.of("v"), List.of(new DType.Primitive(PType.F64, false)), false);
+ double[] dictVals = {1.5, 2.5, 3.5};
+ int rows = 900;
+ double[] data = new double[rows];
+ for (int i = 0; i < rows; i++) {
+ data[i] = dictVals[i % dictVals.length];
+ }
+ Path file = tmp.resolve("low_f64.vortex");
+
+ // When
+ double[] result;
+ try (var ch = FileChannel.open(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE);
+ var sut = VortexWriter.create(ch, schema, WriteOptions.cascading(3))) {
+ sut.writeChunk(Map.of("v", data));
+ }
+ try (var vf = VortexReader.open(file, ReadRegistry.loadAll())) {
+ result = readAllDoubles(vf, "v");
+ }
+
+ // Then
+ assertThat(result).containsExactly(data);
+ }
+
+ @Test
+ void i64_globalDictDisabled_roundTrips(@TempDir Path tmp) throws IOException {
+ // Given — low-card column but globalDict() off: must fall back to per-chunk encoding and
+ // still round-trip, guarding the opt-out branch.
+ long[] data = new long[600];
+ for (int i = 0; i < data.length; i++) {
+ data[i] = i % 3;
+ }
+ Path file = tmp.resolve("nogdict_i64.vortex");
+
+ // When
+ long[] result = writeI64(file, new long[][]{data}, WriteOptions.cascading(3).withGlobalDict(false));
+
+ // Then
+ assertThat(result).containsExactly(data);
+ }
+}
diff --git a/writer/src/test/java/io/github/dfa1/vortex/writer/VortexWriterDictDecisionTest.java b/writer/src/test/java/io/github/dfa1/vortex/writer/VortexWriterDictDecisionTest.java
new file mode 100644
index 00000000..c96098ff
--- /dev/null
+++ b/writer/src/test/java/io/github/dfa1/vortex/writer/VortexWriterDictDecisionTest.java
@@ -0,0 +1,117 @@
+package io.github.dfa1.vortex.writer;
+
+import io.github.dfa1.vortex.core.PType;
+import org.junit.jupiter.api.Test;
+
+import static io.github.dfa1.vortex.writer.VortexWriter.GLOBAL_DICT_MAX_CARDINALITY;
+import static org.assertj.core.api.Assertions.assertThat;
+
+/// Direct unit tests for the global-dictionary decision helpers in [VortexWriter]. These choices
+/// (dict vs cascade fallback, and the code width) only affect *encoding*, not the values a reader
+/// gets back — so a round-trip cannot pin their boundaries. Testing the pure predicates directly
+/// fixes each cardinality / ratio edge.
+class VortexWriterDictDecisionTest {
+
+ // ── isDictCandidate (primitive) ──────────────────────────────────────────────
+
+ @Test
+ void isDictCandidate_excludesF16AndF32() {
+ // Given low-cardinality float data that would otherwise qualify
+ // When / Then — F16/F32 are excluded outright (ALP wins there)
+ assertThat(VortexWriter.isDictCandidate(PType.F32, new float[]{1f, 1f, 2f, 2f, 2f})).isFalse();
+ assertThat(VortexWriter.isDictCandidate(PType.F16, new short[]{1, 1, 2, 2, 2})).isFalse();
+ }
+
+ @Test
+ void isDictCandidate_admitsLowCardinalityI64AndF64() {
+ // Given — 2 distinct over 5 rows: 2*2 = 4 < 5, under the 50%-unique gate
+ assertThat(VortexWriter.isDictCandidate(PType.I64, new long[]{1, 2, 1, 2, 1})).isTrue();
+ assertThat(VortexWriter.isDictCandidate(PType.F64, new double[]{1, 2, 1, 2, 1})).isTrue();
+ }
+
+ @Test
+ void isDictCandidate_emptyArray_isFalse() {
+ assertThat(VortexWriter.isDictCandidate(PType.I64, new long[0])).isFalse();
+ }
+
+ @Test
+ void isDictCandidate_singleDistinctValue_isFalse() {
+ // One distinct value fits vortex.constant better than a dict
+ assertThat(VortexWriter.isDictCandidate(PType.I64, new long[]{7, 7, 7, 7})).isFalse();
+ }
+
+ @Test
+ void isDictCandidate_ratioGate_isExclusive() {
+ // 2 distinct over 4 rows: 2*2 == 4, NOT < 4 → rejected (exactly 50% unique)
+ assertThat(VortexWriter.isDictCandidate(PType.I64, new long[]{1, 2, 1, 2})).isFalse();
+ // 2 distinct over 5 rows: 4 < 5 → admitted
+ assertThat(VortexWriter.isDictCandidate(PType.I64, new long[]{1, 2, 1, 2, 1})).isTrue();
+ }
+
+ @Test
+ void isDictCandidate_cardinalityAtAndOverMax() {
+ // At MAX distinct values (well under 50% unique) → still a candidate
+ assertThat(VortexWriter.isDictCandidate(PType.I64, distinctThenRepeat(GLOBAL_DICT_MAX_CARDINALITY))).isTrue();
+ // One over MAX → rejected by the cardinality guard
+ assertThat(VortexWriter.isDictCandidate(PType.I64, distinctThenRepeat(GLOBAL_DICT_MAX_CARDINALITY + 1))).isFalse();
+ }
+
+ // ── isUtf8DictCandidate ──────────────────────────────────────────────────────
+
+ @Test
+ void isUtf8DictCandidate_emptyArray_isFalse() {
+ assertThat(VortexWriter.isUtf8DictCandidate(new String[0])).isFalse();
+ }
+
+ @Test
+ void isUtf8DictCandidate_ratioGate_isExclusive() {
+ // 2 distinct over 4: 4 == 4, not < → rejected
+ assertThat(VortexWriter.isUtf8DictCandidate(new String[]{"a", "b", "a", "b"})).isFalse();
+ // 2 distinct over 5: 4 < 5 → admitted
+ assertThat(VortexWriter.isUtf8DictCandidate(new String[]{"a", "b", "a", "b", "a"})).isTrue();
+ }
+
+ @Test
+ void isUtf8DictCandidate_overMaxCardinality_isFalse() {
+ String[] data = new String[(GLOBAL_DICT_MAX_CARDINALITY + 1) * 4];
+ for (int i = 0; i < data.length; i++) {
+ data[i] = "s" + (i % (GLOBAL_DICT_MAX_CARDINALITY + 1));
+ }
+ assertThat(VortexWriter.isUtf8DictCandidate(data)).isFalse();
+ }
+
+ // ── codePTypeForSize ─────────────────────────────────────────────────────────
+
+ @Test
+ void codePTypeForSize_picksNarrowestUnsignedCarrier() {
+ assertThat(VortexWriter.codePTypeForSize(1)).isEqualTo(PType.U8);
+ assertThat(VortexWriter.codePTypeForSize(256)).isEqualTo(PType.U8); // upper edge of U8
+ assertThat(VortexWriter.codePTypeForSize(257)).isEqualTo(PType.U16); // first U16
+ assertThat(VortexWriter.codePTypeForSize(65_536)).isEqualTo(PType.U16); // upper edge of U16
+ assertThat(VortexWriter.codePTypeForSize(65_537)).isEqualTo(PType.U32); // first U32
+ }
+
+ // ── primitiveArrayLen / readPrimitiveElement ─────────────────────────────────
+
+ @Test
+ void primitiveArrayLen_returnsActualLength() {
+ assertThat(VortexWriter.primitiveArrayLen(new long[]{1, 2, 3}, PType.I64)).isEqualTo(3);
+ assertThat(VortexWriter.primitiveArrayLen(new int[]{1, 2}, PType.I32)).isEqualTo(2);
+ }
+
+ @Test
+ void readPrimitiveElement_returnsElementAtIndex() {
+ assertThat(VortexWriter.readPrimitiveElement(new long[]{7, 8, 9}, PType.I64, 1)).isEqualTo(8L);
+ assertThat(VortexWriter.readPrimitiveElement(new int[]{7, 8, 9}, PType.I32, 2)).isEqualTo(9);
+ }
+
+ /// Builds a long[] with exactly `distinct` distinct values, each repeated 4× (so the array is
+ /// comfortably under the 50%-unique ratio gate and only the cardinality guard is in play).
+ private static long[] distinctThenRepeat(int distinct) {
+ long[] a = new long[distinct * 4];
+ for (int i = 0; i < a.length; i++) {
+ a[i] = i % distinct;
+ }
+ return a;
+ }
+}
diff --git a/writer/src/test/java/io/github/dfa1/vortex/writer/VortexWriterTest.java b/writer/src/test/java/io/github/dfa1/vortex/writer/VortexWriterTest.java
index 3755788e..58f8d82c 100644
--- a/writer/src/test/java/io/github/dfa1/vortex/writer/VortexWriterTest.java
+++ b/writer/src/test/java/io/github/dfa1/vortex/writer/VortexWriterTest.java
@@ -438,4 +438,25 @@ void writeAndRead_columnProjection_returnsOnlyRequestedColumns(@TempDir Path tmp
assertThat(snapshots.getFirst().columnNames()).doesNotContain("value");
}
}
+
+ @Test
+ void create_withWriteRegistry_roundTrips(@TempDir Path tmp) throws IOException {
+ // Given — the WriteRegistry factory overload (global dict forced off, encoders taken from
+ // the registry). A round-trip exercises this otherwise-untested create path end to end.
+ var schema = new DType.Struct(List.of("v"),
+ List.of(new DType.Primitive(PType.I64, false)), false);
+ Path file = tmp.resolve("registry.vortex");
+ long[] data = {1L, 2L, 3L, 4L};
+
+ // When
+ try (var ch = FileChannel.open(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE);
+ var sut = VortexWriter.create(ch, schema, WriteOptions.defaults(), WriteRegistry.loadAll())) {
+ sut.writeChunk(Map.of("v", data));
+ }
+
+ // Then
+ try (var vf = VortexReader.open(file, ReadRegistry.loadAll())) {
+ assertThat(VortexReads.readAllLongs(vf, "v")).containsExactly(data);
+ }
+ }
}