Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions writer/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@
<param>io.github.dfa1.vortex.writer.ChunkImpl</param>
<param>io.github.dfa1.vortex.writer.WriteRegistry</param>
<param>io.github.dfa1.vortex.writer.WriteRegistry$Builder</param>
<param>io.github.dfa1.vortex.writer.VortexWriter</param>
</targetClasses>
</configuration>
</plugin>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ public final class VortexWriter implements Closeable {

// Columns with global cardinality below this threshold are dict-encoded across all chunks.
// Kept low: global dict hurts high-cardinality F64 columns (ALP codes beat U16 dict codes).
private static final int GLOBAL_DICT_MAX_CARDINALITY = 2_048;
static final int GLOBAL_DICT_MAX_CARDINALITY = 2_048;

private static final List<EncodingEncoder> DEFAULT_CODECS = List.of(
new AlpEncodingEncoder(), new PrimitiveEncodingEncoder(), new BoolEncodingEncoder(),
Expand Down Expand Up @@ -912,7 +912,7 @@ private static Object buildUtf8CodesArray(String[] strs, Map<String, Integer> va
};
}

private static boolean isUtf8DictCandidate(String[] data) {
static boolean isUtf8DictCandidate(String[] data) {
if (data.length == 0) {
return false;
}
Expand All @@ -926,7 +926,7 @@ private static boolean isUtf8DictCandidate(String[] data) {
return seen.size() * 2 < data.length;
}

private static boolean isDictCandidate(PType ptype, Object data) {
static boolean isDictCandidate(PType ptype, Object data) {
// F16/F32 excluded: no measured workload; ALP usually wins. F64 admitted: low-card
// F64 columns (taxi mta_tax/Airport_fee/extra) compress better via global dict +
// sparse-coded codes (matches Rust FloatDictScheme). Skip rule (cardinality / 2
Expand All @@ -953,7 +953,7 @@ private static boolean isDictCandidate(PType ptype, Object data) {
return seen.size() * 2 < n;
}

private static int primitiveArrayLen(Object data, PType ptype) {
static int primitiveArrayLen(Object data, PType ptype) {
return switch (ptype) {
case I8, U8 -> ((byte[]) data).length;
case I16, U16, F16 -> ((short[]) data).length;
Expand All @@ -964,7 +964,7 @@ private static int primitiveArrayLen(Object data, PType ptype) {
};
}

private static Object readPrimitiveElement(Object data, PType ptype, int i) {
static Object readPrimitiveElement(Object data, PType ptype, int i) {
return switch (ptype) {
case I8, U8 -> ((byte[]) data)[i];
case I16, U16, F16 -> ((short[]) data)[i];
Expand All @@ -975,7 +975,7 @@ private static Object readPrimitiveElement(Object data, PType ptype, int i) {
};
}

private static PType codePTypeForSize(int dictSize) {
static PType codePTypeForSize(int dictSize) {
if (dictSize <= 256) {
return PType.U8;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
package io.github.dfa1.vortex.writer;

import io.github.dfa1.vortex.core.DType;
import io.github.dfa1.vortex.core.PType;
import io.github.dfa1.vortex.reader.ReadRegistry;
import io.github.dfa1.vortex.reader.VortexReader;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;

import java.io.IOException;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.List;
import java.util.Map;

import static io.github.dfa1.vortex.writer.VortexReads.readAllDoubles;
import static io.github.dfa1.vortex.writer.VortexReads.readAllLongs;
import static org.assertj.core.api.Assertions.assertThat;

/// Global dictionary encoding for low-cardinality primitive columns — the integer/float
/// counterpart to [GlobalDictUtf8Test]. Exercises the primitive dict-candidate gate
/// (`isDictCandidate`), the unique-key and codes array construction, and the shared dictionary
/// build across chunks, all asserted by exact value round-trips.
class GlobalDictPrimitiveTest {

private static DType.Struct i64Schema() {
return new DType.Struct(List.of("v"), List.of(new DType.Primitive(PType.I64, false)), false);
}

private static long[] writeI64(Path file, long[][] chunks, WriteOptions options) throws IOException {
try (var ch = FileChannel.open(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE);
var sut = VortexWriter.create(ch, i64Schema(), options)) {
for (long[] chunk : chunks) {
sut.writeChunk(Map.of("v", chunk));
}
}
try (var vf = VortexReader.open(file, ReadRegistry.loadAll())) {
return readAllLongs(vf, "v");
}
}

@Test
void lowCardinality_i64_acrossChunks_usesGlobalDict(@TempDir Path tmp) throws IOException {
// Given — 4 distinct longs cycled across 5 chunks: cardinality 4 with 50/50 distribution,
// well under the < 50%-unique gate, so the global dict path fires.
long[] dict = {100L, 200L, 300L, 400L};
int rowsPerChunk = 1_000;
int chunkCount = 5;
long[][] chunks = new long[chunkCount][rowsPerChunk];
long[] expected = new long[rowsPerChunk * chunkCount];
for (int c = 0; c < chunkCount; c++) {
for (int i = 0; i < rowsPerChunk; i++) {
long value = dict[(c + i) % dict.length];
chunks[c][i] = value;
expected[c * rowsPerChunk + i] = value;
}
}
Path file = tmp.resolve("low_i64.vortex");

// When
long[] result = writeI64(file, chunks, WriteOptions.cascading(3));

// Then — every value round-trips, and one shared U8-coded dict keeps the file tiny
assertThat(result).containsExactly(expected);
assertThat(Files.size(file)).as("global dict for 5k low-card longs").isLessThan(9_000L);
}

@Test
void highCardinality_i64_fallsBackToCascade(@TempDir Path tmp) throws IOException {
// Given — every value unique, so the cardinality/ratio gate rejects the dict path
int rows = 2_000;
long[] data = new long[rows];
for (int i = 0; i < rows; i++) {
data[i] = 1_000_000L + i;
}
Path file = tmp.resolve("high_i64.vortex");

// When
long[] result = writeI64(file, new long[][]{data}, WriteOptions.cascading(3));

// Then — correctness, not size: the fallback round-trips exactly
assertThat(result).containsExactly(data);
}

@Test
void singleValue_i64_fallsBackToConstant(@TempDir Path tmp) throws IOException {
// Given — one distinct value: dict overhead is pointless, so isDictCandidate returns false
// at the seen.size() == 1 guard and the column routes to vortex.constant instead.
int rows = 1_500;
long[] data = new long[rows];
java.util.Arrays.fill(data, 42L);
Path file = tmp.resolve("const_i64.vortex");

// When
long[] result = writeI64(file, new long[][]{data}, WriteOptions.cascading(3));

// Then
assertThat(result).containsExactly(data);
}

@Test
void cardinalityJustOverU8_i64_roundTrips(@TempDir Path tmp) throws IOException {
// Given — 300 distinct values over 1000 rows: above the 256 U8-code boundary (codes need
// U16) and still under the 50%-unique gate, exercising the wider code-ptype path.
int rows = 1_000;
int cardinality = 300;
long[] data = new long[rows];
for (int i = 0; i < rows; i++) {
data[i] = i % cardinality;
}
Path file = tmp.resolve("u16_i64.vortex");

// When
long[] result = writeI64(file, new long[][]{data}, WriteOptions.cascading(3));

// Then
assertThat(result).containsExactly(data);
}

@Test
void lowCardinality_i32_usesGlobalDict(@TempDir Path tmp) throws IOException {
// Given — a low-cardinality I32 column drives the global dict build through the int[]
// unique-array and codes paths (the narrower I8/I16 carriers are NOT round-tripped here:
// the reader's lazy dict rejects them — see lowCardinality_i16_globalDict_readerRejects).
var schema = new DType.Struct(List.of("v"), List.of(new DType.Primitive(PType.I32, false)), false);
int[] data = {10, 20, 30, 10, 20, 30, 10, 20};
Path file = tmp.resolve("i32.vortex");
try (var ch = FileChannel.open(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE);
var sut = VortexWriter.create(ch, schema, WriteOptions.cascading(3))) {
sut.writeChunk(Map.of("v", data));
}

// When
int[] result;
try (var vf = VortexReader.open(file, ReadRegistry.loadAll())) {
result = io.github.dfa1.vortex.writer.VortexReads.readAllInts(vf, "v");
}

// Then
assertThat(result).containsExactly(data);
}

@Test
void lowCardinality_i16_globalDict_readerRejects(@TempDir Path tmp) throws IOException {
// Documents a real write/read incompatibility surfaced by mutation coverage: the writer's
// global dict admits I8/I16 columns (isDictCandidate), but the reader's lazy dict decode
// only supports I32/I64/F64 — reading back throws "unsupported ptype for lazy dict: I16".
// Pinning it here makes the gap explicit; fixing it belongs to the reader's dict decode.
var schema = new DType.Struct(List.of("v"), List.of(new DType.Primitive(PType.I16, false)), false);
short[] data = {1, 2, 3, 1, 2, 3, 1, 2};
Path file = tmp.resolve("i16.vortex");
try (var ch = FileChannel.open(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE);
var sut = VortexWriter.create(ch, schema, WriteOptions.cascading(3))) {
sut.writeChunk(Map.of("v", data));
}

// When / Then — the round-trip is not yet supported; assert the current behaviour
try (var vf = VortexReader.open(file, ReadRegistry.loadAll())) {
org.assertj.core.api.Assertions.assertThatThrownBy(() -> VortexReads.readAllInts(vf, "v"))
.isInstanceOf(io.github.dfa1.vortex.core.VortexException.class)
.hasMessageContaining("unsupported ptype for lazy dict");
}
}

@Test
void lowCardinality_f64_usesGlobalDict(@TempDir Path tmp) throws IOException {
// Given — F64 is admitted to the dict path (unlike F16/F32); a low-card float column must
// round-trip through the float dict build.
var schema = new DType.Struct(List.of("v"), List.of(new DType.Primitive(PType.F64, false)), false);
double[] dictVals = {1.5, 2.5, 3.5};
int rows = 900;
double[] data = new double[rows];
for (int i = 0; i < rows; i++) {
data[i] = dictVals[i % dictVals.length];
}
Path file = tmp.resolve("low_f64.vortex");

// When
double[] result;
try (var ch = FileChannel.open(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE);
var sut = VortexWriter.create(ch, schema, WriteOptions.cascading(3))) {
sut.writeChunk(Map.of("v", data));
}
try (var vf = VortexReader.open(file, ReadRegistry.loadAll())) {
result = readAllDoubles(vf, "v");
}

// Then
assertThat(result).containsExactly(data);
}

@Test
void i64_globalDictDisabled_roundTrips(@TempDir Path tmp) throws IOException {
// Given — low-card column but globalDict() off: must fall back to per-chunk encoding and
// still round-trip, guarding the opt-out branch.
long[] data = new long[600];
for (int i = 0; i < data.length; i++) {
data[i] = i % 3;
}
Path file = tmp.resolve("nogdict_i64.vortex");

// When
long[] result = writeI64(file, new long[][]{data}, WriteOptions.cascading(3).withGlobalDict(false));

// Then
assertThat(result).containsExactly(data);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
package io.github.dfa1.vortex.writer;

import io.github.dfa1.vortex.core.PType;
import org.junit.jupiter.api.Test;

import static io.github.dfa1.vortex.writer.VortexWriter.GLOBAL_DICT_MAX_CARDINALITY;
import static org.assertj.core.api.Assertions.assertThat;

/// Direct unit tests for the global-dictionary decision helpers in [VortexWriter]. These choices
/// (dict vs cascade fallback, and the code width) only affect *encoding*, not the values a reader
/// gets back — so a round-trip cannot pin their boundaries. Testing the pure predicates directly
/// fixes each cardinality / ratio edge.
class VortexWriterDictDecisionTest {

// ── isDictCandidate (primitive) ──────────────────────────────────────────────

@Test
void isDictCandidate_excludesF16AndF32() {
// Given low-cardinality float data that would otherwise qualify
// When / Then — F16/F32 are excluded outright (ALP wins there)
assertThat(VortexWriter.isDictCandidate(PType.F32, new float[]{1f, 1f, 2f, 2f, 2f})).isFalse();
assertThat(VortexWriter.isDictCandidate(PType.F16, new short[]{1, 1, 2, 2, 2})).isFalse();
}

@Test
void isDictCandidate_admitsLowCardinalityI64AndF64() {
// Given — 2 distinct over 5 rows: 2*2 = 4 < 5, under the 50%-unique gate
assertThat(VortexWriter.isDictCandidate(PType.I64, new long[]{1, 2, 1, 2, 1})).isTrue();
assertThat(VortexWriter.isDictCandidate(PType.F64, new double[]{1, 2, 1, 2, 1})).isTrue();
}

@Test
void isDictCandidate_emptyArray_isFalse() {
assertThat(VortexWriter.isDictCandidate(PType.I64, new long[0])).isFalse();
}

@Test
void isDictCandidate_singleDistinctValue_isFalse() {
// One distinct value fits vortex.constant better than a dict
assertThat(VortexWriter.isDictCandidate(PType.I64, new long[]{7, 7, 7, 7})).isFalse();
}

@Test
void isDictCandidate_ratioGate_isExclusive() {
// 2 distinct over 4 rows: 2*2 == 4, NOT < 4 → rejected (exactly 50% unique)
assertThat(VortexWriter.isDictCandidate(PType.I64, new long[]{1, 2, 1, 2})).isFalse();
// 2 distinct over 5 rows: 4 < 5 → admitted
assertThat(VortexWriter.isDictCandidate(PType.I64, new long[]{1, 2, 1, 2, 1})).isTrue();
}

@Test
void isDictCandidate_cardinalityAtAndOverMax() {
// At MAX distinct values (well under 50% unique) → still a candidate
assertThat(VortexWriter.isDictCandidate(PType.I64, distinctThenRepeat(GLOBAL_DICT_MAX_CARDINALITY))).isTrue();
// One over MAX → rejected by the cardinality guard
assertThat(VortexWriter.isDictCandidate(PType.I64, distinctThenRepeat(GLOBAL_DICT_MAX_CARDINALITY + 1))).isFalse();
}

// ── isUtf8DictCandidate ──────────────────────────────────────────────────────

@Test
void isUtf8DictCandidate_emptyArray_isFalse() {
assertThat(VortexWriter.isUtf8DictCandidate(new String[0])).isFalse();
}

@Test
void isUtf8DictCandidate_ratioGate_isExclusive() {
// 2 distinct over 4: 4 == 4, not < → rejected
assertThat(VortexWriter.isUtf8DictCandidate(new String[]{"a", "b", "a", "b"})).isFalse();
// 2 distinct over 5: 4 < 5 → admitted
assertThat(VortexWriter.isUtf8DictCandidate(new String[]{"a", "b", "a", "b", "a"})).isTrue();
}

@Test
void isUtf8DictCandidate_overMaxCardinality_isFalse() {
String[] data = new String[(GLOBAL_DICT_MAX_CARDINALITY + 1) * 4];
for (int i = 0; i < data.length; i++) {
data[i] = "s" + (i % (GLOBAL_DICT_MAX_CARDINALITY + 1));
}
assertThat(VortexWriter.isUtf8DictCandidate(data)).isFalse();
}

// ── codePTypeForSize ─────────────────────────────────────────────────────────

@Test
void codePTypeForSize_picksNarrowestUnsignedCarrier() {
assertThat(VortexWriter.codePTypeForSize(1)).isEqualTo(PType.U8);
assertThat(VortexWriter.codePTypeForSize(256)).isEqualTo(PType.U8); // upper edge of U8
assertThat(VortexWriter.codePTypeForSize(257)).isEqualTo(PType.U16); // first U16
assertThat(VortexWriter.codePTypeForSize(65_536)).isEqualTo(PType.U16); // upper edge of U16
assertThat(VortexWriter.codePTypeForSize(65_537)).isEqualTo(PType.U32); // first U32
}

// ── primitiveArrayLen / readPrimitiveElement ─────────────────────────────────

@Test
void primitiveArrayLen_returnsActualLength() {
assertThat(VortexWriter.primitiveArrayLen(new long[]{1, 2, 3}, PType.I64)).isEqualTo(3);
assertThat(VortexWriter.primitiveArrayLen(new int[]{1, 2}, PType.I32)).isEqualTo(2);
}

@Test
void readPrimitiveElement_returnsElementAtIndex() {
assertThat(VortexWriter.readPrimitiveElement(new long[]{7, 8, 9}, PType.I64, 1)).isEqualTo(8L);
assertThat(VortexWriter.readPrimitiveElement(new int[]{7, 8, 9}, PType.I32, 2)).isEqualTo(9);
}

/// Builds a long[] with exactly `distinct` distinct values, each repeated 4× (so the array is
/// comfortably under the 50%-unique ratio gate and only the cardinality guard is in play).
private static long[] distinctThenRepeat(int distinct) {
long[] a = new long[distinct * 4];
for (int i = 0; i < a.length; i++) {
a[i] = i % distinct;
}
return a;
}
}
Loading
Loading