Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import org.apache.arrow.vector.BigIntVector;
import org.apache.arrow.vector.Float2Vector;
import org.apache.arrow.vector.Float8Vector;
import org.apache.arrow.vector.SmallIntVector;
import org.apache.arrow.vector.VectorSchemaRoot;
import org.apache.arrow.vector.ipc.ArrowReader;
import org.apache.arrow.vector.types.FloatingPointPrecision;
Expand Down Expand Up @@ -390,6 +391,62 @@ void jniWriter_javaReader_fewUniqueF64Values(@TempDir Path tmp) throws IOExcepti
}
}

private static final Schema I16_SCHEMA = new Schema(List.of(
Field.notNullable("v", new ArrowType.Int(16, true))
));

private static void writeJniI16(Path file, short[] vals) throws IOException {
String uri = file.toAbsolutePath().toUri().toString();
try (VortexWriter writer = VortexWriter.create(SESSION, uri, I16_SCHEMA, new HashMap<>(), ALLOCATOR);
VectorSchemaRoot root = VectorSchemaRoot.create(I16_SCHEMA, ALLOCATOR)) {
SmallIntVector vec = (SmallIntVector) root.getVector("v");
vec.allocateNew(vals.length);
for (int i = 0; i < vals.length; i++) {
vec.setSafe(i, vals[i]);
}
root.setRowCount(vals.length);
try (ArrowArray arr = ArrowArray.allocateNew(ALLOCATOR);
ArrowSchema schema = ArrowSchema.allocateNew(ALLOCATOR)) {
Data.exportVectorSchemaRoot(ALLOCATOR, root, null, arr, schema);
writer.writeBatch(arr.memoryAddress(), schema.memoryAddress());
}
}
}

@Test
void jniWriter_javaReader_lowCardinalityI16(@TempDir Path tmp) throws IOException {
// Given — 10_000 rows cycling 3 unique I16 values. Ground-truth cross-check: does the
// Rust/JNI compressor dict-encode a low-cardinality I16 column, and can the Java reader
// read it back? (The Java writer's global dict admitted I16 but the Java reader rejected
// it with "unsupported ptype for lazy dict: I16".) If Rust dicts I16 and Java cannot read
// it, the bug is reader-side and this test fails with that exact message.
int n = 10_000;
short[] unique = {7, 8, 9};
short[] vals = new short[n];
for (int i = 0; i < n; i++) {
vals[i] = unique[i % unique.length];
}
Path file = tmp.resolve("jni_i16_lowcard.vtx");
writeJniI16(file, vals);

// When / Then — must round-trip exactly
try (var vf = VortexReader.open(file, ReadRegistry.loadAll())) {
List<JavaChunk> results = scanAll(vf, io.github.dfa1.vortex.reader.ScanOptions.columns("v"));
long total = results.stream().mapToLong(JavaChunk::rowCount).sum();
assertThat(total).isEqualTo(n);
var got = new ArrayList<Short>();
for (JavaChunk r : results) {
short[] decoded = (short[]) r.columns().get("v");
for (short s : decoded) {
got.add(s);
}
}
for (int i = 0; i < n; i++) {
assertThat(got.get(i)).as("row %d", i).isEqualTo(unique[i % unique.length]);
}
}
}

// ── S3 helpers ────────────────────────────────────────────────────────────

@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -927,11 +927,18 @@ static boolean isUtf8DictCandidate(String[] data) {
}

static boolean isDictCandidate(PType ptype, Object data) {
// F16/F32 excluded: no measured workload; ALP usually wins. F64 admitted: low-card
// F64 columns (taxi mta_tax/Airport_fee/extra) compress better via global dict +
// sparse-coded codes (matches Rust FloatDictScheme). Skip rule (cardinality / 2
// below) mirrors Rust's >50%-distinct skip.
if (ptype == PType.F16 || ptype == PType.F32) {
// Only the carriers the reader's lazy dict decode supports (I32/I64/F64) are admitted.
// - I8/U8/I16/U16 excluded: dict gives little/no benefit (a U8/U16 code is no smaller
// than the value), the Rust compressor does not dict them either (verified by
// RustWritesJavaReadsIntegrationTest#jniWriter_javaReader_lowCardinalityI16), and the
// reader cannot decode a narrow-int dict — emitting one produced an unreadable file.
// - F16/F32 excluded: no measured workload; ALP usually wins.
// F64 admitted: low-card F64 columns (taxi mta_tax/Airport_fee/extra) compress better via
// global dict + sparse-coded codes (matches Rust FloatDictScheme). The skip rule
// (cardinality / 2 below) mirrors Rust's >50%-distinct skip.
if (ptype == PType.I8 || ptype == PType.U8
|| ptype == PType.I16 || ptype == PType.U16
|| ptype == PType.F16 || ptype == PType.F32) {
return false;
}
int n = primitiveArrayLen(data, ptype);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,11 +143,12 @@ void lowCardinality_i32_usesGlobalDict(@TempDir Path tmp) throws IOException {
}

@Test
void lowCardinality_i16_globalDict_readerRejects(@TempDir Path tmp) throws IOException {
// Documents a real write/read incompatibility surfaced by mutation coverage: the writer's
// global dict admits I8/I16 columns (isDictCandidate), but the reader's lazy dict decode
// only supports I32/I64/F64 — reading back throws "unsupported ptype for lazy dict: I16".
// Pinning it here makes the gap explicit; fixing it belongs to the reader's dict decode.
void lowCardinality_i16_notDicted_roundTrips(@TempDir Path tmp) throws IOException {
// Regression: the writer used to admit I16 to the global dict, producing a vortex.dict
// column the reader cannot decode ("unsupported ptype for lazy dict: I16"). I8/I16 are now
// excluded from dict candidacy — matching the Rust compressor, which does not dict narrow
// ints (RustWritesJavaReadsIntegrationTest#jniWriter_javaReader_lowCardinalityI16) — so a
// low-card I16 column encodes via the cascade and round-trips cleanly.
var schema = new DType.Struct(List.of("v"), List.of(new DType.Primitive(PType.I16, false)), false);
short[] data = {1, 2, 3, 1, 2, 3, 1, 2};
Path file = tmp.resolve("i16.vortex");
Expand All @@ -156,12 +157,25 @@ void lowCardinality_i16_globalDict_readerRejects(@TempDir Path tmp) throws IOExc
sut.writeChunk(Map.of("v", data));
}

// When / Then — the round-trip is not yet supported; assert the current behaviour
try (var vf = VortexReader.open(file, ReadRegistry.loadAll())) {
org.assertj.core.api.Assertions.assertThatThrownBy(() -> VortexReads.readAllInts(vf, "v"))
.isInstanceOf(io.github.dfa1.vortex.core.VortexException.class)
.hasMessageContaining("unsupported ptype for lazy dict");
// When — I16 now encodes via the cascade (a ShortArray view), not a dict
short[] result;
try (var vf = VortexReader.open(file, ReadRegistry.loadAll());
var iter = vf.scan(io.github.dfa1.vortex.reader.ScanOptions.columns("v"))) {
var out = new java.util.ArrayList<Short>();
iter.forEachRemaining(c -> {
var arr = (io.github.dfa1.vortex.reader.array.ShortArray) c.columns().get("v");
for (long i = 0; i < arr.length(); i++) {
out.add(arr.getShort(i));
}
});
result = new short[out.size()];
for (int i = 0; i < result.length; i++) {
result[i] = out.get(i);
}
}

// Then — reads back exactly, no lazy-dict error
assertThat(result).containsExactly(data);
}

@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,18 @@ void isDictCandidate_excludesF16AndF32() {
assertThat(VortexWriter.isDictCandidate(PType.F16, new short[]{1, 1, 2, 2, 2})).isFalse();
}

@Test
void isDictCandidate_excludesNarrowIntegers() {
// I8/U8/I16/U16 are excluded: a U8/U16 code is no smaller than the value, the Rust
// compressor does not dict them, and the reader's lazy dict cannot decode a narrow-int
// dictionary. Low-cardinality data that would otherwise pass the ratio gate must still
// be rejected.
assertThat(VortexWriter.isDictCandidate(PType.I8, new byte[]{1, 2, 1, 2, 1})).isFalse();
assertThat(VortexWriter.isDictCandidate(PType.U8, new byte[]{1, 2, 1, 2, 1})).isFalse();
assertThat(VortexWriter.isDictCandidate(PType.I16, new short[]{1, 2, 1, 2, 1})).isFalse();
assertThat(VortexWriter.isDictCandidate(PType.U16, new short[]{1, 2, 1, 2, 1})).isFalse();
}

@Test
void isDictCandidate_admitsLowCardinalityI64AndF64() {
// Given — 2 distinct over 5 rows: 2*2 = 4 < 5, under the 50%-unique gate
Expand Down