diff --git a/isthmus/src/main/java/io/substrait/isthmus/OuterReferenceResolver.java b/isthmus/src/main/java/io/substrait/isthmus/OuterReferenceResolver.java index 8071468c0..76b20c8e1 100644 --- a/isthmus/src/main/java/io/substrait/isthmus/OuterReferenceResolver.java +++ b/isthmus/src/main/java/io/substrait/isthmus/OuterReferenceResolver.java @@ -66,6 +66,27 @@ public RelNode visit(Filter filter) throws RuntimeException { for (CorrelationId id : filter.getVariablesSet()) { nestedDepth.putIfAbsent(id, 0); } + // Also register any CorrelationIds referenced directly in the condition expression. + // This covers partially-decorrelated plans where a Calcite optimizer (e.g. HepPlanner) + // rewrites correlated IN-subqueries into joins but leaves scalar-aggregate subquery + // correlations as RexFieldAccess($corN.col) inside a Filter condition without a + // surrounding Correlate node. In that case getVariablesSet() is empty but the condition + // still contains RexCorrelVariable references whose IDs must be in nestedDepth before + // rexVisitor.visitFieldAccess() is called. + filter + .getCondition() + .accept( + new RexShuttle() { + @Override + public RexNode visitFieldAccess(RexFieldAccess fieldAccess) { + if (fieldAccess.getReferenceExpr() instanceof RexCorrelVariable) { + CorrelationId id = + ((RexCorrelVariable) fieldAccess.getReferenceExpr()).id; + nestedDepth.putIfAbsent(id, 0); + } + return fieldAccess; + } + }); filter.getCondition().accept(rexVisitor); return super.visit(filter); } diff --git a/isthmus/src/test/java/io/substrait/isthmus/OuterReferenceResolverTest.java b/isthmus/src/test/java/io/substrait/isthmus/OuterReferenceResolverTest.java index a804aafb1..aa151b007 100644 --- a/isthmus/src/test/java/io/substrait/isthmus/OuterReferenceResolverTest.java +++ b/isthmus/src/test/java/io/substrait/isthmus/OuterReferenceResolverTest.java @@ -7,6 +7,7 @@ import org.apache.calcite.rel.core.JoinRelType; import org.apache.calcite.rex.RexCorrelVariable; import org.apache.calcite.rex.RexFieldAccess; +import org.apache.calcite.rex.RexNode; import org.apache.calcite.sql.parser.SqlParseException; import org.apache.calcite.tools.RelBuilder; import org.apache.calcite.util.Holder; @@ -142,4 +143,63 @@ void nestedApplyJoinQuery() throws SqlParseException { validateOuterRef(fieldAccessDepthMap, "$cor0", "SS_ITEM_SK", 2); validateOuterRef(fieldAccessDepthMap, "$cor1", "I_ITEM_SK", 1); } + + /** + * Regression test for the partially-decorrelated Filter case. + * + *
When a Calcite optimizer (e.g. HepPlanner) rewrites a correlated IN-subquery into a join, it + * can produce a {@link org.apache.calcite.rel.core.Filter} whose condition contains a {@link + * RexFieldAccess} referencing a {@link RexCorrelVariable} ({@code $cor0.field}), but whose {@link + * RelNode#getVariablesSet()} is empty — because the enclosing {@link + * org.apache.calcite.rel.core.Correlate} node has already been replaced by a regular join. + * + *
Before the fix, {@code OuterReferenceResolver.visit(Filter)} only iterated {@code + * filter.getVariablesSet()}, so the {@link org.apache.calcite.rel.core.CorrelationId} was never + * registered in {@code nestedDepth}. The subsequent {@code rexVisitor.visitFieldAccess()} call + * found {@code !nestedDepth.containsKey(id)} and silently skipped the access, producing an empty + * {@code fieldAccessDepthMap} instead of the expected entry. + * + *
The fix pre-scans the Filter condition for {@link RexCorrelVariable} references and registers
+ * their IDs before delegating to the rex visitor.
+ */
+ @Test
+ void filterWithCorrelVariableButEmptyVariablesSet() throws SqlParseException {
+ // Build the partially-decorrelated pattern:
+ // JOIN (inner side has a Filter whose condition references $cor0 but variablesSet is empty)
+ //
+ // SQL equivalent of what a post-optimisation plan looks like:
+ // SELECT ss_sold_date_sk FROM store_sales
+ // JOIN item ON item.i_item_sk = store_sales.ss_item_sk ← decorrelated join
+ // WHERE item.i_item_sk = $cor0.SS_ITEM_SK ← Filter still has the correl ref
+ //
+ // We deliberately call .filter(condition) without passing the CorrelationId so that
+ // getVariablesSet() returns an empty set, reproducing the post-decorrelation shape.
+
+ final Holder