apache · konstantinb · May 21, 2026 · May 22, 2026 · May 25, 2026
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ReduceSinkMapJoinProc.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ReduceSinkMapJoinProc.java
@@ -187,7 +187,8 @@ public static Object processReduceSinkToHashJoin(ReduceSinkOperator parentRS, Ma
           ExprNodeDesc realCol = parentRS.getColumnExprMap().get(prefix + "." + keyCol);
           ColStatistics cs =
               StatsUtils.getColStatisticsFromExpression(context.conf, stats, realCol);
-          if (cs == null || cs.getCountDistint() <= 0) {
+          if (cs == null || cs.getCountDistint() < 0) {
+            // unknown: same fallback as old "no stats / overloaded NDV=0" path
             maxKeyCount = Long.MAX_VALUE;
             break;
           }

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SetHashGroupByMinReduction.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SetHashGroupByMinReduction.java
@@ -69,8 +69,8 @@ public Object process(Node nd, Stack<Node> stack,
     Statistics parentStats = groupByOperator.getParentOperators().get(0).getStatistics();
     long ndvProduct = StatsUtils.computeNDVGroupingColumns(
         colStats, parentStats, true);
-    // if ndvProduct is 0 then column stats state must be partial and we are missing
-    if (ndvProduct == 0) {
+    if (ndvProduct < 0) {
+      // unknown product - same fallback as old "overloaded NDV=0" path
       return null;
     }
 

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedDynPartitionOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedDynPartitionOptimizer.java
@@ -90,6 +90,7 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.collect.Lists;
 import com.google.common.collect.Maps;
 import com.google.common.collect.Sets;
@@ -924,7 +925,8 @@ private long computeMaxWriters() {
      * Computes the partition cardinality based on column NDV statistics.
      * @return positive value = estimated cardinality, 0 = no partition columns, -1 = stats unavailable
      */
-    private long computePartCardinality(List<Integer> partitionPos,
+    @VisibleForTesting
+    long computePartCardinality(List<Integer> partitionPos,
         List<Function<List<ExprNodeDesc>, ExprNodeDesc>> customPartitionExprs,
         Statistics tStats, Operator<? extends OperatorDesc> fsParent,
         ArrayList<ExprNodeDesc> allRSCols) {
@@ -935,7 +937,8 @@ private long computePartCardinality(List<Integer> partitionPos,
         for (Integer idx : partitionPos) {
           ColumnInfo ci = fsParent.getSchema().getSignature().get(idx);
           ColStatistics partStats = tStats.getColumnStatisticsFromColName(ci.getInternalName());
-          if (partStats == null) {
+          // countDistinct < 0 means "unknown" - same path as missing stats
+          if (partStats == null || partStats.getCountDistint() < 0) {
             return -1;
           }
           partCardinality *= partStats.getCountDistint();
@@ -950,7 +953,8 @@ private long computePartCardinality(List<Integer> partitionPos,
           // implementations on UDFs (e.g. iceberg_bucket reports min(inputNDV, numBuckets))
           ColStatistics exprStats = StatsUtils.getColStatisticsFromExpression(
               this.parseCtx.getConf(), tStats, resolved);
-          if (exprStats == null) {
+          // countDistinct < 0 means "unknown" - same path as missing stats
+          if (exprStats == null || exprStats.getCountDistint() < 0) {
             return -1;
           }
           partCardinality *= exprStats.getCountDistint();

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdDistinctRowCount.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdDistinctRowCount.java
@@ -22,6 +22,7 @@
 import org.apache.calcite.rel.convert.Converter;
 import org.apache.calcite.rel.core.JoinRelType;
 import org.apache.calcite.rel.core.Spool;
+import com.google.common.annotations.VisibleForTesting;
 import org.apache.calcite.rel.metadata.ReflectiveRelMetadataProvider;
 import org.apache.calcite.rel.metadata.RelMdDistinctRowCount;
 import org.apache.calcite.rel.metadata.RelMdUtil;
@@ -50,7 +51,8 @@ public class HiveRelMdDistinctRowCount extends RelMdDistinctRowCount {
       ReflectiveRelMetadataProvider.reflectiveSource(
           BuiltInMethod.DISTINCT_ROW_COUNT.method, new HiveRelMdDistinctRowCount());
 
-  private HiveRelMdDistinctRowCount() {
+  @VisibleForTesting
+  HiveRelMdDistinctRowCount() {
   }
 
   public Double getDistinctRowCount(HiveTableScan htRel, RelMetadataQuery mq, ImmutableBitSet groupKey,
@@ -60,6 +62,9 @@ public Double getDistinctRowCount(HiveTableScan htRel, RelMetadataQuery mq, Immu
     List<ColStatistics> colStats = htRel.getColStat(projIndxLst);
     Double noDistinctRows = 1.0;
     for (ColStatistics cStat : colStats) {
+      if (cStat.getCountDistint() <= 0) {
+        return 0.0;
+      }
       noDistinctRows *= cStat.getCountDistint();
     }
 

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
@@ -557,11 +557,18 @@
         factor *= children.size() - 1;
       }
       for (int i = 0; i < columnStats.size(); i++) {
-        long dvs = columnStats.get(i) == null ? 0 : columnStats.get(i).getCountDistint();
-        if (dvs == 0) {
+        ColStatistics cs = columnStats.get(i);
+        long dvs = cs == null ? -1L : cs.getCountDistint();
+        if (dvs < 0) {
+          // missing stats or unknown NDV
           factor *= 0.5;
           continue;
         }
+        if (dvs == 0) {
+          // verified zero distinct values: IN cannot match any row
+          factor = 0;
+          break;
+        }
         // (num of distinct vals for col in IN clause  / num of distinct vals for col )
         double columnFactor = 1.0 / dvs;
         if (!multiColumn) {
@@ -1317,9 +1324,7 @@
 
               ColStatistics cs = stats.getColumnStatisticsFromColName(colName);
               if (cs != null) {
-                long dvs = cs.getCountDistint();
-                numRows = dvs == 0 ? numRows / 2 : Math.round((double) numRows / dvs);
-                return numRows;
+                return rowsAfterEqualityFilter(numRows, cs.getCountDistint());
               }
             } else if (leaf instanceof ExprNodeColumnDesc) {
               ExprNodeColumnDesc colDesc = (ExprNodeColumnDesc) leaf;
@@ -1338,9 +1343,7 @@
 
                 ColStatistics cs = stats.getColumnStatisticsFromColName(colName);
                 if (cs != null) {
-                  long dvs = cs.getCountDistint();
-                  numRows = dvs == 0 ? numRows / 2 : Math.round((double) numRows / dvs);
-                  return numRows;
+                  return rowsAfterEqualityFilter(numRows, cs.getCountDistint());
                 }
               }
             }
@@ -1380,6 +1383,16 @@
       return numRows / 2;
     }
 
+    private static long rowsAfterEqualityFilter(long numRows, long dvs) {
+      if (dvs < 0) {
+        return numRows / 2;
+      }
+      if (dvs == 0) {
+        return 0;
+      }
+      return Math.round((double) numRows / dvs);
+    }
+
   }
 
   /**
@@ -1518,14 +1531,13 @@
         // compute product of distinct values of grouping columns
         long ndvProduct =
             StatsUtils.computeNDVGroupingColumns(colStats, parentStats, false);
-        // if ndvProduct is 0 then column stats state must be partial and we are missing
-        // column stats for a group by column
-        if (ndvProduct == 0) {
+        if (ndvProduct < 0) {
+          // unknown - missing column stats or unknown NDV on a grouping column
           ndvProduct = parentNumRows / 2;
 
           if (LOG.isDebugEnabled()) {
-            LOG.debug("STATS-" + gop.toString() + ": ndvProduct became 0 as some column does not" +
-                " have stats. ndvProduct changed to: " + ndvProduct);
+            LOG.debug("STATS-" + gop.toString() + ": ndvProduct unknown; falling back to "
+                + ndvProduct);
           }
         }
         final long maxColumnNDV = colStats.stream()
@@ -1720,6 +1732,10 @@
           long valuesCount = agg.getDistinct() ?
               parentCS.getCountDistint() :
               parentStats.getNumRows() - numNulls;
+          // countDistinct < 0 would produce a Range with a negative maxValue
+          if (agg.getDistinct() && valuesCount < 0) {
+            return;
+          }
           Range range = parentCS.getRange();
           // Get the aggregate function matching the name in the query.
           GenericUDAFResolver udaf =
@@ -1819,9 +1835,24 @@
 
         // estimate size of key from column statistics
         long avgKeySize = 0;
+        // lazily computed on first unknown NDV (null = not yet looked up)
+        Long parentNumRows = null;
         for (ColStatistics cs : colStats) {
           if (cs != null) {
-            numEstimatedRows = StatsUtils.safeMult(numEstimatedRows, cs.getCountDistint());
+            long ndv = cs.getCountDistint();
+            if (ndv < 0) {
+              if (parentNumRows == null) {
+                // unknown NDV: fall back to parentNumRows / 2, matching the heuristic
+                // used elsewhere in this file when GROUP BY cardinality cannot be computed
+                Statistics parentStats = gop.getParentOperators().get(0).getStatistics();
+                parentNumRows = (parentStats != null) ? parentStats.getNumRows() : -1L;
+              }
+              if (parentNumRows <= 0) {
+                return false;
+              }
+              ndv = parentNumRows / 2;
+            }
+            numEstimatedRows = StatsUtils.safeMult(numEstimatedRows, ndv);
             avgKeySize += Math.ceil(cs.getAvgColLen());
           }
         }
@@ -2227,7 +2258,8 @@
       return null;
     }
 
-    private long calculateUnmatchedRowsForOuter(HiveConf conf, long inputRowCount,
+    @VisibleForTesting
+    long calculateUnmatchedRowsForOuter(HiveConf conf, long inputRowCount,
         List<String> joinKeys, Statistics statistics, long distinctUnmatched) {
       // Extract the ndv from each of the columns involved in the join
       List<Long> distinctVals = new ArrayList<>();
@@ -2248,14 +2280,15 @@
           distinctVal = StatsUtils.addWithExpDecay(distinctVals);
         }
       }
-      // If we have a greater number of unmatched values than number of distinct values,
-      // we just return the number of rows in the input as we can assume there are no
-      // matches
-      if (distinctUnmatched >= distinctVal) {
+      // distinctVal <= 0 covers unknown (<0) and verified-zero (==0) cases; the latter means
+      // no key value matches anything, so every input row is unmatched in an outer join.
+      // distinctUnmatched < 0 (unknown) is treated conservatively the same way.
+      // If unmatched >= distinctVal, all rows can be assumed unmatched.
+      if (distinctVal <= 0 || distinctUnmatched < 0 || distinctUnmatched >= distinctVal) {
         return inputRowCount;
       }
       // Otherwise, divide the number of input rows by the number of distinct values
-      // and divide by the number of distinct values unmatched
+      // and multiply by the number of distinct values unmatched
       return StatsUtils.safeMult(inputRowCount / distinctVal, distinctUnmatched);
     }
 
@@ -2604,7 +2637,8 @@
       colStats.setNumNulls(newNumNulls);
     }
 
-    private void updateColStats(HiveConf conf, Statistics stats, long leftUnmatchedRows, long rightUnmatchedRows,
+    @VisibleForTesting
+    void updateColStats(HiveConf conf, Statistics stats, long leftUnmatchedRows, long rightUnmatchedRows,
         long newNumRows, CommonJoinOperator<? extends JoinDesc> jop, Map<Integer, Long> rowCountParents) {
 
       if (newNumRows < 0) {
@@ -2632,26 +2666,29 @@
         int pos = jop.getConf().getReversedExprs().get(cs.getColumnName());
         long oldDV = cs.getCountDistint();
 
-        boolean useCalciteForNdvReadjustment
-            = HiveConf.getBoolVar(conf, ConfVars.HIVE_STATS_JOIN_NDV_READJUSTMENT);
-        long newDV = oldDV;
-        if (useCalciteForNdvReadjustment) {
-          Double approxNdv = RelMdUtil.numDistinctVals(oldDV * 1.0, newNumRows * 1.0);
-          Preconditions.checkNotNull(approxNdv, "approximate NDV is null");
-          newDV = approxNdv.longValue();
-        } else {
-          long oldRowCount = rowCountParents.get(pos);
-          double ratio = (double) newNumRows / (double) oldRowCount;
-
-          // if ratio is greater than 1, then number of rows increases. This can happen
-          // when some operators like GROUPBY duplicates the input rows in which case
-          // number of distincts should not change. Update the distinct count only when
-          // the output number of rows is less than input number of rows.
-          if (ratio <= 1.0) {
-            newDV = (long) Math.ceil(ratio * oldDV);
+        // countDistinct < 0 means "unknown"
+        if (oldDV >= 0) {
+          boolean useCalciteForNdvReadjustment
+              = HiveConf.getBoolVar(conf, ConfVars.HIVE_STATS_JOIN_NDV_READJUSTMENT);
+          long newDV = oldDV;
+          if (useCalciteForNdvReadjustment) {
+            Double approxNdv = RelMdUtil.numDistinctVals(oldDV * 1.0, newNumRows * 1.0);
+            Preconditions.checkNotNull(approxNdv, "approximate NDV is null");
+            newDV = approxNdv.longValue();
+          } else {
+            long oldRowCount = rowCountParents.get(pos);
+            double ratio = (double) newNumRows / (double) oldRowCount;
+
+            // if ratio is greater than 1, then number of rows increases. This can happen
+            // when some operators like GROUPBY duplicates the input rows in which case
+            // number of distincts should not change. Update the distinct count only when
+            // the output number of rows is less than input number of rows.
+            if (ratio < 1.0) {
+              newDV = (long) Math.ceil(ratio * oldDV);
+            }
           }
+          cs.setCountDistint(newDV);
         }
-        cs.setCountDistint(newDV);
         updateNumNulls(cs, leftUnmatchedRows, rightUnmatchedRows, newNumRows, pos, jop);
       }
       stats.setColumnStats(colStats);
@@ -2718,7 +2755,8 @@
       return result;
     }
 
-    private long computeRowCountAssumingInnerJoin(List<Long> rowCountParents, long denom,
+    @VisibleForTesting
+    long computeRowCountAssumingInnerJoin(List<Long> rowCountParents, long denom,
         CommonJoinOperator<? extends JoinDesc> join) {
       double factor = 0.0d;
       long result = 1;
@@ -2734,7 +2772,12 @@
         }
       }
 
-      denom = denom == 0 ? 1 : denom;
+      // denom < 0 (unknown) and denom == 0 (verified-zero join key, cardinality formula
+      // degenerate) both fall back to "no constraint" rather than producing a negative
+      // factor or div-by-zero
+      if (denom <= 0) {
+        denom = 1;
+      }
       factor = (double) max / (double) denom;
 
       for (int i = 0; i < rowCountParents.size(); i++) {
@@ -2786,6 +2829,9 @@
       if (distinctVals.isEmpty()) {
         return 2;
       }
+      if (StatsUtils.containsUnknownNDV(distinctVals)) {
+        return -1L;
+      }
 
       // simple join from 2 relations: denom = min(v1, v2)
       if (distinctVals.size() <= 2) {
@@ -2826,6 +2872,9 @@
         // denominator is 2.
         return 2;
       }
+      if (StatsUtils.containsUnknownNDV(distinctVals)) {
+        return -1L;
+      }
 
       // simple join from 2 relations: denom = max(v1, v2)
       if (distinctVals.size() <= 2) {

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java
@@ -239,7 +239,7 @@
    addToColumnStats(colStats);
  }

  public void addToColumnStats(List<ColStatistics> colStats) {
    if (colStats == null) {
      return;
    }
@@ -260,7 +260,11 @@
         } else {
           existing.setNumNulls(StatsUtils.safeAdd(existing.getNumNulls(), cs.getNumNulls()));
         }
-        existing.setCountDistint(Math.max(existing.getCountDistint(), cs.getCountDistint()));
+        if (cs.getCountDistint() < 0 || existing.getCountDistint() < 0) {
+          existing.setCountDistint(-1);
+        } else {
+          existing.setCountDistint(Math.max(existing.getCountDistint(), cs.getCountDistint()));
+        }
       }
     }
   }