diff --git a/src/iceberg/expression/strict_metrics_evaluator.cc b/src/iceberg/expression/strict_metrics_evaluator.cc index e2fe34f14..a9b5ded46 100644 --- a/src/iceberg/expression/strict_metrics_evaluator.cc +++ b/src/iceberg/expression/strict_metrics_evaluator.cc @@ -142,7 +142,9 @@ class StrictMetricsVisitor : public BoundVisitor { return kRowsMightNotMatch; } - if (CanContainNulls(id) || CanContainNaNs(id)) { + ICEBERG_ASSIGN_OR_RAISE(auto has_nulls, CanContainNulls(id)); + ICEBERG_ASSIGN_OR_RAISE(auto has_nans, CanContainNaNs(id)); + if (has_nulls || has_nans) { return kRowsMightNotMatch; } @@ -168,7 +170,9 @@ class StrictMetricsVisitor : public BoundVisitor { return kRowsMightNotMatch; } - if (CanContainNulls(id) || CanContainNaNs(id)) { + ICEBERG_ASSIGN_OR_RAISE(auto has_nulls, CanContainNulls(id)); + ICEBERG_ASSIGN_OR_RAISE(auto has_nans, CanContainNaNs(id)); + if (has_nulls || has_nans) { return kRowsMightNotMatch; } @@ -194,7 +198,9 @@ class StrictMetricsVisitor : public BoundVisitor { return kRowsMightNotMatch; } - if (CanContainNulls(id) || CanContainNaNs(id)) { + ICEBERG_ASSIGN_OR_RAISE(auto has_nulls, CanContainNulls(id)); + ICEBERG_ASSIGN_OR_RAISE(auto has_nans, CanContainNaNs(id)); + if (has_nulls || has_nans) { return kRowsMightNotMatch; } @@ -226,7 +232,9 @@ class StrictMetricsVisitor : public BoundVisitor { return kRowsMightNotMatch; } - if (CanContainNulls(id) || CanContainNaNs(id)) { + ICEBERG_ASSIGN_OR_RAISE(auto has_nulls, CanContainNulls(id)); + ICEBERG_ASSIGN_OR_RAISE(auto has_nans, CanContainNaNs(id)); + if (has_nulls || has_nans) { return kRowsMightNotMatch; } @@ -258,7 +266,9 @@ class StrictMetricsVisitor : public BoundVisitor { return kRowsMightNotMatch; } - if (CanContainNulls(id) || CanContainNaNs(id)) { + ICEBERG_ASSIGN_OR_RAISE(auto has_nulls, CanContainNulls(id)); + ICEBERG_ASSIGN_OR_RAISE(auto has_nans, CanContainNaNs(id)); + if (has_nulls || has_nans) { return kRowsMightNotMatch; } auto lower_it = data_file_.lower_bounds.find(id); @@ -330,7 +340,9 @@ class StrictMetricsVisitor : public BoundVisitor { return kRowsMightNotMatch; } - if (CanContainNulls(id) || CanContainNaNs(id)) { + ICEBERG_ASSIGN_OR_RAISE(auto has_nulls, CanContainNulls(id)); + ICEBERG_ASSIGN_OR_RAISE(auto has_nans, CanContainNaNs(id)); + if (has_nulls || has_nans) { return kRowsMightNotMatch; } auto lower_it = data_file_.lower_bounds.find(id); @@ -435,19 +447,43 @@ class StrictMetricsVisitor : public BoundVisitor { return Literal::Deserialize(stats, primitive_type); } - bool CanContainNulls(int32_t id) { + Result CanContainNulls(int32_t id) { + ICEBERG_ASSIGN_OR_RAISE(auto field_opt, schema_.FindFieldById(id)); + if (field_opt.has_value() && !field_opt->get().optional()) { + return false; + } + + // When null_value_counts is not empty, the evaluator expects all fields to be + // present. A missing entry indicates that the field's null count is unknown. if (data_file_.null_value_counts.empty()) { return true; } auto it = data_file_.null_value_counts.find(id); - return it != data_file_.null_value_counts.cend() && it->second > 0; + if (it == data_file_.null_value_counts.cend()) { + return true; + } + return it->second > 0; } - bool CanContainNaNs(int32_t id) { + Result CanContainNaNs(int32_t id) { + ICEBERG_ASSIGN_OR_RAISE(auto field_opt, schema_.FindFieldById(id)); + if (field_opt.has_value()) { + auto type_id = field_opt->get().type()->type_id(); + if (type_id != TypeId::kFloat && type_id != TypeId::kDouble) { + return false; + } + } + // nan counts might be null for early version writers when nan counters are not // populated. + if (data_file_.nan_value_counts.empty()) { + return true; + } auto it = data_file_.nan_value_counts.find(id); - return it != data_file_.nan_value_counts.cend() && it->second > 0; + if (it == data_file_.nan_value_counts.cend()) { + return true; + } + return it->second > 0; } bool ContainsNullsOnly(int32_t id) { diff --git a/src/iceberg/test/strict_metrics_evaluator_test.cc b/src/iceberg/test/strict_metrics_evaluator_test.cc index fa6185c3b..fcb997929 100644 --- a/src/iceberg/test/strict_metrics_evaluator_test.cc +++ b/src/iceberg/test/strict_metrics_evaluator_test.cc @@ -846,4 +846,67 @@ TEST_F(StrictMetricsEvaluatorMigratedTest, EvaluateOnNestedColumnWithStats) { ExpectShouldRead(Expressions::NotNull("struct.nested_col_with_stats"), false); } +TEST(StrictMetricsEvaluatorRegressionTest, MissingNullCountForField) { + // Field 14 (no_nan_stats, float64, optional) has bounds and value_counts but is + // missing from null_value_counts. The evaluator must conservatively assume nulls + // may exist and return kRowsMightNotMatch for comparison operators. + auto schema = std::make_shared( + std::vector{ + SchemaField::MakeOptional(14, "no_nan_stats", float64()), + }, + /*schema_id=*/0); + + auto data_file = std::make_shared(); + data_file->file_path = "null_test.parquet"; + data_file->file_format = FileFormatType::kParquet; + data_file->record_count = 50; + data_file->value_counts = {{14, 50L}}; + data_file->null_value_counts = {{4, 0L}, {5, 0L}}; + data_file->nan_value_counts = {{14, 0L}}; + data_file->lower_bounds = {{14, Literal::Double(1.0).Serialize().value()}}; + data_file->upper_bounds = {{14, Literal::Double(100.0).Serialize().value()}}; + + auto evaluate = [&](const std::shared_ptr& expr) { + ICEBERG_UNWRAP_OR_FAIL(auto eval, StrictMetricsEvaluator::Make(expr, schema, true)); + ICEBERG_UNWRAP_OR_FAIL(auto result, eval->Evaluate(*data_file)); + EXPECT_EQ(result, kRowsMightNotMatch) << expr->ToString(); + }; + + evaluate(Expressions::LessThan("no_nan_stats", Literal::Double(200.0))); + evaluate(Expressions::LessThanOrEqual("no_nan_stats", Literal::Double(200.0))); + evaluate(Expressions::GreaterThan("no_nan_stats", Literal::Double(-1.0))); + evaluate(Expressions::GreaterThanOrEqual("no_nan_stats", Literal::Double(-1.0))); + evaluate(Expressions::Equal("no_nan_stats", Literal::Double(50.0))); +} + +TEST(StrictMetricsEvaluatorRegressionTest, MissingNanCountForField) { + // Field 14 (no_nan_stats, float64, optional) is missing from nan_value_counts. + // For a floating-point field, the evaluator must conservatively assume NaNs may + // exist and return kRowsMightNotMatch for comparison operators. + auto schema = std::make_shared( + std::vector{ + SchemaField::MakeOptional(14, "no_nan_stats", float64()), + }, + /*schema_id=*/0); + + auto data_file = std::make_shared(); + data_file->file_path = "nan_test.parquet"; + data_file->file_format = FileFormatType::kParquet; + data_file->record_count = 50; + data_file->value_counts = {{14, 50L}}; + data_file->null_value_counts = {{14, 0L}}; + data_file->nan_value_counts = {{8, 0L}}; + data_file->lower_bounds = {{14, Literal::Double(1.0).Serialize().value()}}; + data_file->upper_bounds = {{14, Literal::Double(100.0).Serialize().value()}}; + + auto evaluate = [&](const std::shared_ptr& expr) { + ICEBERG_UNWRAP_OR_FAIL(auto eval, StrictMetricsEvaluator::Make(expr, schema, true)); + ICEBERG_UNWRAP_OR_FAIL(auto result, eval->Evaluate(*data_file)); + EXPECT_EQ(result, kRowsMightNotMatch) << expr->ToString(); + }; + + evaluate(Expressions::LessThan("no_nan_stats", Literal::Double(200.0))); + evaluate(Expressions::GreaterThan("no_nan_stats", Literal::Double(-1.0))); +} + } // namespace iceberg