Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 46 additions & 10 deletions src/iceberg/expression/strict_metrics_evaluator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,9 @@ class StrictMetricsVisitor : public BoundVisitor<bool> {
return kRowsMightNotMatch;
}

if (CanContainNulls(id) || CanContainNaNs(id)) {
ICEBERG_ASSIGN_OR_RAISE(auto has_nulls, CanContainNulls(id));
ICEBERG_ASSIGN_OR_RAISE(auto has_nans, CanContainNaNs(id));
if (has_nulls || has_nans) {
return kRowsMightNotMatch;
}

Expand All @@ -168,7 +170,9 @@ class StrictMetricsVisitor : public BoundVisitor<bool> {
return kRowsMightNotMatch;
}

if (CanContainNulls(id) || CanContainNaNs(id)) {
ICEBERG_ASSIGN_OR_RAISE(auto has_nulls, CanContainNulls(id));
ICEBERG_ASSIGN_OR_RAISE(auto has_nans, CanContainNaNs(id));
if (has_nulls || has_nans) {
return kRowsMightNotMatch;
}

Expand All @@ -194,7 +198,9 @@ class StrictMetricsVisitor : public BoundVisitor<bool> {
return kRowsMightNotMatch;
}

if (CanContainNulls(id) || CanContainNaNs(id)) {
ICEBERG_ASSIGN_OR_RAISE(auto has_nulls, CanContainNulls(id));
ICEBERG_ASSIGN_OR_RAISE(auto has_nans, CanContainNaNs(id));
if (has_nulls || has_nans) {
return kRowsMightNotMatch;
}

Expand Down Expand Up @@ -226,7 +232,9 @@ class StrictMetricsVisitor : public BoundVisitor<bool> {
return kRowsMightNotMatch;
}

if (CanContainNulls(id) || CanContainNaNs(id)) {
ICEBERG_ASSIGN_OR_RAISE(auto has_nulls, CanContainNulls(id));
ICEBERG_ASSIGN_OR_RAISE(auto has_nans, CanContainNaNs(id));
if (has_nulls || has_nans) {
return kRowsMightNotMatch;
}

Expand Down Expand Up @@ -258,7 +266,9 @@ class StrictMetricsVisitor : public BoundVisitor<bool> {
return kRowsMightNotMatch;
}

if (CanContainNulls(id) || CanContainNaNs(id)) {
ICEBERG_ASSIGN_OR_RAISE(auto has_nulls, CanContainNulls(id));
ICEBERG_ASSIGN_OR_RAISE(auto has_nans, CanContainNaNs(id));
if (has_nulls || has_nans) {
return kRowsMightNotMatch;
}
auto lower_it = data_file_.lower_bounds.find(id);
Expand Down Expand Up @@ -330,7 +340,9 @@ class StrictMetricsVisitor : public BoundVisitor<bool> {
return kRowsMightNotMatch;
}

if (CanContainNulls(id) || CanContainNaNs(id)) {
ICEBERG_ASSIGN_OR_RAISE(auto has_nulls, CanContainNulls(id));
ICEBERG_ASSIGN_OR_RAISE(auto has_nans, CanContainNaNs(id));
if (has_nulls || has_nans) {
return kRowsMightNotMatch;
}
auto lower_it = data_file_.lower_bounds.find(id);
Expand Down Expand Up @@ -435,19 +447,43 @@ class StrictMetricsVisitor : public BoundVisitor<bool> {
return Literal::Deserialize(stats, primitive_type);
}

bool CanContainNulls(int32_t id) {
Result<bool> CanContainNulls(int32_t id) {
ICEBERG_ASSIGN_OR_RAISE(auto field_opt, schema_.FindFieldById(id));
if (field_opt.has_value() && !field_opt->get().optional()) {
return false;
}

// When null_value_counts is not empty, the evaluator expects all fields to be
// present. A missing entry indicates that the field's null count is unknown.
if (data_file_.null_value_counts.empty()) {
return true;
}
auto it = data_file_.null_value_counts.find(id);
return it != data_file_.null_value_counts.cend() && it->second > 0;
if (it == data_file_.null_value_counts.cend()) {
return true;
}
return it->second > 0;
}

bool CanContainNaNs(int32_t id) {
Result<bool> CanContainNaNs(int32_t id) {
ICEBERG_ASSIGN_OR_RAISE(auto field_opt, schema_.FindFieldById(id));
if (field_opt.has_value()) {
auto type_id = field_opt->get().type()->type_id();
if (type_id != TypeId::kFloat && type_id != TypeId::kDouble) {
return false;
}
}

// nan counts might be null for early version writers when nan counters are not
Comment thread
sentomk marked this conversation as resolved.
// populated.
if (data_file_.nan_value_counts.empty()) {
return true;
}
auto it = data_file_.nan_value_counts.find(id);
return it != data_file_.nan_value_counts.cend() && it->second > 0;
if (it == data_file_.nan_value_counts.cend()) {
return true;
}
return it->second > 0;
}

bool ContainsNullsOnly(int32_t id) {
Expand Down
63 changes: 63 additions & 0 deletions src/iceberg/test/strict_metrics_evaluator_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -846,4 +846,67 @@ TEST_F(StrictMetricsEvaluatorMigratedTest, EvaluateOnNestedColumnWithStats) {
ExpectShouldRead(Expressions::NotNull("struct.nested_col_with_stats"), false);
}

TEST(StrictMetricsEvaluatorRegressionTest, MissingNullCountForField) {
// Field 14 (no_nan_stats, float64, optional) has bounds and value_counts but is
// missing from null_value_counts. The evaluator must conservatively assume nulls
// may exist and return kRowsMightNotMatch for comparison operators.
auto schema = std::make_shared<Schema>(
std::vector<SchemaField>{
SchemaField::MakeOptional(14, "no_nan_stats", float64()),
},
/*schema_id=*/0);

auto data_file = std::make_shared<DataFile>();
data_file->file_path = "null_test.parquet";
data_file->file_format = FileFormatType::kParquet;
data_file->record_count = 50;
data_file->value_counts = {{14, 50L}};
data_file->null_value_counts = {{4, 0L}, {5, 0L}};
data_file->nan_value_counts = {{14, 0L}};
data_file->lower_bounds = {{14, Literal::Double(1.0).Serialize().value()}};
data_file->upper_bounds = {{14, Literal::Double(100.0).Serialize().value()}};

auto evaluate = [&](const std::shared_ptr<Expression>& expr) {
ICEBERG_UNWRAP_OR_FAIL(auto eval, StrictMetricsEvaluator::Make(expr, schema, true));
ICEBERG_UNWRAP_OR_FAIL(auto result, eval->Evaluate(*data_file));
EXPECT_EQ(result, kRowsMightNotMatch) << expr->ToString();
};

evaluate(Expressions::LessThan("no_nan_stats", Literal::Double(200.0)));
evaluate(Expressions::LessThanOrEqual("no_nan_stats", Literal::Double(200.0)));
evaluate(Expressions::GreaterThan("no_nan_stats", Literal::Double(-1.0)));
evaluate(Expressions::GreaterThanOrEqual("no_nan_stats", Literal::Double(-1.0)));
evaluate(Expressions::Equal("no_nan_stats", Literal::Double(50.0)));
}

TEST(StrictMetricsEvaluatorRegressionTest, MissingNanCountForField) {
// Field 14 (no_nan_stats, float64, optional) is missing from nan_value_counts.
// For a floating-point field, the evaluator must conservatively assume NaNs may
// exist and return kRowsMightNotMatch for comparison operators.
auto schema = std::make_shared<Schema>(
std::vector<SchemaField>{
SchemaField::MakeOptional(14, "no_nan_stats", float64()),
},
/*schema_id=*/0);

auto data_file = std::make_shared<DataFile>();
data_file->file_path = "nan_test.parquet";
data_file->file_format = FileFormatType::kParquet;
data_file->record_count = 50;
data_file->value_counts = {{14, 50L}};
data_file->null_value_counts = {{14, 0L}};
data_file->nan_value_counts = {{8, 0L}};
data_file->lower_bounds = {{14, Literal::Double(1.0).Serialize().value()}};
data_file->upper_bounds = {{14, Literal::Double(100.0).Serialize().value()}};

auto evaluate = [&](const std::shared_ptr<Expression>& expr) {
ICEBERG_UNWRAP_OR_FAIL(auto eval, StrictMetricsEvaluator::Make(expr, schema, true));
ICEBERG_UNWRAP_OR_FAIL(auto result, eval->Evaluate(*data_file));
EXPECT_EQ(result, kRowsMightNotMatch) << expr->ToString();
};

evaluate(Expressions::LessThan("no_nan_stats", Literal::Double(200.0)));
evaluate(Expressions::GreaterThan("no_nan_stats", Literal::Double(-1.0)));
}

} // namespace iceberg
Loading