Skip to content

Commit 6e26e49

Browse files
committed
refactor(parquet-statistics): use optional instead of has_* flags
1 parent dbbf7cf commit 6e26e49

17 files changed

Lines changed: 440 additions & 289 deletions

cpp/src/arrow/dataset/file_parquet.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -370,7 +370,7 @@ std::optional<compute::Expression> ParquetFileFragment::EvaluateStatisticsAsExpr
370370
const parquet::Statistics& statistics) {
371371
auto field_expr = compute::field_ref(field_ref);
372372

373-
bool may_have_null = !statistics.HasNullCount() || statistics.null_count() > 0;
373+
bool may_have_null = !statistics.HasNullCount() || statistics.NullCount().value() > 0;
374374
// Optimize for corner case where all values are nulls
375375
if (statistics.num_values() == 0) {
376376
// If there are no non-null values, column `field_ref` in the fragment

cpp/src/arrow/dataset/file_parquet_test.cc

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -941,9 +941,8 @@ TEST(TestParquetStatistics, NoNullCount) {
941941
::parquet::EncodedStatistics encoded_stats;
942942
encoded_stats.set_min(int32_to_parquet_stats(1));
943943
encoded_stats.set_max(int32_to_parquet_stats(100));
944-
encoded_stats.has_null_count = false;
945944
encoded_stats.all_null_value = false;
946-
encoded_stats.null_count = 0;
945+
encoded_stats.null_count = std::nullopt;
947946
auto stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/10);
948947

949948
auto stat_expression =
@@ -956,7 +955,6 @@ TEST(TestParquetStatistics, NoNullCount) {
956955
// Special case: when num_value is 0, it would return
957956
// "is_null".
958957
::parquet::EncodedStatistics encoded_stats;
959-
encoded_stats.has_null_count = true;
960958
encoded_stats.null_count = 1;
961959
encoded_stats.all_null_value = true;
962960
auto stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/0);
@@ -965,7 +963,7 @@ TEST(TestParquetStatistics, NoNullCount) {
965963
ASSERT_TRUE(stat_expression.has_value());
966964
EXPECT_EQ(stat_expression->ToString(), "is_null(x, {nan_is_null=false})");
967965

968-
encoded_stats.has_null_count = false;
966+
encoded_stats.null_count = std::nullopt;
969967
encoded_stats.all_null_value = false;
970968
stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/0);
971969
stat_expression = ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *stats);

cpp/src/parquet/arrow/arrow_reader_writer_test.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4735,8 +4735,8 @@ TEST_P(TestArrowWriteDictionary, Statistics) {
47354735

47364736
auto expect_has_min_max =
47374737
expected_has_min_max_by_page[case_index][row_group_index][page_index];
4738-
EXPECT_EQ(stats.has_min, expect_has_min_max);
4739-
EXPECT_EQ(stats.has_max, expect_has_min_max);
4738+
EXPECT_EQ(stats.HasMin(), expect_has_min_max);
4739+
EXPECT_EQ(stats.HasMax(), expect_has_min_max);
47404740
if (expect_has_min_max) {
47414741
EXPECT_EQ(stats.min(),
47424742
expected_min_by_page[case_index][row_group_index][page_index]);

cpp/src/parquet/arrow/arrow_statistics_test.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ TEST_P(ParameterizedStatisticsTest, NoNullCountWrittenForRepeatedFields) {
8989
auto parquet_reader = ParquetFileReader::Open(std::move(buffer_reader));
9090
std::shared_ptr<FileMetaData> metadata = parquet_reader->metadata();
9191
std::shared_ptr<Statistics> stats = metadata->RowGroup(0)->ColumnChunk(0)->statistics();
92-
EXPECT_EQ(stats->null_count(), GetParam().expected_null_count);
92+
EXPECT_EQ(stats->NullCount(), GetParam().expected_null_count);
9393
EXPECT_EQ(stats->num_values(), GetParam().expected_value_count);
9494
ASSERT_TRUE(stats->HasMinMax());
9595
EXPECT_EQ(stats->EncodeMin(), GetParam().expected_min);

cpp/src/parquet/arrow/reader_internal.cc

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -169,8 +169,8 @@ template <typename CType, typename StatisticsType>
169169
Status MakeMinMaxScalar(const StatisticsType& statistics,
170170
std::shared_ptr<::arrow::Scalar>* min,
171171
std::shared_ptr<::arrow::Scalar>* max) {
172-
*min = ::arrow::MakeScalar(static_cast<CType>(statistics.min()));
173-
*max = ::arrow::MakeScalar(static_cast<CType>(statistics.max()));
172+
*min = ::arrow::MakeScalar(static_cast<CType>(statistics.Min().value()));
173+
*max = ::arrow::MakeScalar(static_cast<CType>(statistics.Max().value()));
174174
return Status::OK();
175175
}
176176

@@ -179,8 +179,8 @@ Status MakeMinMaxTypedScalar(const StatisticsType& statistics,
179179
std::shared_ptr<DataType> type,
180180
std::shared_ptr<::arrow::Scalar>* min,
181181
std::shared_ptr<::arrow::Scalar>* max) {
182-
ARROW_ASSIGN_OR_RAISE(*min, ::arrow::MakeScalar(type, statistics.min()));
183-
ARROW_ASSIGN_OR_RAISE(*max, ::arrow::MakeScalar(type, statistics.max()));
182+
ARROW_ASSIGN_OR_RAISE(*min, ::arrow::MakeScalar(type, statistics.Min().value()));
183+
ARROW_ASSIGN_OR_RAISE(*max, ::arrow::MakeScalar(type, statistics.Max().value()));
184184
return Status::OK();
185185
}
186186

@@ -227,8 +227,8 @@ static Status FromInt32Statistics(const Int32Statistics& statistics,
227227
case LogicalType::Type::NONE:
228228
return MakeMinMaxTypedScalar<int32_t>(statistics, type, min, max);
229229
case LogicalType::Type::DECIMAL:
230-
return ExtractDecimalMinMaxFromInteger(statistics.min(), statistics.max(),
231-
logical_type, min, max);
230+
return ExtractDecimalMinMaxFromInteger(
231+
statistics.Min().value(), statistics.Max().value(), logical_type, min, max);
232232
default:
233233
break;
234234
}
@@ -252,8 +252,8 @@ static Status FromInt64Statistics(const Int64Statistics& statistics,
252252
case LogicalType::Type::NONE:
253253
return MakeMinMaxTypedScalar<int64_t>(statistics, type, min, max);
254254
case LogicalType::Type::DECIMAL:
255-
return ExtractDecimalMinMaxFromInteger(statistics.min(), statistics.max(),
256-
logical_type, min, max);
255+
return ExtractDecimalMinMaxFromInteger(
256+
statistics.Min().value(), statistics.Max().value(), logical_type, min, max);
257257
default:
258258
break;
259259
}
@@ -384,13 +384,13 @@ void AttachStatistics(::arrow::ArrayData* data,
384384
}
385385
if (statistics) {
386386
if (statistics->HasDistinctCount()) {
387-
array_statistics->distinct_count = statistics->distinct_count();
387+
array_statistics->distinct_count = statistics->DistinctCount().value();
388388
}
389389
if (statistics->HasMinMax()) {
390390
const auto* typed_statistics =
391391
checked_cast<const ::parquet::TypedStatistics<ParquetType>*>(statistics.get());
392-
const ArrowCType min = typed_statistics->min();
393-
const ArrowCType max = typed_statistics->max();
392+
const ArrowCType min = typed_statistics->Min().value();
393+
const ArrowCType max = typed_statistics->Max().value();
394394
if constexpr (std::is_same_v<ArrowCType, bool>) {
395395
array_statistics->min = static_cast<bool>(min);
396396
array_statistics->max = static_cast<bool>(max);

cpp/src/parquet/column_writer.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1084,7 +1084,7 @@ void ColumnWriterImpl::BuildDataPageV2(int64_t definition_levels_rle_size,
10841084

10851085
// page_stats.null_count is not set when page_statistics_ is nullptr. It is only used
10861086
// here for safety check.
1087-
DCHECK(!page_stats.has_null_count || page_stats.null_count == null_count);
1087+
DCHECK(!page_stats.HasNullCount() || page_stats.null_count == null_count);
10881088

10891089
// Write the page to OutputStream eagerly if there is no dictionary or
10901090
// if dictionary encoding has fallen back to PLAIN

cpp/src/parquet/column_writer_test.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -375,7 +375,7 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
375375
auto metadata_accessor = ColumnChunkMetaData::Make(
376376
metadata_->contents(), this->descr_, default_reader_properties(), &app_version);
377377
auto encoded_stats = metadata_accessor->statistics()->Encode();
378-
return {encoded_stats.has_min, encoded_stats.has_max};
378+
return {encoded_stats.HasMin(), encoded_stats.HasMax()};
379379
}
380380

381381
std::vector<Encoding::type> metadata_encodings() {

cpp/src/parquet/file_deserialize_test.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -69,10 +69,10 @@ static inline void AddDummyStats(int stat_size, H& header, bool fill_all_stats =
6969
template <typename H>
7070
static inline void CheckStatistics(const H& expected, const EncodedStatistics& actual) {
7171
if (expected.statistics.__isset.max) {
72-
ASSERT_EQ(expected.statistics.max, actual.max());
72+
ASSERT_EQ(expected.statistics.max, actual.Max());
7373
}
7474
if (expected.statistics.__isset.min) {
75-
ASSERT_EQ(expected.statistics.min, actual.min());
75+
ASSERT_EQ(expected.statistics.min, actual.Min());
7676
}
7777
if (expected.statistics.__isset.null_count) {
7878
ASSERT_EQ(expected.statistics.null_count, actual.null_count);
@@ -513,8 +513,8 @@ TYPED_TEST(PageFilterTest, TestPageFilterCallback) {
513513
CheckDataPageHeader(this->data_page_headers_[i], current_page.get()));
514514
auto data_page = static_cast<const DataPage*>(current_page.get());
515515
const EncodedStatistics encoded_statistics = data_page->statistics();
516-
ASSERT_EQ(read_stats[i].max(), encoded_statistics.max());
517-
ASSERT_EQ(read_stats[i].min(), encoded_statistics.min());
516+
ASSERT_EQ(read_stats[i].Max(), encoded_statistics.Max());
517+
ASSERT_EQ(read_stats[i].Min(), encoded_statistics.Min());
518518
ASSERT_EQ(read_stats[i].null_count, encoded_statistics.null_count);
519519
ASSERT_EQ(read_stats[i].distinct_count, encoded_statistics.distinct_count);
520520
ASSERT_EQ(read_num_values[i], this->data_page_headers_[i].num_values);

cpp/src/parquet/metadata.cc

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -101,24 +101,40 @@ static std::shared_ptr<Statistics> MakeTypedColumnStats(
101101
metadata.statistics.__isset.is_max_value_exact
102102
? std::optional<bool>(metadata.statistics.is_max_value_exact)
103103
: std::nullopt;
104+
std::optional<int64_t> null_count =
105+
metadata.statistics.__isset.null_count
106+
? std::optional<int64_t>(metadata.statistics.null_count)
107+
: std::nullopt;
108+
std::optional<int64_t> distinct_count =
109+
metadata.statistics.__isset.distinct_count
110+
? std::optional<int64_t>(metadata.statistics.distinct_count)
111+
: std::nullopt;
112+
std::optional<std::string_view> min_val =
113+
metadata.statistics.__isset.min
114+
? std::optional<std::string_view>(metadata.statistics.min)
115+
: std::nullopt;
116+
std::optional<std::string_view> max_val =
117+
metadata.statistics.__isset.max
118+
? std::optional<std::string_view>(metadata.statistics.max)
119+
: std::nullopt;
104120
// If ColumnOrder is defined, return max_value and min_value
105121
if (descr->column_order().get_order() == ColumnOrder::TYPE_DEFINED_ORDER) {
106-
return MakeStatistics<DType>(
107-
descr, metadata.statistics.min_value, metadata.statistics.max_value,
108-
metadata.num_values - metadata.statistics.null_count,
109-
metadata.statistics.null_count, metadata.statistics.distinct_count,
110-
metadata.statistics.__isset.max_value && metadata.statistics.__isset.min_value,
111-
metadata.statistics.__isset.null_count,
112-
metadata.statistics.__isset.distinct_count, min_exact, max_exact, pool);
122+
std::optional<std::string_view> min_value =
123+
metadata.statistics.__isset.min_value
124+
? std::optional<std::string_view>(metadata.statistics.min_value)
125+
: std::nullopt;
126+
std::optional<std::string_view> max_value =
127+
metadata.statistics.__isset.max_value
128+
? std::optional<std::string_view>(metadata.statistics.max_value)
129+
: std::nullopt;
130+
return MakeStatistics<DType>(descr, min_value, max_value,
131+
metadata.num_values - null_count.value_or(0), null_count,
132+
distinct_count, min_exact, max_exact, pool);
113133
}
114134
// Default behavior
115-
return MakeStatistics<DType>(
116-
descr, metadata.statistics.min, metadata.statistics.max,
117-
metadata.num_values - metadata.statistics.null_count,
118-
metadata.statistics.null_count, metadata.statistics.distinct_count,
119-
metadata.statistics.__isset.max && metadata.statistics.__isset.min,
120-
metadata.statistics.__isset.null_count, metadata.statistics.__isset.distinct_count,
121-
min_exact, max_exact, pool);
135+
return MakeStatistics<DType>(descr, min_val, max_val,
136+
metadata.num_values - null_count.value_or(0), null_count,
137+
distinct_count, min_exact, max_exact, pool);
122138
}
123139

124140
namespace {
@@ -1610,7 +1626,7 @@ bool ApplicationVersion::HasCorrectStatistics(Type::type col_type,
16101626
(application_ == "parquet-mr" && VersionLt(PARQUET_MR_FIXED_STATS_VERSION()))) {
16111627
// Only SIGNED are valid unless max and min are the same
16121628
// (in which case the sort order does not matter)
1613-
bool max_equals_min = statistics.has_min && statistics.has_max
1629+
bool max_equals_min = statistics.HasMin() && statistics.HasMax()
16141630
? statistics.min() == statistics.max()
16151631
: false;
16161632
if (SortOrder::SIGNED != sort_order && !max_equals_min) {

cpp/src/parquet/metadata_test.cc

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -154,18 +154,18 @@ TEST(Metadata, TestBuildAccess) {
154154
auto rg1_column2 = rg1_accessor->ColumnChunk(1);
155155
ASSERT_EQ(true, rg1_column1->is_stats_set());
156156
ASSERT_EQ(true, rg1_column2->is_stats_set());
157-
ASSERT_EQ(stats_float.min(), rg1_column2->statistics()->EncodeMin());
158-
ASSERT_EQ(stats_float.max(), rg1_column2->statistics()->EncodeMax());
159-
ASSERT_EQ(stats_int.min(), rg1_column1->statistics()->EncodeMin());
160-
ASSERT_EQ(stats_int.max(), rg1_column1->statistics()->EncodeMax());
161-
ASSERT_EQ(stats_float.min(), rg1_column2->encoded_statistics()->min());
162-
ASSERT_EQ(stats_float.max(), rg1_column2->encoded_statistics()->max());
163-
ASSERT_EQ(stats_int.min(), rg1_column1->encoded_statistics()->min());
164-
ASSERT_EQ(stats_int.max(), rg1_column1->encoded_statistics()->max());
165-
ASSERT_EQ(0, rg1_column1->statistics()->null_count());
166-
ASSERT_EQ(0, rg1_column2->statistics()->null_count());
167-
ASSERT_EQ(nrows, rg1_column1->statistics()->distinct_count());
168-
ASSERT_EQ(nrows, rg1_column2->statistics()->distinct_count());
157+
ASSERT_EQ(stats_float.Min(), rg1_column2->statistics()->EncodeMin());
158+
ASSERT_EQ(stats_float.Max(), rg1_column2->statistics()->EncodeMax());
159+
ASSERT_EQ(stats_int.Min(), rg1_column1->statistics()->EncodeMin());
160+
ASSERT_EQ(stats_int.Max(), rg1_column1->statistics()->EncodeMax());
161+
ASSERT_EQ(stats_float.Min(), rg1_column2->encoded_statistics()->Min());
162+
ASSERT_EQ(stats_float.Max(), rg1_column2->encoded_statistics()->Max());
163+
ASSERT_EQ(stats_int.Min(), rg1_column1->encoded_statistics()->Min());
164+
ASSERT_EQ(stats_int.Max(), rg1_column1->encoded_statistics()->Max());
165+
ASSERT_EQ(0, rg1_column1->statistics()->NullCount());
166+
ASSERT_EQ(0, rg1_column2->statistics()->NullCount());
167+
ASSERT_EQ(nrows, rg1_column1->statistics()->DistinctCount());
168+
ASSERT_EQ(nrows, rg1_column2->statistics()->DistinctCount());
169169
ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column1->compression());
170170
ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column2->compression());
171171
ASSERT_EQ(nrows / 2, rg1_column1->num_values());
@@ -205,18 +205,18 @@ TEST(Metadata, TestBuildAccess) {
205205
auto rg2_column2 = rg2_accessor->ColumnChunk(1);
206206
ASSERT_EQ(true, rg2_column1->is_stats_set());
207207
ASSERT_EQ(true, rg2_column2->is_stats_set());
208-
ASSERT_EQ(stats_float.min(), rg2_column2->statistics()->EncodeMin());
209-
ASSERT_EQ(stats_float.max(), rg2_column2->statistics()->EncodeMax());
210-
ASSERT_EQ(stats_int.min(), rg1_column1->statistics()->EncodeMin());
211-
ASSERT_EQ(stats_int.max(), rg1_column1->statistics()->EncodeMax());
212-
ASSERT_EQ(stats_float.min(), rg2_column2->encoded_statistics()->min());
213-
ASSERT_EQ(stats_float.max(), rg2_column2->encoded_statistics()->max());
214-
ASSERT_EQ(stats_int.min(), rg1_column1->encoded_statistics()->min());
215-
ASSERT_EQ(stats_int.max(), rg1_column1->encoded_statistics()->max());
216-
ASSERT_EQ(0, rg2_column1->statistics()->null_count());
217-
ASSERT_EQ(0, rg2_column2->statistics()->null_count());
218-
ASSERT_EQ(nrows, rg2_column1->statistics()->distinct_count());
219-
ASSERT_EQ(nrows, rg2_column2->statistics()->distinct_count());
208+
ASSERT_EQ(stats_float.Min(), rg2_column2->statistics()->EncodeMin());
209+
ASSERT_EQ(stats_float.Max(), rg2_column2->statistics()->EncodeMax());
210+
ASSERT_EQ(stats_int.Min(), rg1_column1->statistics()->EncodeMin());
211+
ASSERT_EQ(stats_int.Max(), rg1_column1->statistics()->EncodeMax());
212+
ASSERT_EQ(stats_float.Min(), rg2_column2->encoded_statistics()->Min());
213+
ASSERT_EQ(stats_float.Max(), rg2_column2->encoded_statistics()->Max());
214+
ASSERT_EQ(stats_int.Min(), rg1_column1->encoded_statistics()->Min());
215+
ASSERT_EQ(stats_int.Max(), rg1_column1->encoded_statistics()->Max());
216+
ASSERT_EQ(0, rg2_column1->statistics()->NullCount());
217+
ASSERT_EQ(0, rg2_column2->statistics()->NullCount());
218+
ASSERT_EQ(nrows, rg2_column1->statistics()->DistinctCount());
219+
ASSERT_EQ(nrows, rg2_column2->statistics()->DistinctCount());
220220
ASSERT_EQ(nrows / 2, rg2_column1->num_values());
221221
ASSERT_EQ(nrows / 2, rg2_column2->num_values());
222222
ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column1->compression());

0 commit comments

Comments
 (0)