Skip to content

Commit 322516f

Browse files
authored
GH-47105: [Statistics][C++] Implement Statistics specification attribute ARROW:row_count:approximate (#48266)
### Rationale for this change Implement the statistics specification attribute `ARROW:row_count:approximate`. ### What changes are included in this PR? Implement the statistics specification attribute `ARROW:row_count:approximate` with relevant tests. ### Are these changes tested? Yes, I ran the related tests. ### Are there any user-facing changes? Yes, this adds the `arrow::ArrayStatistics::row_count` attribute. * GitHub Issue: #47105 Authored-by: arash andishgar <[email protected]> Signed-off-by: Sutou Kouhei <[email protected]>
1 parent 0879f5b commit 322516f

File tree

4 files changed

+55
-1
lines changed

4 files changed

+55
-1
lines changed

cpp/src/arrow/array/array_test.cc

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3909,6 +3909,7 @@ class TestArrayDataStatistics : public ::testing::Test {
39093909
public:
39103910
void SetUp() {
39113911
valids_ = {1, 0, 1, 1};
3912+
row_count_ = static_cast<int64_t>(valids_.size());
39123913
null_count_ = std::count(valids_.begin(), valids_.end(), 0);
39133914
distinct_count_ = 3.0;
39143915
max_byte_width_ = 4.0;
@@ -3921,6 +3922,7 @@ class TestArrayDataStatistics : public ::testing::Test {
39213922
data_ = ArrayData::Make(int32(), values_.size(), {null_buffer_, values_buffer_},
39223923
null_count_);
39233924
data_->statistics = std::make_shared<ArrayStatistics>();
3925+
data_->statistics->row_count = row_count_;
39243926
data_->statistics->null_count = null_count_;
39253927
data_->statistics->distinct_count = distinct_count_;
39263928
data_->statistics->max_byte_width = max_byte_width_;
@@ -3934,6 +3936,7 @@ class TestArrayDataStatistics : public ::testing::Test {
39343936

39353937
protected:
39363938
std::vector<uint8_t> valids_;
3939+
int64_t row_count_;
39373940
int64_t null_count_;
39383941
double distinct_count_;
39393942
double max_byte_width_;
@@ -3950,6 +3953,9 @@ TEST_F(TestArrayDataStatistics, MoveConstructor) {
39503953
ArrayData copied_data(*data_);
39513954
ArrayData moved_data(std::move(copied_data));
39523955

3956+
ASSERT_TRUE(moved_data.statistics->row_count.has_value());
3957+
ASSERT_EQ(row_count_, std::get<int64_t>(moved_data.statistics->row_count.value()));
3958+
39533959
ASSERT_TRUE(moved_data.statistics->null_count.has_value());
39543960
ASSERT_EQ(null_count_, std::get<int64_t>(moved_data.statistics->null_count.value()));
39553961

@@ -3980,6 +3986,9 @@ TEST_F(TestArrayDataStatistics, MoveConstructor) {
39803986
TEST_F(TestArrayDataStatistics, CopyConstructor) {
39813987
ArrayData copied_data(*data_);
39823988

3989+
ASSERT_TRUE(copied_data.statistics->row_count.has_value());
3990+
ASSERT_EQ(row_count_, std::get<int64_t>(copied_data.statistics->row_count.value()));
3991+
39833992
ASSERT_TRUE(copied_data.statistics->null_count.has_value());
39843993
ASSERT_EQ(null_count_, std::get<int64_t>(copied_data.statistics->null_count.value()));
39853994

@@ -4012,6 +4021,9 @@ TEST_F(TestArrayDataStatistics, MoveAssignment) {
40124021
ArrayData moved_data;
40134022
moved_data = std::move(copied_data);
40144023

4024+
ASSERT_TRUE(moved_data.statistics->row_count.has_value());
4025+
ASSERT_EQ(row_count_, std::get<int64_t>(moved_data.statistics->row_count.value()));
4026+
40154027
ASSERT_TRUE(moved_data.statistics->null_count.has_value());
40164028
ASSERT_EQ(null_count_, std::get<int64_t>(moved_data.statistics->null_count.value()));
40174029

@@ -4043,6 +4055,9 @@ TEST_F(TestArrayDataStatistics, CopyAssignment) {
40434055
ArrayData copied_data;
40444056
copied_data = *data_;
40454057

4058+
ASSERT_TRUE(copied_data.statistics->row_count.has_value());
4059+
ASSERT_EQ(row_count_, std::get<int64_t>(copied_data.statistics->row_count.value()));
4060+
40464061
ASSERT_TRUE(copied_data.statistics->null_count.has_value());
40474062
ASSERT_EQ(null_count_, std::get<int64_t>(copied_data.statistics->null_count.value()));
40484063

@@ -4074,6 +4089,9 @@ TEST_F(TestArrayDataStatistics, CopyTo) {
40744089
ASSERT_OK_AND_ASSIGN(auto copied_data,
40754090
data_->CopyTo(arrow::default_cpu_memory_manager()));
40764091

4092+
ASSERT_TRUE(copied_data->statistics->row_count.has_value());
4093+
ASSERT_EQ(row_count_, std::get<int64_t>(copied_data->statistics->row_count.value()));
4094+
40774095
ASSERT_TRUE(copied_data->statistics->null_count.has_value());
40784096
ASSERT_EQ(null_count_, std::get<int64_t>(copied_data->statistics->null_count.value()));
40794097

cpp/src/arrow/array/statistics.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,12 @@ struct ARROW_EXPORT ArrayStatistics {
7575
return std::visit(visitor, value.value());
7676
}
7777

78+
/// \brief The number of rows, may not be set
79+
/// Note: when set to `int64_t`, it represents `exact_row_count`,
80+
/// and when set to `double`, it represents `approximate_row_count`.
81+
/// Note: this value is not used by \ref arrow::RecordBatch::MakeStatisticsArray.
82+
std::optional<CountType> row_count = std::nullopt;
83+
7884
/// \brief The number of null values, may not be set
7985
/// Note: when set to `int64_t`, it represents `exact_null_count`,
8086
/// and when set to `double`, it represents `approximate_null_count`.

cpp/src/arrow/array/statistics_test.cc

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,22 @@
2525

2626
namespace arrow {
2727

28+
TEST(TestArrayStatistics, RowCountExact) {
29+
ArrayStatistics statistics;
30+
ASSERT_FALSE(statistics.row_count.has_value());
31+
statistics.row_count = 45;
32+
ASSERT_TRUE(statistics.row_count.has_value());
33+
ASSERT_EQ(45, std::get<int64_t>(statistics.row_count.value()));
34+
}
35+
36+
TEST(TestArrayStatistics, RowCountApproximate) {
37+
ArrayStatistics statistics;
38+
ASSERT_FALSE(statistics.row_count.has_value());
39+
statistics.row_count = 45.0;
40+
ASSERT_TRUE(statistics.row_count.has_value());
41+
ASSERT_DOUBLE_EQ(45.0, std::get<double>(statistics.row_count.value()));
42+
}
43+
2844
TEST(TestArrayStatistics, NullCountExact) {
2945
ArrayStatistics statistics;
3046
ASSERT_FALSE(statistics.null_count.has_value());
@@ -114,6 +130,18 @@ TEST(TestArrayStatistics, Equals) {
114130

115131
ASSERT_EQ(statistics1, statistics2);
116132

133+
// Test ROW_COUNT_EXACT
134+
statistics1.row_count = 45;
135+
ASSERT_NE(statistics1, statistics2);
136+
statistics2.row_count = 45;
137+
ASSERT_EQ(statistics1, statistics2);
138+
139+
// Test ROW_COUNT_APPROXIMATE
140+
statistics1.row_count = 45.0;
141+
ASSERT_NE(statistics1, statistics2);
142+
statistics2.row_count = 45.0;
143+
ASSERT_EQ(statistics1, statistics2);
144+
117145
// Test NULL_COUNT_EXACT
118146
statistics1.null_count = 29;
119147
ASSERT_NE(statistics1, statistics2);

cpp/src/arrow/compare.cc

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1563,7 +1563,9 @@ bool ArrayStatisticsOptionalValueEquals(const std::optional<Type>& left,
15631563

15641564
bool ArrayStatisticsEqualsImpl(const ArrayStatistics& left, const ArrayStatistics& right,
15651565
const EqualOptions& equal_options) {
1566-
return ArrayStatisticsOptionalValueEquals(left.null_count, right.null_count,
1566+
return ArrayStatisticsOptionalValueEquals(left.row_count, right.row_count,
1567+
equal_options) &&
1568+
ArrayStatisticsOptionalValueEquals(left.null_count, right.null_count,
15671569
equal_options) &&
15681570
ArrayStatisticsOptionalValueEquals(left.distinct_count, right.distinct_count,
15691571
equal_options) &&

0 commit comments

Comments
 (0)