34[[nodiscard]]
inline std::vector<uint8_t>
to_le_bytes(T value) {
35 static_assert(std::is_arithmetic_v<T>,
"to_le_bytes requires an arithmetic type");
37 std::vector<uint8_t> bytes(
sizeof(T));
38 std::memcpy(bytes.data(), &value,
sizeof(T));
42#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
43 std::reverse(bytes.begin(), bytes.end());
53[[nodiscard]]
inline std::vector<uint8_t>
to_le_bytes(
const std::string& value) {
54 return {value.begin(), value.end()};
66[[nodiscard]]
inline T
from_le_bytes(
const std::vector<uint8_t>& bytes) {
67 static_assert(std::is_arithmetic_v<T>,
"from_le_bytes requires an arithmetic type");
70 if (bytes.size() >=
sizeof(T)) {
71#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
72 std::vector<uint8_t> tmp(bytes.begin(), bytes.begin() +
sizeof(T));
73 std::reverse(tmp.begin(), tmp.end());
74 std::memcpy(&value, tmp.data(),
sizeof(T));
76 std::memcpy(&value, bytes.data(),
sizeof(T));
109 template <
typename T>
111 if constexpr (std::is_same_v<T, bool>) {
112 update_numeric(
static_cast<uint8_t
>(value ? 1 : 0));
113 }
else if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
115 if (std::isnan(value))
return;
117 }
else if constexpr (std::is_same_v<T, std::string>) {
118 update_string(value);
119 }
else if constexpr (std::is_arithmetic_v<T>) {
120 update_numeric(value);
122 static_assert(!std::is_same_v<T, T>,
123 "ColumnStatistics::update: unsupported type");
143 distinct_count_ = std::nullopt;
146 has_min_max_ =
false;
152 [[nodiscard]] int64_t
null_count()
const {
return null_count_; }
154 [[nodiscard]] int64_t
num_values()
const {
return num_values_; }
156 [[nodiscard]] std::optional<int64_t>
distinct_count()
const {
return distinct_count_; }
161 [[nodiscard]]
const std::vector<uint8_t>&
min_bytes()
const {
return min_value_; }
163 [[nodiscard]]
const std::vector<uint8_t>&
max_bytes()
const {
return max_value_; }
173 template <
typename T>
175 if constexpr (std::is_same_v<T, bool>) {
176 return min_value_.empty() ? false : (min_value_[0] != 0);
177 }
else if constexpr (std::is_same_v<T, std::string>) {
178 return std::string(min_value_.begin(), min_value_.end());
180 return from_le_bytes<T>(min_value_);
189 template <
typename T>
191 if constexpr (std::is_same_v<T, bool>) {
192 return max_value_.empty() ? false : (max_value_[0] != 0);
193 }
else if constexpr (std::is_same_v<T, std::string>) {
194 return std::string(max_value_.begin(), max_value_.end());
196 return from_le_bytes<T>(max_value_);
225 if (has_min_max_ && other.has_min_max_ && type_ != other.type_) {
228 null_count_ += other.null_count_;
229 num_values_ += other.num_values_;
231 if (other.has_min_max_) {
233 min_value_ = other.min_value_;
234 max_value_ = other.max_value_;
238 merge_min_max(other.min_value_, other.max_value_);
243 if (distinct_count_.has_value() || other.distinct_count_.has_value()) {
244 distinct_count_ = std::nullopt;
249 int64_t null_count_ = 0;
250 int64_t num_values_ = 0;
251 std::optional<int64_t> distinct_count_;
252 std::vector<uint8_t> min_value_;
253 std::vector<uint8_t> max_value_;
254 bool has_min_max_ =
false;
260 void merge_min_max(
const std::vector<uint8_t>& other_min,
261 const std::vector<uint8_t>& other_max) {
264 typed_merge<int32_t>(other_min, other_max);
break;
266 typed_merge<int64_t>(other_min, other_max);
break;
268 typed_merge<float>(other_min, other_max);
break;
270 typed_merge<double>(other_min, other_max);
break;
273 if (other_min < min_value_) min_value_ = other_min;
274 if (other_max > max_value_) max_value_ = other_max;
280 template <
typename T>
281 void typed_merge(
const std::vector<uint8_t>& other_min_bytes,
282 const std::vector<uint8_t>& other_max_bytes) {
283 T cur_min = from_le_bytes<T>(min_value_);
284 T cur_max = from_le_bytes<T>(max_value_);
285 T o_min = from_le_bytes<T>(other_min_bytes);
286 T o_max = from_le_bytes<T>(other_max_bytes);
287 if constexpr (std::is_floating_point_v<T>) {
290 T new_min = std::fmin(cur_min, o_min);
291 T new_max = std::fmax(cur_max, o_max);
292 if (new_min != cur_min) min_value_ = other_min_bytes;
293 if (new_max != cur_max) max_value_ = other_max_bytes;
295 if (o_min < cur_min) min_value_ = other_min_bytes;
296 if (o_max > cur_max) max_value_ = other_max_bytes;
303 template <
typename T>
304 void update_numeric(T value) {
315 T current_min = from_le_bytes<T>(min_value_);
316 T current_max = from_le_bytes<T>(max_value_);
318 if (value < current_min) {
321 if (value > current_max) {
327 template <
typename T>
328 void update_float(T value) {
330 if (std::isnan(value)) {
343 T current_min = from_le_bytes<T>(min_value_);
344 T current_max = from_le_bytes<T>(max_value_);
346 if (value < current_min) {
349 if (value > current_max) {
355 void update_string(
const std::string& value) {
366 if (bytes < min_value_) {
369 if (bytes > max_value_) {
Per-column-chunk statistics tracker.
void set_type(PhysicalType t)
Set the physical type for type-aware min/max comparison during merge.
int64_t null_count() const
Number of null values recorded.
void reset()
Reset all statistics to initial state.
void update(const T &value)
Update statistics with a non-null typed value.
void merge(const ColumnStatistics &other)
Merge another ColumnStatistics into this one.
const std::vector< uint8_t > & max_bytes() const
Raw little-endian bytes of the maximum value.
PhysicalType type() const
Get the physical type associated with these statistics.
bool has_min_max() const
Whether at least one non-null value has been recorded (min/max valid).
T min_as() const
Reconstruct the typed minimum value from stored bytes.
void update_null()
Record a null value (increments null count only, no min/max update).
ColumnStatistics()
Default constructor – initializes all counters to zero.
std::optional< int64_t > distinct_count() const
Optional distinct-value count (invalidated on merge).
T max_as() const
Reconstruct the typed maximum value from stored bytes.
int64_t num_values() const
Number of non-null values recorded.
const std::vector< uint8_t > & min_bytes() const
Raw little-endian bytes of the minimum value.
void set_distinct_count(int64_t count)
Set the distinct-value count (e.g.
PhysicalType
Parquet physical (storage) types as defined in parquet.thrift.
@ INT64
64-bit signed integer (little-endian).
@ INT32
32-bit signed integer (little-endian).
@ FLOAT
IEEE 754 single-precision float.
@ DOUBLE
IEEE 754 double-precision float.
std::vector< uint8_t > to_le_bytes(T value)
Convert an arithmetic value to its little-endian byte representation.
T from_le_bytes(const std::vector< uint8_t > &bytes)
Reconstruct an arithmetic value from its little-endian byte representation.
Parquet format enumerations, type traits, and statistics structs.