Signet Forge 0.1.0
C++20 Parquet library with AI-native extensions
DEMO
Loading...
Searching...
No Matches
statistics.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright 2026 Johnson Ogundeji
3#pragma once
4
11
12#include "signet/types.hpp"
13
14#include <algorithm>
15#include <cmath>
16#include <cstring>
17#include <limits>
18#include <optional>
19#include <string>
20#include <type_traits>
21#include <vector>
22
23namespace signet::forge {
24
33template <typename T>
34[[nodiscard]] inline std::vector<uint8_t> to_le_bytes(T value) {
35 static_assert(std::is_arithmetic_v<T>, "to_le_bytes requires an arithmetic type");
36
37 std::vector<uint8_t> bytes(sizeof(T));
38 std::memcpy(bytes.data(), &value, sizeof(T));
39
40 // If this platform is big-endian, reverse the bytes.
41 // On little-endian (x86, ARM), this is a no-op at compile time.
42#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
43 std::reverse(bytes.begin(), bytes.end());
44#endif
45
46 return bytes;
47}
48
53[[nodiscard]] inline std::vector<uint8_t> to_le_bytes(const std::string& value) {
54 return {value.begin(), value.end()};
55}
56
65template <typename T>
66[[nodiscard]] inline T from_le_bytes(const std::vector<uint8_t>& bytes) {
67 static_assert(std::is_arithmetic_v<T>, "from_le_bytes requires an arithmetic type");
68
69 T value{};
70 if (bytes.size() >= sizeof(T)) {
71#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
72 std::vector<uint8_t> tmp(bytes.begin(), bytes.begin() + sizeof(T));
73 std::reverse(tmp.begin(), tmp.end());
74 std::memcpy(&value, tmp.data(), sizeof(T));
75#else
76 std::memcpy(&value, bytes.data(), sizeof(T));
77#endif
78 }
79 return value;
80}
81
95public:
98
99 // -- Core update methods ---------------------------------------------------
100
109 template <typename T>
110 void update(const T& value) {
111 if constexpr (std::is_same_v<T, bool>) {
112 update_numeric(static_cast<uint8_t>(value ? 1 : 0));
113 } else if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
114 // NaN values do not count toward num_values_ (Parquet spec S2.4)
115 if (std::isnan(value)) return;
116 update_float(value);
117 } else if constexpr (std::is_same_v<T, std::string>) {
118 update_string(value);
119 } else if constexpr (std::is_arithmetic_v<T>) {
120 update_numeric(value);
121 } else {
122 static_assert(!std::is_same_v<T, T>,
123 "ColumnStatistics::update: unsupported type");
124 }
125 // Note: NaN early-returns above, so this only increments for non-NaN values
126 ++num_values_;
127 }
128
129 // The NaN guard above (`if (std::isnan(value)) return;`) runs before
130 // `++num_values_`, so NaN values never double-count. The `update_float()`
131 // method has its own redundant NaN check which also returns early, but since
132 // we already returned at the top, it is never reached for NaN inputs.
133
135 void update_null() {
136 ++null_count_;
137 }
138
140 void reset() {
141 null_count_ = 0;
142 num_values_ = 0;
143 distinct_count_ = std::nullopt;
144 min_value_.clear();
145 max_value_.clear();
146 has_min_max_ = false;
147 }
148
149 // -- Accessors -------------------------------------------------------------
150
152 [[nodiscard]] int64_t null_count() const { return null_count_; }
154 [[nodiscard]] int64_t num_values() const { return num_values_; }
156 [[nodiscard]] std::optional<int64_t> distinct_count() const { return distinct_count_; }
158 [[nodiscard]] bool has_min_max() const { return has_min_max_; }
159
161 [[nodiscard]] const std::vector<uint8_t>& min_bytes() const { return min_value_; }
163 [[nodiscard]] const std::vector<uint8_t>& max_bytes() const { return max_value_; }
164
173 template <typename T>
174 [[nodiscard]] T min_as() const {
175 if constexpr (std::is_same_v<T, bool>) {
176 return min_value_.empty() ? false : (min_value_[0] != 0);
177 } else if constexpr (std::is_same_v<T, std::string>) {
178 return std::string(min_value_.begin(), min_value_.end());
179 } else {
180 return from_le_bytes<T>(min_value_);
181 }
182 }
183
189 template <typename T>
190 [[nodiscard]] T max_as() const {
191 if constexpr (std::is_same_v<T, bool>) {
192 return max_value_.empty() ? false : (max_value_[0] != 0);
193 } else if constexpr (std::is_same_v<T, std::string>) {
194 return std::string(max_value_.begin(), max_value_.end());
195 } else {
196 return from_le_bytes<T>(max_value_);
197 }
198 }
199
200 // -- Mutators for optional fields ------------------------------------------
201
204 void set_distinct_count(int64_t count) { distinct_count_ = count; }
205
208 void set_type(PhysicalType t) { type_ = t; }
209
211 [[nodiscard]] PhysicalType type() const { return type_; }
212
213 // -- Merge two statistics (useful for combining page stats into chunk stats) -
214
223 void merge(const ColumnStatistics& other) {
224 // Guard: merging statistics of different physical types is a logic error
225 if (has_min_max_ && other.has_min_max_ && type_ != other.type_) {
226 return; // silently skip — caller must ensure same type
227 }
228 null_count_ += other.null_count_;
229 num_values_ += other.num_values_;
230
231 if (other.has_min_max_) {
232 if (!has_min_max_) {
233 min_value_ = other.min_value_;
234 max_value_ = other.max_value_;
235 has_min_max_ = true;
236 } else {
237 // Use typed comparison for numeric types (CWE-697: incorrect comparison)
238 merge_min_max(other.min_value_, other.max_value_);
239 }
240 }
241
242 // distinct_count cannot be merged without a full distinct set
243 if (distinct_count_.has_value() || other.distinct_count_.has_value()) {
244 distinct_count_ = std::nullopt; // invalidate on merge
245 }
246 }
247
248private:
249 int64_t null_count_ = 0;
250 int64_t num_values_ = 0;
251 std::optional<int64_t> distinct_count_;
252 std::vector<uint8_t> min_value_;
253 std::vector<uint8_t> max_value_;
254 bool has_min_max_ = false;
256
257 // -- Typed merge helpers (used by merge()) -----------------------------------
258
260 void merge_min_max(const std::vector<uint8_t>& other_min,
261 const std::vector<uint8_t>& other_max) {
262 switch (type_) {
264 typed_merge<int32_t>(other_min, other_max); break;
266 typed_merge<int64_t>(other_min, other_max); break;
268 typed_merge<float>(other_min, other_max); break;
270 typed_merge<double>(other_min, other_max); break;
271 default:
272 // BOOLEAN, BYTE_ARRAY, FIXED_LEN_BYTE_ARRAY: lexicographic is correct
273 if (other_min < min_value_) min_value_ = other_min;
274 if (other_max > max_value_) max_value_ = other_max;
275 break;
276 }
277 }
278
280 template <typename T>
281 void typed_merge(const std::vector<uint8_t>& other_min_bytes,
282 const std::vector<uint8_t>& other_max_bytes) {
283 T cur_min = from_le_bytes<T>(min_value_);
284 T cur_max = from_le_bytes<T>(max_value_);
285 T o_min = from_le_bytes<T>(other_min_bytes);
286 T o_max = from_le_bytes<T>(other_max_bytes);
287 if constexpr (std::is_floating_point_v<T>) {
288 // Use fmin/fmax to handle NaN correctly: NaN is treated as missing,
289 // so non-NaN always wins. std::fmin(NaN, x) == x.
290 T new_min = std::fmin(cur_min, o_min);
291 T new_max = std::fmax(cur_max, o_max);
292 if (new_min != cur_min) min_value_ = other_min_bytes;
293 if (new_max != cur_max) max_value_ = other_max_bytes;
294 } else {
295 if (o_min < cur_min) min_value_ = other_min_bytes;
296 if (o_max > cur_max) max_value_ = other_max_bytes;
297 }
298 }
299
300 // -- Internal update helpers -----------------------------------------------
301
303 template <typename T>
304 void update_numeric(T value) {
305 auto bytes = to_le_bytes(value);
306
307 if (!has_min_max_) {
308 min_value_ = bytes;
309 max_value_ = bytes;
310 has_min_max_ = true;
311 return;
312 }
313
314 // Compare as native typed values for correctness (signed vs unsigned)
315 T current_min = from_le_bytes<T>(min_value_);
316 T current_max = from_le_bytes<T>(max_value_);
317
318 if (value < current_min) {
319 min_value_ = bytes;
320 }
321 if (value > current_max) {
322 max_value_ = bytes;
323 }
324 }
325
327 template <typename T>
328 void update_float(T value) {
329 // Skip NaN values entirely — they do not participate in min/max
330 if (std::isnan(value)) {
331 return;
332 }
333
334 auto bytes = to_le_bytes(value);
335
336 if (!has_min_max_) {
337 min_value_ = bytes;
338 max_value_ = bytes;
339 has_min_max_ = true;
340 return;
341 }
342
343 T current_min = from_le_bytes<T>(min_value_);
344 T current_max = from_le_bytes<T>(max_value_);
345
346 if (value < current_min) {
347 min_value_ = bytes;
348 }
349 if (value > current_max) {
350 max_value_ = bytes;
351 }
352 }
353
355 void update_string(const std::string& value) {
356 auto bytes = to_le_bytes(value);
357
358 if (!has_min_max_) {
359 min_value_ = bytes;
360 max_value_ = bytes;
361 has_min_max_ = true;
362 return;
363 }
364
365 // Lexicographic comparison on raw bytes (equivalent to std::string comparison)
366 if (bytes < min_value_) {
367 min_value_ = bytes;
368 }
369 if (bytes > max_value_) {
370 max_value_ = bytes;
371 }
372 }
373};
374
375} // namespace signet::forge
Per-column-chunk statistics tracker.
void set_type(PhysicalType t)
Set the physical type for type-aware min/max comparison during merge.
int64_t null_count() const
Number of null values recorded.
void reset()
Reset all statistics to initial state.
void update(const T &value)
Update statistics with a non-null typed value.
void merge(const ColumnStatistics &other)
Merge another ColumnStatistics into this one.
const std::vector< uint8_t > & max_bytes() const
Raw little-endian bytes of the maximum value.
PhysicalType type() const
Get the physical type associated with these statistics.
bool has_min_max() const
Whether at least one non-null value has been recorded (min/max valid).
T min_as() const
Reconstruct the typed minimum value from stored bytes.
void update_null()
Record a null value (increments null count only, no min/max update).
ColumnStatistics()
Default constructor – initializes all counters to zero.
std::optional< int64_t > distinct_count() const
Optional distinct-value count (invalidated on merge).
T max_as() const
Reconstruct the typed maximum value from stored bytes.
int64_t num_values() const
Number of non-null values recorded.
const std::vector< uint8_t > & min_bytes() const
Raw little-endian bytes of the minimum value.
void set_distinct_count(int64_t count)
Set the distinct-value count (e.g.
PhysicalType
Parquet physical (storage) types as defined in parquet.thrift.
Definition types.hpp:20
@ INT64
64-bit signed integer (little-endian).
@ INT32
32-bit signed integer (little-endian).
@ FLOAT
IEEE 754 single-precision float.
@ DOUBLE
IEEE 754 double-precision float.
std::vector< uint8_t > to_le_bytes(T value)
Convert an arithmetic value to its little-endian byte representation.
T from_le_bytes(const std::vector< uint8_t > &bytes)
Reconstruct an arithmetic value from its little-endian byte representation.
Parquet format enumerations, type traits, and statistics structs.