SIGNET_FORGE/statistics_8hpp_source.html

// SPDX-License-Identifier: AGPL-3.0-or-later

// Copyright 2026 Johnson Ogundeji

#pragma once


#include "signet/types.hpp"


#include <algorithm>

#include <cmath>

#include <cstring>

#include <limits>

#include <optional>

#include <string>

#include <type_traits>

#include <vector>


namespace signet::forge {


template <typename T>


[[nodiscard]] inline std::vector<uint8_t> to_le_bytes(T value) {

    static_assert(std::is_arithmetic_v<T>, "to_le_bytes requires an arithmetic type");


    std::vector<uint8_t> bytes(sizeof(T));

    std::memcpy(bytes.data(), &value, sizeof(T));


    // If this platform is big-endian, reverse the bytes.

    // On little-endian (x86, ARM), this is a no-op at compile time.

#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__

    std::reverse(bytes.begin(), bytes.end());

#endif


    return bytes;

}


[[nodiscard]] inline std::vector<uint8_t> to_le_bytes(const std::string& value) {

    return {value.begin(), value.end()};

}


template <typename T>


[[nodiscard]] inline T from_le_bytes(const std::vector<uint8_t>& bytes) {

    static_assert(std::is_arithmetic_v<T>, "from_le_bytes requires an arithmetic type");


    T value{};

    if (bytes.size() >= sizeof(T)) {

#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__

        std::vector<uint8_t> tmp(bytes.begin(), bytes.begin() + sizeof(T));

        std::reverse(tmp.begin(), tmp.end());

        std::memcpy(&value, tmp.data(), sizeof(T));

#else

        std::memcpy(&value, bytes.data(), sizeof(T));

#endif

    }

    return value;

}


class ColumnStatistics {

public:

    ColumnStatistics() { reset(); }


    // -- Core update methods ---------------------------------------------------


    template <typename T>


    void update(const T& value) {

        if constexpr (std::is_same_v<T, bool>) {

            update_numeric(static_cast<uint8_t>(value ? 1 : 0));

        } else if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {

            // NaN values do not count toward num_values_ (Parquet spec S2.4)

            if (std::isnan(value)) return;

            update_float(value);

        } else if constexpr (std::is_same_v<T, std::string>) {

            update_string(value);

        } else if constexpr (std::is_arithmetic_v<T>) {

            update_numeric(value);

        } else {

            static_assert(!std::is_same_v<T, T>,

                          "ColumnStatistics::update: unsupported type");

        }

        // Note: NaN early-returns above, so this only increments for non-NaN values

        ++num_values_;

    }


    // The NaN guard above (`if (std::isnan(value)) return;`) runs before

    // `++num_values_`, so NaN values never double-count. The `update_float()`

    // method has its own redundant NaN check which also returns early, but since

    // we already returned at the top, it is never reached for NaN inputs.


    void update_null() {

        ++null_count_;

    }


    void reset() {

        null_count_     = 0;

        num_values_     = 0;

        distinct_count_ = std::nullopt;

        min_value_.clear();

        max_value_.clear();

        has_min_max_    = false;

    }


    // -- Accessors -------------------------------------------------------------


    [[nodiscard]] int64_t null_count()  const { return null_count_; }

    [[nodiscard]] int64_t num_values()  const { return num_values_; }

    [[nodiscard]] std::optional<int64_t> distinct_count() const { return distinct_count_; }

    [[nodiscard]] bool    has_min_max() const { return has_min_max_; }


    [[nodiscard]] const std::vector<uint8_t>& min_bytes() const { return min_value_; }

    [[nodiscard]] const std::vector<uint8_t>& max_bytes() const { return max_value_; }


    template <typename T>


    [[nodiscard]] T min_as() const {

        if constexpr (std::is_same_v<T, bool>) {

            return min_value_.empty() ? false : (min_value_[0] != 0);

        } else if constexpr (std::is_same_v<T, std::string>) {

            return std::string(min_value_.begin(), min_value_.end());

        } else {

            return from_le_bytes<T>(min_value_);

        }

    }


    template <typename T>


    [[nodiscard]] T max_as() const {

        if constexpr (std::is_same_v<T, bool>) {

            return max_value_.empty() ? false : (max_value_[0] != 0);

        } else if constexpr (std::is_same_v<T, std::string>) {

            return std::string(max_value_.begin(), max_value_.end());

        } else {

            return from_le_bytes<T>(max_value_);

        }

    }


    // -- Mutators for optional fields ------------------------------------------


    void set_distinct_count(int64_t count) { distinct_count_ = count; }


    void set_type(PhysicalType t) { type_ = t; }


    [[nodiscard]] PhysicalType type() const { return type_; }


    // -- Merge two statistics (useful for combining page stats into chunk stats) -


    void merge(const ColumnStatistics& other) {

        // Guard: merging statistics of different physical types is a logic error

        if (has_min_max_ && other.has_min_max_ && type_ != other.type_) {

            return; // silently skip — caller must ensure same type

        }

        null_count_ += other.null_count_;

        num_values_ += other.num_values_;


        if (other.has_min_max_) {

            if (!has_min_max_) {

                min_value_  = other.min_value_;

                max_value_  = other.max_value_;

                has_min_max_ = true;

            } else {

                // Use typed comparison for numeric types (CWE-697: incorrect comparison)

                merge_min_max(other.min_value_, other.max_value_);

            }

        }


        // distinct_count cannot be merged without a full distinct set

        if (distinct_count_.has_value() || other.distinct_count_.has_value()) {

            distinct_count_ = std::nullopt; // invalidate on merge

        }

    }


private:

    int64_t                null_count_     = 0;

    int64_t                num_values_     = 0;

    std::optional<int64_t> distinct_count_;

    std::vector<uint8_t>   min_value_;

    std::vector<uint8_t>   max_value_;

    bool                   has_min_max_    = false;

    PhysicalType           type_           = PhysicalType::INT32;


    // -- Typed merge helpers (used by merge()) -----------------------------------


    void merge_min_max(const std::vector<uint8_t>& other_min,

                       const std::vector<uint8_t>& other_max) {

        switch (type_) {

            case PhysicalType::INT32:

                typed_merge<int32_t>(other_min, other_max); break;

            case PhysicalType::INT64:

                typed_merge<int64_t>(other_min, other_max); break;

            case PhysicalType::FLOAT:

                typed_merge<float>(other_min, other_max); break;

            case PhysicalType::DOUBLE:

                typed_merge<double>(other_min, other_max); break;

            default:

                // BOOLEAN, BYTE_ARRAY, FIXED_LEN_BYTE_ARRAY: lexicographic is correct

                if (other_min < min_value_) min_value_ = other_min;

                if (other_max > max_value_) max_value_ = other_max;

                break;

        }

    }


    template <typename T>

    void typed_merge(const std::vector<uint8_t>& other_min_bytes,

                     const std::vector<uint8_t>& other_max_bytes) {

        T cur_min = from_le_bytes<T>(min_value_);

        T cur_max = from_le_bytes<T>(max_value_);

        T o_min   = from_le_bytes<T>(other_min_bytes);

        T o_max   = from_le_bytes<T>(other_max_bytes);

        if constexpr (std::is_floating_point_v<T>) {

            // Use fmin/fmax to handle NaN correctly: NaN is treated as missing,

            // so non-NaN always wins. std::fmin(NaN, x) == x.

            T new_min = std::fmin(cur_min, o_min);

            T new_max = std::fmax(cur_max, o_max);

            if (new_min != cur_min) min_value_ = other_min_bytes;

            if (new_max != cur_max) max_value_ = other_max_bytes;

        } else {

            if (o_min < cur_min) min_value_ = other_min_bytes;

            if (o_max > cur_max) max_value_ = other_max_bytes;

        }

    }


    // -- Internal update helpers -----------------------------------------------


    template <typename T>

    void update_numeric(T value) {

        auto bytes = to_le_bytes(value);


        if (!has_min_max_) {

            min_value_   = bytes;

            max_value_   = bytes;

            has_min_max_ = true;

            return;

        }


        // Compare as native typed values for correctness (signed vs unsigned)

        T current_min = from_le_bytes<T>(min_value_);

        T current_max = from_le_bytes<T>(max_value_);


        if (value < current_min) {

            min_value_ = bytes;

        }

        if (value > current_max) {

            max_value_ = bytes;

        }

    }


    template <typename T>

    void update_float(T value) {

        // Skip NaN values entirely — they do not participate in min/max

        if (std::isnan(value)) {

            return;

        }


        auto bytes = to_le_bytes(value);


        if (!has_min_max_) {

            min_value_   = bytes;

            max_value_   = bytes;

            has_min_max_ = true;

            return;

        }


        T current_min = from_le_bytes<T>(min_value_);

        T current_max = from_le_bytes<T>(max_value_);


        if (value < current_min) {

            min_value_ = bytes;

        }

        if (value > current_max) {

            max_value_ = bytes;

        }

    }


    void update_string(const std::string& value) {

        auto bytes = to_le_bytes(value);


        if (!has_min_max_) {

            min_value_   = bytes;

            max_value_   = bytes;

            has_min_max_ = true;

            return;

        }


        // Lexicographic comparison on raw bytes (equivalent to std::string comparison)

        if (bytes < min_value_) {

            min_value_ = bytes;

        }

        if (bytes > max_value_) {

            max_value_ = bytes;

        }

    }

};


} // namespace signet::forge

signet::forge::ColumnStatistics
Per-column-chunk statistics tracker.
Definition statistics.hpp:94

signet::forge::ColumnStatistics::set_type
void set_type(PhysicalType t)
Set the physical type for type-aware min/max comparison during merge.
Definition statistics.hpp:208

signet::forge::ColumnStatistics::null_count
int64_t null_count() const
Number of null values recorded.
Definition statistics.hpp:152

signet::forge::ColumnStatistics::reset
void reset()
Reset all statistics to initial state.
Definition statistics.hpp:140

signet::forge::ColumnStatistics::update
void update(const T &value)
Update statistics with a non-null typed value.
Definition statistics.hpp:110

signet::forge::ColumnStatistics::merge
void merge(const ColumnStatistics &other)
Merge another ColumnStatistics into this one.
Definition statistics.hpp:223

signet::forge::ColumnStatistics::max_bytes
const std::vector< uint8_t > & max_bytes() const
Raw little-endian bytes of the maximum value.
Definition statistics.hpp:163

signet::forge::ColumnStatistics::type
PhysicalType type() const
Get the physical type associated with these statistics.
Definition statistics.hpp:211

signet::forge::ColumnStatistics::has_min_max
bool has_min_max() const
Whether at least one non-null value has been recorded (min/max valid).
Definition statistics.hpp:158

signet::forge::ColumnStatistics::min_as
T min_as() const
Reconstruct the typed minimum value from stored bytes.
Definition statistics.hpp:174

signet::forge::ColumnStatistics::update_null
void update_null()
Record a null value (increments null count only, no min/max update).
Definition statistics.hpp:135

signet::forge::ColumnStatistics::ColumnStatistics
ColumnStatistics()
Default constructor – initializes all counters to zero.
Definition statistics.hpp:97

signet::forge::ColumnStatistics::distinct_count
std::optional< int64_t > distinct_count() const
Optional distinct-value count (invalidated on merge).
Definition statistics.hpp:156

signet::forge::ColumnStatistics::max_as
T max_as() const
Reconstruct the typed maximum value from stored bytes.
Definition statistics.hpp:190

signet::forge::ColumnStatistics::num_values
int64_t num_values() const
Number of non-null values recorded.
Definition statistics.hpp:154

signet::forge::ColumnStatistics::min_bytes
const std::vector< uint8_t > & min_bytes() const
Raw little-endian bytes of the minimum value.
Definition statistics.hpp:161

signet::forge::ColumnStatistics::set_distinct_count
void set_distinct_count(int64_t count)
Set the distinct-value count (e.g.
Definition statistics.hpp:204

signet::forge
Definition audit_chain.hpp:74

signet::forge::PhysicalType
PhysicalType
Parquet physical (storage) types as defined in parquet.thrift.
Definition types.hpp:20

signet::forge::PhysicalType::INT64
@ INT64
64-bit signed integer (little-endian).

signet::forge::PhysicalType::INT32
@ INT32
32-bit signed integer (little-endian).

signet::forge::PhysicalType::FLOAT
@ FLOAT
IEEE 754 single-precision float.

signet::forge::PhysicalType::DOUBLE
@ DOUBLE
IEEE 754 double-precision float.

signet::forge::to_le_bytes
std::vector< uint8_t > to_le_bytes(T value)
Convert an arithmetic value to its little-endian byte representation.
Definition statistics.hpp:34

signet::forge::from_le_bytes
T from_le_bytes(const std::vector< uint8_t > &bytes)
Reconstruct an arithmetic value from its little-endian byte representation.
Definition statistics.hpp:66

types.hpp
Parquet format enumerations, type traits, and statistics structs.