SIGNET_FORGE/delta_8hpp_source.html

// SPDX-License-Identifier: AGPL-3.0-or-later

// Copyright 2026 Johnson Ogundeji


#pragma once


// ---------------------------------------------------------------------------

// delta.hpp -- DELTA_BINARY_PACKED encoding (Parquet encoding=5)

//

// Delta-encodes int32/int64 values for high compression on sorted or

// monotonic sequences (timestamps, sequence IDs, etc.). Can achieve 90%+

// compression on sorted time-series data.

//

// Wire format:

//

// 1. Header (all unsigned varints, first_value is zigzag-encoded):

//    - block_size:        values per block (must be multiple of 128)

//    - miniblock_count:   miniblocks per block

//    - total_value_count: total number of values encoded

//    - first_value:       zigzag-encoded first value

//

// 2. Per block:

//    - min_delta:   zigzag-encoded minimum delta in this block

//    - bit_widths:  one byte per miniblock (bit width for that miniblock)

//    - miniblocks:  each miniblock contains (block_size / miniblock_count)

//                   values, bit-packed at the specified width. Stored values

//                   are (delta - min_delta), always non-negative.

//

// Zigzag encoding maps signed integers to unsigned:

//    encode: (n << 1) ^ (n >> 63)  for int64

//    decode: (v >> 1) ^ -(v & 1)

// ---------------------------------------------------------------------------


#include <algorithm>

#include <cassert>

#include <cstdint>

#include <cstring>

#include <limits>

#include <vector>


namespace signet::forge {


namespace delta {


// ---------------------------------------------------------------------------

// Constants -- tuned for the Parquet default configuration

// ---------------------------------------------------------------------------


inline constexpr size_t DEFAULT_BLOCK_SIZE      = 128;


inline constexpr size_t DEFAULT_MINIBLOCK_COUNT = 4;


inline constexpr size_t VALUES_PER_MINIBLOCK    = DEFAULT_BLOCK_SIZE / DEFAULT_MINIBLOCK_COUNT;


// ---------------------------------------------------------------------------

// Zigzag encoding/decoding

// ---------------------------------------------------------------------------


[[nodiscard]] inline uint64_t zigzag_encode(int64_t n) {

    // Cast to unsigned before left shift to avoid signed overflow UB (CWE-190)

    return (static_cast<uint64_t>(n) << 1) ^ static_cast<uint64_t>(n >> 63);

}


[[nodiscard]] inline uint32_t zigzag_encode32(int32_t n) {

    return (static_cast<uint32_t>(n) << 1) ^ static_cast<uint32_t>(n >> 31);

}


[[nodiscard]] inline int64_t zigzag_decode(uint64_t v) {

    return static_cast<int64_t>((v >> 1) ^ (~(v & 1) + 1));

}


[[nodiscard]] inline int32_t zigzag_decode32(uint32_t v) {

    return static_cast<int32_t>((v >> 1) ^ (~(v & 1) + 1));

}


// ---------------------------------------------------------------------------

// Unsigned varint encoding/decoding (LEB128, same as Thrift compact protocol)

// ---------------------------------------------------------------------------


inline size_t encode_uvarint(std::vector<uint8_t>& buf, uint64_t value) {

    size_t start = buf.size();

    while (value >= 0x80) {

        buf.push_back(static_cast<uint8_t>(value & 0x7F) | 0x80);

        value >>= 7;

    }

    buf.push_back(static_cast<uint8_t>(value));

    return buf.size() - start;

}


[[nodiscard]] inline uint64_t decode_uvarint(const uint8_t* data, size_t& pos, size_t size) {

    const size_t start_pos = pos;

    uint64_t result = 0;

    int shift = 0;

    while (pos < size) {

        uint8_t byte = data[pos++];

        result |= static_cast<uint64_t>(byte & 0x7F) << shift;

        if ((byte & 0x80) == 0) {

            return result;

        }

        shift += 7;

        if (shift >= 64) { pos = start_pos; break; } // overflow protection

    }

    pos = start_pos;

    return result;

}


// ---------------------------------------------------------------------------

// Bit width computation

// ---------------------------------------------------------------------------


[[nodiscard]] inline int bit_width_for(uint64_t value) {

    if (value == 0) return 0;

    int width = 0;

    while (value > 0) {

        value >>= 1;

        ++width;

    }

    return width;

}


// ---------------------------------------------------------------------------

// Bit-packing helpers for miniblocks

// ---------------------------------------------------------------------------


inline void bit_pack_values(std::vector<uint8_t>& out,

                            const uint64_t* values, size_t count,

                            int bit_width) {

    if (bit_width == 0) return; // all values are zero, no bytes needed


    // Total bytes = ceil(count * bit_width / 8)

    size_t total_bits = count * static_cast<size_t>(bit_width);

    size_t total_bytes = (total_bits + 7) / 8;


    size_t start = out.size();

    out.resize(start + total_bytes, 0);

    uint8_t* dst = out.data() + start;


    size_t bit_offset = 0;

    for (size_t i = 0; i < count; ++i) {

        uint64_t val = values[i];

        int bits_remaining = bit_width;

        size_t cur_bit = bit_offset;

        while (bits_remaining > 0) {

            size_t byte_idx = cur_bit / 8;

            int bit_idx = static_cast<int>(cur_bit % 8);

            int bits_to_write = (std::min)(bits_remaining, 8 - bit_idx);

            uint8_t mask = static_cast<uint8_t>(

                (val & ((uint64_t{1} << bits_to_write) - 1)) << bit_idx);

            dst[byte_idx] |= mask;

            val >>= bits_to_write;

            cur_bit += static_cast<size_t>(bits_to_write);

            bits_remaining -= bits_to_write;

        }

        bit_offset += static_cast<size_t>(bit_width);

    }

}


inline void bit_unpack_values(const uint8_t* src,

                              uint64_t* values, size_t count,

                              int bit_width) {

    if (bit_width == 0) {

        for (size_t i = 0; i < count; ++i) values[i] = 0;

        return;

    }


    uint64_t mask = (bit_width >= 64) ? ~uint64_t{0}

                                      : (uint64_t{1} << bit_width) - 1;


    size_t bit_offset = 0;

    for (size_t i = 0; i < count; ++i) {

        uint64_t val = 0;

        int bits_remaining = bit_width;

        size_t cur_bit = bit_offset;

        int val_bit = 0;

        while (bits_remaining > 0) {

            size_t byte_idx = cur_bit / 8;

            int bit_idx = static_cast<int>(cur_bit % 8);

            int bits_avail = 8 - bit_idx;

            int bits_to_read = (std::min)(bits_remaining, bits_avail);

            uint64_t chunk = (src[byte_idx] >> bit_idx)

                             & ((uint64_t{1} << bits_to_read) - 1);

            val |= chunk << val_bit;

            cur_bit += static_cast<size_t>(bits_to_read);

            val_bit += bits_to_read;

            bits_remaining -= bits_to_read;

        }

        values[i] = val & mask;

        bit_offset += static_cast<size_t>(bit_width);

    }

}


// ===========================================================================

// Encoder — DELTA_BINARY_PACKED for int64

// ===========================================================================


[[nodiscard]] inline std::vector<uint8_t> encode_int64(const int64_t* values,

                                                        size_t count) {

    std::vector<uint8_t> out;


    if (count == 0) {

        // Header: block_size, miniblock_count, total_count=0, first_value=0

        encode_uvarint(out, DEFAULT_BLOCK_SIZE);

        encode_uvarint(out, DEFAULT_MINIBLOCK_COUNT);

        encode_uvarint(out, 0);

        encode_uvarint(out, zigzag_encode(0));

        return out;

    }


    // Reserve a reasonable estimate (header + ~1 byte per delta on average)

    out.reserve(32 + count);


    // Write header

    encode_uvarint(out, DEFAULT_BLOCK_SIZE);

    encode_uvarint(out, DEFAULT_MINIBLOCK_COUNT);

    encode_uvarint(out, count);

    encode_uvarint(out, zigzag_encode(values[0]));


    if (count == 1) {

        return out;

    }


    // Compute all deltas

    size_t num_deltas = count - 1;

    std::vector<int64_t> deltas(num_deltas);

    for (size_t i = 0; i < num_deltas; ++i) {

        // CWE-190: Integer Overflow — unsigned subtraction avoids signed overflow UB; C++ [expr.shift] §7.6.7

        deltas[i] = static_cast<int64_t>(static_cast<uint64_t>(values[i + 1]) - static_cast<uint64_t>(values[i]));

    }


    // Process deltas in blocks of DEFAULT_BLOCK_SIZE

    size_t delta_idx = 0;

    while (delta_idx < num_deltas) {

        // Determine how many deltas are in this block

        size_t block_remaining = (std::min)(DEFAULT_BLOCK_SIZE, num_deltas - delta_idx);


        // Pad the block to DEFAULT_BLOCK_SIZE with zeros for the last block

        std::vector<int64_t> block_deltas(DEFAULT_BLOCK_SIZE, 0);

        std::copy(deltas.begin() + static_cast<ptrdiff_t>(delta_idx),

                  deltas.begin() + static_cast<ptrdiff_t>(delta_idx + block_remaining),

                  block_deltas.begin());


        // Find min_delta across the actual (non-padded) values in this block

        int64_t min_delta = block_deltas[0];

        for (size_t i = 1; i < block_remaining; ++i) {

            if (block_deltas[i] < min_delta) {

                min_delta = block_deltas[i];

            }

        }


        // Write min_delta (zigzag-encoded)

        encode_uvarint(out, zigzag_encode(min_delta));


        // Compute (delta - min_delta) for all values in the block.

        // For padding positions beyond block_remaining, use (0 - min_delta)

        // if min_delta <= 0 (non-negative result), else just 0.

        std::vector<uint64_t> adjusted(DEFAULT_BLOCK_SIZE);

        for (size_t i = 0; i < block_remaining; ++i) {

            // Use unsigned arithmetic to avoid signed overflow UB

            adjusted[i] = static_cast<uint64_t>(block_deltas[i]) - static_cast<uint64_t>(min_delta);

        }

        // Pad positions: store 0 so padding doesn't inflate bit_width

        for (size_t i = block_remaining; i < DEFAULT_BLOCK_SIZE; ++i) {

            adjusted[i] = 0;

        }


        // Compute bit widths per miniblock

        uint8_t bit_widths[DEFAULT_MINIBLOCK_COUNT];

        for (size_t mb = 0; mb < DEFAULT_MINIBLOCK_COUNT; ++mb) {

            size_t mb_start = mb * VALUES_PER_MINIBLOCK;

            uint64_t max_val = 0;

            for (size_t j = 0; j < VALUES_PER_MINIBLOCK; ++j) {

                if (adjusted[mb_start + j] > max_val) {

                    max_val = adjusted[mb_start + j];

                }

            }

            bit_widths[mb] = static_cast<uint8_t>(bit_width_for(max_val));

        }


        // Write bit widths (one byte per miniblock)

        for (size_t mb = 0; mb < DEFAULT_MINIBLOCK_COUNT; ++mb) {

            out.push_back(bit_widths[mb]);

        }


        // Write miniblock data (bit-packed)

        for (size_t mb = 0; mb < DEFAULT_MINIBLOCK_COUNT; ++mb) {

            size_t mb_start = mb * VALUES_PER_MINIBLOCK;

            bit_pack_values(out, adjusted.data() + mb_start,

                            VALUES_PER_MINIBLOCK, bit_widths[mb]);

        }


        delta_idx += block_remaining;

    }


    return out;

}


[[nodiscard]] inline std::vector<uint8_t> encode_int32(const int32_t* values,

                                                        size_t count) {

    // Widen int32 to int64 and use the same encoding

    std::vector<int64_t> wide(count);

    for (size_t i = 0; i < count; ++i) {

        wide[i] = static_cast<int64_t>(values[i]);

    }

    return encode_int64(wide.data(), count);

}


// ===========================================================================

// Decoder — DELTA_BINARY_PACKED for int64

// ===========================================================================


[[nodiscard]] inline std::vector<int64_t> decode_int64(const uint8_t* data,

                                                        size_t size,

                                                        size_t num_values) {

    std::vector<int64_t> result;

    if (num_values == 0 || size == 0) return result;

    if (num_values > 256 * 1024 * 1024) return result; // 256M value cap

    result.reserve(num_values);


    size_t pos = 0;


    // Read header

    uint64_t block_size      = decode_uvarint(data, pos, size);

    uint64_t miniblock_count = decode_uvarint(data, pos, size);

    uint64_t total_count     = decode_uvarint(data, pos, size);

    uint64_t first_value_zz  = decode_uvarint(data, pos, size);


    (void)total_count; // We use num_values from the caller


    // Validate block structure

    if (miniblock_count == 0 || block_size == 0) return result;

    // Parquet spec: block_size must be a multiple of 128; cap to prevent

    // absurd allocations from corrupted data (65536 values per block max).

    static constexpr uint64_t MAX_DELTA_BLOCK_SIZE = 65536;

    if (block_size > MAX_DELTA_BLOCK_SIZE) return result;

    static constexpr uint64_t MAX_MINIBLOCK_COUNT = 256;

    if (miniblock_count > MAX_MINIBLOCK_COUNT) return result;

    if (block_size % miniblock_count != 0) return result;

    size_t values_per_miniblock = static_cast<size_t>(block_size / miniblock_count);

    if (values_per_miniblock == 0) return result;


    // First value

    int64_t prev = zigzag_decode(first_value_zz);

    result.push_back(prev);


    if (num_values == 1) return result;


    // Decode blocks until we have enough values

    size_t values_remaining = num_values - 1; // first value already emitted


    while (values_remaining > 0 && pos < size) {

        // Read min_delta (zigzag-encoded)

        uint64_t min_delta_zz = decode_uvarint(data, pos, size);

        int64_t min_delta = zigzag_decode(min_delta_zz);


        // Read bit widths (one per miniblock)

        std::vector<uint8_t> bit_widths(static_cast<size_t>(miniblock_count));

        for (size_t mb = 0; mb < static_cast<size_t>(miniblock_count); ++mb) {

            if (pos < size) {

                bit_widths[mb] = data[pos++];

            } else {

                bit_widths[mb] = 0;

            }

        }


        // Decode each miniblock (buffer allocated once, reused across miniblocks)

        std::vector<uint64_t> unpacked(values_per_miniblock);

        for (size_t mb = 0; mb < static_cast<size_t>(miniblock_count); ++mb) {

            if (values_remaining == 0) break;


            int bw = bit_widths[mb];

            if (bw > 64) return result; // corrupt bit width — return partial


            if (bw == 0) {

                // All adjusted values are 0 => all deltas are min_delta

                for (size_t j = 0; j < values_per_miniblock; ++j) {

                    unpacked[j] = 0;

                }

            } else {

                // Calculate bytes needed for this miniblock

                size_t miniblock_bytes = (values_per_miniblock * static_cast<size_t>(bw) + 7) / 8;


                if (pos + miniblock_bytes > size) {

                    // Truncated data: decode what we can, pad rest with zeros

                    size_t avail = size - pos;

                    // Create a zero-padded copy

                    std::vector<uint8_t> padded(miniblock_bytes, 0);

                    std::memcpy(padded.data(), data + pos, avail);

                    bit_unpack_values(padded.data(), unpacked.data(),

                                      values_per_miniblock, bw);

                    pos = size;

                } else {

                    bit_unpack_values(data + pos, unpacked.data(),

                                      values_per_miniblock, bw);

                    pos += miniblock_bytes;

                }

            }


            // Convert adjusted values back to actual values

            size_t to_emit = (std::min)(values_per_miniblock, values_remaining);

            for (size_t j = 0; j < to_emit; ++j) {

                // Reconstruct delta using unsigned arithmetic to avoid overflow UB

                int64_t delta = static_cast<int64_t>(

                    unpacked[j] + static_cast<uint64_t>(min_delta));

#if defined(__GNUC__) || defined(__clang__)

                int64_t new_val;

                if (__builtin_add_overflow(prev, delta, &new_val)) {

                    return result; // overflow — return partial result

                }

                prev = new_val;

#else

                // Manual overflow check for MSVC: detect sign-change

                if ((delta > 0 && prev > (std::numeric_limits<int64_t>::max)() - delta) ||

                    (delta < 0 && prev < (std::numeric_limits<int64_t>::min)() - delta)) {

                    return result; // overflow — return partial result

                }

                prev += delta;

#endif

                result.push_back(prev);

            }

            values_remaining -= to_emit;

        }

    }


    return result;

}


[[nodiscard]] inline std::vector<int32_t> decode_int32(const uint8_t* data,

                                                        size_t size,

                                                        size_t num_values) {

    auto wide = decode_int64(data, size, num_values);

    std::vector<int32_t> result(wide.size());

    for (size_t i = 0; i < wide.size(); ++i) {

        // CWE-681: Incorrect Conversion between Numeric Types — range check before int64→int32 narrowing

        if (wide[i] < (std::numeric_limits<int32_t>::min)() || wide[i] > (std::numeric_limits<int32_t>::max)()) return {};

        result[i] = static_cast<int32_t>(wide[i]);

    }

    return result;

}


} // namespace delta


} // namespace signet::forge

signet::forge::delta::encode_int32
std::vector< uint8_t > encode_int32(const int32_t *values, size_t count)
Encode int32 values using the DELTA_BINARY_PACKED algorithm.
Definition delta.hpp:408

signet::forge::delta::DEFAULT_MINIBLOCK_COUNT
constexpr size_t DEFAULT_MINIBLOCK_COUNT
Default number of miniblocks within each block.
Definition delta.hpp:61

signet::forge::delta::bit_width_for
int bit_width_for(uint64_t value)
Compute the minimum number of bits required to represent an unsigned value.
Definition delta.hpp:179

signet::forge::delta::bit_unpack_values
void bit_unpack_values(const uint8_t *src, uint64_t *values, size_t count, int bit_width)
Unpack an arbitrary number of values at a given bit width from packed data.
Definition delta.hpp:248

signet::forge::delta::encode_uvarint
size_t encode_uvarint(std::vector< uint8_t > &buf, uint64_t value)
Encode an unsigned varint (LEB128) into a byte buffer.
Definition delta.hpp:131

signet::forge::delta::decode_uvarint
uint64_t decode_uvarint(const uint8_t *data, size_t &pos, size_t size)
Decode an unsigned varint (LEB128) from a byte buffer.
Definition delta.hpp:152

signet::forge::delta::zigzag_encode32
uint32_t zigzag_encode32(int32_t n)
Zigzag-encode a signed 32-bit integer to an unsigned representation.
Definition delta.hpp:91

signet::forge::delta::zigzag_decode
int64_t zigzag_decode(uint64_t v)
Zigzag-decode an unsigned 64-bit integer back to its signed representation.
Definition delta.hpp:102

signet::forge::delta::encode_int64
std::vector< uint8_t > encode_int64(const int64_t *values, size_t count)
Encode int64 values using the DELTA_BINARY_PACKED algorithm.
Definition delta.hpp:298

signet::forge::delta::decode_int32
std::vector< int32_t > decode_int32(const uint8_t *data, size_t size, size_t num_values)
Decode DELTA_BINARY_PACKED data back to int32 values.
Definition delta.hpp:564

signet::forge::delta::DEFAULT_BLOCK_SIZE
constexpr size_t DEFAULT_BLOCK_SIZE
Default number of delta values per block (must be a multiple of 128).
Definition delta.hpp:58

signet::forge::delta::zigzag_decode32
int32_t zigzag_decode32(uint32_t v)
Zigzag-decode an unsigned 32-bit integer back to its signed representation.
Definition delta.hpp:113

signet::forge::delta::decode_int64
std::vector< int64_t > decode_int64(const uint8_t *data, size_t size, size_t num_values)
Decode DELTA_BINARY_PACKED data back to int64 values.
Definition delta.hpp:438

signet::forge::delta::VALUES_PER_MINIBLOCK
constexpr size_t VALUES_PER_MINIBLOCK
Number of delta values per miniblock (DEFAULT_BLOCK_SIZE / DEFAULT_MINIBLOCK_COUNT).
Definition delta.hpp:64

signet::forge::delta::bit_pack_values
void bit_pack_values(std::vector< uint8_t > &out, const uint64_t *values, size_t count, int bit_width)
Bit-pack an arbitrary number of values at a given bit width.
Definition delta.hpp:204

signet::forge::delta::zigzag_encode
uint64_t zigzag_encode(int64_t n)
Zigzag-encode a signed 64-bit integer to an unsigned representation.
Definition delta.hpp:79

signet::forge
Definition audit_chain.hpp:74