SIGNET_FORGE/byte__stream__split_8hpp_source.html

// SPDX-License-Identifier: AGPL-3.0-or-later

// Copyright 2026 Johnson Ogundeji


#pragma once


// ---------------------------------------------------------------------------

// byte_stream_split.hpp -- BYTE_STREAM_SPLIT encoding (Parquet encoding=9)

//

// Splits IEEE 754 float/double values by byte position to group similar

// exponent and mantissa bits together. This dramatically improves

// compression ratios with ZSTD/Snappy for financial data (prices, rates,

// quantities) where successive values share exponent bytes.

//

// Layout for N float values (4 bytes each):

//   [byte0 of val0][byte0 of val1]...[byte0 of valN-1]   (N bytes)

//   [byte1 of val0][byte1 of val1]...[byte1 of valN-1]   (N bytes)

//   [byte2 of val0][byte2 of val1]...[byte2 of valN-1]   (N bytes)

//   [byte3 of val0][byte3 of val1]...[byte3 of valN-1]   (N bytes)

//   Total: 4*N bytes (same as input, just rearranged)

//

// Layout for N double values (8 bytes each):

//   Same pattern but 8 byte streams instead of 4.

//   Total: 8*N bytes.

//

// Decoding reverses the process: de-interleave back to native byte order.

// ---------------------------------------------------------------------------


#include <bit>

#include <cstdint>

#include <cstring>

#include <vector>


namespace signet::forge {


static_assert(std::endian::native == std::endian::little,

              "Byte Stream Split encoding requires little-endian platform");


namespace byte_stream_split {


// ===========================================================================

// Encode

// ===========================================================================


[[nodiscard]] inline std::vector<uint8_t> encode_float(const float* values,

                                                        size_t count) {

    constexpr size_t WIDTH = sizeof(float); // 4

    if (count > SIZE_MAX / WIDTH) return {}; // CWE-190: Integer Overflow — prevent count * WIDTH wraparound

    std::vector<uint8_t> out(count * WIDTH);


    if (count == 0) return out;


    // Reinterpret the float array as raw bytes

    const auto* src = reinterpret_cast<const uint8_t*>(values);


    // For each byte position b in [0,4), copy byte b of every value

    // into the output at offset b*count

    for (size_t b = 0; b < WIDTH; ++b) {

        uint8_t* dst = out.data() + b * count;

        for (size_t i = 0; i < count; ++i) {

            dst[i] = src[i * WIDTH + b];

        }

    }


    return out;

}


[[nodiscard]] inline std::vector<uint8_t> encode_double(const double* values,

                                                         size_t count) {

    constexpr size_t WIDTH = sizeof(double); // 8

    if (count > SIZE_MAX / WIDTH) return {}; // CWE-190: Integer Overflow — prevent count * WIDTH wraparound

    std::vector<uint8_t> out(count * WIDTH);


    if (count == 0) return out;


    const auto* src = reinterpret_cast<const uint8_t*>(values);


    for (size_t b = 0; b < WIDTH; ++b) {

        uint8_t* dst = out.data() + b * count;

        for (size_t i = 0; i < count; ++i) {

            dst[i] = src[i * WIDTH + b];

        }

    }


    return out;

}


// ===========================================================================

// Decode

// ===========================================================================


[[nodiscard]] inline std::vector<float> decode_float(const uint8_t* data,

                                                      size_t size,

                                                      size_t count) {

    constexpr size_t WIDTH = sizeof(float); // 4

    if (count > SIZE_MAX / WIDTH || count * WIDTH > size) return {};


    std::vector<float> out(count);


    if (count == 0) return out;


    auto* dst = reinterpret_cast<uint8_t*>(out.data());


    // Reverse the split: for each byte position b, read from data[b*count+i]

    // and write to dst[i*WIDTH+b]

    for (size_t b = 0; b < WIDTH; ++b) {

        const uint8_t* src = data + b * count;

        for (size_t i = 0; i < count; ++i) {

            dst[i * WIDTH + b] = src[i];

        }

    }


    return out;

}


[[nodiscard]] inline std::vector<double> decode_double(const uint8_t* data,

                                                        size_t size,

                                                        size_t count) {

    constexpr size_t WIDTH = sizeof(double); // 8

    if (count > SIZE_MAX / WIDTH || count * WIDTH > size) return {};


    std::vector<double> out(count);


    if (count == 0) return out;


    auto* dst = reinterpret_cast<uint8_t*>(out.data());


    for (size_t b = 0; b < WIDTH; ++b) {

        const uint8_t* src = data + b * count;

        for (size_t i = 0; i < count; ++i) {

            dst[i * WIDTH + b] = src[i];

        }

    }


    return out;

}


} // namespace byte_stream_split


} // namespace signet::forge

signet::forge::byte_stream_split::encode_float
std::vector< uint8_t > encode_float(const float *values, size_t count)
Encode float values using the BYTE_STREAM_SPLIT algorithm.
Definition byte_stream_split.hpp:67

signet::forge::byte_stream_split::encode_double
std::vector< uint8_t > encode_double(const double *values, size_t count)
Encode double values using the BYTE_STREAM_SPLIT algorithm.
Definition byte_stream_split.hpp:101

signet::forge::byte_stream_split::decode_float
std::vector< float > decode_float(const uint8_t *data, size_t size, size_t count)
Decode float values from BYTE_STREAM_SPLIT encoding.
Definition byte_stream_split.hpp:136

signet::forge::byte_stream_split::decode_double
std::vector< double > decode_double(const uint8_t *data, size_t size, size_t count)
Decode double values from BYTE_STREAM_SPLIT encoding.
Definition byte_stream_split.hpp:171

signet::forge
Definition audit_chain.hpp:74