Signet Forge 0.1.0
C++20 Parquet library with AI-native extensions
DEMO
Loading...
Searching...
No Matches
byte_stream_split.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright 2026 Johnson Ogundeji
3
14
15#pragma once
16
17// ---------------------------------------------------------------------------
18// byte_stream_split.hpp -- BYTE_STREAM_SPLIT encoding (Parquet encoding=9)
19//
20// Splits IEEE 754 float/double values by byte position to group similar
21// exponent and mantissa bits together. This dramatically improves
22// compression ratios with ZSTD/Snappy for financial data (prices, rates,
23// quantities) where successive values share exponent bytes.
24//
25// Layout for N float values (4 bytes each):
26// [byte0 of val0][byte0 of val1]...[byte0 of valN-1] (N bytes)
27// [byte1 of val0][byte1 of val1]...[byte1 of valN-1] (N bytes)
28// [byte2 of val0][byte2 of val1]...[byte2 of valN-1] (N bytes)
29// [byte3 of val0][byte3 of val1]...[byte3 of valN-1] (N bytes)
30// Total: 4*N bytes (same as input, just rearranged)
31//
32// Layout for N double values (8 bytes each):
33// Same pattern but 8 byte streams instead of 4.
34// Total: 8*N bytes.
35//
36// Decoding reverses the process: de-interleave back to native byte order.
37// ---------------------------------------------------------------------------
38
39#include <bit>
40#include <cstdint>
41#include <cstring>
42#include <vector>
43
44namespace signet::forge {
45
46static_assert(std::endian::native == std::endian::little,
47 "Byte Stream Split encoding requires little-endian platform");
48
50namespace byte_stream_split {
51
52// ===========================================================================
53// Encode
54// ===========================================================================
55
67[[nodiscard]] inline std::vector<uint8_t> encode_float(const float* values,
68 size_t count) {
69 constexpr size_t WIDTH = sizeof(float); // 4
70 if (count > SIZE_MAX / WIDTH) return {}; // CWE-190: Integer Overflow — prevent count * WIDTH wraparound
71 std::vector<uint8_t> out(count * WIDTH);
72
73 if (count == 0) return out;
74
75 // Reinterpret the float array as raw bytes
76 const auto* src = reinterpret_cast<const uint8_t*>(values);
77
78 // For each byte position b in [0,4), copy byte b of every value
79 // into the output at offset b*count
80 for (size_t b = 0; b < WIDTH; ++b) {
81 uint8_t* dst = out.data() + b * count;
82 for (size_t i = 0; i < count; ++i) {
83 dst[i] = src[i * WIDTH + b];
84 }
85 }
86
87 return out;
88}
89
101[[nodiscard]] inline std::vector<uint8_t> encode_double(const double* values,
102 size_t count) {
103 constexpr size_t WIDTH = sizeof(double); // 8
104 if (count > SIZE_MAX / WIDTH) return {}; // CWE-190: Integer Overflow — prevent count * WIDTH wraparound
105 std::vector<uint8_t> out(count * WIDTH);
106
107 if (count == 0) return out;
108
109 const auto* src = reinterpret_cast<const uint8_t*>(values);
110
111 for (size_t b = 0; b < WIDTH; ++b) {
112 uint8_t* dst = out.data() + b * count;
113 for (size_t i = 0; i < count; ++i) {
114 dst[i] = src[i * WIDTH + b];
115 }
116 }
117
118 return out;
119}
120
121// ===========================================================================
122// Decode
123// ===========================================================================
124
136[[nodiscard]] inline std::vector<float> decode_float(const uint8_t* data,
137 size_t size,
138 size_t count) {
139 constexpr size_t WIDTH = sizeof(float); // 4
140 if (count > SIZE_MAX / WIDTH || count * WIDTH > size) return {};
141
142 std::vector<float> out(count);
143
144 if (count == 0) return out;
145
146 auto* dst = reinterpret_cast<uint8_t*>(out.data());
147
148 // Reverse the split: for each byte position b, read from data[b*count+i]
149 // and write to dst[i*WIDTH+b]
150 for (size_t b = 0; b < WIDTH; ++b) {
151 const uint8_t* src = data + b * count;
152 for (size_t i = 0; i < count; ++i) {
153 dst[i * WIDTH + b] = src[i];
154 }
155 }
156
157 return out;
158}
159
171[[nodiscard]] inline std::vector<double> decode_double(const uint8_t* data,
172 size_t size,
173 size_t count) {
174 constexpr size_t WIDTH = sizeof(double); // 8
175 if (count > SIZE_MAX / WIDTH || count * WIDTH > size) return {};
176
177 std::vector<double> out(count);
178
179 if (count == 0) return out;
180
181 auto* dst = reinterpret_cast<uint8_t*>(out.data());
182
183 for (size_t b = 0; b < WIDTH; ++b) {
184 const uint8_t* src = data + b * count;
185 for (size_t i = 0; i < count; ++i) {
186 dst[i * WIDTH + b] = src[i];
187 }
188 }
189
190 return out;
191}
192
193} // namespace byte_stream_split
194} // namespace signet::forge
std::vector< uint8_t > encode_float(const float *values, size_t count)
Encode float values using the BYTE_STREAM_SPLIT algorithm.
std::vector< uint8_t > encode_double(const double *values, size_t count)
Encode double values using the BYTE_STREAM_SPLIT algorithm.
std::vector< float > decode_float(const uint8_t *data, size_t size, size_t count)
Decode float values from BYTE_STREAM_SPLIT encoding.
std::vector< double > decode_double(const uint8_t *data, size_t size, size_t count)
Decode double values from BYTE_STREAM_SPLIT encoding.