Signet Forge 0.1.0
C++20 Parquet library with AI-native extensions
DEMO
Loading...
Searching...
No Matches
delta.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright 2026 Johnson Ogundeji
3
13
14#pragma once
15
16// ---------------------------------------------------------------------------
17// delta.hpp -- DELTA_BINARY_PACKED encoding (Parquet encoding=5)
18//
19// Delta-encodes int32/int64 values for high compression on sorted or
20// monotonic sequences (timestamps, sequence IDs, etc.). Can achieve 90%+
21// compression on sorted time-series data.
22//
23// Wire format:
24//
25// 1. Header (all unsigned varints, first_value is zigzag-encoded):
26// - block_size: values per block (must be multiple of 128)
27// - miniblock_count: miniblocks per block
28// - total_value_count: total number of values encoded
29// - first_value: zigzag-encoded first value
30//
31// 2. Per block:
32// - min_delta: zigzag-encoded minimum delta in this block
33// - bit_widths: one byte per miniblock (bit width for that miniblock)
34// - miniblocks: each miniblock contains (block_size / miniblock_count)
35// values, bit-packed at the specified width. Stored values
36// are (delta - min_delta), always non-negative.
37//
38// Zigzag encoding maps signed integers to unsigned:
39// encode: (n << 1) ^ (n >> 63) for int64
40// decode: (v >> 1) ^ -(v & 1)
41// ---------------------------------------------------------------------------
42
43#include <algorithm>
44#include <cassert>
45#include <cstdint>
46#include <cstring>
47#include <limits>
48#include <vector>
49
50namespace signet::forge {
51namespace delta {
52
53// ---------------------------------------------------------------------------
54// Constants -- tuned for the Parquet default configuration
55// ---------------------------------------------------------------------------
56
58inline constexpr size_t DEFAULT_BLOCK_SIZE = 128;
59
61inline constexpr size_t DEFAULT_MINIBLOCK_COUNT = 4;
62
65
66// ---------------------------------------------------------------------------
67// Zigzag encoding/decoding
68// ---------------------------------------------------------------------------
69
79[[nodiscard]] inline uint64_t zigzag_encode(int64_t n) {
80 // Cast to unsigned before left shift to avoid signed overflow UB (CWE-190)
81 return (static_cast<uint64_t>(n) << 1) ^ static_cast<uint64_t>(n >> 63);
82}
83
91[[nodiscard]] inline uint32_t zigzag_encode32(int32_t n) {
92 return (static_cast<uint32_t>(n) << 1) ^ static_cast<uint32_t>(n >> 31);
93}
94
102[[nodiscard]] inline int64_t zigzag_decode(uint64_t v) {
103 return static_cast<int64_t>((v >> 1) ^ (~(v & 1) + 1));
104}
105
113[[nodiscard]] inline int32_t zigzag_decode32(uint32_t v) {
114 return static_cast<int32_t>((v >> 1) ^ (~(v & 1) + 1));
115}
116
117// ---------------------------------------------------------------------------
118// Unsigned varint encoding/decoding (LEB128, same as Thrift compact protocol)
119// ---------------------------------------------------------------------------
120
131inline size_t encode_uvarint(std::vector<uint8_t>& buf, uint64_t value) {
132 size_t start = buf.size();
133 while (value >= 0x80) {
134 buf.push_back(static_cast<uint8_t>(value & 0x7F) | 0x80);
135 value >>= 7;
136 }
137 buf.push_back(static_cast<uint8_t>(value));
138 return buf.size() - start;
139}
140
152[[nodiscard]] inline uint64_t decode_uvarint(const uint8_t* data, size_t& pos, size_t size) {
153 const size_t start_pos = pos;
154 uint64_t result = 0;
155 int shift = 0;
156 while (pos < size) {
157 uint8_t byte = data[pos++];
158 result |= static_cast<uint64_t>(byte & 0x7F) << shift;
159 if ((byte & 0x80) == 0) {
160 return result;
161 }
162 shift += 7;
163 if (shift >= 64) { pos = start_pos; break; } // overflow protection
164 }
165 pos = start_pos;
166 return result;
167}
168
169// ---------------------------------------------------------------------------
170// Bit width computation
171// ---------------------------------------------------------------------------
172
179[[nodiscard]] inline int bit_width_for(uint64_t value) {
180 if (value == 0) return 0;
181 int width = 0;
182 while (value > 0) {
183 value >>= 1;
184 ++width;
185 }
186 return width;
187}
188
189// ---------------------------------------------------------------------------
190// Bit-packing helpers for miniblocks
191// ---------------------------------------------------------------------------
192
204inline void bit_pack_values(std::vector<uint8_t>& out,
205 const uint64_t* values, size_t count,
206 int bit_width) {
207 if (bit_width == 0) return; // all values are zero, no bytes needed
208
209 // Total bytes = ceil(count * bit_width / 8)
210 size_t total_bits = count * static_cast<size_t>(bit_width);
211 size_t total_bytes = (total_bits + 7) / 8;
212
213 size_t start = out.size();
214 out.resize(start + total_bytes, 0);
215 uint8_t* dst = out.data() + start;
216
217 size_t bit_offset = 0;
218 for (size_t i = 0; i < count; ++i) {
219 uint64_t val = values[i];
220 int bits_remaining = bit_width;
221 size_t cur_bit = bit_offset;
222 while (bits_remaining > 0) {
223 size_t byte_idx = cur_bit / 8;
224 int bit_idx = static_cast<int>(cur_bit % 8);
225 int bits_to_write = (std::min)(bits_remaining, 8 - bit_idx);
226 uint8_t mask = static_cast<uint8_t>(
227 (val & ((uint64_t{1} << bits_to_write) - 1)) << bit_idx);
228 dst[byte_idx] |= mask;
229 val >>= bits_to_write;
230 cur_bit += static_cast<size_t>(bits_to_write);
231 bits_remaining -= bits_to_write;
232 }
233 bit_offset += static_cast<size_t>(bit_width);
234 }
235}
236
248inline void bit_unpack_values(const uint8_t* src,
249 uint64_t* values, size_t count,
250 int bit_width) {
251 if (bit_width == 0) {
252 for (size_t i = 0; i < count; ++i) values[i] = 0;
253 return;
254 }
255
256 uint64_t mask = (bit_width >= 64) ? ~uint64_t{0}
257 : (uint64_t{1} << bit_width) - 1;
258
259 size_t bit_offset = 0;
260 for (size_t i = 0; i < count; ++i) {
261 uint64_t val = 0;
262 int bits_remaining = bit_width;
263 size_t cur_bit = bit_offset;
264 int val_bit = 0;
265 while (bits_remaining > 0) {
266 size_t byte_idx = cur_bit / 8;
267 int bit_idx = static_cast<int>(cur_bit % 8);
268 int bits_avail = 8 - bit_idx;
269 int bits_to_read = (std::min)(bits_remaining, bits_avail);
270 uint64_t chunk = (src[byte_idx] >> bit_idx)
271 & ((uint64_t{1} << bits_to_read) - 1);
272 val |= chunk << val_bit;
273 cur_bit += static_cast<size_t>(bits_to_read);
274 val_bit += bits_to_read;
275 bits_remaining -= bits_to_read;
276 }
277 values[i] = val & mask;
278 bit_offset += static_cast<size_t>(bit_width);
279 }
280}
281
282// ===========================================================================
283// Encoder — DELTA_BINARY_PACKED for int64
284// ===========================================================================
285
298[[nodiscard]] inline std::vector<uint8_t> encode_int64(const int64_t* values,
299 size_t count) {
300 std::vector<uint8_t> out;
301
302 if (count == 0) {
303 // Header: block_size, miniblock_count, total_count=0, first_value=0
306 encode_uvarint(out, 0);
308 return out;
309 }
310
311 // Reserve a reasonable estimate (header + ~1 byte per delta on average)
312 out.reserve(32 + count);
313
314 // Write header
317 encode_uvarint(out, count);
318 encode_uvarint(out, zigzag_encode(values[0]));
319
320 if (count == 1) {
321 return out;
322 }
323
324 // Compute all deltas
325 size_t num_deltas = count - 1;
326 std::vector<int64_t> deltas(num_deltas);
327 for (size_t i = 0; i < num_deltas; ++i) {
328 // CWE-190: Integer Overflow — unsigned subtraction avoids signed overflow UB; C++ [expr.shift] §7.6.7
329 deltas[i] = static_cast<int64_t>(static_cast<uint64_t>(values[i + 1]) - static_cast<uint64_t>(values[i]));
330 }
331
332 // Process deltas in blocks of DEFAULT_BLOCK_SIZE
333 size_t delta_idx = 0;
334 while (delta_idx < num_deltas) {
335 // Determine how many deltas are in this block
336 size_t block_remaining = (std::min)(DEFAULT_BLOCK_SIZE, num_deltas - delta_idx);
337
338 // Pad the block to DEFAULT_BLOCK_SIZE with zeros for the last block
339 std::vector<int64_t> block_deltas(DEFAULT_BLOCK_SIZE, 0);
340 std::copy(deltas.begin() + static_cast<ptrdiff_t>(delta_idx),
341 deltas.begin() + static_cast<ptrdiff_t>(delta_idx + block_remaining),
342 block_deltas.begin());
343
344 // Find min_delta across the actual (non-padded) values in this block
345 int64_t min_delta = block_deltas[0];
346 for (size_t i = 1; i < block_remaining; ++i) {
347 if (block_deltas[i] < min_delta) {
348 min_delta = block_deltas[i];
349 }
350 }
351
352 // Write min_delta (zigzag-encoded)
353 encode_uvarint(out, zigzag_encode(min_delta));
354
355 // Compute (delta - min_delta) for all values in the block.
356 // For padding positions beyond block_remaining, use (0 - min_delta)
357 // if min_delta <= 0 (non-negative result), else just 0.
358 std::vector<uint64_t> adjusted(DEFAULT_BLOCK_SIZE);
359 for (size_t i = 0; i < block_remaining; ++i) {
360 // Use unsigned arithmetic to avoid signed overflow UB
361 adjusted[i] = static_cast<uint64_t>(block_deltas[i]) - static_cast<uint64_t>(min_delta);
362 }
363 // Pad positions: store 0 so padding doesn't inflate bit_width
364 for (size_t i = block_remaining; i < DEFAULT_BLOCK_SIZE; ++i) {
365 adjusted[i] = 0;
366 }
367
368 // Compute bit widths per miniblock
369 uint8_t bit_widths[DEFAULT_MINIBLOCK_COUNT];
370 for (size_t mb = 0; mb < DEFAULT_MINIBLOCK_COUNT; ++mb) {
371 size_t mb_start = mb * VALUES_PER_MINIBLOCK;
372 uint64_t max_val = 0;
373 for (size_t j = 0; j < VALUES_PER_MINIBLOCK; ++j) {
374 if (adjusted[mb_start + j] > max_val) {
375 max_val = adjusted[mb_start + j];
376 }
377 }
378 bit_widths[mb] = static_cast<uint8_t>(bit_width_for(max_val));
379 }
380
381 // Write bit widths (one byte per miniblock)
382 for (size_t mb = 0; mb < DEFAULT_MINIBLOCK_COUNT; ++mb) {
383 out.push_back(bit_widths[mb]);
384 }
385
386 // Write miniblock data (bit-packed)
387 for (size_t mb = 0; mb < DEFAULT_MINIBLOCK_COUNT; ++mb) {
388 size_t mb_start = mb * VALUES_PER_MINIBLOCK;
389 bit_pack_values(out, adjusted.data() + mb_start,
390 VALUES_PER_MINIBLOCK, bit_widths[mb]);
391 }
392
393 delta_idx += block_remaining;
394 }
395
396 return out;
397}
398
408[[nodiscard]] inline std::vector<uint8_t> encode_int32(const int32_t* values,
409 size_t count) {
410 // Widen int32 to int64 and use the same encoding
411 std::vector<int64_t> wide(count);
412 for (size_t i = 0; i < count; ++i) {
413 wide[i] = static_cast<int64_t>(values[i]);
414 }
415 return encode_int64(wide.data(), count);
416}
417
418// ===========================================================================
419// Decoder — DELTA_BINARY_PACKED for int64
420// ===========================================================================
421
438[[nodiscard]] inline std::vector<int64_t> decode_int64(const uint8_t* data,
439 size_t size,
440 size_t num_values) {
441 std::vector<int64_t> result;
442 if (num_values == 0 || size == 0) return result;
443 if (num_values > 256 * 1024 * 1024) return result; // 256M value cap
444 result.reserve(num_values);
445
446 size_t pos = 0;
447
448 // Read header
449 uint64_t block_size = decode_uvarint(data, pos, size);
450 uint64_t miniblock_count = decode_uvarint(data, pos, size);
451 uint64_t total_count = decode_uvarint(data, pos, size);
452 uint64_t first_value_zz = decode_uvarint(data, pos, size);
453
454 (void)total_count; // We use num_values from the caller
455
456 // Validate block structure
457 if (miniblock_count == 0 || block_size == 0) return result;
458 // Parquet spec: block_size must be a multiple of 128; cap to prevent
459 // absurd allocations from corrupted data (65536 values per block max).
460 static constexpr uint64_t MAX_DELTA_BLOCK_SIZE = 65536;
461 if (block_size > MAX_DELTA_BLOCK_SIZE) return result;
462 static constexpr uint64_t MAX_MINIBLOCK_COUNT = 256;
463 if (miniblock_count > MAX_MINIBLOCK_COUNT) return result;
464 if (block_size % miniblock_count != 0) return result;
465 size_t values_per_miniblock = static_cast<size_t>(block_size / miniblock_count);
466 if (values_per_miniblock == 0) return result;
467
468 // First value
469 int64_t prev = zigzag_decode(first_value_zz);
470 result.push_back(prev);
471
472 if (num_values == 1) return result;
473
474 // Decode blocks until we have enough values
475 size_t values_remaining = num_values - 1; // first value already emitted
476
477 while (values_remaining > 0 && pos < size) {
478 // Read min_delta (zigzag-encoded)
479 uint64_t min_delta_zz = decode_uvarint(data, pos, size);
480 int64_t min_delta = zigzag_decode(min_delta_zz);
481
482 // Read bit widths (one per miniblock)
483 std::vector<uint8_t> bit_widths(static_cast<size_t>(miniblock_count));
484 for (size_t mb = 0; mb < static_cast<size_t>(miniblock_count); ++mb) {
485 if (pos < size) {
486 bit_widths[mb] = data[pos++];
487 } else {
488 bit_widths[mb] = 0;
489 }
490 }
491
492 // Decode each miniblock (buffer allocated once, reused across miniblocks)
493 std::vector<uint64_t> unpacked(values_per_miniblock);
494 for (size_t mb = 0; mb < static_cast<size_t>(miniblock_count); ++mb) {
495 if (values_remaining == 0) break;
496
497 int bw = bit_widths[mb];
498 if (bw > 64) return result; // corrupt bit width — return partial
499
500 if (bw == 0) {
501 // All adjusted values are 0 => all deltas are min_delta
502 for (size_t j = 0; j < values_per_miniblock; ++j) {
503 unpacked[j] = 0;
504 }
505 } else {
506 // Calculate bytes needed for this miniblock
507 size_t miniblock_bytes = (values_per_miniblock * static_cast<size_t>(bw) + 7) / 8;
508
509 if (pos + miniblock_bytes > size) {
510 // Truncated data: decode what we can, pad rest with zeros
511 size_t avail = size - pos;
512 // Create a zero-padded copy
513 std::vector<uint8_t> padded(miniblock_bytes, 0);
514 std::memcpy(padded.data(), data + pos, avail);
515 bit_unpack_values(padded.data(), unpacked.data(),
516 values_per_miniblock, bw);
517 pos = size;
518 } else {
519 bit_unpack_values(data + pos, unpacked.data(),
520 values_per_miniblock, bw);
521 pos += miniblock_bytes;
522 }
523 }
524
525 // Convert adjusted values back to actual values
526 size_t to_emit = (std::min)(values_per_miniblock, values_remaining);
527 for (size_t j = 0; j < to_emit; ++j) {
528 // Reconstruct delta using unsigned arithmetic to avoid overflow UB
529 int64_t delta = static_cast<int64_t>(
530 unpacked[j] + static_cast<uint64_t>(min_delta));
531#if defined(__GNUC__) || defined(__clang__)
532 int64_t new_val;
533 if (__builtin_add_overflow(prev, delta, &new_val)) {
534 return result; // overflow — return partial result
535 }
536 prev = new_val;
537#else
538 // Manual overflow check for MSVC: detect sign-change
539 if ((delta > 0 && prev > (std::numeric_limits<int64_t>::max)() - delta) ||
540 (delta < 0 && prev < (std::numeric_limits<int64_t>::min)() - delta)) {
541 return result; // overflow — return partial result
542 }
543 prev += delta;
544#endif
545 result.push_back(prev);
546 }
547 values_remaining -= to_emit;
548 }
549 }
550
551 return result;
552}
553
564[[nodiscard]] inline std::vector<int32_t> decode_int32(const uint8_t* data,
565 size_t size,
566 size_t num_values) {
567 auto wide = decode_int64(data, size, num_values);
568 std::vector<int32_t> result(wide.size());
569 for (size_t i = 0; i < wide.size(); ++i) {
570 // CWE-681: Incorrect Conversion between Numeric Types — range check before int64→int32 narrowing
571 if (wide[i] < (std::numeric_limits<int32_t>::min)() || wide[i] > (std::numeric_limits<int32_t>::max)()) return {};
572 result[i] = static_cast<int32_t>(wide[i]);
573 }
574 return result;
575}
576
577} // namespace delta
578} // namespace signet::forge
std::vector< uint8_t > encode_int32(const int32_t *values, size_t count)
Encode int32 values using the DELTA_BINARY_PACKED algorithm.
Definition delta.hpp:408
constexpr size_t DEFAULT_MINIBLOCK_COUNT
Default number of miniblocks within each block.
Definition delta.hpp:61
int bit_width_for(uint64_t value)
Compute the minimum number of bits required to represent an unsigned value.
Definition delta.hpp:179
void bit_unpack_values(const uint8_t *src, uint64_t *values, size_t count, int bit_width)
Unpack an arbitrary number of values at a given bit width from packed data.
Definition delta.hpp:248
size_t encode_uvarint(std::vector< uint8_t > &buf, uint64_t value)
Encode an unsigned varint (LEB128) into a byte buffer.
Definition delta.hpp:131
uint64_t decode_uvarint(const uint8_t *data, size_t &pos, size_t size)
Decode an unsigned varint (LEB128) from a byte buffer.
Definition delta.hpp:152
uint32_t zigzag_encode32(int32_t n)
Zigzag-encode a signed 32-bit integer to an unsigned representation.
Definition delta.hpp:91
int64_t zigzag_decode(uint64_t v)
Zigzag-decode an unsigned 64-bit integer back to its signed representation.
Definition delta.hpp:102
std::vector< uint8_t > encode_int64(const int64_t *values, size_t count)
Encode int64 values using the DELTA_BINARY_PACKED algorithm.
Definition delta.hpp:298
std::vector< int32_t > decode_int32(const uint8_t *data, size_t size, size_t num_values)
Decode DELTA_BINARY_PACKED data back to int32 values.
Definition delta.hpp:564
constexpr size_t DEFAULT_BLOCK_SIZE
Default number of delta values per block (must be a multiple of 128).
Definition delta.hpp:58
int32_t zigzag_decode32(uint32_t v)
Zigzag-decode an unsigned 32-bit integer back to its signed representation.
Definition delta.hpp:113
std::vector< int64_t > decode_int64(const uint8_t *data, size_t size, size_t num_values)
Decode DELTA_BINARY_PACKED data back to int64 values.
Definition delta.hpp:438
constexpr size_t VALUES_PER_MINIBLOCK
Number of delta values per miniblock (DEFAULT_BLOCK_SIZE / DEFAULT_MINIBLOCK_COUNT).
Definition delta.hpp:64
void bit_pack_values(std::vector< uint8_t > &out, const uint64_t *values, size_t count, int bit_width)
Bit-pack an arbitrary number of values at a given bit width.
Definition delta.hpp:204
uint64_t zigzag_encode(int64_t n)
Zigzag-encode a signed 64-bit integer to an unsigned representation.
Definition delta.hpp:79