Signet Forge 0.1.0
C++20 Parquet library with AI-native extensions
DEMO
Loading...
Searching...
No Matches
compact.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright 2026 Johnson Ogundeji
3#pragma once
4
7
8#include <cstdint>
9#include <cstring>
10#include <limits>
11#include <optional>
12#include <stack>
13#include <stdexcept>
14#include <string>
15#include <vector>
16
17#include "signet/error.hpp"
18
20
26namespace compact_type {
27 inline constexpr uint8_t STOP = 0;
28 inline constexpr uint8_t BOOL_TRUE = 1;
29 inline constexpr uint8_t BOOL_FALSE = 2;
30 inline constexpr uint8_t I8 = 3;
31 inline constexpr uint8_t I16 = 4;
32 inline constexpr uint8_t I32 = 5;
33 inline constexpr uint8_t I64 = 6;
34 inline constexpr uint8_t DOUBLE = 7;
35 inline constexpr uint8_t BINARY = 8;
36 inline constexpr uint8_t LIST = 9;
37 inline constexpr uint8_t SET = 10;
38 inline constexpr uint8_t MAP = 11;
39 inline constexpr uint8_t STRUCT = 12;
40} // namespace compact_type
41
47 int16_t field_id;
48 uint8_t thrift_type;
49
51 [[nodiscard]] bool is_stop() const { return field_id == 0 && thrift_type == 0; }
52};
53
58struct ListHeader {
59 uint8_t elem_type;
60 int32_t size;
61};
62
73public:
75 static constexpr size_t MAX_STRING_BYTES = 64u * 1024u * 1024u;
76
78 CompactEncoder() { last_field_ids_.push(0); }
79
80 // -- field / struct helpers ------------------------------------------------
81
85 void write_field(int16_t field_id, uint8_t thrift_type) {
86 int16_t delta = field_id - last_field_ids_.top();
87 if (delta > 0 && delta <= 15) {
88 buf_.push_back(static_cast<uint8_t>((delta << 4) | thrift_type));
89 } else {
90 buf_.push_back(thrift_type);
91 write_zigzag_i16(field_id);
92 }
93 last_field_ids_.top() = field_id;
94 }
95
97 void write_stop() { buf_.push_back(0x00); }
98
100 void begin_struct() { last_field_ids_.push(0); }
101
103 void end_struct() { last_field_ids_.pop(); }
104
105 // -- primitive writers -----------------------------------------------------
106
108 void write_bool(bool val) {
109 buf_.push_back(val ? 0x01 : 0x00);
110 }
111
113 void write_i8(int8_t val) {
114 buf_.push_back(static_cast<uint8_t>(val));
115 }
116
120 void write_field_bool(int16_t field_id, bool val) {
121 uint8_t thrift_type = val ? compact_type::BOOL_TRUE
123 int16_t delta = field_id - last_field_ids_.top();
124 if (delta > 0 && delta <= 15) {
125 buf_.push_back(static_cast<uint8_t>((delta << 4) | thrift_type));
126 } else {
127 buf_.push_back(thrift_type);
128 write_zigzag_i16(field_id);
129 }
130 last_field_ids_.top() = field_id;
131 }
132
134 void write_i32(int32_t val) {
135 write_varint32(zigzag_encode_i32(val));
136 }
137
139 void write_i64(int64_t val) {
140 write_varint64(zigzag_encode_i64(val));
141 }
142
144 void write_double(double val) {
145 uint64_t bits;
146 std::memcpy(&bits, &val, 8);
147 for (int i = 0; i < 8; ++i)
148 buf_.push_back(static_cast<uint8_t>((bits >> (i * 8)) & 0xFF));
149 }
150
154 void write_float(float val) {
155 uint32_t bits;
156 std::memcpy(&bits, &val, 4);
157 for (int i = 0; i < 4; ++i)
158 buf_.push_back(static_cast<uint8_t>((bits >> (i * 8)) & 0xFF));
159 }
160
163 void write_string(const std::string& val) {
164 if (val.size() > MAX_STRING_BYTES) {
165 throw std::overflow_error("CompactEncoder::write_string: length "
166 + std::to_string(val.size()) + " exceeds MAX_STRING_BYTES");
167 }
168 write_varint32(static_cast<uint32_t>(val.size()));
169 buf_.insert(buf_.end(), val.begin(), val.end());
170 }
171
174 void write_binary(const uint8_t* data, size_t len) {
175 if (len > MAX_STRING_BYTES) {
176 throw std::overflow_error("CompactEncoder::write_binary: length "
177 + std::to_string(len) + " exceeds MAX_STRING_BYTES");
178 }
179 write_varint32(static_cast<uint32_t>(len));
180 buf_.insert(buf_.end(), data, data + len);
181 }
182
185 void write_list_header(uint8_t elem_type, int32_t size) {
186 if (size < 0) {
187 throw std::invalid_argument("write_list_header: negative list size");
188 }
189 if (size <= 14) {
190 buf_.push_back(static_cast<uint8_t>((size << 4) | elem_type));
191 } else {
192 buf_.push_back(static_cast<uint8_t>(0xF0 | elem_type));
193 write_varint32(static_cast<uint32_t>(size));
194 }
195 }
196
197 // -- access ---------------------------------------------------------------
198
200 [[nodiscard]] const std::vector<uint8_t>& data() const { return buf_; }
201
203 [[nodiscard]] size_t size() const { return buf_.size(); }
204
207 void clear() {
208 buf_.clear();
209 while (!last_field_ids_.empty()) last_field_ids_.pop();
210 last_field_ids_.push(0);
211 }
212
213private:
214 std::vector<uint8_t> buf_;
215 std::stack<int16_t> last_field_ids_;
216
217 // -- varint encoding ------------------------------------------------------
218
219 void write_varint32(uint32_t val) {
220 while (val > 0x7F) {
221 buf_.push_back(static_cast<uint8_t>((val & 0x7F) | 0x80));
222 val >>= 7;
223 }
224 buf_.push_back(static_cast<uint8_t>(val));
225 }
226
227 void write_varint64(uint64_t val) {
228 while (val > 0x7F) {
229 buf_.push_back(static_cast<uint8_t>((val & 0x7F) | 0x80));
230 val >>= 7;
231 }
232 buf_.push_back(static_cast<uint8_t>(val));
233 }
234
235 void write_zigzag_i16(int16_t val) {
236 uint32_t zz = zigzag_encode_i32(static_cast<int32_t>(val));
237 write_varint32(zz);
238 }
239
240 // -- zigzag encoding ------------------------------------------------------
241
242 // CWE-190, C++ [expr.shift] p7.6.7 — left shift on unsigned to avoid UB on signed overflow
243 static uint32_t zigzag_encode_i32(int32_t val) {
244 return (static_cast<uint32_t>(val) << 1) ^ static_cast<uint32_t>(val >> 31);
245 }
246
247 // CWE-190, C++ [expr.shift] p7.6.7 — left shift on unsigned to avoid UB
248 static uint64_t zigzag_encode_i64(int64_t val) {
249 return (static_cast<uint64_t>(val) << 1) ^ static_cast<uint64_t>(val >> 63);
250 }
251};
252
268public:
274 CompactDecoder(const uint8_t* data, size_t size)
275 : data_(data), size_(size), pos_(0), error_(false),
276 pending_bool_{}, pending_bool_valid_(false) {
277 last_field_ids_.push(0);
278 }
279
280 // -- field / struct helpers ------------------------------------------------
281
286 if (!ensure(1)) return {0, 0};
287
288 uint8_t byte = data_[pos_++];
289
290 // STOP marker
291 if (byte == 0x00) return {0, 0};
292
293 uint8_t type = byte & 0x0F;
294 int16_t delta = static_cast<int16_t>((byte >> 4) & 0x0F);
295
296 int16_t field_id;
297 if (delta != 0) {
298 // Delta-encoded field ID
299 field_id = last_field_ids_.top() + delta;
300 } else {
301 // Full field ID follows as zigzag varint
302 int32_t id32 = read_zigzag_i32();
303 field_id = static_cast<int16_t>(id32);
304 }
305 last_field_ids_.top() = field_id;
306
307 // For bool fields, the value is embedded in the type nibble.
308 // Cache it for the next read_bool() call.
309 if (type == compact_type::BOOL_TRUE) {
310 pending_bool_ = true;
311 pending_bool_valid_ = true;
312 } else if (type == compact_type::BOOL_FALSE) {
313 pending_bool_ = false;
314 pending_bool_valid_ = true;
315 }
316
317 if (++field_count_ > MAX_FIELD_COUNT) {
318 error_ = true;
319 return {0, 0};
320 }
321 // CWE-400: Uncontrolled Resource Consumption (DoS prevention)
322 if (++total_fields_read_ > MAX_TOTAL_FIELDS) {
323 error_ = true;
324 return {0, 0};
325 }
326
327 return {field_id, type};
328 }
329
332 [[nodiscard]] bool read_bool() {
333 if (pending_bool_valid_) {
334 pending_bool_valid_ = false;
335 return pending_bool_;
336 }
337 if (!ensure(1)) return false;
338 return data_[pos_++] != 0;
339 }
340
342 [[nodiscard]] int8_t read_i8() {
343 if (!ensure(1)) return 0;
344 return static_cast<int8_t>(data_[pos_++]);
345 }
346
348 [[nodiscard]] int32_t read_i32() {
349 return read_zigzag_i32();
350 }
351
353 [[nodiscard]] int64_t read_i64() {
354 return read_zigzag_i64();
355 }
356
358 [[nodiscard]] double read_double() {
359 if (!ensure(8)) return 0.0;
360 uint64_t bits = 0;
361 for (int i = 0; i < 8; ++i)
362 bits |= static_cast<uint64_t>(data_[pos_++]) << (i * 8);
363 double val;
364 std::memcpy(&val, &bits, 8);
365 return val;
366 }
367
369 [[nodiscard]] float read_float() {
370 if (!ensure(4)) return 0.0f;
371 uint32_t bits = 0;
372 for (int i = 0; i < 4; ++i)
373 bits |= static_cast<uint32_t>(data_[pos_++]) << (i * 8);
374 float val;
375 std::memcpy(&val, &bits, 4);
376 return val;
377 }
378
380 [[nodiscard]] std::string read_string() {
381 uint32_t len = read_varint32();
382 if (len > MAX_STRING_BYTES) { error_ = true; return {}; }
383 if (!ensure(len)) return {};
384 std::string result(reinterpret_cast<const char*>(data_ + pos_), len);
385 pos_ += len;
386 return result;
387 }
388
390 [[nodiscard]] std::vector<uint8_t> read_binary() {
391 uint32_t len = read_varint32();
392 if (len > MAX_STRING_BYTES) { error_ = true; return {}; }
393 if (!ensure(len)) return {};
394 std::vector<uint8_t> result(data_ + pos_, data_ + pos_ + len);
395 pos_ += len;
396 return result;
397 }
398
400 [[nodiscard]] ListHeader read_list_header() {
401 if (!ensure(1)) return {0, 0};
402 uint8_t byte = data_[pos_++];
403 uint8_t elem_type = byte & 0x0F;
404 int32_t size = (byte >> 4) & 0x0F;
405 if (size == 15) {
406 // Large list: size follows as varint
407 uint32_t raw = read_varint32();
408 if (raw > static_cast<uint32_t>((std::numeric_limits<int32_t>::max)())) {
409 error_ = true;
410 return {0, 0};
411 }
412 size = static_cast<int32_t>(raw);
413 }
414 if (size < 0) {
415 // CWE-20: Improper Input Validation — negative list size (likely corrupt data)
416 error_ = true;
417 return {0, 0};
418 }
419 if (static_cast<uint32_t>(size) > MAX_COLLECTION_SIZE) {
420 error_ = true; return {0, 0};
421 }
422 return {elem_type, size};
423 }
424
427 void skip_field(uint8_t thrift_type) {
428 switch (thrift_type) {
431 // Value already consumed in field header; nothing to skip.
432 break;
433
434 case compact_type::I8:
435 // Single byte
436 if (ensure(1)) pos_ += 1;
437 break;
438
441 // Zigzag varint — just consume it
442 (void)read_varint32();
443 break;
444
446 // Zigzag varint64 — just consume it
447 (void)read_varint64();
448 break;
449
451 if (ensure(8)) pos_ += 8;
452 break;
453
455 // Length-prefixed bytes
456 uint32_t len = read_varint32();
457 if (ensure(len)) pos_ += len;
458 break;
459 }
460
462 case compact_type::SET: {
463 auto hdr = read_list_header();
464 if (hdr.size < 0 || static_cast<uint32_t>(hdr.size) > MAX_COLLECTION_SIZE) {
465 error_ = true; break;
466 }
467 for (int32_t i = 0; i < hdr.size && good(); ++i) {
468 skip_field(hdr.elem_type);
469 }
470 break;
471 }
472
473 case compact_type::MAP: {
474 uint32_t map_size = read_varint32();
475 if (map_size == 0) break;
476 if (map_size > MAX_COLLECTION_SIZE) { error_ = true; break; }
477 if (!ensure(1)) break;
478 uint8_t kv_types = data_[pos_++];
479 uint8_t key_type = (kv_types >> 4) & 0x0F;
480 uint8_t val_type = kv_types & 0x0F;
481 for (uint32_t i = 0; i < map_size && good(); ++i) {
482 skip_field(key_type);
483 skip_field(val_type);
484 }
485 break;
486 }
487
489 // Read fields until STOP
490 begin_struct();
491 while (good()) {
492 auto fh = read_field_header();
493 if (fh.is_stop()) break;
494 skip_field(fh.thrift_type);
495 }
496 end_struct();
497 break;
498 }
499
500 default:
501 // Unknown type — mark as error
502 error_ = true;
503 break;
504 }
505 }
506
509 if (last_field_ids_.size() >= MAX_NESTING_DEPTH) { error_ = true; return; }
510 last_field_ids_.push(0);
511 field_count_ = 0;
512 }
513
515 void end_struct() {
516 if (last_field_ids_.empty()) { error_ = true; return; } // CWE-124: Buffer Underwrite
517 last_field_ids_.pop();
518 }
519
520 // -- state queries --------------------------------------------------------
521
523 [[nodiscard]] size_t remaining() const {
524 return (pos_ <= size_) ? (size_ - pos_) : 0;
525 }
526
528 [[nodiscard]] size_t position() const { return pos_; }
529
531 [[nodiscard]] bool good() const { return !error_; }
532
533private:
534 const uint8_t* data_;
535 size_t size_;
536 size_t pos_;
537 bool error_;
538
539 // Bool values embedded in field headers are cached here.
540 bool pending_bool_;
541 bool pending_bool_valid_;
542
543 static constexpr size_t MAX_NESTING_DEPTH = 64;
544 static constexpr size_t MAX_FIELD_COUNT = 65536;
545 static constexpr size_t MAX_TOTAL_FIELDS = 1'000'000;
546 static constexpr uint32_t MAX_STRING_BYTES = 64u * 1024u * 1024u;
547 static constexpr uint32_t MAX_COLLECTION_SIZE = 1'000'000u;
548 std::stack<int16_t> last_field_ids_;
549 size_t field_count_ = 0;
550 size_t total_fields_read_ = 0;
551
552 // -- bounds checking ------------------------------------------------------
553
555 [[nodiscard]] bool ensure(size_t n) {
556 if (error_ || n > size_ || pos_ > size_ - n) {
557 error_ = true;
558 return false;
559 }
560 return true;
561 }
562
563 // -- varint decoding ------------------------------------------------------
564
565 [[nodiscard]] uint32_t read_varint32() {
566 uint32_t result = 0;
567 int shift = 0;
568 while (shift < 35) {
569 if (!ensure(1)) return 0;
570 uint8_t byte = data_[pos_++];
571 result |= static_cast<uint32_t>(byte & 0x7F) << shift;
572 if ((byte & 0x80) == 0) return result;
573 shift += 7;
574 }
575 // Varint too long — malformed
576 error_ = true;
577 return 0;
578 }
579
580 [[nodiscard]] uint64_t read_varint64() {
581 uint64_t result = 0;
582 int shift = 0;
583 while (shift < 70) {
584 if (!ensure(1)) return 0;
585 uint8_t byte = data_[pos_++];
586 result |= static_cast<uint64_t>(byte & 0x7F) << shift;
587 if ((byte & 0x80) == 0) return result;
588 shift += 7;
589 }
590 // Varint too long — malformed
591 error_ = true;
592 return 0;
593 }
594
595 // -- zigzag decoding ------------------------------------------------------
596
597 [[nodiscard]] int32_t read_zigzag_i32() {
598 uint32_t raw = read_varint32();
599 return static_cast<int32_t>((raw >> 1) ^ -(static_cast<int32_t>(raw & 1)));
600 }
601
602 [[nodiscard]] int64_t read_zigzag_i64() {
603 uint64_t raw = read_varint64();
604 return static_cast<int64_t>((raw >> 1) ^ -(static_cast<int64_t>(raw & 1)));
605 }
606};
607
608} // namespace signet::forge::thrift
Thrift Compact Protocol reader.
Definition compact.hpp:267
void begin_struct()
Push a new field-ID context for reading a nested struct.
Definition compact.hpp:508
void end_struct()
Pop the field-ID context after finishing a nested struct.
Definition compact.hpp:515
double read_double()
Read a double (8 bytes little-endian, IEEE 754).
Definition compact.hpp:358
FieldHeader read_field_header()
Read a field header.
Definition compact.hpp:285
int64_t read_i64()
Read a 64-bit integer (zigzag + varint64 decode).
Definition compact.hpp:353
CompactDecoder(const uint8_t *data, size_t size)
Construct a decoder over a byte buffer.
Definition compact.hpp:274
int8_t read_i8()
Read an 8-bit signed integer (single raw byte, I8 wire type).
Definition compact.hpp:342
ListHeader read_list_header()
Read a list header. Returns element type and count.
Definition compact.hpp:400
void skip_field(uint8_t thrift_type)
Skip a field without parsing its value.
Definition compact.hpp:427
std::string read_string()
Read a string (varint-length-prefixed UTF-8 bytes).
Definition compact.hpp:380
float read_float()
Read a float (4 bytes little-endian, IEEE 754).
Definition compact.hpp:369
std::vector< uint8_t > read_binary()
Read raw binary data (varint-length-prefixed bytes).
Definition compact.hpp:390
size_t position() const
Returns the current read position (offset from start of buffer).
Definition compact.hpp:528
bool good() const
Returns true if no errors have occurred (no bounds violations).
Definition compact.hpp:531
bool read_bool()
Read a boolean value.
Definition compact.hpp:332
size_t remaining() const
Returns the number of bytes remaining in the buffer.
Definition compact.hpp:523
int32_t read_i32()
Read a 32-bit integer (zigzag + varint decode).
Definition compact.hpp:348
Thrift Compact Protocol writer.
Definition compact.hpp:72
void begin_struct()
Push a new field-ID context for a nested struct.
Definition compact.hpp:100
size_t size() const
Returns the current size of the encoded buffer in bytes.
Definition compact.hpp:203
void write_bool(bool val)
Write a standalone bool (not embedded in a field header).
Definition compact.hpp:108
void end_struct()
Pop the field-ID context after finishing a nested struct.
Definition compact.hpp:103
void write_string(const std::string &val)
Write a string as varint-length-prefixed UTF-8 bytes.
Definition compact.hpp:163
void write_field_bool(int16_t field_id, bool val)
Write a bool field where the value is embedded in the field header's type nibble (1 = true,...
Definition compact.hpp:120
void write_float(float val)
Write a float as 4 bytes little-endian (IEEE 754).
Definition compact.hpp:154
const std::vector< uint8_t > & data() const
Returns a const reference to the underlying byte buffer.
Definition compact.hpp:200
void clear()
Resets the encoder to its initial state (empty buffer, field ID stack reset to a single zero entry).
Definition compact.hpp:207
CompactEncoder()
Default constructor. Initializes field-ID stack with a single zero entry.
Definition compact.hpp:78
void write_field(int16_t field_id, uint8_t thrift_type)
Write a field header.
Definition compact.hpp:85
void write_double(double val)
Write a double as 8 bytes little-endian (IEEE 754).
Definition compact.hpp:144
void write_i32(int32_t val)
Write a 32-bit integer as zigzag + varint.
Definition compact.hpp:134
void write_stop()
Write struct stop marker (0x00).
Definition compact.hpp:97
void write_i64(int64_t val)
Write a 64-bit integer as zigzag + varint.
Definition compact.hpp:139
void write_i8(int8_t val)
Write an 8-bit signed integer as a single raw byte (I8 wire type).
Definition compact.hpp:113
void write_binary(const uint8_t *data, size_t len)
Write raw binary data as varint-length-prefixed bytes.
Definition compact.hpp:174
static constexpr size_t MAX_STRING_BYTES
Maximum string/binary field size (matches CompactDecoder::MAX_STRING_BYTES).
Definition compact.hpp:75
void write_list_header(uint8_t elem_type, int32_t size)
Write a list header.
Definition compact.hpp:185
constexpr uint8_t STRUCT
Nested struct.
Definition compact.hpp:39
constexpr uint8_t I32
32-bit signed integer (zigzag + varint).
Definition compact.hpp:32
constexpr uint8_t DOUBLE
IEEE 754 double (8 bytes LE).
Definition compact.hpp:34
constexpr uint8_t BOOL_FALSE
Boolean false (embedded in field header).
Definition compact.hpp:29
constexpr uint8_t SET
Set container.
Definition compact.hpp:37
constexpr uint8_t BINARY
Length-prefixed bytes (also used for STRING).
Definition compact.hpp:35
constexpr uint8_t LIST
List container.
Definition compact.hpp:36
constexpr uint8_t STOP
Struct stop marker.
Definition compact.hpp:27
constexpr uint8_t I16
16-bit signed integer (zigzag + varint).
Definition compact.hpp:31
constexpr uint8_t BOOL_TRUE
Boolean true (embedded in field header).
Definition compact.hpp:28
constexpr uint8_t MAP
Map container.
Definition compact.hpp:38
constexpr uint8_t I64
64-bit signed integer (zigzag + varint).
Definition compact.hpp:33
constexpr uint8_t I8
8-bit signed integer.
Definition compact.hpp:30
Decoded field header from the Thrift Compact Protocol.
Definition compact.hpp:46
bool is_stop() const
Check if this is the STOP marker (end of struct).
Definition compact.hpp:51
uint8_t thrift_type
Wire type (one of the compact_type constants).
Definition compact.hpp:48
int16_t field_id
Thrift field identifier (from the schema).
Definition compact.hpp:47
Decoded list/set header from the Thrift Compact Protocol.
Definition compact.hpp:58
int32_t size
Number of elements in the list/set.
Definition compact.hpp:60
uint8_t elem_type
Wire type of each element (compact_type constant).
Definition compact.hpp:59