Signet Forge 0.1.0
C++20 Parquet library with AI-native extensions
DEMO
Loading...
Searching...
No Matches
snappy.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright 2026 Johnson Ogundeji
3#pragma once
4
26
28
29#include <array>
30#include <cstdint>
31#include <cstring>
32#include <string>
33#include <vector>
34
35namespace signet::forge {
36
37// ===========================================================================
38// Internal helpers -- not part of the public API
39// ===========================================================================
40namespace detail::snappy {
41
42// ---- Varint encoding/decoding (unsigned, up to 32-bit) --------------------
43
46inline size_t encode_varint32(uint8_t* dst, uint32_t value) {
47 size_t n = 0;
48 while (value >= 0x80) {
49 dst[n++] = static_cast<uint8_t>(value | 0x80);
50 value >>= 7;
51 }
52 dst[n++] = static_cast<uint8_t>(value);
53 return n;
54}
55
59inline bool decode_varint32(const uint8_t* data, size_t size, size_t& pos,
60 uint32_t& out) {
61 uint32_t result = 0;
62 uint32_t shift = 0;
63 for (int i = 0; i < 5; ++i) {
64 if (pos >= size) return false;
65 uint8_t byte = data[pos++];
66 result |= static_cast<uint32_t>(byte & 0x7F) << shift;
67 if ((byte & 0x80) == 0) {
68 out = result;
69 return true;
70 }
71 shift += 7;
72 }
73 // More than 5 bytes -- invalid for a 32-bit varint.
74 return false;
75}
76
77// ---- Little-endian helpers ------------------------------------------------
78
80inline uint32_t load_le32(const uint8_t* p) {
81 uint32_t v;
82 std::memcpy(&v, p, 4);
83#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
84 v = __builtin_bswap32(v);
85#endif
86 return v;
87}
88
90inline uint16_t load_le16(const uint8_t* p) {
91 uint16_t v;
92 std::memcpy(&v, p, 2);
93#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
94 v = __builtin_bswap16(v);
95#endif
96 return v;
97}
98
100inline void store_le16(uint8_t* p, uint16_t v) {
101#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
102 v = __builtin_bswap16(v);
103#endif
104 std::memcpy(p, &v, 2);
105}
106
108inline void store_le32(uint8_t* p, uint32_t v) {
109#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
110 v = __builtin_bswap32(v);
111#endif
112 std::memcpy(p, &v, 4);
113}
114
115// ---- Hash function for the compressor ------------------------------------
116
120inline uint32_t hash4(const uint8_t* p) {
121 uint32_t val = load_le32(p);
122 return (val * 0x1e35a7bd) >> 18; // Result is in [0, 16383]
123}
124
125// Hash table size: 2^14 = 16384 entries.
126static constexpr size_t kHashTableSize = 16384;
127static constexpr uint32_t kHashTableMask = kHashTableSize - 1;
128
129// ---- Literal / copy element emitters (compressor) -------------------------
130
133inline void emit_literal(std::vector<uint8_t>& out,
134 const uint8_t* data, size_t length) {
135 // Tag byte encoding depends on literal length.
136 if (length <= 60) {
137 // Lengths 1-60: tag encodes (length-1) in the upper 6 bits.
138 out.push_back(static_cast<uint8_t>((length - 1) << 2 | 0));
139 } else if (length <= 256) {
140 // Lengths 61-256: tag = (60 << 2 | 0), followed by 1 byte of (length-1).
141 out.push_back(static_cast<uint8_t>(60 << 2 | 0));
142 out.push_back(static_cast<uint8_t>(length - 1));
143 } else if (length <= 65536) {
144 // Lengths 257-65536: tag = (61 << 2 | 0), followed by 2 LE bytes of (length-1).
145 out.push_back(static_cast<uint8_t>(61 << 2 | 0));
146 uint16_t len_minus_1 = static_cast<uint16_t>(length - 1);
147 out.push_back(static_cast<uint8_t>(len_minus_1 & 0xFF));
148 out.push_back(static_cast<uint8_t>((len_minus_1 >> 8) & 0xFF));
149 } else if (length <= 16777216) {
150 // Lengths 65537-16777216: tag = (62 << 2 | 0), followed by 3 LE bytes.
151 out.push_back(static_cast<uint8_t>(62 << 2 | 0));
152 uint32_t len_minus_1 = static_cast<uint32_t>(length - 1);
153 out.push_back(static_cast<uint8_t>(len_minus_1 & 0xFF));
154 out.push_back(static_cast<uint8_t>((len_minus_1 >> 8) & 0xFF));
155 out.push_back(static_cast<uint8_t>((len_minus_1 >> 16) & 0xFF));
156 } else {
157 // Lengths up to 2^32: tag = (63 << 2 | 0), followed by 4 LE bytes.
158 out.push_back(static_cast<uint8_t>(63 << 2 | 0));
159 uint32_t len_minus_1 = static_cast<uint32_t>(length - 1);
160 out.push_back(static_cast<uint8_t>(len_minus_1 & 0xFF));
161 out.push_back(static_cast<uint8_t>((len_minus_1 >> 8) & 0xFF));
162 out.push_back(static_cast<uint8_t>((len_minus_1 >> 16) & 0xFF));
163 out.push_back(static_cast<uint8_t>((len_minus_1 >> 24) & 0xFF));
164 }
165
166 // Append the literal bytes themselves.
167 out.insert(out.end(), data, data + length);
168}
169
172inline void emit_copy(std::vector<uint8_t>& out, uint32_t offset,
173 uint32_t length) {
174 // Copy-1: offset 1-2047, length 4-11
175 // Encodes in 2 bytes total (tag + 1 extra).
176 while (length > 0) {
177 if (length >= 4 && length <= 11 && offset <= 2047) {
178 // Copy-1 encoding.
179 uint8_t tag = static_cast<uint8_t>(
180 ((offset >> 8) << 5) | ((length - 4) << 2) | 1);
181 out.push_back(tag);
182 out.push_back(static_cast<uint8_t>(offset & 0xFF));
183 return;
184 }
185
186 if (offset <= 65535) {
187 // Copy-2 encoding: offset in 2 LE bytes, length 1-64.
188 uint32_t chunk = (length > 64) ? 64 : length;
189 uint8_t tag = static_cast<uint8_t>(((chunk - 1) << 2) | 2);
190 out.push_back(tag);
191 out.push_back(static_cast<uint8_t>(offset & 0xFF));
192 out.push_back(static_cast<uint8_t>((offset >> 8) & 0xFF));
193 length -= chunk;
194 } else {
195 // Copy-4 encoding: offset in 4 LE bytes, length 1-64.
196 uint32_t chunk = (length > 64) ? 64 : length;
197 uint8_t tag = static_cast<uint8_t>(((chunk - 1) << 2) | 3);
198 out.push_back(tag);
199 out.push_back(static_cast<uint8_t>(offset & 0xFF));
200 out.push_back(static_cast<uint8_t>((offset >> 8) & 0xFF));
201 out.push_back(static_cast<uint8_t>((offset >> 16) & 0xFF));
202 out.push_back(static_cast<uint8_t>((offset >> 24) & 0xFF));
203 length -= chunk;
204 }
205 }
206}
207
211inline uint32_t match_length(const uint8_t* src, size_t s1, size_t s2,
212 size_t src_end) {
213 if (s1 >= src_end || s2 >= src_end) return 0; // CWE-125 guard
214 uint32_t len = 0;
215 size_t limit = src_end - ((s1 > s2) ? s1 : s2);
216 // Cap match length to avoid overly long copy chains. The Snappy format
217 // doesn't impose a per-copy maximum, but we emit in chunks anyway.
218 if (limit > 65535) limit = 65535;
219 while (len < limit && src[s1 + len] == src[s2 + len]) {
220 ++len;
221 }
222 return len;
223}
224
225} // namespace detail::snappy
226
227// ===========================================================================
228// SnappyCodec -- CompressionCodec implementation
229// ===========================================================================
230
242public:
252 const uint8_t* data, size_t size) const override {
253
254 using namespace detail::snappy;
255
256 // Pessimistic upper bound: varint(5) + tag overhead per literal +
257 // the data itself. Snappy's worst case is about 1.004x the input.
258 std::vector<uint8_t> out;
259 out.reserve(size + size / 64 + 16);
260
261 // Snappy uses a uint32 varint for uncompressed length; reject >4 GiB
262 // CWE-190: Integer Overflow (Snappy uses 32-bit lengths)
263 if (size > UINT32_MAX) {
265 "Snappy: input exceeds 4 GiB limit"};
266 }
267
268 // -- 1. Write the preamble: uncompressed length as varint ----------
269 uint8_t varint_buf[5];
270 size_t varint_len = encode_varint32(varint_buf,
271 static_cast<uint32_t>(size));
272 out.insert(out.end(), varint_buf, varint_buf + varint_len);
273
274 // Handle trivial cases.
275 if (size == 0) {
276 return out;
277 }
278 if (size < 4) {
279 // Too short for any match -- emit a single literal.
280 emit_literal(out, data, size);
281 return out;
282 }
283
284 // -- 2. Compress using greedy hash matching ------------------------
285 //
286 // The hash table maps 4-byte sequences (hashed to 14 bits) to their
287 // most recent position in the input. When a hash collision finds a
288 // genuine byte match, we emit any pending literals followed by a copy
289 // element.
290
291 // Use uint64_t positions to support inputs > 4 GB (L-1).
292 std::array<uint64_t, kHashTableSize> table{};
293 // Sentinel: UINT64_MAX means "no entry". We use this rather than 0
294 // because position 0 is a valid position.
295 table.fill(UINT64_MAX);
296
297 size_t pos = 0; // Current scan position in `data`.
298 size_t literal_start = 0; // Start of pending literal run.
299
300 while (pos + 4 <= size) {
301 uint32_t h = hash4(data + pos);
302 uint64_t candidate = table[h];
303 table[h] = pos;
304
305 // Check for a genuine match: the candidate must exist, the 4
306 // bytes must match, and the offset must be positive.
307 if (candidate != UINT64_MAX &&
308 pos - candidate <= 65535 &&
309 load_le32(data + pos) == load_le32(data + candidate)) {
310
311 // Emit any pending literals before the copy.
312 if (pos > literal_start) {
313 emit_literal(out, data + literal_start,
314 pos - literal_start);
315 }
316
317 // Determine how far the match extends.
318 uint32_t ml = match_length(data, pos, candidate, size);
319 if (ml < 4) ml = 4; // We already verified 4 bytes match.
320
321 uint32_t offset = static_cast<uint32_t>(pos - candidate);
322 emit_copy(out, offset, ml);
323
324 // Advance past the matched region. Insert intermediate
325 // positions into the hash table so future matches can find
326 // them (skip-1 heuristic: hash every position in the match).
327 size_t match_end = pos + ml;
328 pos++;
329 while (pos < match_end && pos + 4 <= size) {
330 table[hash4(data + pos)] = pos;
331 pos++;
332 }
333 pos = match_end;
334 literal_start = pos;
335 } else {
336 // No match -- advance one byte and keep accumulating literals.
337 ++pos;
338 }
339 }
340
341 // -- 3. Flush remaining literals -----------------------------------
342 if (literal_start < size) {
343 emit_literal(out, data + literal_start, size - literal_start);
344 }
345
346 return out;
347 }
348
361 const uint8_t* data, size_t size,
362 size_t uncompressed_size) const override {
363
364 using namespace detail::snappy;
365
366 if (size == 0) {
367 if (uncompressed_size == 0) {
368 return std::vector<uint8_t>{};
369 }
371 "Snappy: empty compressed stream but "
372 "expected non-zero output"};
373 }
374
375 // -- 1. Read the preamble: uncompressed length varint --------------
376 size_t pos = 0;
377 uint32_t declared_len = 0;
378 if (!decode_varint32(data, size, pos, declared_len)) {
380 "Snappy: failed to decode uncompressed length varint"};
381 }
382
383 // Validate against the Parquet-declared uncompressed size.
384 if (static_cast<size_t>(declared_len) != uncompressed_size) {
386 "Snappy: declared uncompressed length (" +
387 std::to_string(declared_len) +
388 ") does not match expected (" +
389 std::to_string(uncompressed_size) + ")"};
390 }
391
392 // -- 2. Allocate the output buffer ---------------------------------
393 // CWE-409: Improper Handling of Highly Compressed Data (Decompression Bomb)
394 static constexpr size_t MAX_SNAPPY_DECOMPRESS = 256ULL * 1024 * 1024;
395 if (uncompressed_size > MAX_SNAPPY_DECOMPRESS) {
397 "Snappy: decompressed size exceeds 256 MB"};
398 }
399 std::vector<uint8_t> out(uncompressed_size);
400 size_t out_pos = 0;
401
402 // -- 3. Decode elements --------------------------------------------
403 while (pos < size) {
404 uint8_t tag = data[pos++];
405 uint8_t element_type = tag & 0x03;
406
407 switch (element_type) {
408
409 // ---- Literal (type 0) ----------------------------------------
410 case 0: {
411 uint32_t literal_len = (tag >> 2) + 1;
412
413 // If the encoded length-1 is 60..63, the actual length follows
414 // as 1..4 extra bytes (little-endian).
415 uint32_t encoded_len_minus_1 = tag >> 2;
416 if (encoded_len_minus_1 >= 60) {
417 uint32_t extra_bytes = encoded_len_minus_1 - 59;
418 if (pos + extra_bytes > size) {
420 "Snappy: literal length bytes truncated"};
421 }
422 literal_len = 0;
423 for (uint32_t i = 0; i < extra_bytes; ++i) {
424 literal_len |= static_cast<uint32_t>(data[pos++]) << (8 * i);
425 }
426 literal_len += 1; // Stored as (length - 1).
427 }
428
429 // Bounds check.
430 if (pos + literal_len > size) {
432 "Snappy: literal data extends past end of "
433 "compressed stream"};
434 }
435 if (out_pos + literal_len > uncompressed_size) {
437 "Snappy: literal would overflow output buffer"};
438 }
439
440 std::memcpy(out.data() + out_pos, data + pos, literal_len);
441 pos += literal_len;
442 out_pos += literal_len;
443 break;
444 }
445
446 // ---- Copy-1 (type 1): 1 extra byte --------------------------
447 // Tag layout: [offset_hi_3 : 3][length-4 : 3][01 : 2]
448 // Extra byte: offset low 8 bits.
449 case 1: {
450 if (pos >= size) {
452 "Snappy: copy-1 truncated"};
453 }
454 uint32_t length = ((tag >> 2) & 0x07) + 4; // 4..11
455 uint32_t offset = (static_cast<uint32_t>(tag >> 5) << 8) |
456 data[pos++];
457 if (offset == 0) {
459 "Snappy: copy-1 with zero offset"};
460 }
461 if (offset > out_pos) {
463 "Snappy: copy-1 offset (" +
464 std::to_string(offset) +
465 ") exceeds output position (" +
466 std::to_string(out_pos) + ")"};
467 }
468 if (out_pos + length > uncompressed_size) {
470 "Snappy: copy-1 would overflow output buffer"};
471 }
472
473 // Copy from earlier output. Byte-by-byte to handle the
474 // overlapping case (length > offset), which is how Snappy
475 // encodes run-length patterns.
476 size_t src = out_pos - offset;
477 for (uint32_t i = 0; i < length; ++i) {
478 out[out_pos++] = out[src + i];
479 }
480 break;
481 }
482
483 // ---- Copy-2 (type 2): 2 extra bytes (LE offset) -------------
484 // Tag layout: [length-1 : 6][10 : 2]
485 case 2: {
486 if (pos + 2 > size) {
488 "Snappy: copy-2 truncated"};
489 }
490 uint32_t length = (tag >> 2) + 1; // 1..64
491 uint32_t offset = load_le16(data + pos);
492 pos += 2;
493
494 if (offset == 0) {
496 "Snappy: copy-2 with zero offset"};
497 }
498 if (offset > out_pos) {
500 "Snappy: copy-2 offset (" +
501 std::to_string(offset) +
502 ") exceeds output position (" +
503 std::to_string(out_pos) + ")"};
504 }
505 if (out_pos + length > uncompressed_size) {
507 "Snappy: copy-2 would overflow output buffer"};
508 }
509
510 size_t src = out_pos - offset;
511 if (length <= offset) {
512 // Non-overlapping: safe to memcpy.
513 std::memcpy(out.data() + out_pos, out.data() + src, length);
514 out_pos += length;
515 } else {
516 // Overlapping: byte-by-byte.
517 for (uint32_t i = 0; i < length; ++i) {
518 out[out_pos++] = out[src + i];
519 }
520 }
521 break;
522 }
523
524 // ---- Copy-4 (type 3): 4 extra bytes (LE offset) -------------
525 // Tag layout: [length-1 : 6][11 : 2]
526 case 3: {
527 if (pos + 4 > size) {
529 "Snappy: copy-4 truncated"};
530 }
531 uint32_t length = (tag >> 2) + 1; // 1..64
532 uint32_t offset = load_le32(data + pos);
533 pos += 4;
534
535 if (offset == 0) {
537 "Snappy: copy-4 with zero offset"};
538 }
539 if (static_cast<size_t>(offset) > out_pos) {
541 "Snappy: copy-4 offset (" +
542 std::to_string(offset) +
543 ") exceeds output position (" +
544 std::to_string(out_pos) + ")"};
545 }
546 if (out_pos + length > uncompressed_size) {
548 "Snappy: copy-4 would overflow output buffer"};
549 }
550
551 size_t src = out_pos - offset;
552 if (length <= offset) {
553 std::memcpy(out.data() + out_pos, out.data() + src, length);
554 out_pos += length;
555 } else {
556 for (uint32_t i = 0; i < length; ++i) {
557 out[out_pos++] = out[src + i];
558 }
559 }
560 break;
561 }
562
563 default:
564 // Unreachable: element_type is (tag & 0x03), always 0-3.
566 "Snappy: unknown element type"};
567 }
568 }
569
570 // -- 4. Validate final output size ---------------------------------
571 if (out_pos != uncompressed_size) {
573 "Snappy: decompressed " + std::to_string(out_pos) +
574 " bytes but expected " +
575 std::to_string(uncompressed_size)};
576 }
577
578 return out;
579 }
580
583
585 [[nodiscard]] Compression codec_type() const override {
586 return Compression::SNAPPY;
587 }
588
590 [[nodiscard]] const char* name() const override {
591 return "snappy";
592 }
593
595};
596
597// ===========================================================================
598// Auto-registration helper
599// ===========================================================================
600
609 CodecRegistry::instance().register_codec(std::make_unique<SnappyCodec>());
610}
611
612} // namespace signet::forge
void register_codec(std::unique_ptr< CompressionCodec > codec)
Register a codec, transferring ownership to the registry.
Definition codec.hpp:105
static CodecRegistry & instance()
Access the process-wide singleton instance.
Definition codec.hpp:94
Abstract base class for all compression/decompression codecs.
Definition codec.hpp:45
Bundled Snappy compression codec (header-only, no external dependency).
Definition snappy.hpp:241
expected< std::vector< uint8_t > > decompress(const uint8_t *data, size_t size, size_t uncompressed_size) const override
Snappy-decompress the input data.
Definition snappy.hpp:360
const char * name() const override
Return the codec name "snappy".
Definition snappy.hpp:590
expected< std::vector< uint8_t > > compress(const uint8_t *data, size_t size) const override
Snappy-compress the input data.
Definition snappy.hpp:251
Compression codec_type() const override
Return Compression::SNAPPY.
Definition snappy.hpp:585
A lightweight result type that holds either a success value of type T or an Error.
Definition error.hpp:145
Compression codec interface and registry for Signet Forge.
void store_le16(uint8_t *p, uint16_t v)
Write a 16-bit little-endian value.
Definition snappy.hpp:100
void store_le32(uint8_t *p, uint32_t v)
Write a 32-bit little-endian value.
Definition snappy.hpp:108
size_t encode_varint32(uint8_t *dst, uint32_t value)
Encode a 32-bit unsigned integer as a Snappy varint (1-5 bytes).
Definition snappy.hpp:46
uint16_t load_le16(const uint8_t *p)
Read a 16-bit little-endian value from a potentially unaligned pointer.
Definition snappy.hpp:90
void emit_literal(std::vector< uint8_t > &out, const uint8_t *data, size_t length)
Emit a literal element.
Definition snappy.hpp:133
void emit_copy(std::vector< uint8_t > &out, uint32_t offset, uint32_t length)
Emit a copy element.
Definition snappy.hpp:172
bool decode_varint32(const uint8_t *data, size_t size, size_t &pos, uint32_t &out)
Decode a Snappy varint from the input stream.
Definition snappy.hpp:59
uint32_t load_le32(const uint8_t *p)
Read a 32-bit little-endian value from a potentially unaligned pointer.
Definition snappy.hpp:80
uint32_t match_length(const uint8_t *src, size_t s1, size_t s2, size_t src_end)
Find the match length between src[s1..] and src[s2..], bounded by src_end.
Definition snappy.hpp:211
uint32_t hash4(const uint8_t *p)
14-bit hash of 4 bytes read as a little-endian uint32.
Definition snappy.hpp:120
Compression
Parquet compression codecs.
Definition types.hpp:115
@ SNAPPY
Snappy compression (bundled, header-only).
void register_snappy_codec()
Register the bundled Snappy codec with the global CodecRegistry.
Definition snappy.hpp:608
@ INTERNAL_ERROR
An unexpected internal error that does not fit any other category.
@ CORRUPT_PAGE
A data page failed integrity checks (bad CRC, truncated, or exceeds size limits).
uint32_t load_le32(const uint8_t *data) noexcept
Definition reader.hpp:129
Lightweight error value carrying an ErrorCode and a human-readable message.
Definition error.hpp:101