40namespace detail::snappy {
48 while (value >= 0x80) {
49 dst[n++] =
static_cast<uint8_t
>(value | 0x80);
52 dst[n++] =
static_cast<uint8_t
>(value);
63 for (
int i = 0; i < 5; ++i) {
64 if (pos >= size)
return false;
65 uint8_t
byte = data[pos++];
66 result |=
static_cast<uint32_t
>(
byte & 0x7F) << shift;
67 if ((
byte & 0x80) == 0) {
82 std::memcpy(&v, p, 4);
83#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
84 v = __builtin_bswap32(v);
92 std::memcpy(&v, p, 2);
93#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
94 v = __builtin_bswap16(v);
101#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
102 v = __builtin_bswap16(v);
104 std::memcpy(p, &v, 2);
109#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
110 v = __builtin_bswap32(v);
112 std::memcpy(p, &v, 4);
120inline uint32_t
hash4(
const uint8_t* p) {
122 return (val * 0x1e35a7bd) >> 18;
126static constexpr size_t kHashTableSize = 16384;
127static constexpr uint32_t kHashTableMask = kHashTableSize - 1;
134 const uint8_t* data,
size_t length) {
138 out.push_back(
static_cast<uint8_t
>((length - 1) << 2 | 0));
139 }
else if (length <= 256) {
141 out.push_back(
static_cast<uint8_t
>(60 << 2 | 0));
142 out.push_back(
static_cast<uint8_t
>(length - 1));
143 }
else if (length <= 65536) {
145 out.push_back(
static_cast<uint8_t
>(61 << 2 | 0));
146 uint16_t len_minus_1 =
static_cast<uint16_t
>(length - 1);
147 out.push_back(
static_cast<uint8_t
>(len_minus_1 & 0xFF));
148 out.push_back(
static_cast<uint8_t
>((len_minus_1 >> 8) & 0xFF));
149 }
else if (length <= 16777216) {
151 out.push_back(
static_cast<uint8_t
>(62 << 2 | 0));
152 uint32_t len_minus_1 =
static_cast<uint32_t
>(length - 1);
153 out.push_back(
static_cast<uint8_t
>(len_minus_1 & 0xFF));
154 out.push_back(
static_cast<uint8_t
>((len_minus_1 >> 8) & 0xFF));
155 out.push_back(
static_cast<uint8_t
>((len_minus_1 >> 16) & 0xFF));
158 out.push_back(
static_cast<uint8_t
>(63 << 2 | 0));
159 uint32_t len_minus_1 =
static_cast<uint32_t
>(length - 1);
160 out.push_back(
static_cast<uint8_t
>(len_minus_1 & 0xFF));
161 out.push_back(
static_cast<uint8_t
>((len_minus_1 >> 8) & 0xFF));
162 out.push_back(
static_cast<uint8_t
>((len_minus_1 >> 16) & 0xFF));
163 out.push_back(
static_cast<uint8_t
>((len_minus_1 >> 24) & 0xFF));
167 out.insert(out.end(), data, data + length);
172inline void emit_copy(std::vector<uint8_t>& out, uint32_t offset,
177 if (length >= 4 && length <= 11 && offset <= 2047) {
179 uint8_t tag =
static_cast<uint8_t
>(
180 ((offset >> 8) << 5) | ((length - 4) << 2) | 1);
182 out.push_back(
static_cast<uint8_t
>(offset & 0xFF));
186 if (offset <= 65535) {
188 uint32_t chunk = (length > 64) ? 64 : length;
189 uint8_t tag =
static_cast<uint8_t
>(((chunk - 1) << 2) | 2);
191 out.push_back(
static_cast<uint8_t
>(offset & 0xFF));
192 out.push_back(
static_cast<uint8_t
>((offset >> 8) & 0xFF));
196 uint32_t chunk = (length > 64) ? 64 : length;
197 uint8_t tag =
static_cast<uint8_t
>(((chunk - 1) << 2) | 3);
199 out.push_back(
static_cast<uint8_t
>(offset & 0xFF));
200 out.push_back(
static_cast<uint8_t
>((offset >> 8) & 0xFF));
201 out.push_back(
static_cast<uint8_t
>((offset >> 16) & 0xFF));
202 out.push_back(
static_cast<uint8_t
>((offset >> 24) & 0xFF));
213 if (s1 >= src_end || s2 >= src_end)
return 0;
215 size_t limit = src_end - ((s1 > s2) ? s1 : s2);
218 if (limit > 65535) limit = 65535;
219 while (len < limit && src[s1 + len] == src[s2 + len]) {
252 const uint8_t* data,
size_t size)
const override {
254 using namespace detail::snappy;
258 std::vector<uint8_t> out;
259 out.reserve(size + size / 64 + 16);
263 if (size > UINT32_MAX) {
265 "Snappy: input exceeds 4 GiB limit"};
269 uint8_t varint_buf[5];
270 size_t varint_len = encode_varint32(varint_buf,
271 static_cast<uint32_t
>(size));
272 out.insert(out.end(), varint_buf, varint_buf + varint_len);
280 emit_literal(out, data, size);
292 std::array<uint64_t, kHashTableSize> table{};
295 table.fill(UINT64_MAX);
298 size_t literal_start = 0;
300 while (pos + 4 <= size) {
301 uint32_t h = hash4(data + pos);
302 uint64_t candidate = table[h];
307 if (candidate != UINT64_MAX &&
308 pos - candidate <= 65535 &&
312 if (pos > literal_start) {
313 emit_literal(out, data + literal_start,
314 pos - literal_start);
318 uint32_t ml = match_length(data, pos, candidate, size);
321 uint32_t offset =
static_cast<uint32_t
>(pos - candidate);
322 emit_copy(out, offset, ml);
327 size_t match_end = pos + ml;
329 while (pos < match_end && pos + 4 <= size) {
330 table[hash4(data + pos)] = pos;
342 if (literal_start < size) {
343 emit_literal(out, data + literal_start, size - literal_start);
361 const uint8_t* data,
size_t size,
362 size_t uncompressed_size)
const override {
364 using namespace detail::snappy;
367 if (uncompressed_size == 0) {
368 return std::vector<uint8_t>{};
371 "Snappy: empty compressed stream but "
372 "expected non-zero output"};
377 uint32_t declared_len = 0;
378 if (!decode_varint32(data, size, pos, declared_len)) {
380 "Snappy: failed to decode uncompressed length varint"};
384 if (
static_cast<size_t>(declared_len) != uncompressed_size) {
386 "Snappy: declared uncompressed length (" +
387 std::to_string(declared_len) +
388 ") does not match expected (" +
389 std::to_string(uncompressed_size) +
")"};
394 static constexpr size_t MAX_SNAPPY_DECOMPRESS = 256ULL * 1024 * 1024;
395 if (uncompressed_size > MAX_SNAPPY_DECOMPRESS) {
397 "Snappy: decompressed size exceeds 256 MB"};
399 std::vector<uint8_t> out(uncompressed_size);
404 uint8_t tag = data[pos++];
405 uint8_t element_type = tag & 0x03;
407 switch (element_type) {
411 uint32_t literal_len = (tag >> 2) + 1;
415 uint32_t encoded_len_minus_1 = tag >> 2;
416 if (encoded_len_minus_1 >= 60) {
417 uint32_t extra_bytes = encoded_len_minus_1 - 59;
418 if (pos + extra_bytes > size) {
420 "Snappy: literal length bytes truncated"};
423 for (uint32_t i = 0; i < extra_bytes; ++i) {
424 literal_len |=
static_cast<uint32_t
>(data[pos++]) << (8 * i);
430 if (pos + literal_len > size) {
432 "Snappy: literal data extends past end of "
433 "compressed stream"};
435 if (out_pos + literal_len > uncompressed_size) {
437 "Snappy: literal would overflow output buffer"};
440 std::memcpy(out.data() + out_pos, data + pos, literal_len);
442 out_pos += literal_len;
452 "Snappy: copy-1 truncated"};
454 uint32_t length = ((tag >> 2) & 0x07) + 4;
455 uint32_t offset = (
static_cast<uint32_t
>(tag >> 5) << 8) |
459 "Snappy: copy-1 with zero offset"};
461 if (offset > out_pos) {
463 "Snappy: copy-1 offset (" +
464 std::to_string(offset) +
465 ") exceeds output position (" +
466 std::to_string(out_pos) +
")"};
468 if (out_pos + length > uncompressed_size) {
470 "Snappy: copy-1 would overflow output buffer"};
476 size_t src = out_pos - offset;
477 for (uint32_t i = 0; i < length; ++i) {
478 out[out_pos++] = out[src + i];
486 if (pos + 2 > size) {
488 "Snappy: copy-2 truncated"};
490 uint32_t length = (tag >> 2) + 1;
491 uint32_t offset = load_le16(data + pos);
496 "Snappy: copy-2 with zero offset"};
498 if (offset > out_pos) {
500 "Snappy: copy-2 offset (" +
501 std::to_string(offset) +
502 ") exceeds output position (" +
503 std::to_string(out_pos) +
")"};
505 if (out_pos + length > uncompressed_size) {
507 "Snappy: copy-2 would overflow output buffer"};
510 size_t src = out_pos - offset;
511 if (length <= offset) {
513 std::memcpy(out.data() + out_pos, out.data() + src, length);
517 for (uint32_t i = 0; i < length; ++i) {
518 out[out_pos++] = out[src + i];
527 if (pos + 4 > size) {
529 "Snappy: copy-4 truncated"};
531 uint32_t length = (tag >> 2) + 1;
537 "Snappy: copy-4 with zero offset"};
539 if (
static_cast<size_t>(offset) > out_pos) {
541 "Snappy: copy-4 offset (" +
542 std::to_string(offset) +
543 ") exceeds output position (" +
544 std::to_string(out_pos) +
")"};
546 if (out_pos + length > uncompressed_size) {
548 "Snappy: copy-4 would overflow output buffer"};
551 size_t src = out_pos - offset;
552 if (length <= offset) {
553 std::memcpy(out.data() + out_pos, out.data() + src, length);
556 for (uint32_t i = 0; i < length; ++i) {
557 out[out_pos++] = out[src + i];
566 "Snappy: unknown element type"};
571 if (out_pos != uncompressed_size) {
573 "Snappy: decompressed " + std::to_string(out_pos) +
574 " bytes but expected " +
575 std::to_string(uncompressed_size)};
590 [[nodiscard]]
const char*
name()
const override {
void register_codec(std::unique_ptr< CompressionCodec > codec)
Register a codec, transferring ownership to the registry.
static CodecRegistry & instance()
Access the process-wide singleton instance.
Abstract base class for all compression/decompression codecs.
Bundled Snappy compression codec (header-only, no external dependency).
expected< std::vector< uint8_t > > decompress(const uint8_t *data, size_t size, size_t uncompressed_size) const override
Snappy-decompress the input data.
const char * name() const override
Return the codec name "snappy".
expected< std::vector< uint8_t > > compress(const uint8_t *data, size_t size) const override
Snappy-compress the input data.
Compression codec_type() const override
Return Compression::SNAPPY.
A lightweight result type that holds either a success value of type T or an Error.
Compression codec interface and registry for Signet Forge.
void store_le16(uint8_t *p, uint16_t v)
Write a 16-bit little-endian value.
void store_le32(uint8_t *p, uint32_t v)
Write a 32-bit little-endian value.
size_t encode_varint32(uint8_t *dst, uint32_t value)
Encode a 32-bit unsigned integer as a Snappy varint (1-5 bytes).
uint16_t load_le16(const uint8_t *p)
Read a 16-bit little-endian value from a potentially unaligned pointer.
void emit_literal(std::vector< uint8_t > &out, const uint8_t *data, size_t length)
Emit a literal element.
void emit_copy(std::vector< uint8_t > &out, uint32_t offset, uint32_t length)
Emit a copy element.
bool decode_varint32(const uint8_t *data, size_t size, size_t &pos, uint32_t &out)
Decode a Snappy varint from the input stream.
uint32_t load_le32(const uint8_t *p)
Read a 32-bit little-endian value from a potentially unaligned pointer.
uint32_t match_length(const uint8_t *src, size_t s1, size_t s2, size_t src_end)
Find the match length between src[s1..] and src[s2..], bounded by src_end.
uint32_t hash4(const uint8_t *p)
14-bit hash of 4 bytes read as a little-endian uint32.
Compression
Parquet compression codecs.
@ SNAPPY
Snappy compression (bundled, header-only).
void register_snappy_codec()
Register the bundled Snappy codec with the global CodecRegistry.
@ INTERNAL_ERROR
An unexpected internal error that does not fit any other category.
@ CORRUPT_PAGE
A data page failed integrity checks (bad CRC, truncated, or exceeds size limits).
uint32_t load_le32(const uint8_t *data) noexcept
Lightweight error value carrying an ErrorCode and a human-readable message.