Signet Forge 0.1.0
C++20 Parquet library with AI-native extensions
DEMO
Loading...
Searching...
No Matches
codec.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright 2026 Johnson Ogundeji
3#pragma once
4
19
20#include "signet/types.hpp"
21#include "signet/error.hpp"
22
23#include <algorithm>
24#include <cstdint>
25#include <cstring>
26#include <memory>
27#include <mutex>
28#include <unordered_map>
29#include <vector>
30
31namespace signet::forge {
32
33// ===========================================================================
34// CompressionCodec — abstract interface for compression/decompression
35// ===========================================================================
36
46public:
48 virtual ~CompressionCodec() = default;
49
55 const uint8_t* data, size_t size) const = 0;
56
65 const uint8_t* data, size_t size, size_t uncompressed_size) const = 0;
66
69 [[nodiscard]] virtual Compression codec_type() const = 0;
70
74 [[nodiscard]] virtual const char* name() const = 0;
75};
76
77// ===========================================================================
78// CodecRegistry — singleton holding all registered compression codecs
79// ===========================================================================
80
91public:
95 static CodecRegistry registry;
96 return registry;
97 }
98
105 void register_codec(std::unique_ptr<CompressionCodec> codec) {
106 if (!codec) return;
107 std::lock_guard<std::mutex> lock(mutex_);
108 Compression type = codec->codec_type();
109 codecs_[type] = std::move(codec);
110 }
111
117 [[nodiscard]] const CompressionCodec* get(Compression type) const {
118 std::lock_guard<std::mutex> lock(mutex_);
119 auto it = codecs_.find(type);
120 if (it != codecs_.end()) {
121 return it->second.get();
122 }
123 return nullptr;
124 }
125
130 [[nodiscard]] bool has(Compression type) const {
131 if (type == Compression::UNCOMPRESSED) return true;
132 std::lock_guard<std::mutex> lock(mutex_);
133 return codecs_.find(type) != codecs_.end();
134 }
135
139 [[nodiscard]] std::vector<Compression> available() const {
140 std::lock_guard<std::mutex> lock(mutex_);
141 std::vector<Compression> result;
142 result.reserve(codecs_.size() + 1);
143 result.push_back(Compression::UNCOMPRESSED);
144 for (const auto& [type, _] : codecs_) {
145 result.push_back(type);
146 }
147 return result;
148 }
149
152 CodecRegistry(const CodecRegistry&) = delete;
157
158private:
160 CodecRegistry() = default;
161
163 mutable std::mutex mutex_;
165 std::unordered_map<Compression, std::unique_ptr<CompressionCodec>> codecs_;
166};
167
168// ===========================================================================
169// Convenience functions — compress / decompress via the registry
170// ===========================================================================
171
184 Compression codec, const uint8_t* data, size_t size) {
185
186 // UNCOMPRESSED: return a copy of the data
187 if (codec == Compression::UNCOMPRESSED) {
188 return std::vector<uint8_t>(data, data + size);
189 }
190
191 const CompressionCodec* impl = CodecRegistry::instance().get(codec);
192 if (!impl) {
194 "compression codec not registered"};
195 }
196
197 return impl->compress(data, size);
198}
199
214 Compression codec, const uint8_t* data, size_t size,
215 size_t uncompressed_size) {
216
217 // UNCOMPRESSED: return a copy of the data
218 if (codec == Compression::UNCOMPRESSED) {
219 return std::vector<uint8_t>(data, data + size);
220 }
221
222 // Absolute decompressed size cap (256 MB)
223 // CWE-409: Improper Handling of Highly Compressed Data (Decompression Bomb)
224 static constexpr size_t MAX_DECOMPRESS_SIZE = 256ULL * 1024 * 1024;
225 if (uncompressed_size > MAX_DECOMPRESS_SIZE) {
227 "Decompressed size exceeds 256 MB limit"};
228 }
229 // Reject zero-length compressed data claiming non-zero output
230 // CWE-20: Improper Input Validation
231 if (size == 0 && uncompressed_size > 0) {
233 "Zero-length compressed data with non-zero uncompressed size"};
234 }
235 // Decompression bomb guard: ratio > 1024:1 is suspicious
236 // CWE-409: Improper Handling of Highly Compressed Data (Decompression Bomb)
237 static constexpr size_t MAX_DECOMPRESSION_RATIO = 1024;
238 if (size > 0 && uncompressed_size / size >= MAX_DECOMPRESSION_RATIO) {
240 "Decompression ratio exceeds limit"};
241 }
242
243 const CompressionCodec* impl = CodecRegistry::instance().get(codec);
244 if (!impl) {
246 "compression codec not registered"};
247 }
248
249 return impl->decompress(data, size, uncompressed_size);
250}
251
252// ===========================================================================
253// auto_select_compression — heuristic codec selection
254// ===========================================================================
255
271 [[maybe_unused]] const uint8_t* sample_data,
272 [[maybe_unused]] size_t sample_size) {
273
274 const auto& registry = CodecRegistry::instance();
275
276 // Prefer ZSTD for best compression ratio
277 if (registry.has(Compression::ZSTD)) {
278 return Compression::ZSTD;
279 }
280
281 // Snappy: fast compression, moderate ratio
282 if (registry.has(Compression::SNAPPY)) {
283 return Compression::SNAPPY;
284 }
285
286 // LZ4 (raw): fast, decent ratio
287 if (registry.has(Compression::LZ4_RAW)) {
289 }
290
291 // LZ4 (framed, legacy enum value)
292 if (registry.has(Compression::LZ4)) {
293 return Compression::LZ4;
294 }
295
296 // No compression codecs available
298}
299
300} // namespace signet::forge
Thread-safe singleton registry of compression codec implementations.
Definition codec.hpp:90
CodecRegistry & operator=(const CodecRegistry &)=delete
std::vector< Compression > available() const
List all available compression types, including UNCOMPRESSED.
Definition codec.hpp:139
CodecRegistry(CodecRegistry &&)=delete
CodecRegistry(const CodecRegistry &)=delete
const CompressionCodec * get(Compression type) const
Look up a registered codec by its Compression type.
Definition codec.hpp:117
CodecRegistry & operator=(CodecRegistry &&)=delete
void register_codec(std::unique_ptr< CompressionCodec > codec)
Register a codec, transferring ownership to the registry.
Definition codec.hpp:105
static CodecRegistry & instance()
Access the process-wide singleton instance.
Definition codec.hpp:94
bool has(Compression type) const
Check whether a codec is available for the given compression type.
Definition codec.hpp:130
Abstract base class for all compression/decompression codecs.
Definition codec.hpp:45
virtual const char * name() const =0
Return a human-readable codec name (e.g.
virtual expected< std::vector< uint8_t > > decompress(const uint8_t *data, size_t size, size_t uncompressed_size) const =0
Decompress codec-specific data back to raw bytes.
virtual expected< std::vector< uint8_t > > compress(const uint8_t *data, size_t size) const =0
Compress raw data into codec-specific format.
virtual ~CompressionCodec()=default
Virtual destructor for safe polymorphic deletion.
virtual Compression codec_type() const =0
Return the Parquet Compression enum value identifying this codec.
A lightweight result type that holds either a success value of type T or an Error.
Definition error.hpp:145
Compression auto_select_compression(const uint8_t *sample_data, size_t sample_size)
Automatically select the best available compression codec.
Definition codec.hpp:270
Compression
Parquet compression codecs.
Definition types.hpp:115
@ SNAPPY
Snappy compression (bundled, header-only).
@ LZ4_RAW
LZ4 raw (unframed) block compression.
@ UNCOMPRESSED
No compression.
@ ZSTD
Zstandard compression (requires SIGNET_ENABLE_ZSTD).
@ LZ4
LZ4 block compression (requires SIGNET_ENABLE_LZ4).
expected< std::vector< uint8_t > > decompress(Compression codec, const uint8_t *data, size_t size, size_t uncompressed_size)
Decompress data using the specified codec via the global CodecRegistry.
Definition codec.hpp:213
expected< std::vector< uint8_t > > compress(Compression codec, const uint8_t *data, size_t size)
Compress data using the specified codec via the global CodecRegistry.
Definition codec.hpp:183
@ UNSUPPORTED_COMPRESSION
The file uses a compression codec not linked into this build (ZSTD, LZ4, Gzip).
@ CORRUPT_PAGE
A data page failed integrity checks (bad CRC, truncated, or exceeds size limits).
Lightweight error value carrying an ErrorCode and a human-readable message.
Definition error.hpp:101
Parquet format enumerations, type traits, and statistics structs.