28#include <unordered_map>
55 const uint8_t* data,
size_t size)
const = 0;
65 const uint8_t* data,
size_t size,
size_t uncompressed_size)
const = 0;
74 [[nodiscard]]
virtual const char*
name()
const = 0;
107 std::lock_guard<std::mutex> lock(mutex_);
109 codecs_[type] = std::move(codec);
118 std::lock_guard<std::mutex> lock(mutex_);
119 auto it = codecs_.find(type);
120 if (it != codecs_.end()) {
121 return it->second.get();
132 std::lock_guard<std::mutex> lock(mutex_);
133 return codecs_.find(type) != codecs_.end();
139 [[nodiscard]] std::vector<Compression>
available()
const {
140 std::lock_guard<std::mutex> lock(mutex_);
141 std::vector<Compression> result;
142 result.reserve(codecs_.size() + 1);
144 for (
const auto& [type, _] : codecs_) {
145 result.push_back(type);
163 mutable std::mutex mutex_;
165 std::unordered_map<Compression, std::unique_ptr<CompressionCodec>> codecs_;
184 Compression codec,
const uint8_t* data,
size_t size) {
188 return std::vector<uint8_t>(data, data + size);
194 "compression codec not registered"};
214 Compression codec,
const uint8_t* data,
size_t size,
215 size_t uncompressed_size) {
219 return std::vector<uint8_t>(data, data + size);
224 static constexpr size_t MAX_DECOMPRESS_SIZE = 256ULL * 1024 * 1024;
225 if (uncompressed_size > MAX_DECOMPRESS_SIZE) {
227 "Decompressed size exceeds 256 MB limit"};
231 if (size == 0 && uncompressed_size > 0) {
233 "Zero-length compressed data with non-zero uncompressed size"};
237 static constexpr size_t MAX_DECOMPRESSION_RATIO = 1024;
238 if (size > 0 && uncompressed_size / size >= MAX_DECOMPRESSION_RATIO) {
240 "Decompression ratio exceeds limit"};
246 "compression codec not registered"};
249 return impl->
decompress(data, size, uncompressed_size);
271 [[maybe_unused]]
const uint8_t* sample_data,
272 [[maybe_unused]]
size_t sample_size) {
Thread-safe singleton registry of compression codec implementations.
CodecRegistry & operator=(const CodecRegistry &)=delete
std::vector< Compression > available() const
List all available compression types, including UNCOMPRESSED.
CodecRegistry(CodecRegistry &&)=delete
CodecRegistry(const CodecRegistry &)=delete
const CompressionCodec * get(Compression type) const
Look up a registered codec by its Compression type.
CodecRegistry & operator=(CodecRegistry &&)=delete
void register_codec(std::unique_ptr< CompressionCodec > codec)
Register a codec, transferring ownership to the registry.
static CodecRegistry & instance()
Access the process-wide singleton instance.
bool has(Compression type) const
Check whether a codec is available for the given compression type.
Abstract base class for all compression/decompression codecs.
virtual const char * name() const =0
Return a human-readable codec name (e.g.
virtual expected< std::vector< uint8_t > > decompress(const uint8_t *data, size_t size, size_t uncompressed_size) const =0
Decompress codec-specific data back to raw bytes.
virtual expected< std::vector< uint8_t > > compress(const uint8_t *data, size_t size) const =0
Compress raw data into codec-specific format.
virtual ~CompressionCodec()=default
Virtual destructor for safe polymorphic deletion.
virtual Compression codec_type() const =0
Return the Parquet Compression enum value identifying this codec.
A lightweight result type that holds either a success value of type T or an Error.
Compression auto_select_compression(const uint8_t *sample_data, size_t sample_size)
Automatically select the best available compression codec.
Compression
Parquet compression codecs.
@ SNAPPY
Snappy compression (bundled, header-only).
@ LZ4_RAW
LZ4 raw (unframed) block compression.
@ UNCOMPRESSED
No compression.
@ ZSTD
Zstandard compression (requires SIGNET_ENABLE_ZSTD).
@ LZ4
LZ4 block compression (requires SIGNET_ENABLE_LZ4).
expected< std::vector< uint8_t > > decompress(Compression codec, const uint8_t *data, size_t size, size_t uncompressed_size)
Decompress data using the specified codec via the global CodecRegistry.
expected< std::vector< uint8_t > > compress(Compression codec, const uint8_t *data, size_t size)
Compress data using the specified codec via the global CodecRegistry.
@ UNSUPPORTED_COMPRESSION
The file uses a compression codec not linked into this build (ZSTD, LZ4, Gzip).
@ CORRUPT_PAGE
A data page failed integrity checks (bad CRC, truncated, or exceeds size limits).
Lightweight error value carrying an ErrorCode and a human-readable message.
Parquet format enumerations, type traits, and statistics structs.