Signet Forge 0.1.0
C++20 Parquet library with AI-native extensions
DEMO
Loading...
Searching...
No Matches
dictionary.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright 2026 Johnson Ogundeji
3
14
15#pragma once
16
17// ---------------------------------------------------------------------------
18// dictionary.hpp -- Dictionary encoding for Parquet
19//
20// Implements PLAIN_DICTIONARY (encoding=2) for dictionary pages and
21// RLE_DICTIONARY (encoding=8) for data pages. Critical for low-cardinality
22// string columns (symbols, sides, exchanges) where 10-50x compression is
23// typical.
24//
25// Wire format:
26// Dictionary page: PLAIN-encoded unique values
27// - BYTE_ARRAY: 4-byte LE length + bytes per entry
28// - INT32/FLOAT: 4-byte LE per entry
29// - INT64/DOUBLE: 8-byte LE per entry
30//
31// Data page: 1-byte bit_width + RLE/Bit-Pack hybrid encoded indices
32// - bit_width = 0 for dict_size==1, else ceil(log2(dict_size))
33// - Indices encoded using RLE/Bit-Packing Hybrid (see rle.hpp)
34// ---------------------------------------------------------------------------
35
37#include "signet/error.hpp"
38#include "signet/types.hpp"
39
40#include <cassert>
41#include <cstdint>
42#include <cstring>
43#include <limits>
44#include <stdexcept>
45#include <string>
46#include <unordered_map>
47#include <vector>
48
49namespace signet::forge {
50
51// ---------------------------------------------------------------------------
52// Internal helpers
53// ---------------------------------------------------------------------------
54
56namespace detail {
57
66inline int dict_bit_width(size_t dict_size) {
67 if (dict_size <= 1) return 0;
68 // ceil(log2(n)) = number of bits needed to represent 0..n-1
69 int bits = 0;
70 size_t n = dict_size - 1; // max index value
71 while (n > 0) {
72 ++bits;
73 n >>= 1;
74 }
75 return bits;
76}
77
78// -- PLAIN encoding helpers for dictionary page entries ----------------------
79
84inline void plain_encode_value(std::vector<uint8_t>& buf, const std::string& val) {
85 if (val.size() > UINT32_MAX) throw std::length_error("String exceeds Parquet BYTE_ARRAY 4 GB limit");
86 uint32_t len = static_cast<uint32_t>(val.size());
87 buf.push_back(static_cast<uint8_t>((len ) & 0xFF));
88 buf.push_back(static_cast<uint8_t>((len >> 8) & 0xFF));
89 buf.push_back(static_cast<uint8_t>((len >> 16) & 0xFF));
90 buf.push_back(static_cast<uint8_t>((len >> 24) & 0xFF));
91 buf.insert(buf.end(),
92 reinterpret_cast<const uint8_t*>(val.data()),
93 reinterpret_cast<const uint8_t*>(val.data()) + val.size());
94}
95
100inline void plain_encode_value(std::vector<uint8_t>& buf, int32_t val) {
101 uint32_t bits;
102 std::memcpy(&bits, &val, sizeof(bits));
103 buf.push_back(static_cast<uint8_t>((bits ) & 0xFF));
104 buf.push_back(static_cast<uint8_t>((bits >> 8) & 0xFF));
105 buf.push_back(static_cast<uint8_t>((bits >> 16) & 0xFF));
106 buf.push_back(static_cast<uint8_t>((bits >> 24) & 0xFF));
107}
108
113inline void plain_encode_value(std::vector<uint8_t>& buf, int64_t val) {
114 uint64_t bits;
115 std::memcpy(&bits, &val, sizeof(bits));
116 for (int i = 0; i < 8; ++i) {
117 buf.push_back(static_cast<uint8_t>(bits & 0xFF));
118 bits >>= 8;
119 }
120}
121
126inline void plain_encode_value(std::vector<uint8_t>& buf, float val) {
127 uint32_t bits;
128 std::memcpy(&bits, &val, sizeof(bits));
129 buf.push_back(static_cast<uint8_t>((bits ) & 0xFF));
130 buf.push_back(static_cast<uint8_t>((bits >> 8) & 0xFF));
131 buf.push_back(static_cast<uint8_t>((bits >> 16) & 0xFF));
132 buf.push_back(static_cast<uint8_t>((bits >> 24) & 0xFF));
133}
134
139inline void plain_encode_value(std::vector<uint8_t>& buf, double val) {
140 uint64_t bits;
141 std::memcpy(&bits, &val, sizeof(bits));
142 for (int i = 0; i < 8; ++i) {
143 buf.push_back(static_cast<uint8_t>(bits & 0xFF));
144 bits >>= 8;
145 }
146}
147
148// -- PLAIN decoding helpers for dictionary page entries ----------------------
149
159inline std::string plain_decode_value(const uint8_t* data, size_t& pos,
160 size_t size, std::string* /*tag*/) {
161 if (pos + 4 > size) return {};
162 uint32_t len;
163 std::memcpy(&len, data + pos, 4);
164 pos += 4;
165 if (pos + len > size) return {};
166 std::string val(reinterpret_cast<const char*>(data + pos), len);
167 pos += len;
168 return val;
169}
170
177inline int32_t plain_decode_value(const uint8_t* data, size_t& pos,
178 size_t size, int32_t* /*tag*/) {
179 if (pos + 4 > size) return 0;
180 int32_t val;
181 std::memcpy(&val, data + pos, 4);
182 pos += 4;
183 return val;
184}
185
192inline int64_t plain_decode_value(const uint8_t* data, size_t& pos,
193 size_t size, int64_t* /*tag*/) {
194 if (pos + 8 > size) return 0;
195 int64_t val;
196 std::memcpy(&val, data + pos, 8);
197 pos += 8;
198 return val;
199}
200
207inline float plain_decode_value(const uint8_t* data, size_t& pos,
208 size_t size, float* /*tag*/) {
209 if (pos + 4 > size) return 0.0f;
210 float val;
211 std::memcpy(&val, data + pos, 4);
212 pos += 4;
213 return val;
214}
215
222inline double plain_decode_value(const uint8_t* data, size_t& pos,
223 size_t size, double* /*tag*/) {
224 if (pos + 8 > size) return 0.0;
225 double val;
226 std::memcpy(&val, data + pos, 8);
227 pos += 8;
228 return val;
229}
230
231} // namespace detail
232
233// ===========================================================================
234// DictionaryEncoder
235// ===========================================================================
236//
237// Builds a dictionary of unique values and encodes data as indices into
238// that dictionary. The dictionary page is PLAIN-encoded, and the indices
239// page uses RLE_DICTIONARY (bit_width byte + RLE/Bit-Pack hybrid).
240//
241// Usage:
242// DictionaryEncoder<std::string> enc;
243// for (auto& s : strings) enc.put(s);
244// enc.flush();
245// auto dict_page = enc.dictionary_page(); // PLAIN-encoded unique values
246// auto idx_page = enc.indices_page(); // bit_width + RLE-encoded indices
247//
259template <typename T>
261public:
263 DictionaryEncoder() = default;
264
267 static constexpr size_t MAX_DICTIONARY_ENTRIES = 1 << 20; // 1M entries
268
272 [[nodiscard]] bool is_full() const { return dict_values_.size() >= MAX_DICTIONARY_ENTRIES; }
273
283 bool put(const T& value) {
284 auto it = dict_map_.find(value);
285 uint32_t index;
286 if (it == dict_map_.end()) {
287 if (dict_values_.size() >= MAX_DICTIONARY_ENTRIES) {
288 return false; // dictionary full — caller should fall back to PLAIN
289 }
290 index = static_cast<uint32_t>(dict_values_.size());
291 dict_map_.emplace(value, index);
292 dict_values_.push_back(value);
293 } else {
294 index = it->second;
295 }
296 indices_.push_back(index);
297 return true;
298 }
299
304 void flush() {
305 // Nothing to accumulate -- indices are stored incrementally.
306 // This exists for API symmetry with other encoders.
307 }
308
317 [[nodiscard]] std::vector<uint8_t> dictionary_page() const {
318 std::vector<uint8_t> buf;
319 for (const auto& val : dict_values_) {
321 }
322 return buf;
323 }
324
333 [[nodiscard]] std::vector<uint8_t> indices_page() const {
334 int bw = bit_width();
335
336 // RLE-encode the indices
337 auto rle_payload = RleEncoder::encode(indices_.data(),
338 indices_.size(), bw);
339
340 // Prepend the bit_width byte
341 std::vector<uint8_t> result;
342 result.reserve(1 + rle_payload.size());
343 result.push_back(static_cast<uint8_t>(bw));
344 result.insert(result.end(), rle_payload.begin(), rle_payload.end());
345 return result;
346 }
347
351 [[nodiscard]] size_t dictionary_size() const { return dict_values_.size(); }
352
356 [[nodiscard]] size_t num_values() const { return indices_.size(); }
357
361 [[nodiscard]] int bit_width() const {
362 return detail::dict_bit_width(dict_values_.size());
363 }
364
368 void reset() {
369 dict_map_.clear();
370 dict_values_.clear();
371 indices_.clear();
372 }
373
381 [[nodiscard]] bool is_worthwhile() const {
382 if (indices_.empty()) return false;
383 return dict_values_.size() < indices_.size() * 4 / 10;
384 }
385
386private:
387 std::unordered_map<T, uint32_t> dict_map_;
388 std::vector<T> dict_values_;
389 std::vector<uint32_t> indices_;
390};
391
392// ===========================================================================
393// DictionaryDecoder
394// ===========================================================================
395//
396// Decodes a dictionary page (PLAIN-encoded values) and an indices page
397// (bit_width byte + RLE-encoded indices) back to the original typed values.
398//
399// Usage:
400// DictionaryDecoder<std::string> dec(dict_data, dict_size, num_entries, type);
401// auto values = dec.decode(idx_data, idx_size, num_values);
402//
412template <typename T>
414public:
426 DictionaryDecoder(const uint8_t* dict_data, size_t dict_size,
427 size_t num_dict_entries, PhysicalType type)
428 : type_(type)
429 {
430 // Decode dictionary entries from PLAIN-encoded data
431 dict_values_.reserve((std::min)(static_cast<size_t>(num_dict_entries), dict_size));
432 size_t pos = 0;
433 for (size_t i = 0; i < num_dict_entries; ++i) {
434 T* tag = nullptr;
435 dict_values_.push_back(
436 detail::plain_decode_value(dict_data, pos, dict_size, tag));
437 }
438 }
439
451 [[nodiscard]] expected<std::vector<T>> decode(const uint8_t* indices_data,
452 size_t indices_size,
453 size_t num_values) const {
454 if (indices_size == 0 || num_values == 0) return std::vector<T>{};
455
456 // First byte is the bit_width
457 int bw = static_cast<int>(indices_data[0]);
458
459 // Remaining bytes are the RLE-encoded indices
460 auto indices = RleDecoder::decode(indices_data + 1,
461 indices_size - 1,
462 bw, num_values);
463
464 // Map indices back to dictionary values
465 std::vector<T> result;
466 result.reserve(indices.size());
467 for (uint32_t idx : indices) {
468 if (static_cast<size_t>(idx) >= dict_values_.size()) {
470 "dictionary index " + std::to_string(idx)
471 + " out of range (dict size="
472 + std::to_string(dict_values_.size()) + ")"};
473 }
474 result.push_back(dict_values_[idx]);
475 }
476 return result;
477 }
478
482 [[nodiscard]] size_t dictionary_size() const { return dict_values_.size(); }
483
484private:
485 PhysicalType type_;
486 std::vector<T> dict_values_;
487};
488
489} // namespace signet::forge
Dictionary decoder for Parquet PLAIN_DICTIONARY / RLE_DICTIONARY encoding.
size_t dictionary_size() const
Number of entries in the dictionary.
DictionaryDecoder(const uint8_t *dict_data, size_t dict_size, size_t num_dict_entries, PhysicalType type)
Construct a decoder by parsing the raw PLAIN-encoded dictionary page.
expected< std::vector< T > > decode(const uint8_t *indices_data, size_t indices_size, size_t num_values) const
Decode an RLE_DICTIONARY indices page into original typed values.
Dictionary encoder for Parquet PLAIN_DICTIONARY / RLE_DICTIONARY encoding.
bool is_worthwhile() const
Heuristic check: is dictionary encoding worthwhile for this data?
int bit_width() const
Bits per dictionary index (ceil(log2(dictionary_size))).
size_t dictionary_size() const
Number of unique values in the dictionary.
void flush()
Finalize the encoding.
DictionaryEncoder()=default
Default-construct an empty dictionary encoder.
std::vector< uint8_t > indices_page() const
Get the data page as RLE_DICTIONARY-encoded indices.
size_t num_values() const
Total number of values encoded (including duplicates).
static constexpr size_t MAX_DICTIONARY_ENTRIES
Maximum number of dictionary entries before fallback to PLAIN encoding.
std::vector< uint8_t > dictionary_page() const
Get the dictionary page as PLAIN-encoded unique values.
void reset()
Reset the encoder, clearing the dictionary, indices, and all internal state.
bool put(const T &value)
Add a value to the encoding stream.
bool is_full() const
Check whether the dictionary has reached its maximum capacity.
static std::vector< uint32_t > decode(const uint8_t *data, size_t size, int bit_width, size_t num_values)
Decode values from an RLE-encoded buffer without a length prefix.
Definition rle.hpp:626
static std::vector< uint8_t > encode(const uint32_t *values, size_t count, int bit_width)
Encode an array of uint32 values using the RLE/Bit-Pack Hybrid scheme.
Definition rle.hpp:342
A lightweight result type that holds either a success value of type T or an Error.
Definition error.hpp:145
void plain_encode_value(std::vector< uint8_t > &buf, const std::string &val)
Append a string value in PLAIN BYTE_ARRAY format (4-byte LE length prefix + raw bytes).
std::string plain_decode_value(const uint8_t *data, size_t &pos, size_t size, std::string *)
Decode a string from PLAIN BYTE_ARRAY format at data[pos].
int dict_bit_width(size_t dict_size)
Compute the minimum bit width needed to represent dictionary indices.
PhysicalType
Parquet physical (storage) types as defined in parquet.thrift.
Definition types.hpp:20
@ CORRUPT_DATA
Decoded data is corrupt or inconsistent (e.g. out-of-range dictionary index).
RLE/Bit-Packing Hybrid encoding and decoding (Parquet spec).
Lightweight error value carrying an ErrorCode and a human-readable message.
Definition error.hpp:101
Parquet format enumerations, type traits, and statistics structs.