46#include <unordered_map>
67 if (dict_size <= 1)
return 0;
70 size_t n = dict_size - 1;
85 if (val.size() > UINT32_MAX)
throw std::length_error(
"String exceeds Parquet BYTE_ARRAY 4 GB limit");
86 uint32_t len =
static_cast<uint32_t
>(val.size());
87 buf.push_back(
static_cast<uint8_t
>((len ) & 0xFF));
88 buf.push_back(
static_cast<uint8_t
>((len >> 8) & 0xFF));
89 buf.push_back(
static_cast<uint8_t
>((len >> 16) & 0xFF));
90 buf.push_back(
static_cast<uint8_t
>((len >> 24) & 0xFF));
92 reinterpret_cast<const uint8_t*
>(val.data()),
93 reinterpret_cast<const uint8_t*
>(val.data()) + val.size());
102 std::memcpy(&bits, &val,
sizeof(bits));
103 buf.push_back(
static_cast<uint8_t
>((bits ) & 0xFF));
104 buf.push_back(
static_cast<uint8_t
>((bits >> 8) & 0xFF));
105 buf.push_back(
static_cast<uint8_t
>((bits >> 16) & 0xFF));
106 buf.push_back(
static_cast<uint8_t
>((bits >> 24) & 0xFF));
115 std::memcpy(&bits, &val,
sizeof(bits));
116 for (
int i = 0; i < 8; ++i) {
117 buf.push_back(
static_cast<uint8_t
>(bits & 0xFF));
128 std::memcpy(&bits, &val,
sizeof(bits));
129 buf.push_back(
static_cast<uint8_t
>((bits ) & 0xFF));
130 buf.push_back(
static_cast<uint8_t
>((bits >> 8) & 0xFF));
131 buf.push_back(
static_cast<uint8_t
>((bits >> 16) & 0xFF));
132 buf.push_back(
static_cast<uint8_t
>((bits >> 24) & 0xFF));
141 std::memcpy(&bits, &val,
sizeof(bits));
142 for (
int i = 0; i < 8; ++i) {
143 buf.push_back(
static_cast<uint8_t
>(bits & 0xFF));
160 size_t size, std::string* ) {
161 if (pos + 4 > size)
return {};
163 std::memcpy(&len, data + pos, 4);
165 if (pos + len > size)
return {};
166 std::string val(
reinterpret_cast<const char*
>(data + pos), len);
178 size_t size, int32_t* ) {
179 if (pos + 4 > size)
return 0;
181 std::memcpy(&val, data + pos, 4);
193 size_t size, int64_t* ) {
194 if (pos + 8 > size)
return 0;
196 std::memcpy(&val, data + pos, 8);
208 size_t size,
float* ) {
209 if (pos + 4 > size)
return 0.0f;
211 std::memcpy(&val, data + pos, 4);
223 size_t size,
double* ) {
224 if (pos + 8 > size)
return 0.0;
226 std::memcpy(&val, data + pos, 8);
283 bool put(
const T& value) {
284 auto it = dict_map_.find(value);
286 if (it == dict_map_.end()) {
290 index =
static_cast<uint32_t
>(dict_values_.size());
291 dict_map_.emplace(value, index);
292 dict_values_.push_back(value);
296 indices_.push_back(index);
318 std::vector<uint8_t> buf;
319 for (
const auto& val : dict_values_) {
338 indices_.size(), bw);
341 std::vector<uint8_t> result;
342 result.reserve(1 + rle_payload.size());
343 result.push_back(
static_cast<uint8_t
>(bw));
344 result.insert(result.end(), rle_payload.begin(), rle_payload.end());
356 [[nodiscard]]
size_t num_values()
const {
return indices_.size(); }
370 dict_values_.clear();
382 if (indices_.empty())
return false;
383 return dict_values_.size() < indices_.size() * 4 / 10;
387 std::unordered_map<T, uint32_t> dict_map_;
388 std::vector<T> dict_values_;
389 std::vector<uint32_t> indices_;
431 dict_values_.reserve((std::min)(
static_cast<size_t>(num_dict_entries), dict_size));
433 for (
size_t i = 0; i < num_dict_entries; ++i) {
435 dict_values_.push_back(
453 size_t num_values)
const {
454 if (indices_size == 0 || num_values == 0)
return std::vector<T>{};
457 int bw =
static_cast<int>(indices_data[0]);
465 std::vector<T> result;
466 result.reserve(indices.size());
467 for (uint32_t idx : indices) {
468 if (
static_cast<size_t>(idx) >= dict_values_.size()) {
470 "dictionary index " + std::to_string(idx)
471 +
" out of range (dict size="
472 + std::to_string(dict_values_.size()) +
")"};
474 result.push_back(dict_values_[idx]);
486 std::vector<T> dict_values_;
Dictionary decoder for Parquet PLAIN_DICTIONARY / RLE_DICTIONARY encoding.
size_t dictionary_size() const
Number of entries in the dictionary.
DictionaryDecoder(const uint8_t *dict_data, size_t dict_size, size_t num_dict_entries, PhysicalType type)
Construct a decoder by parsing the raw PLAIN-encoded dictionary page.
expected< std::vector< T > > decode(const uint8_t *indices_data, size_t indices_size, size_t num_values) const
Decode an RLE_DICTIONARY indices page into original typed values.
Dictionary encoder for Parquet PLAIN_DICTIONARY / RLE_DICTIONARY encoding.
bool is_worthwhile() const
Heuristic check: is dictionary encoding worthwhile for this data?
int bit_width() const
Bits per dictionary index (ceil(log2(dictionary_size))).
size_t dictionary_size() const
Number of unique values in the dictionary.
void flush()
Finalize the encoding.
DictionaryEncoder()=default
Default-construct an empty dictionary encoder.
std::vector< uint8_t > indices_page() const
Get the data page as RLE_DICTIONARY-encoded indices.
size_t num_values() const
Total number of values encoded (including duplicates).
static constexpr size_t MAX_DICTIONARY_ENTRIES
Maximum number of dictionary entries before fallback to PLAIN encoding.
std::vector< uint8_t > dictionary_page() const
Get the dictionary page as PLAIN-encoded unique values.
void reset()
Reset the encoder, clearing the dictionary, indices, and all internal state.
bool put(const T &value)
Add a value to the encoding stream.
bool is_full() const
Check whether the dictionary has reached its maximum capacity.
static std::vector< uint32_t > decode(const uint8_t *data, size_t size, int bit_width, size_t num_values)
Decode values from an RLE-encoded buffer without a length prefix.
static std::vector< uint8_t > encode(const uint32_t *values, size_t count, int bit_width)
Encode an array of uint32 values using the RLE/Bit-Pack Hybrid scheme.
A lightweight result type that holds either a success value of type T or an Error.
void plain_encode_value(std::vector< uint8_t > &buf, const std::string &val)
Append a string value in PLAIN BYTE_ARRAY format (4-byte LE length prefix + raw bytes).
std::string plain_decode_value(const uint8_t *data, size_t &pos, size_t size, std::string *)
Decode a string from PLAIN BYTE_ARRAY format at data[pos].
int dict_bit_width(size_t dict_size)
Compute the minimum bit width needed to represent dictionary indices.
PhysicalType
Parquet physical (storage) types as defined in parquet.thrift.
@ CORRUPT_DATA
Decoded data is corrupt or inconsistent (e.g. out-of-range dictionary index).
RLE/Bit-Packing Hybrid encoding and decoding (Parquet spec).
Lightweight error value carrying an ErrorCode and a human-readable message.
Parquet format enumerations, type traits, and statistics structs.