325 if (
auto r =
decimal->deserialize(dec); !r.has_value())
return r.error();
333 if (
auto r =
timestamp->deserialize(dec); !r.has_value())
return r.error();
341 if (
auto r =
integer->deserialize(dec); !r.has_value())
return r.error();
370 std::optional<std::string>
max;
371 std::optional<std::string>
min;
381 if (
max.has_value()) {
385 if (
min.has_value()) {
474 :
key(std::move(k)),
value(std::move(v)) {}
480 if (
value.has_value()) {
518 if ((seen & 0x01u) == 0u) {
531 std::optional<PhysicalType>
type;
546 if (
type.has_value()) {
569 if (
scale.has_value()) {
656 if (
auto r =
logical_type->deserialize(dec); !r.has_value())
return r.error();
727 "DataPageHeader.definition_level_encoding: expected I32"};
735 "DataPageHeader.repetition_level_encoding: expected I32"};
745 if (
auto r =
statistics->deserialize(dec); !r.has_value())
return r.error();
756 if ((seen & 0x0Fu) != 0x0Fu) {
809 "DictionaryPageHeader.is_sorted: expected BOOL"};
895 "DataPageHeaderV2.definition_levels_byte_length: expected I32"};
902 "DataPageHeaderV2.repetition_levels_byte_length: expected I32"};
937 std::optional<int32_t>
crc;
953 if (
crc.has_value()) {
1009 "PageHeader.data_page_header: expected STRUCT"};
1012 if (
auto r =
data_page_header->deserialize(dec); !r.has_value())
return r.error();
1021 "PageHeader.dictionary_page_header: expected STRUCT"};
1030 "PageHeader.data_page_header_v2: expected STRUCT"};
1153 static constexpr int32_t MAX_ENCODINGS = 10000;
1154 if (count < 0 || count > MAX_ENCODINGS) {
1156 "ColumnMetaData.encodings: list exceeds maximum size"};
1158 encodings.resize(
static_cast<size_t>(count));
1159 for (int32_t i = 0; i < count; ++i) {
1169 "ColumnMetaData.path_in_schema: expected LIST"};
1172 static constexpr int32_t MAX_PATH_ELEMS = 10000;
1173 if (count < 0 || count > MAX_PATH_ELEMS) {
1175 "ColumnMetaData.path_in_schema: list exceeds maximum size"};
1178 for (int32_t i = 0; i < count; ++i) {
1201 "ColumnMetaData.total_uncompressed_size: expected I64"};
1209 "ColumnMetaData.total_compressed_size: expected I64"};
1217 "ColumnMetaData.key_value_metadata: expected LIST"};
1220 static constexpr int32_t MAX_STRUCT_LIST_SIZE = 10000;
1221 if (count < 0 || count > MAX_STRUCT_LIST_SIZE) {
1223 "ColumnMetaData.key_value_metadata: list exceeds maximum size"};
1227 for (int32_t i = 0; i < count; ++i) {
1238 "ColumnMetaData.data_page_offset: expected I64"};
1246 "ColumnMetaData.index_page_offset: expected I64"};
1253 "ColumnMetaData.dictionary_page_offset: expected I64"};
1260 "ColumnMetaData.statistics: expected STRUCT"};
1263 if (
auto r =
statistics->deserialize(dec); !r.has_value())
return r.error();
1274 if ((seen & 0xFFu) != 0xFFu) {
1276 "ColumnMetaData: missing one or more required fields (1-7, 9)"};
1448 "EncryptionAlgorithm.AES_GCM_V1: expected STRUCT"};
1452 if (
auto r =
aes_gcm_v1->deserialize(dec); !r.has_value())
return r.error();
1458 "EncryptionAlgorithm.AES_GCM_CTR_V1: expected STRUCT"};
1462 if (
auto r =
aes_gcm_ctr_v1->deserialize(dec); !r.has_value())
return r.error();
1509 "EncryptionWithColumnKey.path_in_schema: expected LIST"};
1512 static constexpr int32_t MAX_PATH = 10000;
1513 if (count < 0 || count > MAX_PATH) {
1515 "EncryptionWithColumnKey.path_in_schema: list exceeds maximum size"};
1518 for (int32_t i = 0; i < count; ++i) {
1526 "EncryptionWithColumnKey.key_metadata: expected BINARY"};
1585 "ColumnCryptoMetaData.FOOTER_KEY: expected STRUCT"};
1594 "ColumnCryptoMetaData.COLUMN_KEY: expected STRUCT"};
1598 if (
auto r =
column_key->deserialize(dec); !r.has_value())
return r.error();
1646 "FileCryptoMetaData.encryption_algorithm: expected STRUCT"};
1654 "FileCryptoMetaData.key_metadata: expected BINARY"};
1701 "BloomFilterAlgorithm.BLOCK: expected STRUCT"};
1788 "BloomFilterCompression.UNCOMPRESSED: expected STRUCT"};
1968 if (
auto r =
meta_data->deserialize(dec); !r.has_value())
return r.error();
1976 if (
auto r =
crypto_metadata->deserialize(dec); !r.has_value())
return r.error();
1982 "ColumnChunk.bloom_filter_offset: expected I64"};
1989 "ColumnChunk.bloom_filter_length: expected I32"};
1996 "ColumnChunk.column_index_offset: expected I64"};
2003 "ColumnChunk.column_index_length: expected I32"};
2010 "ColumnChunk.offset_index_offset: expected I64"};
2017 "ColumnChunk.offset_index_length: expected I32"};
2168 for (
const auto& col :
columns) {
2203 static constexpr int32_t MAX_STRUCT_LIST_SIZE = 10000;
2204 if (count < 0 || count > MAX_STRUCT_LIST_SIZE) {
2206 "RowGroup.columns: list exceeds maximum size"};
2208 columns.resize(
static_cast<size_t>(count));
2209 for (int32_t i = 0; i < count; ++i) {
2234 static constexpr int32_t MAX_SORT_COLS = 10000;
2235 if (count < 0 || count > MAX_SORT_COLS) {
2237 "RowGroup.sorting_columns: list exceeds maximum size"};
2240 for (int32_t i = 0; i < count; ++i) {
2283 for (
const auto& elem :
schema) {
2284 elem.serialize(enc);
2342 static constexpr int32_t MAX_STRUCT_LIST_SIZE = 10000;
2343 if (count < 0 || count > MAX_STRUCT_LIST_SIZE) {
2345 "FileMetaData.schema: list exceeds maximum size"};
2347 schema.resize(
static_cast<size_t>(count));
2348 for (int32_t i = 0; i < count; ++i) {
2367 static constexpr int32_t MAX_STRUCT_LIST_SIZE_RG = 10000;
2368 if (count < 0 || count > MAX_STRUCT_LIST_SIZE_RG) {
2370 "FileMetaData.row_groups: list exceeds maximum size"};
2372 row_groups.resize(
static_cast<size_t>(count));
2373 for (int32_t i = 0; i < count; ++i) {
2384 "FileMetaData.key_value_metadata: expected LIST"};
2387 static constexpr int32_t MAX_KV_LIST_SIZE = 1'000'000;
2388 if (count < 0 || count > MAX_KV_LIST_SIZE) {
2390 "FileMetaData.key_value_metadata: list exceeds maximum size"};
2394 for (int32_t i = 0; i < count; ++i) {
2411 "FileMetaData.column_orders: expected LIST"};
2414 static constexpr int32_t MAX_COL_ORDERS = 10000;
2415 if (count < 0 || count > MAX_COL_ORDERS) {
2417 "FileMetaData.column_orders: list exceeds maximum size"};
2421 for (int32_t i = 0; i < count; ++i) {
bool has_value() const
Return true if the result represents success (no error).
A lightweight result type that holds either a success value of type T or an Error.
Thrift Compact Protocol reader.
void begin_struct()
Push a new field-ID context for reading a nested struct.
void end_struct()
Pop the field-ID context after finishing a nested struct.
FieldHeader read_field_header()
Read a field header.
int64_t read_i64()
Read a 64-bit integer (zigzag + varint64 decode).
int8_t read_i8()
Read an 8-bit signed integer (single raw byte, I8 wire type).
ListHeader read_list_header()
Read a list header. Returns element type and count.
void skip_field(uint8_t thrift_type)
Skip a field without parsing its value.
std::string read_string()
Read a string (varint-length-prefixed UTF-8 bytes).
std::vector< uint8_t > read_binary()
Read raw binary data (varint-length-prefixed bytes).
bool good() const
Returns true if no errors have occurred (no bounds violations).
bool read_bool()
Read a boolean value.
int32_t read_i32()
Read a 32-bit integer (zigzag + varint decode).
Thrift Compact Protocol writer.
void begin_struct()
Push a new field-ID context for a nested struct.
void end_struct()
Pop the field-ID context after finishing a nested struct.
void write_string(const std::string &val)
Write a string as varint-length-prefixed UTF-8 bytes.
void write_field_bool(int16_t field_id, bool val)
Write a bool field where the value is embedded in the field header's type nibble (1 = true,...
void write_field(int16_t field_id, uint8_t thrift_type)
Write a field header.
void write_i32(int32_t val)
Write a 32-bit integer as zigzag + varint.
void write_stop()
Write struct stop marker (0x00).
void write_i64(int64_t val)
Write a 64-bit integer as zigzag + varint.
void write_i8(int8_t val)
Write an 8-bit signed integer as a single raw byte (I8 wire type).
void write_binary(const uint8_t *data, size_t len)
Write raw binary data as varint-length-prefixed bytes.
void write_list_header(uint8_t elem_type, int32_t size)
Write a list header.
Thrift Compact Protocol encoder and decoder for Parquet metadata serialization.
constexpr uint8_t STRUCT
Nested struct.
constexpr uint8_t I32
32-bit signed integer (zigzag + varint).
constexpr uint8_t BOOL_FALSE
Boolean false (embedded in field header).
constexpr uint8_t BINARY
Length-prefixed bytes (also used for STRING).
constexpr uint8_t LIST
List container.
constexpr uint8_t STOP
Struct stop marker.
constexpr uint8_t BOOL_TRUE
Boolean true (embedded in field header).
constexpr uint8_t I64
64-bit signed integer (zigzag + varint).
constexpr uint8_t I8
8-bit signed integer.
SortOrder
Sort order for column statistics (parquet.thrift SortOrder enum).
@ SIGNED
Values compared as signed integers or IEEE 754 floats.
@ UNKNOWN
Sort order unknown or inapplicable.
@ UNSIGNED
Values compared as unsigned integers or bytes.
PhysicalType
Parquet physical (storage) types as defined in parquet.thrift.
@ BYTE_ARRAY
Variable-length byte sequence (strings, binary).
constexpr int32_t PARQUET_VERSION
Parquet format version written to the file footer.
Compression
Parquet compression codecs.
@ UNCOMPRESSED
No compression.
ConvertedType
Legacy Parquet converted types for backward compatibility with older readers.
@ THRIFT_DECODE_ERROR
The Thrift Compact Protocol decoder encountered invalid or malicious input.
Encoding
Parquet page encoding types.
@ RLE
Run-length / bit-packed hybrid (used for booleans and def/rep levels).
@ PLAIN_DICTIONARY
Legacy dictionary encoding (Parquet 1.0).
@ PLAIN
Values stored back-to-back in their native binary layout.
PageType
Parquet page types within a column chunk.
@ DATA_PAGE
Data page (Parquet 1.0 format).
Repetition
Parquet field repetition types (nullability / cardinality).
AES-GCM-CTR-V1 encryption algorithm parameters (parquet.thrift AesGcmCtrV1).
std::optional< bool > supply_aad_prefix
Caller supplies AAD prefix (field 3).
std::optional< std::vector< uint8_t > > aad_prefix
AAD prefix bytes (field 1).
std::optional< bool > aad_file_unique
Unique AAD per file (field 2).
void serialize(CompactEncoder &enc) const
expected< void > deserialize(CompactDecoder &dec)
AES-GCM-V1 encryption algorithm parameters (parquet.thrift AesGcmV1).
std::optional< std::vector< uint8_t > > aad_prefix
AAD prefix bytes (field 1).
expected< void > deserialize(CompactDecoder &dec)
std::optional< bool > aad_file_unique
Unique AAD per file (field 2).
void serialize(CompactEncoder &enc) const
std::optional< bool > supply_aad_prefix
Caller supplies AAD prefix (field 3).
BloomFilterAlgorithm union: BLOCK (SplitBlock) is the only defined algorithm.
BloomFilterAlgorithm()=default
enum signet::forge::thrift::BloomFilterAlgorithm::Kind kind
void serialize(CompactEncoder &enc) const
expected< void > deserialize(CompactDecoder &dec)
BloomFilterCompression union: UNCOMPRESSED (field 1) is the only defined mode.
expected< void > deserialize(CompactDecoder &dec)
enum signet::forge::thrift::BloomFilterCompression::Kind kind
BloomFilterCompression()=default
void serialize(CompactEncoder &enc) const
BloomFilterHash union: XXHASH (field 1) is the only defined hash function.
void serialize(CompactEncoder &enc) const
enum signet::forge::thrift::BloomFilterHash::Kind kind
expected< void > deserialize(CompactDecoder &dec)
BloomFilterHash()=default
Parquet column chunk descriptor (parquet.thrift fields 1-13).
std::optional< int64_t > column_index_offset
Column index offset (field 10).
expected< void > deserialize(CompactDecoder &dec)
std::optional< std::string > file_path
External file path (field 1).
void serialize(CompactEncoder &enc) const
int64_t file_offset
Byte offset in file (field 2).
std::optional< int64_t > bloom_filter_offset
Bloom filter offset (field 8).
std::optional< int64_t > offset_index_offset
Offset index offset (field 12).
std::optional< int32_t > offset_index_length
Offset index byte length (field 13).
std::optional< int32_t > column_index_length
Column index byte length (field 11).
std::optional< ColumnMetaData > meta_data
Inline column metadata (field 3).
std::optional< ColumnCryptoMetaData > crypto_metadata
PME crypto metadata (field 4).
std::optional< int32_t > bloom_filter_length
Bloom filter byte length (field 9).
ColumnOrder union: describes how a column's values are compared for statistics.
void serialize(CompactEncoder &enc) const
expected< void > deserialize(CompactDecoder &dec)
enum signet::forge::thrift::ColumnOrder::Kind kind
DecimalType: fixed-point decimal logical type.
int32_t precision
Total number of significant decimal digits.
expected< void > deserialize(CompactDecoder &dec)
DecimalType(int32_t s, int32_t p)
int32_t scale
Number of digits to the right of the decimal point.
void serialize(CompactEncoder &enc) const
EncryptionAlgorithm union: AES-GCM-V1 (field 1) or AES-GCM-CTR-V1 (field 2).
std::optional< AesGcmV1 > aes_gcm_v1
Populated when kind == AES_GCM_V1.
enum signet::forge::thrift::EncryptionAlgorithm::Kind kind
expected< void > deserialize(CompactDecoder &dec)
void serialize(CompactEncoder &enc) const
EncryptionAlgorithm()=default
std::optional< AesGcmCtrV1 > aes_gcm_ctr_v1
Populated when kind == AES_GCM_CTR_V1.
EncryptionWithColumnKey: per-column encryption key binding (parquet.thrift).
std::optional< std::vector< uint8_t > > key_metadata
Serialized key metadata (field 2).
void serialize(CompactEncoder &enc) const
EncryptionWithColumnKey()=default
std::vector< std::string > path_in_schema
Schema path of the encrypted column (field 1).
expected< void > deserialize(CompactDecoder &dec)
IntType: integer logical type with explicit width and signedness.
bool is_signed
True for signed integers; false for unsigned.
expected< void > deserialize(CompactDecoder &dec)
IntType(int8_t bw, bool s)
int8_t bit_width
Bit width: 8, 16, 32, or 64.
void serialize(CompactEncoder &enc) const
Parquet KeyValue metadata entry (parquet.thrift field IDs 1-2).
KeyValue(std::string k, std::string v)
std::optional< std::string > value
Metadata value (field 2, optional).
void serialize(CompactEncoder &enc) const
expected< void > deserialize(CompactDecoder &dec)
std::string key
Metadata key (field 1, required).
LogicalTypeUnion: Thrift wire union for parquet.thrift LogicalType (field 10 of SchemaElement).
void serialize(CompactEncoder &enc) const
expected< void > deserialize(CompactDecoder &dec)
LogicalTypeUnion()=default
std::optional< TimestampType > timestamp
Populated when kind == TIMESTAMP.
std::optional< DecimalType > decimal
Populated when kind == DECIMAL.
@ DECIMAL
DecimalType (field 5 of union).
@ INT
IntType (field 11 of union).
@ UUID
UUIDType (field 15 of union).
@ STRING
StringType (field 1 of union).
@ TIMESTAMP
TimestampType (field 9 of union).
@ NONE
No logical type annotation.
std::optional< IntType > integer
Populated when kind == INT.
enum signet::forge::thrift::LogicalTypeUnion::Kind kind
Parquet row group (parquet.thrift fields 1-4).
std::vector< ColumnChunk > columns
Column chunks (field 1).
int64_t total_byte_size
Total byte size (field 2).
void serialize(CompactEncoder &enc) const
int64_t num_rows
Number of rows (field 3).
expected< void > deserialize(CompactDecoder &dec)
std::vector< SortingColumn > sorting_columns
Sort keys (field 4, optional).
Parquet schema element (parquet.thrift fields 1-10).
void serialize(CompactEncoder &enc) const
std::optional< int32_t > type_length
Type length for FIXED_LEN_BYTE_ARRAY (field 2).
std::optional< ConvertedType > converted_type
Legacy converted type (field 6).
expected< void > deserialize(CompactDecoder &dec)
std::string name
Column or group name (field 4, required).
std::optional< LogicalTypeUnion > logical_type
LogicalType union (field 10, preferred).
std::optional< Repetition > repetition_type
REQUIRED/OPTIONAL/REPEATED (field 3).
std::optional< int32_t > num_children
Number of children for group nodes (field 5).
std::optional< int32_t > scale
Decimal scale (field 7).
std::optional< int32_t > precision
Decimal precision (field 8).
std::optional< int32_t > field_id
Field ID for nested type evolution (field 9).
std::optional< PhysicalType > type
Physical type (field 1, absent for group nodes).
SortingColumn: describes sort key for a column within a RowGroup.
SortingColumn(int32_t idx, bool desc, bool nf)
bool nulls_first
True if nulls sort before non-null values (field 3).
void serialize(CompactEncoder &enc) const
expected< void > deserialize(CompactDecoder &dec)
int32_t column_idx
Zero-based column index within the schema (field 1).
bool descending
True for descending sort order (field 2).
Parquet column statistics (parquet.thrift fields 1-6).
std::optional< std::string > min
Old-style min (field 2, deprecated).
std::optional< std::string > max_value
New-style max value (field 5, preferred).
std::optional< int64_t > null_count
Number of null values (field 3).
expected< void > deserialize(CompactDecoder &dec)
void serialize(CompactEncoder &enc) const
std::optional< int64_t > distinct_count
Approximate distinct count (field 4).
std::optional< std::string > min_value
New-style min value (field 6, preferred).
std::optional< std::string > max
Old-style max (field 1, deprecated).
Time unit discriminator for TimestampType (parquet.thrift TimeUnit union).
enum signet::forge::thrift::TimeUnit::Kind kind
void serialize(CompactEncoder &enc) const
expected< void > deserialize(CompactDecoder &dec)
TimestampType: timestamp logical type with UTC adjustment and time unit.
TimestampType(bool utc, TimeUnit u)
expected< void > deserialize(CompactDecoder &dec)
void serialize(CompactEncoder &enc) const
TimeUnit unit
Time unit (MILLIS, MICROS, or NANOS).
bool is_adjusted_to_utc
True if the timestamp is UTC-normalized.
Parquet format enumerations, type traits, and statistics structs.