121 if (count < 0 ||
static_cast<size_t>(count) > 10'000'000) {
126 for (int32_t i = 0; i < count; ++i) {
225 if (count < 0 ||
static_cast<size_t>(count) > 10'000'000) {
228 null_pages.resize(
static_cast<size_t>(count));
229 for (int32_t i = 0; i < count; ++i) {
238 if (count < 0 ||
static_cast<size_t>(count) > 10'000'000) {
241 min_values.resize(
static_cast<size_t>(count));
242 for (int32_t i = 0; i < count; ++i) {
251 if (count < 0 ||
static_cast<size_t>(count) > 10'000'000) {
254 max_values.resize(
static_cast<size_t>(count));
255 for (int32_t i = 0; i < count; ++i) {
267 if (count < 0 ||
static_cast<size_t>(count) > 10'000'000) {
271 for (int32_t i = 0; i < count; ++i) {
295 const std::string& min_val,
296 const std::string& max_val,
300 auto typed_compare = [&](
const std::string& a,
const std::string& b) ->
int {
301 switch (physical_type) {
303 if (a.size() >=
sizeof(int32_t) && b.size() >=
sizeof(int32_t)) {
304 auto va = from_le_bytes<int32_t>({
reinterpret_cast<const uint8_t*
>(a.data()),
305 reinterpret_cast<const uint8_t*
>(a.data()) + a.size()});
306 auto vb = from_le_bytes<int32_t>({
reinterpret_cast<const uint8_t*
>(b.data()),
307 reinterpret_cast<const uint8_t*
>(b.data()) + b.size()});
308 return (va < vb) ? -1 : (va > vb) ? 1 : 0;
313 if (a.size() >=
sizeof(int64_t) && b.size() >=
sizeof(int64_t)) {
314 auto va = from_le_bytes<int64_t>({
reinterpret_cast<const uint8_t*
>(a.data()),
315 reinterpret_cast<const uint8_t*
>(a.data()) + a.size()});
316 auto vb = from_le_bytes<int64_t>({
reinterpret_cast<const uint8_t*
>(b.data()),
317 reinterpret_cast<const uint8_t*
>(b.data()) + b.size()});
318 return (va < vb) ? -1 : (va > vb) ? 1 : 0;
323 if (a.size() >=
sizeof(
float) && b.size() >=
sizeof(
float)) {
324 auto va = from_le_bytes<float>({
reinterpret_cast<const uint8_t*
>(a.data()),
325 reinterpret_cast<const uint8_t*
>(a.data()) + a.size()});
326 auto vb = from_le_bytes<float>({
reinterpret_cast<const uint8_t*
>(b.data()),
327 reinterpret_cast<const uint8_t*
>(b.data()) + b.size()});
328 return (va < vb) ? -1 : (va > vb) ? 1 : 0;
333 if (a.size() >=
sizeof(
double) && b.size() >=
sizeof(
double)) {
334 auto va = from_le_bytes<double>({
reinterpret_cast<const uint8_t*
>(a.data()),
335 reinterpret_cast<const uint8_t*
>(a.data()) + a.size()});
336 auto vb = from_le_bytes<double>({
reinterpret_cast<const uint8_t*
>(b.data()),
337 reinterpret_cast<const uint8_t*
>(b.data()) + b.size()});
338 return (va < vb) ? -1 : (va > vb) ? 1 : 0;
346 return (a < b) ? -1 : (a > b) ? 1 : 0;
349 std::vector<size_t> matching;
352 for (
size_t i = 0; i < num_pages; ++i) {
368 matching.push_back(i);
399 pages_.emplace_back();
405 if (!pages_.empty()) {
406 pages_.back().min_value = min_val;
413 if (!pages_.empty()) {
414 pages_.back().max_value = max_val;
421 if (!pages_.empty()) {
422 pages_.back().null_page = is_null;
429 if (!pages_.empty()) {
430 pages_.back().null_count = count;
437 if (!pages_.empty()) {
438 pages_.back().first_row_index = row_index;
446 if (!pages_.empty()) {
447 pages_.back().offset = offset;
448 pages_.back().compressed_size = compressed_size;
467 bool has_any_null_counts =
false;
468 for (
const auto& p : pages_) {
473 if (p.null_count > 0) {
474 has_any_null_counts =
true;
485 (void)has_any_null_counts;
496 for (
const auto& p : pages_) {
513 [[nodiscard]]
size_t num_pages()
const {
return pages_.size(); }
518 std::string min_value;
519 std::string max_value;
520 bool null_page =
false;
521 int64_t null_count = 0;
522 int64_t first_row_index = 0;
524 int32_t compressed_size = 0;
527 std::vector<PageInfo> pages_;
535 const std::vector<std::string>& values,
538 if (values.size() <= 1) {
543 auto typed_cmp = [pt](
const std::string& a,
const std::string& b) ->
int {
546 if (a.size() >=
sizeof(int32_t) && b.size() >=
sizeof(int32_t)) {
548 std::memcpy(&va, a.data(),
sizeof(int32_t));
549 std::memcpy(&vb, b.data(),
sizeof(int32_t));
550 return (va < vb) ? -1 : (va > vb) ? 1 : 0;
554 if (a.size() >=
sizeof(int64_t) && b.size() >=
sizeof(int64_t)) {
556 std::memcpy(&va, a.data(),
sizeof(int64_t));
557 std::memcpy(&vb, b.data(),
sizeof(int64_t));
558 return (va < vb) ? -1 : (va > vb) ? 1 : 0;
562 if (a.size() >=
sizeof(
float) && b.size() >=
sizeof(
float)) {
564 std::memcpy(&va, a.data(),
sizeof(
float));
565 std::memcpy(&vb, b.data(),
sizeof(
float));
566 return (va < vb) ? -1 : (va > vb) ? 1 : 0;
570 if (a.size() >=
sizeof(
double) && b.size() >=
sizeof(
double)) {
572 std::memcpy(&va, a.data(),
sizeof(
double));
573 std::memcpy(&vb, b.data(),
sizeof(
double));
574 return (va < vb) ? -1 : (va > vb) ? 1 : 0;
584 bool ascending =
true;
585 bool descending =
true;
587 for (
size_t i = 1; i < values.size(); ++i) {
588 int cmp = typed_cmp(values[i], values[i - 1]);
589 if (cmp < 0) ascending =
false;
590 if (cmp > 0) descending =
false;
591 if (!ascending && !descending)
break;
Builder that accumulates per-page statistics during column writing.
void set_page_location(int64_t offset, int32_t compressed_size)
Record the page location (file offset and compressed size) for the current page.
size_t num_pages() const
Number of pages accumulated so far.
void set_min(const std::string &min_val)
Record the minimum value for the current page (binary-encoded).
void set_max(const std::string &max_val)
Record the maximum value for the current page (binary-encoded).
void set_null_page(bool is_null)
Mark the current page as all-nulls (or not).
void start_page()
Start a new page. Must be called before set_min/set_max etc.
void set_first_row_index(int64_t row_index)
Record the first row index for the current page (relative to row group).
ColumnIndex build_column_index(PhysicalType pt=PhysicalType::BYTE_ARRAY) const
Finalize and return the ColumnIndex from accumulated page info.
void reset()
Reset the builder, discarding all accumulated page info.
OffsetIndex build_offset_index() const
Finalize and return the OffsetIndex from accumulated page info.
void set_null_count(int64_t count)
Record the null count for the current page.
Thrift Compact Protocol reader.
void begin_struct()
Push a new field-ID context for reading a nested struct.
void end_struct()
Pop the field-ID context after finishing a nested struct.
FieldHeader read_field_header()
Read a field header.
int64_t read_i64()
Read a 64-bit integer (zigzag + varint64 decode).
ListHeader read_list_header()
Read a list header. Returns element type and count.
void skip_field(uint8_t thrift_type)
Skip a field without parsing its value.
std::string read_string()
Read a string (varint-length-prefixed UTF-8 bytes).
bool read_bool()
Read a boolean value.
int32_t read_i32()
Read a 32-bit integer (zigzag + varint decode).
Thrift Compact Protocol writer.
void begin_struct()
Push a new field-ID context for a nested struct.
void write_bool(bool val)
Write a standalone bool (not embedded in a field header).
void end_struct()
Pop the field-ID context after finishing a nested struct.
void write_string(const std::string &val)
Write a string as varint-length-prefixed UTF-8 bytes.
void write_field(int16_t field_id, uint8_t thrift_type)
Write a field header.
void write_i32(int32_t val)
Write a 32-bit integer as zigzag + varint.
void write_stop()
Write struct stop marker (0x00).
void write_i64(int64_t val)
Write a 64-bit integer as zigzag + varint.
void write_list_header(uint8_t elem_type, int32_t size)
Write a list header.
Thrift Compact Protocol encoder and decoder for Parquet metadata serialization.
constexpr uint8_t STRUCT
Nested struct.
constexpr uint8_t I32
32-bit signed integer (zigzag + varint).
constexpr uint8_t BINARY
Length-prefixed bytes (also used for STRING).
constexpr uint8_t LIST
List container.
constexpr uint8_t STOP
Struct stop marker.
constexpr uint8_t BOOL_TRUE
Boolean true (embedded in field header).
constexpr uint8_t I64
64-bit signed integer (zigzag + varint).
PhysicalType
Parquet physical (storage) types as defined in parquet.thrift.
@ INT64
64-bit signed integer (little-endian).
@ INT32
32-bit signed integer (little-endian).
@ BYTE_ARRAY
Variable-length byte sequence (strings, binary).
@ FLOAT
IEEE 754 single-precision float.
@ DOUBLE
IEEE 754 double-precision float.
Per-column-chunk statistics tracker and little-endian byte helpers.
Per-page min/max statistics for predicate pushdown.
void deserialize(thrift::CompactDecoder &dec)
Deserialize this ColumnIndex from a Thrift compact decoder.
bool valid_
False if deserialization failed (M-V7).
bool valid() const
Check if deserialization was successful.
BoundaryOrder boundary_order
Boundary order of min values.
std::vector< bool > null_pages
True if the corresponding page is all nulls.
std::vector< size_t > filter_pages(const std::string &min_val, const std::string &max_val, PhysicalType physical_type=PhysicalType::BYTE_ARRAY) const
Filter pages by a value range for predicate pushdown.
std::vector< std::string > max_values
Binary-encoded maximum value per page.
BoundaryOrder
Ordering of min values across pages, used to short-circuit filtering.
@ UNORDERED
Min values have no particular order.
@ ASCENDING
Min values are non-decreasing across pages.
@ DESCENDING
Min values are non-increasing across pages.
void serialize(thrift::CompactEncoder &enc) const
Serialize this ColumnIndex to a Thrift compact encoder.
std::vector< std::string > min_values
Binary-encoded minimum value per page.
std::vector< int64_t > null_counts
Null count per page (optional).
Page locations for random access within a column chunk.
void deserialize(thrift::CompactDecoder &dec)
Deserialize this OffsetIndex from a Thrift compact decoder.
void serialize(thrift::CompactEncoder &enc) const
Serialize this OffsetIndex to a Thrift compact encoder.
bool valid_
False if deserialization failed (M-V7).
bool valid() const
Check if deserialization was successful.
std::vector< PageLocation > page_locations
One entry per data page.
File offset and size descriptor for a single data page.
int32_t compressed_page_size
Size of the page in compressed bytes.
void serialize(thrift::CompactEncoder &enc) const
Serialize this PageLocation to a Thrift compact encoder.
int64_t first_row_index
First row in this page (relative to row group).
int64_t offset
Absolute file offset of the page header.
void deserialize(thrift::CompactDecoder &dec)
Deserialize this PageLocation from a Thrift compact decoder.