60 int32_t type_length = -1)
65 , num_values_(num_values)
67 , type_length_(type_length)
68 , bool_bit_offset_(0) {}
79 "read_bool() called on non-BOOLEAN column"};
81 if (values_read_ >= num_values_) {
85 size_t byte_index = bool_bit_offset_ / 8;
86 size_t bit_index = bool_bit_offset_ % 8;
88 if (byte_index >= size_) {
90 "boolean read past end of page data"};
93 bool val = (data_[byte_index] >> bit_index) & 1;
104 "read_int32() called on non-INT32 column"};
106 if (values_read_ >= num_values_) {
109 if (pos_ + 4 > size_) {
111 "INT32 read past end of page data"};
115 std::memcpy(&val, data_ + pos_, 4);
126 "read_int64() called on non-INT64 column"};
128 if (values_read_ >= num_values_) {
131 if (pos_ + 8 > size_) {
133 "INT64 read past end of page data"};
137 std::memcpy(&val, data_ + pos_, 8);
148 "read_float() called on non-FLOAT column"};
150 if (values_read_ >= num_values_) {
153 if (pos_ + 4 > size_) {
155 "FLOAT read past end of page data"};
159 std::memcpy(&val, data_ + pos_, 4);
170 "read_double() called on non-DOUBLE column"};
172 if (values_read_ >= num_values_) {
175 if (pos_ + 8 > size_) {
177 "DOUBLE read past end of page data"};
181 std::memcpy(&val, data_ + pos_, 8);
195 "read_string() called on non-BYTE_ARRAY column"};
197 if (values_read_ >= num_values_) {
200 if (pos_ + 4 > size_) {
202 "BYTE_ARRAY length prefix read past end of page data"};
206 std::memcpy(&len, data_ + pos_, 4);
210 if (
static_cast<size_t>(len) > size_ - pos_ - 4) {
212 "BYTE_ARRAY data read past end of page data"};
216 std::string val(
reinterpret_cast<const char*
>(data_ + pos_), len);
231 "read_string_view() called on non-BYTE_ARRAY column"};
233 if (values_read_ >= num_values_) {
236 if (pos_ + 4 > size_) {
238 "BYTE_ARRAY length prefix read past end of page data"};
242 std::memcpy(&len, data_ + pos_, 4);
246 if (
static_cast<size_t>(len) > size_ - pos_ - 4) {
248 "BYTE_ARRAY data read past end of page data"};
252 std::string_view val(
reinterpret_cast<const char*
>(data_ + pos_), len);
268 "read_bytes() requires BYTE_ARRAY or FIXED_LEN_BYTE_ARRAY"};
270 if (values_read_ >= num_values_) {
275 if (type_length_ <= 0) {
277 "FIXED_LEN_BYTE_ARRAY requires positive type_length"};
279 size_t len =
static_cast<size_t>(type_length_);
280 if (pos_ + len > size_) {
282 "FIXED_LEN_BYTE_ARRAY read past end of page data"};
284 std::vector<uint8_t> val(data_ + pos_, data_ + pos_ + len);
291 if (pos_ + 4 > size_) {
293 "BYTE_ARRAY length prefix read past end of page data"};
296 std::memcpy(&len, data_ + pos_, 4);
300 if (
static_cast<size_t>(len) > size_ - pos_ - 4) {
302 "BYTE_ARRAY data read past end of page data"};
305 std::vector<uint8_t> val(data_ + pos_, data_ + pos_ + len);
322 "read_batch_bool() called on non-BOOLEAN column"};
324 if (values_read_ +
static_cast<int64_t
>(count) > num_values_) {
326 "batch read exceeds available values"};
329 for (
size_t i = 0; i < count; ++i) {
330 size_t byte_index = bool_bit_offset_ / 8;
331 size_t bit_index = bool_bit_offset_ % 8;
333 if (byte_index >= size_) {
335 "boolean batch read past end of page data"};
338 out[i] = (data_[byte_index] >> bit_index) & 1;
352 "read_batch_int32() called on non-INT32 column"};
354 if (count > SIZE_MAX / 4) {
357 size_t total_bytes = count * 4;
358 if (values_read_ +
static_cast<int64_t
>(count) > num_values_) {
360 "batch read exceeds available values"};
362 if (pos_ + total_bytes > size_) {
364 "INT32 batch read past end of page data"};
367 std::memcpy(out, data_ + pos_, total_bytes);
369 values_read_ +=
static_cast<int64_t
>(count);
380 "read_batch_int64() called on non-INT64 column"};
382 if (count > SIZE_MAX / 8) {
385 size_t total_bytes = count * 8;
386 if (values_read_ +
static_cast<int64_t
>(count) > num_values_) {
388 "batch read exceeds available values"};
390 if (pos_ + total_bytes > size_) {
392 "INT64 batch read past end of page data"};
395 std::memcpy(out, data_ + pos_, total_bytes);
397 values_read_ +=
static_cast<int64_t
>(count);
408 "read_batch_float() called on non-FLOAT column"};
410 if (count > SIZE_MAX / 4) {
413 size_t total_bytes = count * 4;
414 if (values_read_ +
static_cast<int64_t
>(count) > num_values_) {
416 "batch read exceeds available values"};
418 if (pos_ + total_bytes > size_) {
420 "FLOAT batch read past end of page data"};
423 std::memcpy(out, data_ + pos_, total_bytes);
425 values_read_ +=
static_cast<int64_t
>(count);
436 "read_batch_double() called on non-DOUBLE column"};
438 if (count > SIZE_MAX / 8) {
441 size_t total_bytes = count * 8;
442 if (values_read_ +
static_cast<int64_t
>(count) > num_values_) {
444 "batch read exceeds available values"};
446 if (pos_ + total_bytes > size_) {
448 "DOUBLE batch read past end of page data"};
451 std::memcpy(out, data_ + pos_, total_bytes);
453 values_read_ +=
static_cast<int64_t
>(count);
464 "read_batch_string() called on non-BYTE_ARRAY column"};
466 if (values_read_ +
static_cast<int64_t
>(count) > num_values_) {
468 "batch read exceeds available values"};
471 for (
size_t i = 0; i < count; ++i) {
472 if (pos_ + 4 > size_) {
474 "BYTE_ARRAY length prefix read past end of page data"};
477 std::memcpy(&len, data_ + pos_, 4);
481 if (pos_ > size_ || len > size_ - pos_) {
483 "BYTE_ARRAY data read past end of page data"};
485 out[i].assign(
reinterpret_cast<const char*
>(data_ + pos_), len);
503 template <
typename T>
505 if constexpr (std::is_same_v<T, bool>) {
507 }
else if constexpr (std::is_same_v<T, int32_t>) {
509 }
else if constexpr (std::is_same_v<T, int64_t>) {
511 }
else if constexpr (std::is_same_v<T, float>) {
513 }
else if constexpr (std::is_same_v<T, double>) {
515 }
else if constexpr (std::is_same_v<T, std::string>) {
517 }
else if constexpr (std::is_same_v<T, std::string_view>) {
519 }
else if constexpr (std::is_same_v<T, std::vector<uint8_t>>) {
522 static_assert(!std::is_same_v<T, T>,
523 "ColumnReader::read<T>: unsupported type");
536 template <
typename T>
538 if constexpr (std::is_same_v<T, bool>) {
540 }
else if constexpr (std::is_same_v<T, int32_t>) {
542 }
else if constexpr (std::is_same_v<T, int64_t>) {
544 }
else if constexpr (std::is_same_v<T, float>) {
546 }
else if constexpr (std::is_same_v<T, double>) {
548 }
else if constexpr (std::is_same_v<T, std::string>) {
551 static_assert(!std::is_same_v<T, T>,
552 "ColumnReader::read_batch<T>: unsupported type");
562 return num_values_ - values_read_;
567 return values_read_ < num_values_;
582 const uint8_t* data_;
586 int64_t values_read_;
587 int32_t type_length_;
591 size_t bool_bit_offset_;
PLAIN-encoded Parquet column decoder.
expected< void > read_batch_double(double *out, size_t count)
Read a batch of DOUBLE values via bulk memcpy.
ColumnReader(PhysicalType type, const uint8_t *data, size_t size, int64_t num_values, int32_t type_length=-1)
Construct a reader over raw PLAIN-encoded page data.
expected< int32_t > read_int32()
Read a single INT32 value (4 bytes little-endian).
expected< int64_t > read_int64()
Read a single INT64 value (8 bytes little-endian).
int64_t values_remaining() const
Number of values not yet read from this page.
expected< void > read_batch_float(float *out, size_t count)
Read a batch of FLOAT values via bulk memcpy.
expected< T > read()
Read a single value of type T, dispatching to the correct typed reader.
expected< bool > read_bool()
Read a single BOOLEAN value (bit-packed, LSB first).
expected< double > read_double()
Read a single DOUBLE value (8 bytes little-endian, IEEE 754).
expected< void > read_batch(T *out, size_t count)
Read a batch of count values of type T into out.
expected< void > read_batch_int32(int32_t *out, size_t count)
Read a batch of INT32 values via bulk memcpy.
PhysicalType type() const
The Parquet physical type of this column.
expected< void > read_batch_int64(int64_t *out, size_t count)
Read a batch of INT64 values via bulk memcpy.
expected< void > read_batch_bool(bool *out, size_t count)
Read a batch of BOOLEAN values into out.
bool has_next() const
Whether there is at least one more value to read.
expected< std::vector< uint8_t > > read_bytes()
Read a single BYTE_ARRAY or FIXED_LEN_BYTE_ARRAY value as raw bytes.
expected< std::string > read_string()
Read a single BYTE_ARRAY value as a std::string.
expected< std::string_view > read_string_view()
Read a single BYTE_ARRAY value as a non-owning std::string_view.
expected< void > read_batch_string(std::string *out, size_t count)
Read a batch of BYTE_ARRAY values as strings.
expected< float > read_float()
Read a single FLOAT value (4 bytes little-endian, IEEE 754).
size_t position() const
Current byte offset within the page data buffer.
A lightweight result type that holds either a success value of type T or an Error.
Arena (bump-pointer) allocator for batch Parquet reads.
PhysicalType
Parquet physical (storage) types as defined in parquet.thrift.
@ FIXED_LEN_BYTE_ARRAY
Fixed-length byte array (UUID, vectors, decimals).
@ INT64
64-bit signed integer (little-endian).
@ INT32
32-bit signed integer (little-endian).
@ BOOLEAN
1-bit boolean, bit-packed in pages.
@ BYTE_ARRAY
Variable-length byte sequence (strings, binary).
@ FLOAT
IEEE 754 single-precision float.
@ DOUBLE
IEEE 754 double-precision float.
@ UNSUPPORTED_TYPE
The file contains a Parquet physical or logical type that is not implemented.
@ OUT_OF_RANGE
An index, offset, or size value is outside the valid range.
@ SCHEMA_MISMATCH
The requested column name or type does not match the file schema.
@ CORRUPT_PAGE
A data page failed integrity checks (bad CRC, truncated, or exceeds size limits).
Lightweight error value carrying an ErrorCode and a human-readable message.
Parquet format enumerations, type traits, and statistics structs.