Signet Forge 0.1.0
C++20 Parquet library with AI-native extensions
DEMO
Loading...
Searching...
No Matches
column_reader.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright 2026 Johnson Ogundeji
3#pragma once
4
22
23#include "signet/types.hpp"
24#include "signet/error.hpp"
25#include "signet/memory.hpp"
26
27#include <cstdint>
28#include <cstring>
29#include <string>
30#include <string_view>
31#include <vector>
32
33namespace signet::forge {
34
47public:
57 const uint8_t* data,
58 size_t size,
59 int64_t num_values,
60 int32_t type_length = -1)
61 : type_(type)
62 , data_(data)
63 , size_(size)
64 , pos_(0)
65 , num_values_(num_values)
66 , values_read_(0)
67 , type_length_(type_length)
68 , bool_bit_offset_(0) {}
69
70 // ===================================================================
71 // Single-value reads
72 // ===================================================================
73
77 if (type_ != PhysicalType::BOOLEAN) {
79 "read_bool() called on non-BOOLEAN column"};
80 }
81 if (values_read_ >= num_values_) {
82 return Error{ErrorCode::OUT_OF_RANGE, "no more values to read"};
83 }
84
85 size_t byte_index = bool_bit_offset_ / 8;
86 size_t bit_index = bool_bit_offset_ % 8;
87
88 if (byte_index >= size_) {
90 "boolean read past end of page data"};
91 }
92
93 bool val = (data_[byte_index] >> bit_index) & 1;
94 ++bool_bit_offset_;
95 ++values_read_;
96 return val;
97 }
98
102 if (type_ != PhysicalType::INT32) {
104 "read_int32() called on non-INT32 column"};
105 }
106 if (values_read_ >= num_values_) {
107 return Error{ErrorCode::OUT_OF_RANGE, "no more values to read"};
108 }
109 if (pos_ + 4 > size_) {
111 "INT32 read past end of page data"};
112 }
113
114 int32_t val;
115 std::memcpy(&val, data_ + pos_, 4);
116 pos_ += 4;
117 ++values_read_;
118 return val;
119 }
120
124 if (type_ != PhysicalType::INT64) {
126 "read_int64() called on non-INT64 column"};
127 }
128 if (values_read_ >= num_values_) {
129 return Error{ErrorCode::OUT_OF_RANGE, "no more values to read"};
130 }
131 if (pos_ + 8 > size_) {
133 "INT64 read past end of page data"};
134 }
135
136 int64_t val;
137 std::memcpy(&val, data_ + pos_, 8);
138 pos_ += 8;
139 ++values_read_;
140 return val;
141 }
142
146 if (type_ != PhysicalType::FLOAT) {
148 "read_float() called on non-FLOAT column"};
149 }
150 if (values_read_ >= num_values_) {
151 return Error{ErrorCode::OUT_OF_RANGE, "no more values to read"};
152 }
153 if (pos_ + 4 > size_) {
155 "FLOAT read past end of page data"};
156 }
157
158 float val;
159 std::memcpy(&val, data_ + pos_, 4);
160 pos_ += 4;
161 ++values_read_;
162 return val;
163 }
164
168 if (type_ != PhysicalType::DOUBLE) {
170 "read_double() called on non-DOUBLE column"};
171 }
172 if (values_read_ >= num_values_) {
173 return Error{ErrorCode::OUT_OF_RANGE, "no more values to read"};
174 }
175 if (pos_ + 8 > size_) {
177 "DOUBLE read past end of page data"};
178 }
179
180 double val;
181 std::memcpy(&val, data_ + pos_, 8);
182 pos_ += 8;
183 ++values_read_;
184 return val;
185 }
186
193 if (type_ != PhysicalType::BYTE_ARRAY) {
195 "read_string() called on non-BYTE_ARRAY column"};
196 }
197 if (values_read_ >= num_values_) {
198 return Error{ErrorCode::OUT_OF_RANGE, "no more values to read"};
199 }
200 if (pos_ + 4 > size_) {
202 "BYTE_ARRAY length prefix read past end of page data"};
203 }
204
205 uint32_t len;
206 std::memcpy(&len, data_ + pos_, 4);
207
208 // Bounds check before advancing pos_: use subtraction to avoid
209 // pos_+len wraparound on crafted files (CWE-125, CWE-190).
210 if (static_cast<size_t>(len) > size_ - pos_ - 4) {
212 "BYTE_ARRAY data read past end of page data"};
213 }
214 pos_ += 4;
215
216 std::string val(reinterpret_cast<const char*>(data_ + pos_), len);
217 pos_ += len;
218 ++values_read_;
219 return val;
220 }
221
229 if (type_ != PhysicalType::BYTE_ARRAY) {
231 "read_string_view() called on non-BYTE_ARRAY column"};
232 }
233 if (values_read_ >= num_values_) {
234 return Error{ErrorCode::OUT_OF_RANGE, "no more values to read"};
235 }
236 if (pos_ + 4 > size_) {
238 "BYTE_ARRAY length prefix read past end of page data"};
239 }
240
241 uint32_t len;
242 std::memcpy(&len, data_ + pos_, 4);
243
244 // Bounds check before advancing pos_: use subtraction to avoid
245 // pos_+len wraparound on crafted files (CWE-125, CWE-190).
246 if (static_cast<size_t>(len) > size_ - pos_ - 4) {
248 "BYTE_ARRAY data read past end of page data"};
249 }
250 pos_ += 4;
251
252 std::string_view val(reinterpret_cast<const char*>(data_ + pos_), len);
253 pos_ += len;
254 ++values_read_;
255 return val;
256 }
257
265 if (type_ != PhysicalType::BYTE_ARRAY &&
268 "read_bytes() requires BYTE_ARRAY or FIXED_LEN_BYTE_ARRAY"};
269 }
270 if (values_read_ >= num_values_) {
271 return Error{ErrorCode::OUT_OF_RANGE, "no more values to read"};
272 }
273
275 if (type_length_ <= 0) {
277 "FIXED_LEN_BYTE_ARRAY requires positive type_length"};
278 }
279 size_t len = static_cast<size_t>(type_length_);
280 if (pos_ + len > size_) {
282 "FIXED_LEN_BYTE_ARRAY read past end of page data"};
283 }
284 std::vector<uint8_t> val(data_ + pos_, data_ + pos_ + len);
285 pos_ += len;
286 ++values_read_;
287 return val;
288 }
289
290 // BYTE_ARRAY: 4-byte LE length prefix
291 if (pos_ + 4 > size_) {
293 "BYTE_ARRAY length prefix read past end of page data"};
294 }
295 uint32_t len;
296 std::memcpy(&len, data_ + pos_, 4);
297
298 // Bounds check before advancing pos_: use subtraction to avoid
299 // pos_+len wraparound on crafted files (CWE-125, CWE-190).
300 if (static_cast<size_t>(len) > size_ - pos_ - 4) {
302 "BYTE_ARRAY data read past end of page data"};
303 }
304 pos_ += 4;
305 std::vector<uint8_t> val(data_ + pos_, data_ + pos_ + len);
306 pos_ += len;
307 ++values_read_;
308 return val;
309 }
310
311 // ===================================================================
312 // Batch reads -- read @p count values into a caller-provided buffer
313 // ===================================================================
314
319 expected<void> read_batch_bool(bool* out, size_t count) {
320 if (type_ != PhysicalType::BOOLEAN) {
322 "read_batch_bool() called on non-BOOLEAN column"};
323 }
324 if (values_read_ + static_cast<int64_t>(count) > num_values_) {
326 "batch read exceeds available values"};
327 }
328
329 for (size_t i = 0; i < count; ++i) {
330 size_t byte_index = bool_bit_offset_ / 8;
331 size_t bit_index = bool_bit_offset_ % 8;
332
333 if (byte_index >= size_) {
335 "boolean batch read past end of page data"};
336 }
337
338 out[i] = (data_[byte_index] >> bit_index) & 1;
339 ++bool_bit_offset_;
340 ++values_read_;
341 }
342 return expected<void>{};
343 }
344
349 expected<void> read_batch_int32(int32_t* out, size_t count) {
350 if (type_ != PhysicalType::INT32) {
352 "read_batch_int32() called on non-INT32 column"};
353 }
354 if (count > SIZE_MAX / 4) {
355 return Error{ErrorCode::OUT_OF_RANGE, "count too large, would overflow"};
356 }
357 size_t total_bytes = count * 4;
358 if (values_read_ + static_cast<int64_t>(count) > num_values_) {
360 "batch read exceeds available values"};
361 }
362 if (pos_ + total_bytes > size_) {
364 "INT32 batch read past end of page data"};
365 }
366
367 std::memcpy(out, data_ + pos_, total_bytes);
368 pos_ += total_bytes;
369 values_read_ += static_cast<int64_t>(count);
370 return expected<void>{};
371 }
372
377 expected<void> read_batch_int64(int64_t* out, size_t count) {
378 if (type_ != PhysicalType::INT64) {
380 "read_batch_int64() called on non-INT64 column"};
381 }
382 if (count > SIZE_MAX / 8) {
383 return Error{ErrorCode::OUT_OF_RANGE, "count too large, would overflow"};
384 }
385 size_t total_bytes = count * 8;
386 if (values_read_ + static_cast<int64_t>(count) > num_values_) {
388 "batch read exceeds available values"};
389 }
390 if (pos_ + total_bytes > size_) {
392 "INT64 batch read past end of page data"};
393 }
394
395 std::memcpy(out, data_ + pos_, total_bytes);
396 pos_ += total_bytes;
397 values_read_ += static_cast<int64_t>(count);
398 return expected<void>{};
399 }
400
405 expected<void> read_batch_float(float* out, size_t count) {
406 if (type_ != PhysicalType::FLOAT) {
408 "read_batch_float() called on non-FLOAT column"};
409 }
410 if (count > SIZE_MAX / 4) {
411 return Error{ErrorCode::OUT_OF_RANGE, "count too large, would overflow"};
412 }
413 size_t total_bytes = count * 4;
414 if (values_read_ + static_cast<int64_t>(count) > num_values_) {
416 "batch read exceeds available values"};
417 }
418 if (pos_ + total_bytes > size_) {
420 "FLOAT batch read past end of page data"};
421 }
422
423 std::memcpy(out, data_ + pos_, total_bytes);
424 pos_ += total_bytes;
425 values_read_ += static_cast<int64_t>(count);
426 return expected<void>{};
427 }
428
433 expected<void> read_batch_double(double* out, size_t count) {
434 if (type_ != PhysicalType::DOUBLE) {
436 "read_batch_double() called on non-DOUBLE column"};
437 }
438 if (count > SIZE_MAX / 8) {
439 return Error{ErrorCode::OUT_OF_RANGE, "count too large, would overflow"};
440 }
441 size_t total_bytes = count * 8;
442 if (values_read_ + static_cast<int64_t>(count) > num_values_) {
444 "batch read exceeds available values"};
445 }
446 if (pos_ + total_bytes > size_) {
448 "DOUBLE batch read past end of page data"};
449 }
450
451 std::memcpy(out, data_ + pos_, total_bytes);
452 pos_ += total_bytes;
453 values_read_ += static_cast<int64_t>(count);
454 return expected<void>{};
455 }
456
461 expected<void> read_batch_string(std::string* out, size_t count) {
462 if (type_ != PhysicalType::BYTE_ARRAY) {
464 "read_batch_string() called on non-BYTE_ARRAY column"};
465 }
466 if (values_read_ + static_cast<int64_t>(count) > num_values_) {
468 "batch read exceeds available values"};
469 }
470
471 for (size_t i = 0; i < count; ++i) {
472 if (pos_ + 4 > size_) {
474 "BYTE_ARRAY length prefix read past end of page data"};
475 }
476 uint32_t len;
477 std::memcpy(&len, data_ + pos_, 4);
478 pos_ += 4;
479
480 // CWE-190: Integer Overflow — subtraction-based bounds check avoids unsigned wrap
481 if (pos_ > size_ || len > size_ - pos_) {
483 "BYTE_ARRAY data read past end of page data"};
484 }
485 out[i].assign(reinterpret_cast<const char*>(data_ + pos_), len);
486 pos_ += len;
487 ++values_read_;
488 }
489 return expected<void>{};
490 }
491
492 // ===================================================================
493 // Template dispatch -- read<T>() and read_batch<T>()
494 // ===================================================================
495
503 template <typename T>
505 if constexpr (std::is_same_v<T, bool>) {
506 return read_bool();
507 } else if constexpr (std::is_same_v<T, int32_t>) {
508 return read_int32();
509 } else if constexpr (std::is_same_v<T, int64_t>) {
510 return read_int64();
511 } else if constexpr (std::is_same_v<T, float>) {
512 return read_float();
513 } else if constexpr (std::is_same_v<T, double>) {
514 return read_double();
515 } else if constexpr (std::is_same_v<T, std::string>) {
516 return read_string();
517 } else if constexpr (std::is_same_v<T, std::string_view>) {
518 return read_string_view();
519 } else if constexpr (std::is_same_v<T, std::vector<uint8_t>>) {
520 return read_bytes();
521 } else {
522 static_assert(!std::is_same_v<T, T>,
523 "ColumnReader::read<T>: unsupported type");
524 }
525 }
526
536 template <typename T>
537 expected<void> read_batch(T* out, size_t count) {
538 if constexpr (std::is_same_v<T, bool>) {
539 return read_batch_bool(out, count);
540 } else if constexpr (std::is_same_v<T, int32_t>) {
541 return read_batch_int32(out, count);
542 } else if constexpr (std::is_same_v<T, int64_t>) {
543 return read_batch_int64(out, count);
544 } else if constexpr (std::is_same_v<T, float>) {
545 return read_batch_float(out, count);
546 } else if constexpr (std::is_same_v<T, double>) {
547 return read_batch_double(out, count);
548 } else if constexpr (std::is_same_v<T, std::string>) {
549 return read_batch_string(out, count);
550 } else {
551 static_assert(!std::is_same_v<T, T>,
552 "ColumnReader::read_batch<T>: unsupported type");
553 }
554 }
555
556 // ===================================================================
557 // Status queries
558 // ===================================================================
559
561 [[nodiscard]] int64_t values_remaining() const {
562 return num_values_ - values_read_;
563 }
564
566 [[nodiscard]] bool has_next() const {
567 return values_read_ < num_values_;
568 }
569
571 [[nodiscard]] PhysicalType type() const {
572 return type_;
573 }
574
576 [[nodiscard]] size_t position() const {
577 return pos_;
578 }
579
580private:
581 PhysicalType type_;
582 const uint8_t* data_;
583 size_t size_;
584 size_t pos_;
585 int64_t num_values_;
586 int64_t values_read_;
587 int32_t type_length_;
588
589 // For BOOLEAN: bit offset within the data buffer (since booleans are
590 // bit-packed rather than byte-aligned)
591 size_t bool_bit_offset_;
592};
593
594} // namespace signet::forge
PLAIN-encoded Parquet column decoder.
expected< void > read_batch_double(double *out, size_t count)
Read a batch of DOUBLE values via bulk memcpy.
ColumnReader(PhysicalType type, const uint8_t *data, size_t size, int64_t num_values, int32_t type_length=-1)
Construct a reader over raw PLAIN-encoded page data.
expected< int32_t > read_int32()
Read a single INT32 value (4 bytes little-endian).
expected< int64_t > read_int64()
Read a single INT64 value (8 bytes little-endian).
int64_t values_remaining() const
Number of values not yet read from this page.
expected< void > read_batch_float(float *out, size_t count)
Read a batch of FLOAT values via bulk memcpy.
expected< T > read()
Read a single value of type T, dispatching to the correct typed reader.
expected< bool > read_bool()
Read a single BOOLEAN value (bit-packed, LSB first).
expected< double > read_double()
Read a single DOUBLE value (8 bytes little-endian, IEEE 754).
expected< void > read_batch(T *out, size_t count)
Read a batch of count values of type T into out.
expected< void > read_batch_int32(int32_t *out, size_t count)
Read a batch of INT32 values via bulk memcpy.
PhysicalType type() const
The Parquet physical type of this column.
expected< void > read_batch_int64(int64_t *out, size_t count)
Read a batch of INT64 values via bulk memcpy.
expected< void > read_batch_bool(bool *out, size_t count)
Read a batch of BOOLEAN values into out.
bool has_next() const
Whether there is at least one more value to read.
expected< std::vector< uint8_t > > read_bytes()
Read a single BYTE_ARRAY or FIXED_LEN_BYTE_ARRAY value as raw bytes.
expected< std::string > read_string()
Read a single BYTE_ARRAY value as a std::string.
expected< std::string_view > read_string_view()
Read a single BYTE_ARRAY value as a non-owning std::string_view.
expected< void > read_batch_string(std::string *out, size_t count)
Read a batch of BYTE_ARRAY values as strings.
expected< float > read_float()
Read a single FLOAT value (4 bytes little-endian, IEEE 754).
size_t position() const
Current byte offset within the page data buffer.
A lightweight result type that holds either a success value of type T or an Error.
Definition error.hpp:145
Arena (bump-pointer) allocator for batch Parquet reads.
PhysicalType
Parquet physical (storage) types as defined in parquet.thrift.
Definition types.hpp:20
@ FIXED_LEN_BYTE_ARRAY
Fixed-length byte array (UUID, vectors, decimals).
@ INT64
64-bit signed integer (little-endian).
@ INT32
32-bit signed integer (little-endian).
@ BOOLEAN
1-bit boolean, bit-packed in pages.
@ BYTE_ARRAY
Variable-length byte sequence (strings, binary).
@ FLOAT
IEEE 754 single-precision float.
@ DOUBLE
IEEE 754 double-precision float.
@ UNSUPPORTED_TYPE
The file contains a Parquet physical or logical type that is not implemented.
@ OUT_OF_RANGE
An index, offset, or size value is outside the valid range.
@ SCHEMA_MISMATCH
The requested column name or type does not match the file schema.
@ CORRUPT_PAGE
A data page failed integrity checks (bad CRC, truncated, or exceeds size limits).
Lightweight error value carrying an ErrorCode and a human-readable message.
Definition error.hpp:101
Parquet format enumerations, type traits, and statistics structs.