32inline void append_le32(std::vector<uint8_t>& buf, uint32_t val) {
33 buf.push_back(
static_cast<uint8_t
>((val ) & 0xFF));
34 buf.push_back(
static_cast<uint8_t
>((val >> 8) & 0xFF));
35 buf.push_back(
static_cast<uint8_t
>((val >> 16) & 0xFF));
36 buf.push_back(
static_cast<uint8_t
>((val >> 24) & 0xFF));
42inline void append_le64(std::vector<uint8_t>& buf, uint64_t val) {
43 buf.push_back(
static_cast<uint8_t
>((val ) & 0xFF));
44 buf.push_back(
static_cast<uint8_t
>((val >> 8) & 0xFF));
45 buf.push_back(
static_cast<uint8_t
>((val >> 16) & 0xFF));
46 buf.push_back(
static_cast<uint8_t
>((val >> 24) & 0xFF));
47 buf.push_back(
static_cast<uint8_t
>((val >> 32) & 0xFF));
48 buf.push_back(
static_cast<uint8_t
>((val >> 40) & 0xFF));
49 buf.push_back(
static_cast<uint8_t
>((val >> 48) & 0xFF));
50 buf.push_back(
static_cast<uint8_t
>((val >> 56) & 0xFF));
71 : type_(
type), num_values_(0) {}
82 size_t bit_index =
static_cast<size_t>(num_values_);
83 size_t byte_index = bit_index / 8;
84 size_t bit_offset = bit_index % 8;
87 if (byte_index >= buf_.size()) {
92 buf_[byte_index] |=
static_cast<uint8_t
>(1u << bit_offset);
103 std::memcpy(&bits, &val,
sizeof(bits));
114 std::memcpy(&bits, &val,
sizeof(bits));
125 std::memcpy(&bits, &val,
sizeof(bits));
136 std::memcpy(&bits, &val,
sizeof(bits));
161 if (len >
static_cast<size_t>(UINT32_MAX)) {
164 throw std::length_error(
"BYTE_ARRAY value exceeds 4 GiB limit");
167 buf_.insert(buf_.end(),
data,
data + len);
170 std::string str_val(
reinterpret_cast<const char*
>(
data), len);
186 buf_.insert(buf_.end(),
data,
data + len);
189 std::string str_val(
reinterpret_cast<const char*
>(
data), len);
201 template <
typename T>
203 if constexpr (std::is_same_v<T, bool>) {
205 }
else if constexpr (std::is_same_v<T, int32_t>) {
207 }
else if constexpr (std::is_same_v<T, int64_t>) {
209 }
else if constexpr (std::is_same_v<T, float>) {
211 }
else if constexpr (std::is_same_v<T, double>) {
213 }
else if constexpr (std::is_same_v<T, std::string>) {
216 static_assert(!std::is_same_v<T, T>,
217 "ColumnWriter::write: unsupported type");
228 template <
typename T>
230 for (
size_t i = 0; i < count; ++i) {
239 for (
size_t i = 0; i < count; ++i) {
247 [[nodiscard]]
const std::vector<uint8_t>&
data()
const {
return buf_; }
253 [[nodiscard]] int64_t
num_values()
const {
return num_values_; }
274 std::vector<uint8_t> buf_;
Per-column-chunk statistics tracker.
void reset()
Reset all statistics to initial state.
void update(const T &value)
Update statistics with a non-null typed value.
PLAIN encoding writer for a single Parquet column.
const ColumnStatistics & statistics() const
Returns a const reference to the column statistics.
void write_int64(int64_t val)
Write a single INT64 value (8 bytes little-endian).
void write_batch(const std::string *values, size_t count)
Write a batch of string values (BYTE_ARRAY).
size_t encoded_size() const
Returns the total encoded data size in bytes.
void write_byte_array(const std::string &val)
Write a single BYTE_ARRAY value from a std::string.
void write_bool(bool val)
Write a single boolean value.
void reset()
Reset the writer for the next column chunk. Clears all data and statistics.
int64_t num_values() const
Returns the number of values written so far.
void write_batch(const T *values, size_t count)
Write a batch of typed values.
const std::vector< uint8_t > & data() const
Returns a const reference to the encoded byte buffer.
void write_float(float val)
Write a single FLOAT value (4 bytes little-endian, IEEE 754).
PhysicalType type() const
Returns the physical type this writer encodes.
void write_byte_array(const uint8_t *data, size_t len)
Write a single BYTE_ARRAY value from raw bytes.
void write_int32(int32_t val)
Write a single INT32 value (4 bytes little-endian).
ColumnWriter(PhysicalType type)
Construct a writer for the given Parquet physical type.
void write_double(double val)
Write a single DOUBLE value (8 bytes little-endian, IEEE 754).
void write(const T &val)
Write a single value, dispatching to the correct typed write method.
void write_fixed_len_byte_array(const uint8_t *data, size_t len)
Write a single FIXED_LEN_BYTE_ARRAY value from raw bytes.
PhysicalType
Parquet physical (storage) types as defined in parquet.thrift.
void append_le64(std::vector< uint8_t > &buf, uint64_t val)
Append a uint64_t in little-endian byte order to a byte buffer.
void append_le32(std::vector< uint8_t > &buf, uint32_t val)
Append a uint32_t in little-endian byte order to a byte buffer.
Per-column-chunk statistics tracker and little-endian byte helpers.
Parquet format enumerations, type traits, and statistics structs.