32inline void append_le32(std::vector<uint8_t>& buf, uint32_t val) {
33 buf.push_back(
static_cast<uint8_t
>((val ) & 0xFF));
34 buf.push_back(
static_cast<uint8_t
>((val >> 8) & 0xFF));
35 buf.push_back(
static_cast<uint8_t
>((val >> 16) & 0xFF));
36 buf.push_back(
static_cast<uint8_t
>((val >> 24) & 0xFF));
42inline void append_le64(std::vector<uint8_t>& buf, uint64_t val) {
43 buf.push_back(
static_cast<uint8_t
>((val ) & 0xFF));
44 buf.push_back(
static_cast<uint8_t
>((val >> 8) & 0xFF));
45 buf.push_back(
static_cast<uint8_t
>((val >> 16) & 0xFF));
46 buf.push_back(
static_cast<uint8_t
>((val >> 24) & 0xFF));
47 buf.push_back(
static_cast<uint8_t
>((val >> 32) & 0xFF));
48 buf.push_back(
static_cast<uint8_t
>((val >> 40) & 0xFF));
49 buf.push_back(
static_cast<uint8_t
>((val >> 48) & 0xFF));
50 buf.push_back(
static_cast<uint8_t
>((val >> 56) & 0xFF));
73 : type_(
type), type_length_(type_length), num_values_(0) {}
84 size_t bit_index =
static_cast<size_t>(num_values_);
85 size_t byte_index = bit_index / 8;
86 size_t bit_offset = bit_index % 8;
89 if (byte_index >= buf_.size()) {
94 buf_[byte_index] |=
static_cast<uint8_t
>(1u << bit_offset);
105 std::memcpy(&bits, &val,
sizeof(bits));
116 std::memcpy(&bits, &val,
sizeof(bits));
127 std::memcpy(&bits, &val,
sizeof(bits));
138 std::memcpy(&bits, &val,
sizeof(bits));
163 if (len >
static_cast<size_t>(UINT32_MAX)) {
166 throw std::length_error(
"BYTE_ARRAY value exceeds 4 GiB limit");
169 buf_.insert(buf_.end(),
data,
data + len);
172 std::string str_val(
reinterpret_cast<const char*
>(
data), len);
191 if (type_length_ > 0 && len !=
static_cast<size_t>(type_length_)) {
192 throw std::length_error(
193 "FIXED_LEN_BYTE_ARRAY value length " + std::to_string(len) +
194 " != schema type_length " + std::to_string(type_length_));
196 buf_.insert(buf_.end(),
data,
data + len);
199 std::string str_val(
reinterpret_cast<const char*
>(
data), len);
211 template <
typename T>
213 if constexpr (std::is_same_v<T, bool>) {
215 }
else if constexpr (std::is_same_v<T, int32_t>) {
217 }
else if constexpr (std::is_same_v<T, int64_t>) {
219 }
else if constexpr (std::is_same_v<T, float>) {
221 }
else if constexpr (std::is_same_v<T, double>) {
223 }
else if constexpr (std::is_same_v<T, std::string>) {
226 static_assert(!std::is_same_v<T, T>,
227 "ColumnWriter::write: unsupported type");
238 template <
typename T>
240 for (
size_t i = 0; i < count; ++i) {
249 for (
size_t i = 0; i < count; ++i) {
257 [[nodiscard]]
const std::vector<uint8_t>&
data()
const {
return buf_; }
263 [[nodiscard]] int64_t
num_values()
const {
return num_values_; }
284 int32_t type_length_;
285 std::vector<uint8_t> buf_;
Per-column-chunk statistics tracker.
void reset()
Reset all statistics to initial state.
void update(const T &value)
Update statistics with a non-null typed value.
PLAIN encoding writer for a single Parquet column.
const ColumnStatistics & statistics() const
Returns a const reference to the column statistics.
void write_int64(int64_t val)
Write a single INT64 value (8 bytes little-endian).
void write_batch(const std::string *values, size_t count)
Write a batch of string values (BYTE_ARRAY).
size_t encoded_size() const
Returns the total encoded data size in bytes.
void write_byte_array(const std::string &val)
Write a single BYTE_ARRAY value from a std::string.
void write_bool(bool val)
Write a single boolean value.
ColumnWriter(PhysicalType type, int32_t type_length=-1)
Construct a writer for the given Parquet physical type.
void reset()
Reset the writer for the next column chunk. Clears all data and statistics.
int64_t num_values() const
Returns the number of values written so far.
void write_batch(const T *values, size_t count)
Write a batch of typed values.
const std::vector< uint8_t > & data() const
Returns a const reference to the encoded byte buffer.
void write_float(float val)
Write a single FLOAT value (4 bytes little-endian, IEEE 754).
PhysicalType type() const
Returns the physical type this writer encodes.
void write_byte_array(const uint8_t *data, size_t len)
Write a single BYTE_ARRAY value from raw bytes.
void write_int32(int32_t val)
Write a single INT32 value (4 bytes little-endian).
void write_double(double val)
Write a single DOUBLE value (8 bytes little-endian, IEEE 754).
void write(const T &val)
Write a single value, dispatching to the correct typed write method.
void write_fixed_len_byte_array(const uint8_t *data, size_t len)
Write a single FIXED_LEN_BYTE_ARRAY value from raw bytes.
PhysicalType
Parquet physical (storage) types as defined in parquet.thrift.
void append_le64(std::vector< uint8_t > &buf, uint64_t val)
Append a uint64_t in little-endian byte order to a byte buffer.
void append_le32(std::vector< uint8_t > &buf, uint32_t val)
Append a uint32_t in little-endian byte order to a byte buffer.
Per-column-chunk statistics tracker and little-endian byte helpers.
Parquet format enumerations, type traits, and statistics structs.