Signet Forge 0.1.0
C++20 Parquet library with AI-native extensions
DEMO
Loading...
Searching...
No Matches
column_writer.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright 2026 Johnson Ogundeji
3#pragma once
4
12
13#include "signet/types.hpp"
14#include "signet/statistics.hpp"
15
16#include <cstdint>
17#include <cstring>
18#include <stdexcept>
19#include <string>
20#include <type_traits>
21#include <vector>
22
23namespace signet::forge {
24
25// ---------------------------------------------------------------------------
26// Little-endian append helpers
27// ---------------------------------------------------------------------------
28
32inline void append_le32(std::vector<uint8_t>& buf, uint32_t val) {
33 buf.push_back(static_cast<uint8_t>((val ) & 0xFF));
34 buf.push_back(static_cast<uint8_t>((val >> 8) & 0xFF));
35 buf.push_back(static_cast<uint8_t>((val >> 16) & 0xFF));
36 buf.push_back(static_cast<uint8_t>((val >> 24) & 0xFF));
37}
38
42inline void append_le64(std::vector<uint8_t>& buf, uint64_t val) {
43 buf.push_back(static_cast<uint8_t>((val ) & 0xFF));
44 buf.push_back(static_cast<uint8_t>((val >> 8) & 0xFF));
45 buf.push_back(static_cast<uint8_t>((val >> 16) & 0xFF));
46 buf.push_back(static_cast<uint8_t>((val >> 24) & 0xFF));
47 buf.push_back(static_cast<uint8_t>((val >> 32) & 0xFF));
48 buf.push_back(static_cast<uint8_t>((val >> 40) & 0xFF));
49 buf.push_back(static_cast<uint8_t>((val >> 48) & 0xFF));
50 buf.push_back(static_cast<uint8_t>((val >> 56) & 0xFF));
51}
52
67public:
71 : type_(type), num_values_(0) {}
72
73 // -- Typed write methods --------------------------------------------------
74
81 void write_bool(bool val) {
82 size_t bit_index = static_cast<size_t>(num_values_);
83 size_t byte_index = bit_index / 8;
84 size_t bit_offset = bit_index % 8;
85
86 // Extend the buffer if we need a new byte
87 if (byte_index >= buf_.size()) {
88 buf_.push_back(0);
89 }
90
91 if (val) {
92 buf_[byte_index] |= static_cast<uint8_t>(1u << bit_offset);
93 }
94
95 stats_.update(val);
96 ++num_values_;
97 }
98
101 void write_int32(int32_t val) {
102 uint32_t bits;
103 std::memcpy(&bits, &val, sizeof(bits));
104 append_le32(buf_, bits);
105
106 stats_.update(val);
107 ++num_values_;
108 }
109
112 void write_int64(int64_t val) {
113 uint64_t bits;
114 std::memcpy(&bits, &val, sizeof(bits));
115 append_le64(buf_, bits);
116
117 stats_.update(val);
118 ++num_values_;
119 }
120
123 void write_float(float val) {
124 uint32_t bits;
125 std::memcpy(&bits, &val, sizeof(bits));
126 append_le32(buf_, bits);
127
128 stats_.update(val);
129 ++num_values_;
130 }
131
134 void write_double(double val) {
135 uint64_t bits;
136 std::memcpy(&bits, &val, sizeof(bits));
137 append_le64(buf_, bits);
138
139 stats_.update(val);
140 ++num_values_;
141 }
142
148 void write_byte_array(const std::string& val) {
149 write_byte_array(reinterpret_cast<const uint8_t*>(val.data()), val.size());
150 }
151
158 void write_byte_array(const uint8_t* data, size_t len) {
159 // CWE-190: Integer Overflow — Parquet BYTE_ARRAY uses 4-byte LE length prefix;
160 // payloads > 4 GiB would truncate the length field, corrupting the output.
161 if (len > static_cast<size_t>(UINT32_MAX)) {
162 // H14: BYTE_ARRAY length prefix is a 4-byte LE uint32; reject > 4 GiB payloads.
163 // Throw instead of silently dropping the value, which would corrupt the output.
164 throw std::length_error("BYTE_ARRAY value exceeds 4 GiB limit");
165 }
166 append_le32(buf_, static_cast<uint32_t>(len));
167 buf_.insert(buf_.end(), data, data + len);
168
169 // Update statistics with the string representation
170 std::string str_val(reinterpret_cast<const char*>(data), len);
171 stats_.update(str_val);
172 ++num_values_;
173 }
174
185 void write_fixed_len_byte_array(const uint8_t* data, size_t len) {
186 buf_.insert(buf_.end(), data, data + len);
187
188 // Update statistics with the string representation
189 std::string str_val(reinterpret_cast<const char*>(data), len);
190 stats_.update(str_val);
191 ++num_values_;
192 }
193
194 // -- Template convenience dispatchers ------------------------------------
195
201 template <typename T>
202 void write(const T& val) {
203 if constexpr (std::is_same_v<T, bool>) {
204 write_bool(val);
205 } else if constexpr (std::is_same_v<T, int32_t>) {
206 write_int32(val);
207 } else if constexpr (std::is_same_v<T, int64_t>) {
208 write_int64(val);
209 } else if constexpr (std::is_same_v<T, float>) {
210 write_float(val);
211 } else if constexpr (std::is_same_v<T, double>) {
212 write_double(val);
213 } else if constexpr (std::is_same_v<T, std::string>) {
214 write_byte_array(val);
215 } else {
216 static_assert(!std::is_same_v<T, T>,
217 "ColumnWriter::write: unsupported type");
218 }
219 }
220
221 // -- Batch write methods -------------------------------------------------
222
228 template <typename T>
229 void write_batch(const T* values, size_t count) {
230 for (size_t i = 0; i < count; ++i) {
231 write<T>(values[i]);
232 }
233 }
234
238 void write_batch(const std::string* values, size_t count) {
239 for (size_t i = 0; i < count; ++i) {
240 write_byte_array(values[i]);
241 }
242 }
243
244 // -- Access encoded data -------------------------------------------------
245
247 [[nodiscard]] const std::vector<uint8_t>& data() const { return buf_; }
248
250 [[nodiscard]] size_t encoded_size() const { return buf_.size(); }
251
253 [[nodiscard]] int64_t num_values() const { return num_values_; }
254
255 // -- Statistics -----------------------------------------------------------
256
258 [[nodiscard]] const ColumnStatistics& statistics() const { return stats_; }
259
260 // -- Reset ----------------------------------------------------------------
261
263 void reset() {
264 buf_.clear();
265 stats_.reset();
266 num_values_ = 0;
267 }
268
270 [[nodiscard]] PhysicalType type() const { return type_; }
271
272private:
273 PhysicalType type_;
274 std::vector<uint8_t> buf_;
275 ColumnStatistics stats_;
276 int64_t num_values_;
277};
278
279} // namespace signet::forge
Per-column-chunk statistics tracker.
void reset()
Reset all statistics to initial state.
void update(const T &value)
Update statistics with a non-null typed value.
PLAIN encoding writer for a single Parquet column.
const ColumnStatistics & statistics() const
Returns a const reference to the column statistics.
void write_int64(int64_t val)
Write a single INT64 value (8 bytes little-endian).
void write_batch(const std::string *values, size_t count)
Write a batch of string values (BYTE_ARRAY).
size_t encoded_size() const
Returns the total encoded data size in bytes.
void write_byte_array(const std::string &val)
Write a single BYTE_ARRAY value from a std::string.
void write_bool(bool val)
Write a single boolean value.
void reset()
Reset the writer for the next column chunk. Clears all data and statistics.
int64_t num_values() const
Returns the number of values written so far.
void write_batch(const T *values, size_t count)
Write a batch of typed values.
const std::vector< uint8_t > & data() const
Returns a const reference to the encoded byte buffer.
void write_float(float val)
Write a single FLOAT value (4 bytes little-endian, IEEE 754).
PhysicalType type() const
Returns the physical type this writer encodes.
void write_byte_array(const uint8_t *data, size_t len)
Write a single BYTE_ARRAY value from raw bytes.
void write_int32(int32_t val)
Write a single INT32 value (4 bytes little-endian).
ColumnWriter(PhysicalType type)
Construct a writer for the given Parquet physical type.
void write_double(double val)
Write a single DOUBLE value (8 bytes little-endian, IEEE 754).
void write(const T &val)
Write a single value, dispatching to the correct typed write method.
void write_fixed_len_byte_array(const uint8_t *data, size_t len)
Write a single FIXED_LEN_BYTE_ARRAY value from raw bytes.
PhysicalType
Parquet physical (storage) types as defined in parquet.thrift.
Definition types.hpp:20
void append_le64(std::vector< uint8_t > &buf, uint64_t val)
Append a uint64_t in little-endian byte order to a byte buffer.
void append_le32(std::vector< uint8_t > &buf, uint32_t val)
Append a uint32_t in little-endian byte order to a byte buffer.
Per-column-chunk statistics tracker and little-endian byte helpers.
Parquet format enumerations, type traits, and statistics structs.