Signet Forge 0.1.1
C++20 Parquet library with AI-native extensions
DEMO
Loading...
Searching...
No Matches
column_writer.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright 2026 Johnson Ogundeji
3#pragma once
4
12
13#include "signet/types.hpp"
14#include "signet/statistics.hpp"
15
16#include <cstdint>
17#include <cstring>
18#include <stdexcept>
19#include <string>
20#include <type_traits>
21#include <vector>
22
23namespace signet::forge {
24
25// ---------------------------------------------------------------------------
26// Little-endian append helpers
27// ---------------------------------------------------------------------------
28
32inline void append_le32(std::vector<uint8_t>& buf, uint32_t val) {
33 buf.push_back(static_cast<uint8_t>((val ) & 0xFF));
34 buf.push_back(static_cast<uint8_t>((val >> 8) & 0xFF));
35 buf.push_back(static_cast<uint8_t>((val >> 16) & 0xFF));
36 buf.push_back(static_cast<uint8_t>((val >> 24) & 0xFF));
37}
38
42inline void append_le64(std::vector<uint8_t>& buf, uint64_t val) {
43 buf.push_back(static_cast<uint8_t>((val ) & 0xFF));
44 buf.push_back(static_cast<uint8_t>((val >> 8) & 0xFF));
45 buf.push_back(static_cast<uint8_t>((val >> 16) & 0xFF));
46 buf.push_back(static_cast<uint8_t>((val >> 24) & 0xFF));
47 buf.push_back(static_cast<uint8_t>((val >> 32) & 0xFF));
48 buf.push_back(static_cast<uint8_t>((val >> 40) & 0xFF));
49 buf.push_back(static_cast<uint8_t>((val >> 48) & 0xFF));
50 buf.push_back(static_cast<uint8_t>((val >> 56) & 0xFF));
51}
52
67public:
72 explicit ColumnWriter(PhysicalType type, int32_t type_length = -1)
73 : type_(type), type_length_(type_length), num_values_(0) {}
74
75 // -- Typed write methods --------------------------------------------------
76
83 void write_bool(bool val) {
84 size_t bit_index = static_cast<size_t>(num_values_);
85 size_t byte_index = bit_index / 8;
86 size_t bit_offset = bit_index % 8;
87
88 // Extend the buffer if we need a new byte
89 if (byte_index >= buf_.size()) {
90 buf_.push_back(0);
91 }
92
93 if (val) {
94 buf_[byte_index] |= static_cast<uint8_t>(1u << bit_offset);
95 }
96
97 stats_.update(val);
98 ++num_values_;
99 }
100
103 void write_int32(int32_t val) {
104 uint32_t bits;
105 std::memcpy(&bits, &val, sizeof(bits));
106 append_le32(buf_, bits);
107
108 stats_.update(val);
109 ++num_values_;
110 }
111
114 void write_int64(int64_t val) {
115 uint64_t bits;
116 std::memcpy(&bits, &val, sizeof(bits));
117 append_le64(buf_, bits);
118
119 stats_.update(val);
120 ++num_values_;
121 }
122
125 void write_float(float val) {
126 uint32_t bits;
127 std::memcpy(&bits, &val, sizeof(bits));
128 append_le32(buf_, bits);
129
130 stats_.update(val);
131 ++num_values_;
132 }
133
136 void write_double(double val) {
137 uint64_t bits;
138 std::memcpy(&bits, &val, sizeof(bits));
139 append_le64(buf_, bits);
140
141 stats_.update(val);
142 ++num_values_;
143 }
144
150 void write_byte_array(const std::string& val) {
151 write_byte_array(reinterpret_cast<const uint8_t*>(val.data()), val.size());
152 }
153
160 void write_byte_array(const uint8_t* data, size_t len) {
161 // CWE-190: Integer Overflow — Parquet BYTE_ARRAY uses 4-byte LE length prefix;
162 // payloads > 4 GiB would truncate the length field, corrupting the output.
163 if (len > static_cast<size_t>(UINT32_MAX)) {
164 // H14: BYTE_ARRAY length prefix is a 4-byte LE uint32; reject > 4 GiB payloads.
165 // Throw instead of silently dropping the value, which would corrupt the output.
166 throw std::length_error("BYTE_ARRAY value exceeds 4 GiB limit");
167 }
168 append_le32(buf_, static_cast<uint32_t>(len));
169 buf_.insert(buf_.end(), data, data + len);
170
171 // Update statistics with the string representation
172 std::string str_val(reinterpret_cast<const char*>(data), len);
173 stats_.update(str_val);
174 ++num_values_;
175 }
176
187 void write_fixed_len_byte_array(const uint8_t* data, size_t len) {
188 // CWE-130: Improper Handling of Length Parameter Inconsistency —
189 // Validate that len matches the schema's type_length to prevent
190 // silent data corruption in the column chunk.
191 if (type_length_ > 0 && len != static_cast<size_t>(type_length_)) {
192 throw std::length_error(
193 "FIXED_LEN_BYTE_ARRAY value length " + std::to_string(len) +
194 " != schema type_length " + std::to_string(type_length_));
195 }
196 buf_.insert(buf_.end(), data, data + len);
197
198 // Update statistics with the string representation
199 std::string str_val(reinterpret_cast<const char*>(data), len);
200 stats_.update(str_val);
201 ++num_values_;
202 }
203
204 // -- Template convenience dispatchers ------------------------------------
205
211 template <typename T>
212 void write(const T& val) {
213 if constexpr (std::is_same_v<T, bool>) {
214 write_bool(val);
215 } else if constexpr (std::is_same_v<T, int32_t>) {
216 write_int32(val);
217 } else if constexpr (std::is_same_v<T, int64_t>) {
218 write_int64(val);
219 } else if constexpr (std::is_same_v<T, float>) {
220 write_float(val);
221 } else if constexpr (std::is_same_v<T, double>) {
222 write_double(val);
223 } else if constexpr (std::is_same_v<T, std::string>) {
224 write_byte_array(val);
225 } else {
226 static_assert(!std::is_same_v<T, T>,
227 "ColumnWriter::write: unsupported type");
228 }
229 }
230
231 // -- Batch write methods -------------------------------------------------
232
238 template <typename T>
239 void write_batch(const T* values, size_t count) {
240 for (size_t i = 0; i < count; ++i) {
241 write<T>(values[i]);
242 }
243 }
244
248 void write_batch(const std::string* values, size_t count) {
249 for (size_t i = 0; i < count; ++i) {
250 write_byte_array(values[i]);
251 }
252 }
253
254 // -- Access encoded data -------------------------------------------------
255
257 [[nodiscard]] const std::vector<uint8_t>& data() const { return buf_; }
258
260 [[nodiscard]] size_t encoded_size() const { return buf_.size(); }
261
263 [[nodiscard]] int64_t num_values() const { return num_values_; }
264
265 // -- Statistics -----------------------------------------------------------
266
268 [[nodiscard]] const ColumnStatistics& statistics() const { return stats_; }
269
270 // -- Reset ----------------------------------------------------------------
271
273 void reset() {
274 buf_.clear();
275 stats_.reset();
276 num_values_ = 0;
277 }
278
280 [[nodiscard]] PhysicalType type() const { return type_; }
281
282private:
283 PhysicalType type_;
284 int32_t type_length_;
285 std::vector<uint8_t> buf_;
286 ColumnStatistics stats_;
287 int64_t num_values_;
288};
289
290} // namespace signet::forge
Per-column-chunk statistics tracker.
void reset()
Reset all statistics to initial state.
void update(const T &value)
Update statistics with a non-null typed value.
PLAIN encoding writer for a single Parquet column.
const ColumnStatistics & statistics() const
Returns a const reference to the column statistics.
void write_int64(int64_t val)
Write a single INT64 value (8 bytes little-endian).
void write_batch(const std::string *values, size_t count)
Write a batch of string values (BYTE_ARRAY).
size_t encoded_size() const
Returns the total encoded data size in bytes.
void write_byte_array(const std::string &val)
Write a single BYTE_ARRAY value from a std::string.
void write_bool(bool val)
Write a single boolean value.
ColumnWriter(PhysicalType type, int32_t type_length=-1)
Construct a writer for the given Parquet physical type.
void reset()
Reset the writer for the next column chunk. Clears all data and statistics.
int64_t num_values() const
Returns the number of values written so far.
void write_batch(const T *values, size_t count)
Write a batch of typed values.
const std::vector< uint8_t > & data() const
Returns a const reference to the encoded byte buffer.
void write_float(float val)
Write a single FLOAT value (4 bytes little-endian, IEEE 754).
PhysicalType type() const
Returns the physical type this writer encodes.
void write_byte_array(const uint8_t *data, size_t len)
Write a single BYTE_ARRAY value from raw bytes.
void write_int32(int32_t val)
Write a single INT32 value (4 bytes little-endian).
void write_double(double val)
Write a single DOUBLE value (8 bytes little-endian, IEEE 754).
void write(const T &val)
Write a single value, dispatching to the correct typed write method.
void write_fixed_len_byte_array(const uint8_t *data, size_t len)
Write a single FIXED_LEN_BYTE_ARRAY value from raw bytes.
PhysicalType
Parquet physical (storage) types as defined in parquet.thrift.
Definition types.hpp:20
void append_le64(std::vector< uint8_t > &buf, uint64_t val)
Append a uint64_t in little-endian byte order to a byte buffer.
void append_le32(std::vector< uint8_t > &buf, uint32_t val)
Append a uint32_t in little-endian byte order to a byte buffer.
Per-column-chunk statistics tracker and little-endian byte helpers.
Parquet format enumerations, type traits, and statistics structs.