Signet Forge 0.1.0
C++20 Parquet library with AI-native extensions
DEMO
Loading...
Searching...
No Matches
schema.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright 2026 Johnson Ogundeji
3#pragma once
4
5#include "signet/types.hpp"
6
7#include <optional>
8#include <stdexcept>
9#include <string>
10#include <type_traits>
11#include <unordered_set>
12#include <vector>
13
14// Windows <sal.h> defines OPTIONAL as a SAL annotation macro — undefine.
15#ifdef OPTIONAL
16#undef OPTIONAL
17#endif
18
19namespace signet::forge {
20
23
38template <typename T>
39struct Column {
40 std::string name;
42
45 Column(std::string n) : name(std::move(n)) {
46 apply_default_logical_type();
47 }
48
52 Column(std::string n, LogicalType lt) : name(std::move(n)), logical_type(lt) {
54 apply_default_logical_type();
55 }
56 }
57
60 [[nodiscard]] ColumnDescriptor to_descriptor() const {
62 cd.name = name;
63 cd.physical_type = parquet_type_of_v<T>;
65 return cd;
66 }
67
68private:
70 void apply_default_logical_type() {
71 if constexpr (std::is_same_v<T, std::string>) {
74 }
75 }
76 }
77};
78
79class Schema; // Forward declaration.
80
93public:
96 explicit SchemaBuilder(std::string name) : name_(std::move(name)) {}
97
106 template <typename T>
108 LogicalType logical_type = LogicalType::NONE) {
110 cd.name = std::move(col_name);
111 cd.physical_type = parquet_type_of_v<T>;
112 cd.logical_type = logical_type;
113
114 // Default std::string → STRING if no explicit logical type
115 if constexpr (std::is_same_v<T, std::string>) {
116 if (cd.logical_type == LogicalType::NONE) {
117 cd.logical_type = LogicalType::STRING;
118 }
119 }
120
121 columns_.push_back(std::move(cd));
122 return *this;
123 }
124
131 template <typename T>
133 LogicalType logical_type,
134 Repetition repetition) {
136 cd.name = std::move(col_name);
137 cd.physical_type = parquet_type_of_v<T>;
138 cd.logical_type = logical_type;
139 cd.repetition = repetition;
140
141 if constexpr (std::is_same_v<T, std::string>) {
142 if (cd.logical_type == LogicalType::NONE) {
143 cd.logical_type = LogicalType::STRING;
144 }
145 }
146
147 columns_.push_back(std::move(cd));
148 return *this;
149 }
150
156 template <typename T>
158 LogicalType logical_type = LogicalType::NONE) {
159 return column<T>(std::move(col_name), logical_type, Repetition::OPTIONAL);
160 }
161
169 columns_.push_back(std::move(cd));
170 return *this;
171 }
172
178 [[nodiscard]] Schema build();
179
180private:
181 std::string name_;
182 std::vector<ColumnDescriptor> columns_;
183};
184
192class Schema {
193public:
195 Schema() = default;
196
200 Schema(std::string name, std::vector<ColumnDescriptor> columns)
201 : name_(std::move(name)), columns_(std::move(columns)) {}
202
216 template <typename... Cols>
217 [[nodiscard]] static Schema build(std::string name, Cols&&... cols) {
218 std::vector<ColumnDescriptor> descs;
219 descs.reserve(sizeof...(Cols));
220 (descs.push_back(std::forward<Cols>(cols).to_descriptor()), ...);
221 return Schema(std::move(name), std::move(descs));
222 }
223
228 [[nodiscard]] static SchemaBuilder builder(std::string name) {
229 return SchemaBuilder(std::move(name));
230 }
231
232 // -- Accessors -------------------------------------------------------------
233
235 [[nodiscard]] const std::string& name() const { return name_; }
236
238 [[nodiscard]] size_t num_columns() const { return columns_.size(); }
239
244 [[nodiscard]] const ColumnDescriptor& column(size_t index) const {
245 if (index >= columns_.size()) {
246 throw std::out_of_range("Schema::column: index "
247 + std::to_string(index) + " out of range (num_columns="
248 + std::to_string(columns_.size()) + ")");
249 }
250 return columns_[index];
251 }
252
254 [[nodiscard]] const std::vector<ColumnDescriptor>& columns() const {
255 return columns_;
256 }
257
261 [[nodiscard]] std::optional<size_t> find_column(const std::string& col_name) const {
262 for (size_t i = 0; i < columns_.size(); ++i) {
263 if (columns_[i].name == col_name) {
264 return i;
265 }
266 }
267 return std::nullopt;
268 }
269
272 [[nodiscard]] bool operator==(const Schema& other) const {
273 if (name_ != other.name_ || columns_.size() != other.columns_.size()) {
274 return false;
275 }
276 for (size_t i = 0; i < columns_.size(); ++i) {
277 const auto& a = columns_[i];
278 const auto& b = other.columns_[i];
279 if (a.name != b.name ||
284 return false;
285 }
286 }
287 return true;
288 }
289
291 [[nodiscard]] bool operator!=(const Schema& other) const {
292 return !(*this == other);
293 }
294
295private:
296 std::string name_;
297 std::vector<ColumnDescriptor> columns_;
298};
299
300// ---------------------------------------------------------------------------
301// SchemaBuilder::build() — defined after Schema is complete
302// ---------------------------------------------------------------------------
304 // Detect duplicate column names at build time
305 std::unordered_set<std::string> seen;
306 seen.reserve(columns_.size());
307 for (const auto& cd : columns_) {
308 if (!seen.insert(cd.name).second) {
309 throw std::invalid_argument(
310 "Schema::build: duplicate column name '" + cd.name + "'");
311 }
312 }
313 return Schema(std::move(name_), std::move(columns_));
314}
315
316} // namespace signet::forge
Fluent builder for constructing a Schema one column at a time.
Definition schema.hpp:92
SchemaBuilder & column(std::string col_name, LogicalType logical_type=LogicalType::NONE)
Add a typed column, deducing PhysicalType from T.
Definition schema.hpp:107
SchemaBuilder(std::string name)
Construct a builder with the given root schema name.
Definition schema.hpp:96
Schema build()
Build the final Schema, consuming the builder.
Definition schema.hpp:303
SchemaBuilder & optional_column(std::string col_name, LogicalType logical_type=LogicalType::NONE)
Add an optional (nullable) column — shorthand for Repetition::OPTIONAL.
Definition schema.hpp:157
SchemaBuilder & column(std::string col_name, LogicalType logical_type, Repetition repetition)
Add a column with an explicit repetition level.
Definition schema.hpp:132
SchemaBuilder & raw_column(ColumnDescriptor cd)
Add a pre-built ColumnDescriptor directly.
Definition schema.hpp:168
Immutable schema description for a Parquet file.
Definition schema.hpp:192
const std::vector< ColumnDescriptor > & columns() const
All column descriptors (ordered).
Definition schema.hpp:254
static Schema build(std::string name, Cols &&... cols)
Build a Schema from typed Column<T> descriptors (variadic factory).
Definition schema.hpp:217
static SchemaBuilder builder(std::string name)
Create a SchemaBuilder for fluent column construction.
Definition schema.hpp:228
Schema(std::string name, std::vector< ColumnDescriptor > columns)
Construct a schema directly from a name and column list.
Definition schema.hpp:200
bool operator==(const Schema &other) const
Equality — schemas match if they have the same name and identical columns (name, physical_type,...
Definition schema.hpp:272
bool operator!=(const Schema &other) const
Inequality operator.
Definition schema.hpp:291
size_t num_columns() const
Number of columns in this schema.
Definition schema.hpp:238
Schema()=default
Default-construct an empty schema.
const std::string & name() const
Root schema name (e.g. "tick_data").
Definition schema.hpp:235
std::optional< size_t > find_column(const std::string &col_name) const
Find a column index by name.
Definition schema.hpp:261
const ColumnDescriptor & column(size_t index) const
Access a column descriptor by index.
Definition schema.hpp:244
LogicalType
Parquet logical types (from parquet.thrift LogicalType union).
Definition types.hpp:41
@ STRING
UTF-8 string (stored as BYTE_ARRAY).
@ NONE
No logical annotation — raw physical type.
Repetition
Parquet field repetition types (nullability / cardinality).
Definition types.hpp:140
@ OPTIONAL
Zero or one value per row (nullable).
Descriptor for a single column in a Parquet schema.
Definition types.hpp:152
int32_t type_length
Byte length for FIXED_LEN_BYTE_ARRAY columns (-1 = N/A).
Definition types.hpp:157
LogicalType logical_type
Semantic annotation (STRING, TIMESTAMP_NS, etc.).
Definition types.hpp:155
Repetition repetition
Nullability / cardinality.
Definition types.hpp:156
std::string name
Column name (unique within a schema).
Definition types.hpp:153
PhysicalType physical_type
On-disk storage type.
Definition types.hpp:154
Typed column descriptor for the Schema::build() variadic API.
Definition schema.hpp:39
ColumnDescriptor to_descriptor() const
Convert to a ColumnDescriptor for Schema construction.
Definition schema.hpp:60
std::string name
Column name.
Definition schema.hpp:40
Column(std::string n, LogicalType lt)
Construct a column with an explicit logical type.
Definition schema.hpp:52
Column(std::string n)
Construct a column with a name only (logical type auto-deduced for strings).
Definition schema.hpp:45
LogicalType logical_type
Optional logical annotation.
Definition schema.hpp:41
Parquet format enumerations, type traits, and statistics structs.