Signet Forge 0.1.0
C++20 Parquet library with AI-native extensions
DEMO
Loading...
Searching...
No Matches
types.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright 2026 Johnson Ogundeji
3#pragma once
4
5#include <cstdint>
6#include <string>
7#include <type_traits>
8#include <vector>
9
10namespace signet::forge {
11
14
20enum class PhysicalType : int32_t {
21 BOOLEAN = 0,
22 INT32 = 1,
23 INT64 = 2,
24 INT96 = 3,
25 FLOAT = 4,
26 DOUBLE = 5,
27 BYTE_ARRAY = 6,
29};
30
36// Windows <mmsystem.h> defines TIME_MS as a macro — undefine to avoid collision.
37#ifdef TIME_MS
38#undef TIME_MS
39#endif
40
41enum class LogicalType : int32_t {
42 NONE = 0,
43 STRING = 1,
44 ENUM = 2,
45 UUID = 3,
46 DATE = 4,
47 TIME_MS = 5,
48 TIME_US = 6,
49 TIME_NS = 7,
50 TIMESTAMP_MS = 8,
51 TIMESTAMP_US = 9,
52 TIMESTAMP_NS = 10,
53 DECIMAL = 11,
54 JSON = 12,
55 BSON = 13,
56 FLOAT16 = 14,
59 FLOAT32_VECTOR = 100,
60};
61
67enum class ConvertedType : int32_t {
68 NONE = -1,
69 UTF8 = 0,
70 MAP = 1,
71 MAP_KEY_VALUE = 2,
72 LIST = 3,
73 ENUM = 4,
74 DECIMAL = 5,
75 DATE = 6,
76 TIME_MILLIS = 7,
77 TIME_MICROS = 8,
79 TIMESTAMP_MICROS = 10,
80 UINT_8 = 11,
81 UINT_16 = 12,
82 UINT_32 = 13,
83 UINT_64 = 14,
84 INT_8 = 15,
85 INT_16 = 16,
86 INT_32 = 17,
87 INT_64 = 18,
88 JSON = 19,
89 BSON = 20,
90 INTERVAL = 21
91};
92
98enum class Encoding : int32_t {
99 PLAIN = 0,
100 PLAIN_DICTIONARY = 2,
101 RLE = 3,
102 BIT_PACKED = 4,
105 DELTA_BYTE_ARRAY = 7,
106 RLE_DICTIONARY = 8,
108};
109
115enum class Compression : int32_t {
116 UNCOMPRESSED = 0,
117 SNAPPY = 1,
118 GZIP = 2,
119 LZO = 3,
120 BROTLI = 4,
121 LZ4 = 5,
122 ZSTD = 6,
123 LZ4_RAW = 7
124};
125
127enum class PageType : int32_t {
128 DATA_PAGE = 0,
129 INDEX_PAGE = 1,
130 DICTIONARY_PAGE = 2,
131 DATA_PAGE_V2 = 3
132};
133
136// Windows <sal.h> defines OPTIONAL as a SAL annotation macro — undefine.
137#ifdef OPTIONAL
138#undef OPTIONAL
139#endif
140enum class Repetition : int32_t {
141 REQUIRED = 0,
142 OPTIONAL = 1,
143 REPEATED = 2
144};
145
161
169template <typename T> struct parquet_type_of;
170
171template <> struct parquet_type_of<bool> { static constexpr PhysicalType value = PhysicalType::BOOLEAN; };
172template <> struct parquet_type_of<int32_t> { static constexpr PhysicalType value = PhysicalType::INT32; };
173template <> struct parquet_type_of<int64_t> { static constexpr PhysicalType value = PhysicalType::INT64; };
174template <> struct parquet_type_of<float> { static constexpr PhysicalType value = PhysicalType::FLOAT; };
175template <> struct parquet_type_of<double> { static constexpr PhysicalType value = PhysicalType::DOUBLE; };
176template <> struct parquet_type_of<std::string> { static constexpr PhysicalType value = PhysicalType::BYTE_ARRAY; };
177
179template <typename T>
181
187template <PhysicalType PT> struct native_type_of;
188
189template <> struct native_type_of<PhysicalType::BOOLEAN> { using type = bool; };
190template <> struct native_type_of<PhysicalType::INT32> { using type = int32_t; };
191template <> struct native_type_of<PhysicalType::INT64> { using type = int64_t; };
192template <> struct native_type_of<PhysicalType::FLOAT> { using type = float; };
193template <> struct native_type_of<PhysicalType::DOUBLE> { using type = double; };
194template <> struct native_type_of<PhysicalType::BYTE_ARRAY> { using type = std::string; };
195
197template <PhysicalType PT>
199
201inline constexpr int32_t PARQUET_VERSION = 2;
203inline constexpr const char* SIGNET_CREATED_BY = "SignetStack signet-forge version 0.1.0";
205inline constexpr uint32_t PARQUET_MAGIC = 0x31524150;
207inline constexpr uint32_t PARQUET_MAGIC_ENCRYPTED = 0x45524150;
208
221
228 int64_t file_size_bytes = 0;
229 int64_t total_rows = 0;
230 int64_t total_row_groups = 0;
233 double compression_ratio = 1.0;
234 double bytes_per_row = 0.0;
235
236 std::vector<ColumnWriteStats> columns;
237};
238
253
259struct FileStats {
260 int64_t file_size_bytes = 0;
261 int64_t total_rows = 0;
262 int64_t num_row_groups = 0;
263 int64_t num_columns = 0;
264 std::string created_by;
265 double compression_ratio = 1.0;
266 double bytes_per_row = 0.0;
267
268 std::vector<ColumnFileStats> columns;
269};
270
271} // namespace signet::forge
constexpr const char * SIGNET_CREATED_BY
Default "created_by" string embedded in every Parquet footer.
Definition types.hpp:203
PhysicalType
Parquet physical (storage) types as defined in parquet.thrift.
Definition types.hpp:20
@ INT96
96-bit value (deprecated — legacy Impala timestamps).
@ FIXED_LEN_BYTE_ARRAY
Fixed-length byte array (UUID, vectors, decimals).
@ INT64
64-bit signed integer (little-endian).
@ INT32
32-bit signed integer (little-endian).
@ BOOLEAN
1-bit boolean, bit-packed in pages.
@ BYTE_ARRAY
Variable-length byte sequence (strings, binary).
@ FLOAT
IEEE 754 single-precision float.
@ DOUBLE
IEEE 754 double-precision float.
constexpr int32_t PARQUET_VERSION
Parquet format version written to the file footer.
Definition types.hpp:201
constexpr uint32_t PARQUET_MAGIC_ENCRYPTED
"PARE" magic bytes (little-endian uint32) — marks a Parquet file with an encrypted footer.
Definition types.hpp:207
@ STRING
Variable-length string.
@ FLOAT
32-bit IEEE float (float32)
@ DOUBLE
64-bit IEEE float (float64)
Compression
Parquet compression codecs.
Definition types.hpp:115
@ BROTLI
Brotli compression (not currently supported).
@ SNAPPY
Snappy compression (bundled, header-only).
@ LZ4_RAW
LZ4 raw (unframed) block compression.
@ LZO
LZO compression (not currently supported).
@ UNCOMPRESSED
No compression.
@ ZSTD
Zstandard compression (requires SIGNET_ENABLE_ZSTD).
@ LZ4
LZ4 block compression (requires SIGNET_ENABLE_LZ4).
@ GZIP
Gzip/deflate compression (requires SIGNET_ENABLE_GZIP).
@ JSON
Pretty-printed JSON object (default)
ConvertedType
Legacy Parquet converted types for backward compatibility with older readers.
Definition types.hpp:67
@ TIMESTAMP_MILLIS
Timestamp in milliseconds.
@ MAP_KEY_VALUE
Map key-value pair.
@ LIST
List (nested group).
@ INT_8
Signed 8-bit integer.
@ UINT_32
Unsigned 32-bit integer.
@ TIMESTAMP_MICROS
Timestamp in microseconds.
@ UINT_16
Unsigned 16-bit integer.
@ MAP
Map (nested group).
@ INT_16
Signed 16-bit integer.
@ UINT_8
Unsigned 8-bit integer.
@ TIME_MILLIS
Time in milliseconds.
@ UINT_64
Unsigned 64-bit integer.
@ INT_32
Signed 32-bit integer.
@ TIME_MICROS
Time in microseconds.
@ INT_64
Signed 64-bit integer.
@ UTF8
UTF-8 encoded string.
typename native_type_of< PT >::type native_type_of_t
Convenience alias: native_type_of_t<PhysicalType::INT64> == int64_t.
Definition types.hpp:198
LogicalType
Parquet logical types (from parquet.thrift LogicalType union).
Definition types.hpp:41
@ DECIMAL
Fixed-point decimal (INT32/INT64/FIXED_LEN_BYTE_ARRAY).
@ TIMESTAMP_NS
Timestamp — INT64, nanoseconds since Unix epoch.
@ UUID
RFC 4122 UUID (stored as FIXED_LEN_BYTE_ARRAY(16)).
@ DATE
Calendar date — INT32, days since 1970-01-01.
@ ENUM
Enum string (stored as BYTE_ARRAY).
@ TIME_MS
Time of day — INT32, milliseconds since midnight.
@ TIME_NS
Time of day — INT64, nanoseconds since midnight.
@ NONE
No logical annotation — raw physical type.
@ TIME_US
Time of day — INT64, microseconds since midnight.
@ BSON
BSON document (stored as BYTE_ARRAY).
@ TIMESTAMP_MS
Timestamp — INT64, milliseconds since Unix epoch.
@ FLOAT32_VECTOR
ML embedding vector — FIXED_LEN_BYTE_ARRAY(dim*4).
@ TIMESTAMP_US
Timestamp — INT64, microseconds since Unix epoch.
constexpr uint32_t PARQUET_MAGIC
"PAR1" magic bytes (little-endian uint32) — marks a standard Parquet file.
Definition types.hpp:205
constexpr PhysicalType parquet_type_of_v
Convenience variable template: parquet_type_of_v<double> == PhysicalType::DOUBLE.
Definition types.hpp:180
Encoding
Parquet page encoding types.
Definition types.hpp:98
@ DELTA_BINARY_PACKED
Delta encoding for INT32/INT64 (compact for sorted/sequential data).
@ RLE
Run-length / bit-packed hybrid (used for booleans and def/rep levels).
@ RLE_DICTIONARY
Modern dictionary encoding (Parquet 2.0) — dict page + RLE indices.
@ BIT_PACKED
Deprecated — superseded by RLE.
@ DELTA_BYTE_ARRAY
Incremental/prefix encoding for byte arrays.
@ DELTA_LENGTH_BYTE_ARRAY
Delta-encoded lengths + concatenated byte arrays.
@ PLAIN_DICTIONARY
Legacy dictionary encoding (Parquet 1.0).
@ PLAIN
Values stored back-to-back in their native binary layout.
@ BYTE_STREAM_SPLIT
Byte-stream split for FLOAT/DOUBLE (transposes byte lanes for better compression).
@ INT64
Signed 64-bit integer.
@ INT32
Signed 32-bit integer.
@ FLOAT16
IEEE 754 half-precision (2 bytes)
PageType
Parquet page types within a column chunk.
Definition types.hpp:127
@ DATA_PAGE_V2
Data page v2 (Parquet 2.0 format with separate rep/def level sections).
@ INDEX_PAGE
Index page (reserved, not used by Signet).
@ DICTIONARY_PAGE
Dictionary page — contains the value dictionary for RLE_DICTIONARY columns.
@ DATA_PAGE
Data page (Parquet 1.0 format).
Repetition
Parquet field repetition types (nullability / cardinality).
Definition types.hpp:140
@ REPEATED
Zero or more values per row (list).
@ OPTIONAL
Zero or one value per row (nullable).
@ REQUIRED
Exactly one value per row (non-nullable).
Descriptor for a single column in a Parquet schema.
Definition types.hpp:152
int32_t type_length
Byte length for FIXED_LEN_BYTE_ARRAY columns (-1 = N/A).
Definition types.hpp:157
LogicalType logical_type
Semantic annotation (STRING, TIMESTAMP_NS, etc.).
Definition types.hpp:155
Repetition repetition
Nullability / cardinality.
Definition types.hpp:156
std::string name
Column name (unique within a schema).
Definition types.hpp:153
int32_t scale
Decimal scale (-1 = N/A).
Definition types.hpp:159
PhysicalType physical_type
On-disk storage type.
Definition types.hpp:154
int32_t precision
Decimal precision (-1 = N/A).
Definition types.hpp:158
Per-column statistics from ParquetReader::file_stats().
Definition types.hpp:241
Compression compression
Compression codec.
Definition types.hpp:245
bool has_page_index
Whether column/offset index is present.
Definition types.hpp:251
std::string column_name
Column name.
Definition types.hpp:242
PhysicalType physical_type
Storage type.
Definition types.hpp:243
int64_t uncompressed_bytes
Total uncompressed size.
Definition types.hpp:246
int64_t num_values
Total value count.
Definition types.hpp:248
bool has_bloom_filter
Whether a bloom filter is present.
Definition types.hpp:250
int64_t null_count
Total null count.
Definition types.hpp:249
int64_t compressed_bytes
Total compressed size.
Definition types.hpp:247
LogicalType logical_type
Logical annotation.
Definition types.hpp:244
Per-column statistics produced by ParquetWriter::close().
Definition types.hpp:211
PhysicalType physical_type
Storage type used on disk.
Definition types.hpp:213
std::string column_name
Column name from the schema.
Definition types.hpp:212
int64_t null_count
Number of null values.
Definition types.hpp:219
int64_t uncompressed_bytes
Total uncompressed data size (bytes).
Definition types.hpp:216
int64_t compressed_bytes
Total compressed data size (bytes).
Definition types.hpp:217
Encoding encoding
Encoding applied to data pages.
Definition types.hpp:214
int64_t num_values
Number of values written.
Definition types.hpp:218
Compression compression
Compression codec applied.
Definition types.hpp:215
Aggregate file-level statistics returned by ParquetReader::file_stats().
Definition types.hpp:259
int64_t total_rows
Total rows in the file.
Definition types.hpp:261
double compression_ratio
Overall uncompressed / compressed ratio.
Definition types.hpp:265
std::string created_by
"created_by" string from the footer.
Definition types.hpp:264
std::vector< ColumnFileStats > columns
Per-column statistics.
Definition types.hpp:268
int64_t num_columns
Number of columns.
Definition types.hpp:263
int64_t num_row_groups
Number of row groups.
Definition types.hpp:262
double bytes_per_row
Average file bytes per row.
Definition types.hpp:266
int64_t file_size_bytes
Total file size on disk (bytes).
Definition types.hpp:260
File-level write statistics returned by ParquetWriter::close().
Definition types.hpp:227
int64_t file_size_bytes
Total on-disk file size (bytes).
Definition types.hpp:228
int64_t total_compressed_bytes
Sum of compressed page sizes.
Definition types.hpp:232
double bytes_per_row
Average file bytes per row.
Definition types.hpp:234
std::vector< ColumnWriteStats > columns
Per-column statistics.
Definition types.hpp:236
double compression_ratio
Ratio of uncompressed / compressed (>= 1.0).
Definition types.hpp:233
int64_t total_uncompressed_bytes
Sum of uncompressed page sizes.
Definition types.hpp:231
int64_t total_rows
Total rows written across all row groups.
Definition types.hpp:229
int64_t total_row_groups
Number of row groups in the file.
Definition types.hpp:230
Maps a Parquet PhysicalType back to its corresponding C++ native type.
Definition types.hpp:187
Maps a C++ type to its corresponding Parquet PhysicalType at compile time.
Definition types.hpp:169