Signet Forge 0.1.0
C++20 Parquet library with AI-native extensions
DEMO
Loading...
Searching...
No Matches
z_order.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright 2026 Johnson Ogundeji
3#pragma once
4
28
29#include "signet/types.hpp"
30
31#include <algorithm>
32#include <cstdint>
33#include <cstring>
34#include <numeric>
35#include <stdexcept>
36#include <string>
37#include <string_view>
38#include <vector>
39
42
43// ===========================================================================
44// Value normalization -- preserve sort order as unsigned integers
45// ===========================================================================
46
51inline uint32_t normalize_int32(int32_t v) {
52 return static_cast<uint32_t>(v) ^ 0x80000000u;
53}
54
58inline uint64_t normalize_int64(int64_t v) {
59 return static_cast<uint64_t>(v) ^ 0x8000000000000000ULL;
60}
61
70inline uint32_t normalize_float(float v) {
71 uint32_t bits;
72 std::memcpy(&bits, &v, 4);
73 uint32_t mask = -static_cast<int32_t>(bits >> 31) | 0x80000000u;
74 return bits ^ mask;
75}
76
83inline uint64_t normalize_double(double v) {
84 uint64_t bits;
85 std::memcpy(&bits, &v, 8);
86 uint64_t mask = -static_cast<int64_t>(bits >> 63) | 0x8000000000000000ULL;
87 return bits ^ mask;
88}
89
102inline uint32_t normalize_string(std::string_view v) {
103 uint32_t result = 0;
104 for (size_t i = 0; i < 4 && i < v.size(); ++i) {
105 result |= static_cast<uint32_t>(static_cast<uint8_t>(v[i])) << (24 - 8 * i);
106 }
107 return result;
108}
109
117inline uint32_t truncate_to_32(uint64_t v) {
118 return static_cast<uint32_t>(v >> 32);
119}
120
121// ===========================================================================
122// Morton code — bit interleaving
123// ===========================================================================
124
134inline uint64_t morton_2d(uint32_t x, uint32_t y) {
135 auto spread = [](uint32_t v) -> uint64_t {
136 uint64_t r = v;
137 r = (r | (r << 16)) & 0x0000FFFF0000FFFFULL;
138 r = (r | (r << 8)) & 0x00FF00FF00FF00FFULL;
139 r = (r | (r << 4)) & 0x0F0F0F0F0F0F0F0FULL;
140 r = (r | (r << 2)) & 0x3333333333333333ULL;
141 r = (r | (r << 1)) & 0x5555555555555555ULL;
142 return r;
143 };
144 return spread(x) | (spread(y) << 1);
145}
146
155inline void deinterleave_2d(uint64_t code, uint32_t& x, uint32_t& y) {
156 auto compact = [](uint64_t v) -> uint32_t {
157 v &= 0x5555555555555555ULL;
158 v = (v | (v >> 1)) & 0x3333333333333333ULL;
159 v = (v | (v >> 2)) & 0x0F0F0F0F0F0F0F0FULL;
160 v = (v | (v >> 4)) & 0x00FF00FF00FF00FFULL;
161 v = (v | (v >> 8)) & 0x0000FFFF0000FFFFULL;
162 v = (v | (v >> 16)) & 0x00000000FFFFFFFFULL;
163 return static_cast<uint32_t>(v);
164 };
165 x = compact(code);
166 y = compact(code >> 1);
167}
168
183inline std::vector<uint8_t> morton_nd(const std::vector<uint32_t>& normalized,
184 size_t bits_per_col = 32) {
185 size_t n = normalized.size();
186 if (n == 0) return {};
187
188 size_t total_bits = n * bits_per_col;
189 size_t total_bytes = (total_bits + 7) / 8;
190 std::vector<uint8_t> result(total_bytes, 0);
191
192 // Round-robin interleave: bit position `b` of column `c` maps to
193 // output bit position `b * n + c`, with MSB first.
194 size_t out_bit = 0;
195 for (size_t b = 0; b < bits_per_col; ++b) {
196 size_t src_bit = bits_per_col - 1 - b; // MSB first
197 for (size_t c = 0; c < n; ++c) {
198 if (normalized[c] & (1u << src_bit)) {
199 size_t byte_idx = out_bit / 8;
200 size_t bit_in_byte = 7 - (out_bit % 8);
201 result[byte_idx] |= static_cast<uint8_t>(1u << bit_in_byte);
202 }
203 ++out_bit;
204 }
205 }
206
207 return result;
208}
209
210// ===========================================================================
211// ZOrderColumn -- describes a column's data for Z-order sorting
212// ===========================================================================
213
222 const void* data;
223 size_t count;
224};
225
226// ===========================================================================
227// ZOrderSorter -- sorts row indices by Morton key
228// ===========================================================================
229
248 [[nodiscard]] static std::vector<size_t> sort(
249 size_t num_rows,
250 const std::vector<ZOrderColumn>& columns) {
251
252 if (num_rows == 0 || columns.empty()) return {};
253
254 size_t n_cols = columns.size();
255
256 // Validate that each column's count matches num_rows (CWE-787: OOB write)
257 for (size_t c = 0; c < n_cols; ++c) {
258 if (columns[c].count != num_rows) {
259 throw std::out_of_range(
260 "ZOrderSorter::sort: column " + std::to_string(c) +
261 " count (" + std::to_string(columns[c].count) +
262 ") != num_rows (" + std::to_string(num_rows) + ")");
263 }
264 }
265
266 // Normalize all values to uint32_t
267 std::vector<std::vector<uint32_t>> normalized(n_cols);
268 for (size_t c = 0; c < n_cols; ++c) {
269 normalized[c].resize(num_rows);
270 normalize_column(columns[c], normalized[c]);
271 }
272
273 // Fast path: 2 columns — use morton_2d with uint64_t sort keys
274 if (n_cols == 2) {
275 std::vector<uint64_t> keys(num_rows);
276 for (size_t r = 0; r < num_rows; ++r) {
277 keys[r] = morton_2d(normalized[0][r], normalized[1][r]);
278 }
279
280 std::vector<size_t> perm(num_rows);
281 std::iota(perm.begin(), perm.end(), 0);
282 std::sort(perm.begin(), perm.end(),
283 [&keys](size_t a, size_t b) { return keys[a] < keys[b]; });
284 return perm;
285 }
286
287 // General path: N columns — use morton_nd byte-array sort keys
288 std::vector<std::vector<uint8_t>> keys(num_rows);
289 std::vector<uint32_t> row_vals(n_cols);
290 for (size_t r = 0; r < num_rows; ++r) {
291 for (size_t c = 0; c < n_cols; ++c) {
292 row_vals[c] = normalized[c][r];
293 }
294 keys[r] = morton_nd(row_vals);
295 }
296
297 std::vector<size_t> perm(num_rows);
298 std::iota(perm.begin(), perm.end(), 0);
299 std::sort(perm.begin(), perm.end(),
300 [&keys](size_t a, size_t b) { return keys[a] < keys[b]; });
301 return perm;
302 }
303
304private:
314 static void normalize_column(const ZOrderColumn& col,
315 std::vector<uint32_t>& out) {
316 if (col.data == nullptr) {
317 throw std::invalid_argument(
318 "ZOrderSorter: column data pointer is null");
319 }
320 switch (col.type) {
321 case PhysicalType::INT32: {
322 // CWE-704, C++ [basic.align] — alignment check before pointer cast
323 if (reinterpret_cast<uintptr_t>(col.data) % alignof(int32_t) != 0) {
324 throw std::invalid_argument(
325 "ZOrderSorter: INT32 data pointer not aligned for int32_t");
326 }
327 const auto* vals = static_cast<const int32_t*>(col.data);
328 for (size_t i = 0; i < col.count; ++i) {
329 out[i] = normalize_int32(vals[i]);
330 }
331 break;
332 }
333 case PhysicalType::INT64: {
334 // CWE-704, C++ [basic.align] — alignment check before pointer cast
335 if (reinterpret_cast<uintptr_t>(col.data) % alignof(int64_t) != 0) {
336 throw std::invalid_argument(
337 "ZOrderSorter: INT64 data pointer not aligned for int64_t");
338 }
339 const auto* vals = static_cast<const int64_t*>(col.data);
340 for (size_t i = 0; i < col.count; ++i) {
341 out[i] = truncate_to_32(normalize_int64(vals[i]));
342 }
343 break;
344 }
345 case PhysicalType::FLOAT: {
346 // CWE-704, C++ [basic.align] — alignment check before pointer cast
347 if (reinterpret_cast<uintptr_t>(col.data) % alignof(float) != 0) {
348 throw std::invalid_argument(
349 "ZOrderSorter: FLOAT data pointer not aligned for float");
350 }
351 const auto* vals = static_cast<const float*>(col.data);
352 for (size_t i = 0; i < col.count; ++i) {
353 out[i] = normalize_float(vals[i]);
354 }
355 break;
356 }
358 // CWE-704, C++ [basic.align] — alignment check before pointer cast
359 if (reinterpret_cast<uintptr_t>(col.data) % alignof(double) != 0) {
360 throw std::invalid_argument(
361 "ZOrderSorter: DOUBLE data pointer not aligned for double");
362 }
363 const auto* vals = static_cast<const double*>(col.data);
364 for (size_t i = 0; i < col.count; ++i) {
365 out[i] = truncate_to_32(normalize_double(vals[i]));
366 }
367 break;
368 }
370 // Interpret data as array of std::string pointers
371 const auto* vals = static_cast<const std::string*>(col.data);
372 for (size_t i = 0; i < col.count; ++i) {
373 out[i] = normalize_string(vals[i]);
374 }
375 break;
376 }
377 default: {
378 // For unsupported types, use zero (all rows sort equally)
379 for (size_t i = 0; i < col.count; ++i) {
380 out[i] = 0;
381 }
382 break;
383 }
384 }
385 }
386};
387
388} // namespace signet::forge::z_order
Z-order curve (Morton code) utilities for spatial sort keys.
Definition z_order.hpp:41
uint64_t normalize_double(double v)
Normalize a 64-bit double to uint64_t preserving total order.
Definition z_order.hpp:83
uint32_t normalize_string(std::string_view v)
Normalize a string to uint32_t by taking the first 4 bytes in big-endian order.
Definition z_order.hpp:102
uint32_t normalize_int32(int32_t v)
Normalize a signed 32-bit integer to an unsigned 32-bit integer that preserves the original sort orde...
Definition z_order.hpp:51
uint64_t morton_2d(uint32_t x, uint32_t y)
Interleave bits of two uint32_t values into a single uint64_t Morton code (2D).
Definition z_order.hpp:134
uint32_t normalize_float(float v)
Normalize a 32-bit float to uint32_t preserving total order.
Definition z_order.hpp:70
uint64_t normalize_int64(int64_t v)
Normalize a signed 64-bit integer to uint64_t (flip sign bit).
Definition z_order.hpp:58
std::vector< uint8_t > morton_nd(const std::vector< uint32_t > &normalized, size_t bits_per_col=32)
Generalized N-column Morton code via round-robin bit interleaving (MSB-first).
Definition z_order.hpp:183
uint32_t truncate_to_32(uint64_t v)
Truncate a uint64_t to uint32_t by extracting the upper 32 bits.
Definition z_order.hpp:117
void deinterleave_2d(uint64_t code, uint32_t &x, uint32_t &y)
Deinterleave a 2D Morton code back into its two uint32_t components.
Definition z_order.hpp:155
PhysicalType
Parquet physical (storage) types as defined in parquet.thrift.
Definition types.hpp:20
@ INT64
64-bit signed integer (little-endian).
@ INT32
32-bit signed integer (little-endian).
@ BYTE_ARRAY
Variable-length byte sequence (strings, binary).
@ FLOAT
IEEE 754 single-precision float.
@ DOUBLE
IEEE 754 double-precision float.
Descriptor for a single column of raw typed data used by ZOrderSorter.
Definition z_order.hpp:220
size_t count
Number of elements in the array (must equal num_rows).
Definition z_order.hpp:223
PhysicalType type
Parquet physical type of the column data.
Definition z_order.hpp:221
const void * data
Pointer to a contiguous typed array (not owned).
Definition z_order.hpp:222
Computes a permutation vector that reorders rows by Z-order (Morton) key.
Definition z_order.hpp:240
static std::vector< size_t > sort(size_t num_rows, const std::vector< ZOrderColumn > &columns)
Sort row indices [0..num_rows) by Z-order key computed from columns.
Definition z_order.hpp:248
Parquet format enumerations, type traits, and statistics structs.