52 return static_cast<uint32_t
>(v) ^ 0x80000000u;
59 return static_cast<uint64_t
>(v) ^ 0x8000000000000000ULL;
72 std::memcpy(&bits, &v, 4);
73 uint32_t mask = -
static_cast<int32_t
>(bits >> 31) | 0x80000000u;
85 std::memcpy(&bits, &v, 8);
86 uint64_t mask = -
static_cast<int64_t
>(bits >> 63) | 0x8000000000000000ULL;
104 for (
size_t i = 0; i < 4 && i < v.size(); ++i) {
105 result |=
static_cast<uint32_t
>(
static_cast<uint8_t
>(v[i])) << (24 - 8 * i);
118 return static_cast<uint32_t
>(v >> 32);
135 auto spread = [](uint32_t v) -> uint64_t {
137 r = (r | (r << 16)) & 0x0000FFFF0000FFFFULL;
138 r = (r | (r << 8)) & 0x00FF00FF00FF00FFULL;
139 r = (r | (r << 4)) & 0x0F0F0F0F0F0F0F0FULL;
140 r = (r | (r << 2)) & 0x3333333333333333ULL;
141 r = (r | (r << 1)) & 0x5555555555555555ULL;
144 return spread(x) | (spread(y) << 1);
156 auto compact = [](uint64_t v) -> uint32_t {
157 v &= 0x5555555555555555ULL;
158 v = (v | (v >> 1)) & 0x3333333333333333ULL;
159 v = (v | (v >> 2)) & 0x0F0F0F0F0F0F0F0FULL;
160 v = (v | (v >> 4)) & 0x00FF00FF00FF00FFULL;
161 v = (v | (v >> 8)) & 0x0000FFFF0000FFFFULL;
162 v = (v | (v >> 16)) & 0x00000000FFFFFFFFULL;
163 return static_cast<uint32_t
>(v);
166 y = compact(code >> 1);
183inline std::vector<uint8_t>
morton_nd(
const std::vector<uint32_t>& normalized,
184 size_t bits_per_col = 32) {
185 size_t n = normalized.size();
186 if (n == 0)
return {};
188 size_t total_bits = n * bits_per_col;
189 size_t total_bytes = (total_bits + 7) / 8;
190 std::vector<uint8_t> result(total_bytes, 0);
195 for (
size_t b = 0; b < bits_per_col; ++b) {
196 size_t src_bit = bits_per_col - 1 - b;
197 for (
size_t c = 0; c < n; ++c) {
198 if (normalized[c] & (1u << src_bit)) {
199 size_t byte_idx = out_bit / 8;
200 size_t bit_in_byte = 7 - (out_bit % 8);
201 result[byte_idx] |=
static_cast<uint8_t
>(1u << bit_in_byte);
248 [[nodiscard]]
static std::vector<size_t>
sort(
250 const std::vector<ZOrderColumn>& columns) {
252 if (num_rows == 0 || columns.empty())
return {};
254 size_t n_cols = columns.size();
257 for (
size_t c = 0; c < n_cols; ++c) {
258 if (columns[c].count != num_rows) {
259 throw std::out_of_range(
260 "ZOrderSorter::sort: column " + std::to_string(c) +
261 " count (" + std::to_string(columns[c].count) +
262 ") != num_rows (" + std::to_string(num_rows) +
")");
267 std::vector<std::vector<uint32_t>> normalized(n_cols);
268 for (
size_t c = 0; c < n_cols; ++c) {
269 normalized[c].resize(num_rows);
270 normalize_column(columns[c], normalized[c]);
275 std::vector<uint64_t> keys(num_rows);
276 for (
size_t r = 0; r < num_rows; ++r) {
277 keys[r] =
morton_2d(normalized[0][r], normalized[1][r]);
280 std::vector<size_t> perm(num_rows);
281 std::iota(perm.begin(), perm.end(), 0);
282 std::sort(perm.begin(), perm.end(),
283 [&keys](
size_t a,
size_t b) { return keys[a] < keys[b]; });
288 std::vector<std::vector<uint8_t>> keys(num_rows);
289 std::vector<uint32_t> row_vals(n_cols);
290 for (
size_t r = 0; r < num_rows; ++r) {
291 for (
size_t c = 0; c < n_cols; ++c) {
292 row_vals[c] = normalized[c][r];
297 std::vector<size_t> perm(num_rows);
298 std::iota(perm.begin(), perm.end(), 0);
299 std::sort(perm.begin(), perm.end(),
300 [&keys](
size_t a,
size_t b) { return keys[a] < keys[b]; });
315 std::vector<uint32_t>& out) {
316 if (col.
data ==
nullptr) {
317 throw std::invalid_argument(
318 "ZOrderSorter: column data pointer is null");
323 if (
reinterpret_cast<uintptr_t
>(col.
data) %
alignof(int32_t) != 0) {
324 throw std::invalid_argument(
325 "ZOrderSorter: INT32 data pointer not aligned for int32_t");
327 const auto* vals =
static_cast<const int32_t*
>(col.
data);
328 for (
size_t i = 0; i < col.
count; ++i) {
335 if (
reinterpret_cast<uintptr_t
>(col.
data) %
alignof(int64_t) != 0) {
336 throw std::invalid_argument(
337 "ZOrderSorter: INT64 data pointer not aligned for int64_t");
339 const auto* vals =
static_cast<const int64_t*
>(col.
data);
340 for (
size_t i = 0; i < col.
count; ++i) {
347 if (
reinterpret_cast<uintptr_t
>(col.
data) %
alignof(
float) != 0) {
348 throw std::invalid_argument(
349 "ZOrderSorter: FLOAT data pointer not aligned for float");
351 const auto* vals =
static_cast<const float*
>(col.
data);
352 for (
size_t i = 0; i < col.
count; ++i) {
359 if (
reinterpret_cast<uintptr_t
>(col.
data) %
alignof(
double) != 0) {
360 throw std::invalid_argument(
361 "ZOrderSorter: DOUBLE data pointer not aligned for double");
363 const auto* vals =
static_cast<const double*
>(col.
data);
364 for (
size_t i = 0; i < col.
count; ++i) {
371 const auto* vals =
static_cast<const std::string*
>(col.
data);
372 for (
size_t i = 0; i < col.
count; ++i) {
379 for (
size_t i = 0; i < col.
count; ++i) {
Z-order curve (Morton code) utilities for spatial sort keys.
uint64_t normalize_double(double v)
Normalize a 64-bit double to uint64_t preserving total order.
uint32_t normalize_string(std::string_view v)
Normalize a string to uint32_t by taking the first 4 bytes in big-endian order.
uint32_t normalize_int32(int32_t v)
Normalize a signed 32-bit integer to an unsigned 32-bit integer that preserves the original sort orde...
uint64_t morton_2d(uint32_t x, uint32_t y)
Interleave bits of two uint32_t values into a single uint64_t Morton code (2D).
uint32_t normalize_float(float v)
Normalize a 32-bit float to uint32_t preserving total order.
uint64_t normalize_int64(int64_t v)
Normalize a signed 64-bit integer to uint64_t (flip sign bit).
std::vector< uint8_t > morton_nd(const std::vector< uint32_t > &normalized, size_t bits_per_col=32)
Generalized N-column Morton code via round-robin bit interleaving (MSB-first).
uint32_t truncate_to_32(uint64_t v)
Truncate a uint64_t to uint32_t by extracting the upper 32 bits.
void deinterleave_2d(uint64_t code, uint32_t &x, uint32_t &y)
Deinterleave a 2D Morton code back into its two uint32_t components.
PhysicalType
Parquet physical (storage) types as defined in parquet.thrift.
@ INT64
64-bit signed integer (little-endian).
@ INT32
32-bit signed integer (little-endian).
@ BYTE_ARRAY
Variable-length byte sequence (strings, binary).
@ FLOAT
IEEE 754 single-precision float.
@ DOUBLE
IEEE 754 double-precision float.
Descriptor for a single column of raw typed data used by ZOrderSorter.
size_t count
Number of elements in the array (must equal num_rows).
PhysicalType type
Parquet physical type of the column data.
const void * data
Pointer to a contiguous typed array (not owned).
Computes a permutation vector that reorders rows by Z-order (Morton) key.
static std::vector< size_t > sort(size_t num_rows, const std::vector< ZOrderColumn > &columns)
Sort row indices [0..num_rows) by Z-order key computed from columns.
Parquet format enumerations, type traits, and statistics structs.