Signet Forge 0.1.0
C++20 Parquet library with AI-native extensions
DEMO
Loading...
Searching...
No Matches
tensor_bridge.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright 2026 Johnson Ogundeji
3
9
10#pragma once
11
12// ---------------------------------------------------------------------------
13// tensor_bridge.hpp — Zero-Copy Tensor Bridge for SignetStack Signet Forge
14//
15// Maps Parquet column data directly into ML-framework-compatible tensor views
16// without copying. Provides:
17//
18// TensorDataType — enum mapping to common ML framework element types
19// TensorShape — N-dimensional shape descriptor
20// TensorView — non-owning, zero-copy view into contiguous memory
21// OwnedTensor — owning tensor with heap-allocated storage
22// ColumnToTensor — Parquet column data -> tensor conversion
23// BatchTensorBuilder — multi-column feature batch assembly
24//
25// Header-only. Part of the signet::forge AI module.
26// ---------------------------------------------------------------------------
27
28#include "signet/types.hpp"
29#include "signet/error.hpp"
30
31#include <algorithm>
32#include <cassert>
33#include <cstddef>
34#include <cstdlib>
35#include <stdexcept>
36#include <cstdint>
37#include <cstring>
38#include <limits>
39#include <memory>
40#include <new>
41#include <numeric>
42#include <string>
43#include <type_traits>
44#include <vector>
45
46#ifdef _WIN32
47#include <malloc.h>
48#endif
49
50namespace signet::forge {
51
52namespace detail {
53
54template <typename T, std::size_t Alignment>
56public:
57 using value_type = T;
58 using size_type = std::size_t;
59 using difference_type = std::ptrdiff_t;
61 using is_always_equal = std::true_type;
62
63 template <typename U>
67
68 AlignedAllocator() noexcept = default;
69
70 template <typename U>
71 AlignedAllocator(const AlignedAllocator<U, Alignment>&) noexcept {}
72
73 [[nodiscard]] T* allocate(std::size_t n) {
74 static_assert(Alignment >= alignof(void*), "alignment must satisfy allocator requirements");
75 static_assert((Alignment & (Alignment - 1)) == 0, "alignment must be a power of two");
76 if (n == 0) return nullptr;
77 if (n > (std::numeric_limits<std::size_t>::max)() / sizeof(T)) {
78 throw std::bad_alloc();
79 }
80
81 void* ptr = nullptr;
82 const std::size_t bytes = n * sizeof(T);
83#ifdef _WIN32
84 ptr = _aligned_malloc(bytes, Alignment);
85 if (!ptr) throw std::bad_alloc();
86#else
87 if (::posix_memalign(&ptr, Alignment, bytes) != 0) {
88 throw std::bad_alloc();
89 }
90#endif
91 return static_cast<T*>(ptr);
92 }
93
94 void deallocate(T* ptr, std::size_t) noexcept {
95#ifdef _WIN32
96 _aligned_free(ptr);
97#else
98 std::free(ptr);
99#endif
100 }
101
102 template <typename U>
103 [[nodiscard]] bool operator==(const AlignedAllocator<U, Alignment>&) const noexcept {
104 return true;
105 }
106
107 template <typename U>
108 [[nodiscard]] bool operator!=(const AlignedAllocator<U, Alignment>&) const noexcept {
109 return false;
110 }
111};
112
113template <typename T>
114[[nodiscard]] inline bool is_pointer_aligned(const void* ptr) noexcept {
115 if (ptr == nullptr) return false;
116 return (reinterpret_cast<std::uintptr_t>(ptr) % alignof(T)) == 0;
117}
118
119template <typename T>
120[[nodiscard]] inline T* aligned_ptr(void* ptr) noexcept {
121 return is_pointer_aligned<T>(ptr) ? static_cast<T*>(ptr) : nullptr;
122}
123
124template <typename T>
125[[nodiscard]] inline const T* aligned_ptr(const void* ptr) noexcept {
126 return is_pointer_aligned<T>(ptr) ? static_cast<const T*>(ptr) : nullptr;
127}
128
129template <typename T>
130[[nodiscard]] inline T* aligned_ptr_at(void* base, std::size_t offset) noexcept {
131 auto* ptr = static_cast<std::uint8_t*>(base) + offset;
132 return aligned_ptr<T>(ptr);
133}
134
135template <typename T>
136[[nodiscard]] inline const T* aligned_ptr_at(const void* base, std::size_t offset) noexcept {
137 auto* ptr = static_cast<const std::uint8_t*>(base) + offset;
138 return aligned_ptr<T>(ptr);
139}
140
141} // namespace detail
142
143// ===========================================================================
144// TensorDataType — element data types for tensor storage
145// ===========================================================================
146
148enum class TensorDataType : int32_t {
149 FLOAT32 = 0,
150 FLOAT64 = 1,
151 INT32 = 2,
152 INT64 = 3,
153 INT8 = 4,
154 UINT8 = 5,
155 INT16 = 6,
156 FLOAT16 = 7,
157 BOOL = 8
158};
159
160// ===========================================================================
161// tensor_element_size — bytes per element for a given TensorDataType
162// ===========================================================================
163
165inline constexpr size_t tensor_element_size(TensorDataType dtype) noexcept {
166 switch (dtype) {
167 case TensorDataType::FLOAT32: return 4;
168 case TensorDataType::FLOAT64: return 8;
169 case TensorDataType::INT32: return 4;
170 case TensorDataType::INT64: return 8;
171 case TensorDataType::INT8: return 1;
172 case TensorDataType::UINT8: return 1;
173 case TensorDataType::INT16: return 2;
174 case TensorDataType::FLOAT16: return 2;
175 case TensorDataType::BOOL: return 1;
176 }
177 return 0; // unreachable
178}
179
181inline const char* tensor_dtype_name(TensorDataType dtype) noexcept {
182 switch (dtype) {
183 case TensorDataType::FLOAT32: return "float32";
184 case TensorDataType::FLOAT64: return "float64";
185 case TensorDataType::INT32: return "int32";
186 case TensorDataType::INT64: return "int64";
187 case TensorDataType::INT8: return "int8";
188 case TensorDataType::UINT8: return "uint8";
189 case TensorDataType::INT16: return "int16";
190 case TensorDataType::FLOAT16: return "float16";
191 case TensorDataType::BOOL: return "bool";
192 }
193 return "unknown";
194}
195
196// ===========================================================================
197// TensorShape — N-dimensional shape descriptor
198// ===========================================================================
199
208 std::vector<int64_t> dims;
209
211 TensorShape() = default;
212
215 explicit TensorShape(std::vector<int64_t> d) : dims(std::move(d)) {}
216
219 TensorShape(std::initializer_list<int64_t> il) : dims(il) {}
220
223 [[nodiscard]] int64_t num_elements() const noexcept {
224 if (dims.empty()) return 1;
225 int64_t product = 1;
226 for (auto d : dims) {
227 if (d <= 0) return -1; // error sentinel: non-positive dimension
228 if (product > INT64_MAX / d) return -1; // overflow sentinel
229 product *= d;
230 }
231 return product;
232 }
233
235 [[nodiscard]] size_t ndim() const noexcept { return dims.size(); }
236
238 [[nodiscard]] bool is_scalar() const noexcept {
239 return dims.empty() || (dims.size() == 1 && dims[0] == 1);
240 }
241
243 [[nodiscard]] bool is_vector() const noexcept { return dims.size() == 1; }
244
246 [[nodiscard]] bool is_matrix() const noexcept { return dims.size() == 2; }
247
249 [[nodiscard]] bool operator==(const TensorShape& other) const {
250 return dims == other.dims;
251 }
252
254 [[nodiscard]] bool operator!=(const TensorShape& other) const {
255 return dims != other.dims;
256 }
257};
258
259// ===========================================================================
260// TensorView — non-owning, zero-copy view into contiguous tensor memory
261// ===========================================================================
262
275public:
277 TensorView() = default;
278
287 size_t byte_stride = 0) noexcept
288 : data_(data)
289 , shape_(std::move(shape))
290 , dtype_(dtype)
291 , byte_stride_(byte_stride) {}
292
296 size_t byte_stride = 0) noexcept
297 : data_(const_cast<void*>(data))
298 , shape_(std::move(shape))
299 , dtype_(dtype)
300 , byte_stride_(byte_stride) {}
301
302 // -- Raw data access ------------------------------------------------------
303
305 [[nodiscard]] void* data() noexcept { return data_; }
307 [[nodiscard]] const void* data() const noexcept { return data_; }
308
312 template <typename T>
313 [[nodiscard]] T* typed_data() noexcept {
314 return detail::aligned_ptr<T>(data_);
315 }
316
319 template <typename T>
320 [[nodiscard]] const T* typed_data() const noexcept {
321 return detail::aligned_ptr<T>(data_);
322 }
323
324 // -- Shape and type info --------------------------------------------------
325
327 [[nodiscard]] const TensorShape& shape() const noexcept { return shape_; }
329 [[nodiscard]] TensorDataType dtype() const noexcept { return dtype_; }
330
332 [[nodiscard]] size_t element_size() const noexcept {
333 return tensor_element_size(dtype_);
334 }
335
337 [[nodiscard]] int64_t num_elements() const noexcept {
338 return shape_.num_elements();
339 }
340
343 [[nodiscard]] size_t byte_size() const noexcept {
344 const int64_t n = num_elements();
345 if (n <= 0) return 0;
346 return static_cast<size_t>(n) * element_size();
347 }
348
351 [[nodiscard]] size_t effective_byte_stride() const noexcept {
352 if (byte_stride_ != 0) return byte_stride_;
353 // Dense stride: product of dims[1..] * element_size
354 if (shape_.ndim() <= 1) return element_size();
355 size_t inner_size = element_size();
356 for (size_t i = 1; i < shape_.ndim(); ++i) {
357 inner_size *= static_cast<size_t>(shape_.dims[i]);
358 }
359 return inner_size;
360 }
361
362 // -- Typed element accessors ----------------------------------------------
363
369 template <typename T>
370 [[nodiscard]] T& at(int64_t i) {
371 if (data_ == nullptr || i < 0 || i >= num_elements())
372 throw std::out_of_range("TensorView::at(i): index out of range");
373 if (byte_stride_ != 0) {
374 auto* elem = detail::aligned_ptr_at<T>(data_, static_cast<size_t>(i) * byte_stride_);
375 if (elem == nullptr)
376 throw std::runtime_error("TensorView::at(i): misaligned tensor access");
377 return *elem;
378 }
379 auto* ptr = typed_data<T>();
380 if (ptr == nullptr)
381 throw std::runtime_error("TensorView::at(i): misaligned tensor access");
382 return ptr[i];
383 }
384
390 template <typename T>
391 [[nodiscard]] const T& at(int64_t i) const {
392 if (data_ == nullptr || i < 0 || i >= num_elements())
393 throw std::out_of_range("TensorView::at(i): index out of range");
394 if (byte_stride_ != 0) {
395 const auto* elem = detail::aligned_ptr_at<T>(data_, static_cast<size_t>(i) * byte_stride_);
396 if (elem == nullptr)
397 throw std::runtime_error("TensorView::at(i): misaligned tensor access");
398 return *elem;
399 }
400 const auto* ptr = typed_data<T>();
401 if (ptr == nullptr)
402 throw std::runtime_error("TensorView::at(i): misaligned tensor access");
403 return ptr[i];
404 }
405
412 template <typename T>
413 [[nodiscard]] T& at(int64_t row, int64_t col) {
414 if (data_ == nullptr || shape_.ndim() != 2 ||
415 row < 0 || row >= shape_.dims[0] ||
416 col < 0 || col >= shape_.dims[1])
417 throw std::out_of_range("TensorView::at(row,col): index out of range");
418 const int64_t cols = shape_.dims[1];
419 if (byte_stride_ != 0) {
420 auto* row_ptr = detail::aligned_ptr_at<T>(data_, static_cast<size_t>(row) * byte_stride_);
421 if (row_ptr == nullptr)
422 throw std::runtime_error("TensorView::at(row,col): misaligned tensor access");
423 return row_ptr[col];
424 }
425 auto* ptr = typed_data<T>();
426 if (ptr == nullptr)
427 throw std::runtime_error("TensorView::at(row,col): misaligned tensor access");
428 return ptr[row * cols + col];
429 }
430
437 template <typename T>
438 [[nodiscard]] const T& at(int64_t row, int64_t col) const {
439 if (data_ == nullptr || shape_.ndim() != 2 ||
440 row < 0 || row >= shape_.dims[0] ||
441 col < 0 || col >= shape_.dims[1])
442 throw std::out_of_range("TensorView::at(row,col): index out of range");
443 const int64_t cols = shape_.dims[1];
444 if (byte_stride_ != 0) {
445 const auto* row_ptr = detail::aligned_ptr_at<T>(data_, static_cast<size_t>(row) * byte_stride_);
446 if (row_ptr == nullptr)
447 throw std::runtime_error("TensorView::at(row,col): misaligned tensor access");
448 return row_ptr[col];
449 }
450 const auto* ptr = typed_data<T>();
451 if (ptr == nullptr)
452 throw std::runtime_error("TensorView::at(row,col): misaligned tensor access");
453 return ptr[row * cols + col];
454 }
455
456 // -- Predicates -----------------------------------------------------------
457
459 [[nodiscard]] bool is_contiguous() const noexcept {
460 return byte_stride_ == 0;
461 }
462
464 [[nodiscard]] bool is_valid() const noexcept {
465 return data_ != nullptr;
466 }
467
468 // -- Subview and reshape --------------------------------------------------
469
478 [[nodiscard]] TensorView slice(int64_t start, int64_t count) const {
479 if (data_ == nullptr || shape_.ndim() < 1 ||
480 start < 0 || count < 0 || start + count > shape_.dims[0])
481 return TensorView{}; // return invalid view
482
483 // Compute the byte offset to the start of the slice
484 const size_t stride = effective_byte_stride();
485 auto* base = static_cast<uint8_t*>(const_cast<void*>(data_));
486 void* slice_data = base + static_cast<size_t>(start) * stride;
487
488 // Build the new shape: replace dims[0] with count, keep the rest
489 TensorShape new_shape;
490 new_shape.dims = shape_.dims;
491 new_shape.dims[0] = count;
492
493 return TensorView(slice_data, std::move(new_shape), dtype_, byte_stride_);
494 }
495
501 [[nodiscard]] expected<TensorView> reshape(TensorShape new_shape) const {
502 if (!is_contiguous()) {
504 "cannot reshape a non-contiguous tensor view"};
505 }
506 if (new_shape.num_elements() != shape_.num_elements()) {
508 "reshape: total elements mismatch ("
509 + std::to_string(shape_.num_elements()) + " vs "
510 + std::to_string(new_shape.num_elements()) + ")"};
511 }
512 return TensorView(data_, std::move(new_shape), dtype_, 0);
513 }
514
515private:
516 void* data_ = nullptr;
517 TensorShape shape_;
519 size_t byte_stride_ = 0; // 0 = contiguous (densely packed)
520};
521
522// ===========================================================================
523// OwnedTensor — heap-allocated, owning tensor
524// ===========================================================================
525
532public:
534 OwnedTensor() = default;
535
538 : shape_(std::move(shape))
539 , dtype_(dtype) {
540 const auto num_elements = shape_.num_elements();
541 const auto element_size = tensor_element_size(dtype_);
542 if (num_elements <= 0 || static_cast<size_t>(num_elements) > SIZE_MAX / element_size) {
543 // Overflow or invalid shape — leave buffer empty (invalid tensor)
544 return;
545 }
546 const size_t sz = static_cast<size_t>(num_elements) * element_size;
547 buffer_.resize(sz, 0);
548 }
549
557 : shape_(std::move(shape))
558 , dtype_(dtype) {
559 const auto num_elements = shape_.num_elements();
560 const auto element_size = tensor_element_size(dtype_);
561 if (num_elements <= 0 || static_cast<size_t>(num_elements) > SIZE_MAX / element_size) {
562 return; // Overflow or invalid shape — leave buffer empty
563 }
564 const size_t sz = static_cast<size_t>(num_elements) * element_size;
565 buffer_.resize(sz);
566 if (data && sz > 0) {
567 std::memcpy(buffer_.data(), data, sz);
568 }
569 }
570
571 // Move semantics
572 OwnedTensor(OwnedTensor&&) noexcept = default;
573 OwnedTensor& operator=(OwnedTensor&&) noexcept = default;
574
575 // No implicit copy — use clone()
576 OwnedTensor(const OwnedTensor&) = delete;
577 OwnedTensor& operator=(const OwnedTensor&) = delete;
578
580 [[nodiscard]] OwnedTensor clone() const {
581 OwnedTensor copy;
582 copy.buffer_ = buffer_;
583 copy.shape_ = shape_;
584 copy.dtype_ = dtype_;
585 return copy;
586 }
587
588 // -- View access ----------------------------------------------------------
589
591 [[nodiscard]] TensorView view() {
592 return TensorView(buffer_.data(), shape_, dtype_, 0);
593 }
594
596 [[nodiscard]] TensorView view() const {
597 return TensorView(
598 const_cast<uint8_t*>(buffer_.data()), shape_, dtype_, 0);
599 }
600
601 // -- Data access (forwarded from view) ------------------------------------
602
604 [[nodiscard]] void* data() noexcept { return buffer_.data(); }
606 [[nodiscard]] const void* data() const noexcept { return buffer_.data(); }
607
610 template <typename T>
611 [[nodiscard]] T* typed_data() noexcept {
612 return detail::aligned_ptr<T>(buffer_.data());
613 }
614
617 template <typename T>
618 [[nodiscard]] const T* typed_data() const noexcept {
619 return detail::aligned_ptr<T>(buffer_.data());
620 }
621
623 [[nodiscard]] const TensorShape& shape() const noexcept { return shape_; }
625 [[nodiscard]] TensorDataType dtype() const noexcept { return dtype_; }
626
628 [[nodiscard]] size_t byte_size() const noexcept { return buffer_.size(); }
629
631 [[nodiscard]] int64_t num_elements() const noexcept {
632 return shape_.num_elements();
633 }
634
636 [[nodiscard]] bool is_valid() const noexcept { return !buffer_.empty(); }
637
638private:
639 using Buffer = std::vector<uint8_t,
640 detail::AlignedAllocator<uint8_t, alignof(std::max_align_t)>>;
641
642 Buffer buffer_;
643 TensorShape shape_;
645};
646
647// ===========================================================================
648// ColumnToTensor — map Parquet column data to tensor representations
649// ===========================================================================
650
663public:
664 // -----------------------------------------------------------------------
665 // Zero-copy path: wrap existing column data as a TensorView
666 // -----------------------------------------------------------------------
667
682 const void* column_data,
683 int64_t num_values,
684 PhysicalType physical_type,
685 int32_t type_length = -1) {
686 if (!column_data || num_values <= 0) {
688 "wrap_column: null data or non-positive count"};
689 }
690
691 switch (physical_type) {
693 return TensorView(column_data,
694 TensorShape{num_values},
696
698 return TensorView(column_data,
699 TensorShape{num_values},
701
703 return TensorView(column_data,
704 TensorShape{num_values},
706
708 return TensorView(column_data,
709 TensorShape{num_values},
711
713 if (type_length <= 0) {
715 "wrap_column: FIXED_LEN_BYTE_ARRAY requires "
716 "positive type_length"};
717 }
718 // Expose as a 2D {num_values, type_length} uint8 view
719 return TensorView(column_data,
720 TensorShape{num_values,
721 static_cast<int64_t>(type_length)},
723 }
724
726 // Parquet booleans are bit-packed; cannot zero-copy as a
727 // byte-addressable tensor without unpacking.
729 "wrap_column: BOOLEAN columns require copy "
730 "(bit-packed, not byte-addressable)"};
731
734 "wrap_column: BYTE_ARRAY (variable-length) "
735 "cannot be zero-copy wrapped as a tensor"};
736
739 "wrap_column: INT96 is deprecated and "
740 "not supported for tensor wrapping"};
741 }
742
744 "wrap_column: unknown physical type"};
745 }
746
747 // -----------------------------------------------------------------------
748 // Zero-copy path: wrap vector column data
749 // -----------------------------------------------------------------------
750
761 const void* column_data,
762 int64_t num_vectors,
763 uint32_t dimension) {
764 if (!column_data || num_vectors <= 0) {
766 "wrap_vectors: null data or non-positive count"};
767 }
768 if (dimension == 0) {
770 "wrap_vectors: dimension must be > 0"};
771 }
772
773 return TensorView(column_data,
774 TensorShape{num_vectors,
775 static_cast<int64_t>(dimension)},
777 }
778
779 // -----------------------------------------------------------------------
780 // Copy path: read + convert column data into an OwnedTensor
781 // -----------------------------------------------------------------------
782
796 const void* column_data,
797 int64_t num_values,
798 PhysicalType physical_type,
799 TensorDataType target_dtype,
800 int32_t type_length = -1) {
801 if (!column_data || num_values <= 0) {
803 "copy_column: null data or non-positive count"};
804 }
805
806 // For BYTE_ARRAY we cannot produce a dense tensor
807 if (physical_type == PhysicalType::BYTE_ARRAY) {
809 "copy_column: BYTE_ARRAY (strings) cannot be "
810 "converted to a dense tensor"};
811 }
812 if (physical_type == PhysicalType::INT96) {
814 "copy_column: INT96 is deprecated and not supported"};
815 }
816
817 // First, try the zero-copy wrap to get a typed view of the source
818 // For FIXED_LEN_BYTE_ARRAY, we handle specially below
819 if (physical_type == PhysicalType::FIXED_LEN_BYTE_ARRAY) {
820 if (type_length <= 0) {
822 "copy_column: FIXED_LEN_BYTE_ARRAY requires "
823 "positive type_length"};
824 }
825 // Treat as flat bytes, then cast into the target dtype
826 TensorView src(column_data,
827 TensorShape{num_values * static_cast<int64_t>(type_length)},
829 // If target is UINT8, just copy directly
830 if (target_dtype == TensorDataType::UINT8) {
831 OwnedTensor out(TensorShape{num_values,
832 static_cast<int64_t>(type_length)},
833 target_dtype);
834 std::memcpy(out.data(), column_data,
835 static_cast<size_t>(num_values) *
836 static_cast<size_t>(type_length));
837 return out;
838 }
839 // Otherwise interpret as float32 vectors if type_length is a
840 // multiple of sizeof(float)
841 if (type_length % static_cast<int32_t>(sizeof(float)) == 0) {
842 int64_t dim = type_length / static_cast<int32_t>(sizeof(float));
843 TensorView float_src(column_data,
844 TensorShape{num_values, dim},
846 return cast(float_src, target_dtype);
847 }
849 "copy_column: FIXED_LEN_BYTE_ARRAY with type_length "
850 "not a multiple of 4 can only be copied as UINT8"};
851 }
852
853 // For standard numeric types, wrap then cast
854 auto src_dtype_result = parquet_to_tensor_dtype(physical_type);
855 if (!src_dtype_result) {
856 return Error{src_dtype_result.error().code,
857 src_dtype_result.error().message};
858 }
859
860 TensorDataType src_dtype = src_dtype_result.value();
861 TensorView src(column_data, TensorShape{num_values}, src_dtype);
862
863 // If source dtype matches target, just copy the bytes
864 if (src_dtype == target_dtype) {
865 return OwnedTensor(column_data,
866 TensorShape{num_values}, target_dtype);
867 }
868
869 return cast(src, target_dtype);
870 }
871
872 // -----------------------------------------------------------------------
873 // Type casting
874 // -----------------------------------------------------------------------
875
886 const TensorView& src,
887 TensorDataType target_dtype) {
888 if (!src.is_valid()) {
890 "cast: source tensor is null"};
891 }
892 if (!src.is_contiguous()) {
894 "cast: source tensor must be contiguous"};
895 }
896
897 // Same type — just copy
898 if (src.dtype() == target_dtype) {
899 return OwnedTensor(src.data(), src.shape(), target_dtype);
900 }
901
902 const int64_t n = src.num_elements();
903 OwnedTensor out(src.shape(), target_dtype);
904
905 // Dispatch on (src_dtype, target_dtype) using a helper
906 bool ok = dispatch_cast(src.data(), src.dtype(),
907 out.data(), target_dtype, n);
908 if (!ok) {
910 std::string("cast: unsupported conversion from ")
911 + tensor_dtype_name(src.dtype()) + " to "
912 + tensor_dtype_name(target_dtype)};
913 }
914
915 return out;
916 }
917
918 // -----------------------------------------------------------------------
919 // Parquet → Tensor dtype mapping
920 // -----------------------------------------------------------------------
921
924 PhysicalType pt) {
925 switch (pt) {
935 "BYTE_ARRAY has no fixed tensor type mapping"};
938 "INT96 has no tensor type mapping"};
939 }
941 "unknown PhysicalType"};
942 }
943
944private:
945 // -- Cast dispatch --------------------------------------------------------
946
948 template <typename T>
949 static inline T read_element(const void* data, int64_t idx) {
950 return static_cast<const T*>(data)[idx];
951 }
952
954 template <typename T>
955 static inline void write_element(void* data, int64_t idx, T val) {
956 static_cast<T*>(data)[idx] = val;
957 }
958
960 template <typename Src, typename Dst>
961 static inline void convert_loop(const void* src, void* dst, int64_t n) {
962 const auto* s = static_cast<const Src*>(src);
963 auto* d = static_cast<Dst*>(dst);
964 for (int64_t i = 0; i < n; ++i) {
965 d[i] = static_cast<Dst>(s[i]);
966 }
967 }
968
970 template <typename Src>
971 static inline bool dispatch_target(const void* src, void* dst,
972 TensorDataType target, int64_t n) {
973 switch (target) {
974 case TensorDataType::FLOAT32: convert_loop<Src, float>(src, dst, n); return true;
975 case TensorDataType::FLOAT64: convert_loop<Src, double>(src, dst, n); return true;
976 case TensorDataType::INT32: convert_loop<Src, int32_t>(src, dst, n); return true;
977 case TensorDataType::INT64: convert_loop<Src, int64_t>(src, dst, n); return true;
978 case TensorDataType::INT8: convert_loop<Src, int8_t>(src, dst, n); return true;
979 case TensorDataType::UINT8: convert_loop<Src, uint8_t>(src, dst, n); return true;
980 case TensorDataType::INT16: convert_loop<Src, int16_t>(src, dst, n); return true;
981 case TensorDataType::BOOL: convert_loop<Src, bool>(src, dst, n); return true;
982 default: return false;
983 }
984 }
985
987 static inline bool dispatch_cast(const void* src, TensorDataType src_dtype,
988 void* dst, TensorDataType target, int64_t n) {
989 switch (src_dtype) {
990 case TensorDataType::FLOAT32: return dispatch_target<float>(src, dst, target, n);
991 case TensorDataType::FLOAT64: return dispatch_target<double>(src, dst, target, n);
992 case TensorDataType::INT32: return dispatch_target<int32_t>(src, dst, target, n);
993 case TensorDataType::INT64: return dispatch_target<int64_t>(src, dst, target, n);
994 case TensorDataType::INT8: return dispatch_target<int8_t>(src, dst, target, n);
995 case TensorDataType::UINT8: return dispatch_target<uint8_t>(src, dst, target, n);
996 case TensorDataType::INT16: return dispatch_target<int16_t>(src, dst, target, n);
997 case TensorDataType::BOOL: return dispatch_target<bool>(src, dst, target, n);
998 default: return false;
999 }
1000 }
1001};
1002
1003// ===========================================================================
1004// BatchTensorBuilder — assemble multi-column feature batches for ML inference
1005// ===========================================================================
1006
1025public:
1028
1038 BatchTensorBuilder& add_column(const std::string& name,
1039 const TensorView& tensor) {
1040 columns_.push_back(ColumnEntry{name, tensor});
1041 return *this;
1042 }
1043
1048 [[nodiscard]] TensorShape expected_shape() const {
1049 if (columns_.empty()) return TensorShape{0, 0};
1050
1051 const int64_t rows = column_rows(columns_[0].tensor);
1052 int64_t total_cols = 0;
1053 for (const auto& entry : columns_) {
1054 total_cols += column_width(entry.tensor);
1055 }
1056 return TensorShape{rows, total_cols};
1057 }
1058
1060 [[nodiscard]] size_t num_features() const noexcept {
1061 return columns_.size();
1062 }
1063
1076 TensorDataType output_dtype = TensorDataType::FLOAT32) {
1077 if (columns_.empty()) {
1079 "BatchTensorBuilder: no columns added"};
1080 }
1081
1082 // Determine row count from first column
1083 const int64_t rows = column_rows(columns_[0].tensor);
1084 if (rows <= 0) {
1086 "BatchTensorBuilder: first column has no rows"};
1087 }
1088
1089 // Validate all columns have the same row count
1090 for (size_t i = 1; i < columns_.size(); ++i) {
1091 const int64_t col_rows = column_rows(columns_[i].tensor);
1092 if (col_rows != rows) {
1094 "BatchTensorBuilder: column '"
1095 + columns_[i].name + "' has "
1096 + std::to_string(col_rows) + " rows, expected "
1097 + std::to_string(rows)};
1098 }
1099 }
1100
1101 // Compute total output columns
1102 int64_t total_cols = 0;
1103 for (const auto& entry : columns_) {
1104 total_cols += column_width(entry.tensor);
1105 }
1106
1107 // Allocate output tensor
1108 TensorShape out_shape{rows, total_cols};
1109 OwnedTensor output(out_shape, output_dtype);
1110 const size_t out_elem_size = tensor_element_size(output_dtype);
1111
1112 // Fill column by column
1113 int64_t col_offset = 0;
1114 for (const auto& entry : columns_) {
1115 const TensorView& src = entry.tensor;
1116 const int64_t width = column_width(src);
1117
1118 // Get or cast the source data to the output dtype
1119 // We need a contiguous float32 (or target type) source
1120 if (src.dtype() == output_dtype && src.is_contiguous()) {
1121 // Direct copy path
1122 copy_column_into(output, rows, total_cols, col_offset,
1123 width, src.data(), out_elem_size);
1124 } else {
1125 // Need to cast first
1126 auto cast_result = ColumnToTensor::cast(src, output_dtype);
1127 if (!cast_result) {
1128 return Error{cast_result.error().code,
1129 "BatchTensorBuilder: failed to cast column '"
1130 + entry.name + "': "
1131 + cast_result.error().message};
1132 }
1133 copy_column_into(output, rows, total_cols, col_offset,
1134 width, cast_result.value().data(),
1135 out_elem_size);
1136 }
1137
1138 col_offset += width;
1139 }
1140
1141 return output;
1142 }
1143
1144private:
1145 struct ColumnEntry {
1146 std::string name;
1147 TensorView tensor;
1148 };
1149
1150 std::vector<ColumnEntry> columns_;
1151
1153 static int64_t column_rows(const TensorView& t) noexcept {
1154 if (t.shape().ndim() == 0) return 1;
1155 return t.shape().dims[0];
1156 }
1157
1160 static int64_t column_width(const TensorView& t) noexcept {
1161 if (t.shape().ndim() <= 1) return 1;
1162 return t.shape().dims[1];
1163 }
1164
1171 static void copy_column_into(
1172 OwnedTensor& output,
1173 int64_t rows,
1174 int64_t total_cols,
1175 int64_t col_offset,
1176 int64_t width,
1177 const void* src_data,
1178 size_t elem_size) {
1179 auto* dst_base = static_cast<uint8_t*>(output.data());
1180 const auto* src_base = static_cast<const uint8_t*>(src_data);
1181
1182 const size_t row_byte_stride = static_cast<size_t>(total_cols) * elem_size;
1183 const size_t src_row_bytes = static_cast<size_t>(width) * elem_size;
1184 const size_t col_byte_offset = static_cast<size_t>(col_offset) * elem_size;
1185
1186 for (int64_t r = 0; r < rows; ++r) {
1187 const size_t dst_offset = static_cast<size_t>(r) * row_byte_stride
1188 + col_byte_offset;
1189 const size_t src_offset = static_cast<size_t>(r) * src_row_bytes;
1190 std::memcpy(dst_base + dst_offset,
1191 src_base + src_offset,
1192 src_row_bytes);
1193 }
1194 }
1195};
1196
1197} // namespace signet::forge
Builds a single contiguous 2D tensor from multiple column tensors, suitable for passing to an ML infe...
size_t num_features() const noexcept
Number of feature sources (columns) added.
BatchTensorBuilder()=default
Default constructor: creates an empty builder with no columns.
TensorShape expected_shape() const
Compute the expected output shape based on currently added columns.
BatchTensorBuilder & add_column(const std::string &name, const TensorView &tensor)
Add a column tensor as a feature source.
expected< OwnedTensor > build(TensorDataType output_dtype=TensorDataType::FLOAT32)
Build the final batch tensor.
Provides static methods to convert Parquet column data into tensor form.
static expected< OwnedTensor > cast(const TensorView &src, TensorDataType target_dtype)
Cast a tensor view to a different element type, producing an OwnedTensor.
static expected< TensorDataType > parquet_to_tensor_dtype(PhysicalType pt)
Map a Parquet physical type to the natural TensorDataType.
static expected< TensorView > wrap_column(const void *column_data, int64_t num_values, PhysicalType physical_type, int32_t type_length=-1)
Wrap a contiguous numeric Parquet column as a 1D TensorView.
static expected< OwnedTensor > copy_column(const void *column_data, int64_t num_values, PhysicalType physical_type, TensorDataType target_dtype, int32_t type_length=-1)
Read column data and produce an OwnedTensor with the requested type.
static expected< TensorView > wrap_vectors(const void *column_data, int64_t num_vectors, uint32_t dimension)
Wrap a contiguous FLOAT32_VECTOR column as a 2D TensorView.
An owning tensor that manages its own memory via a std::vector<uint8_t> buffer.
OwnedTensor(TensorShape shape, TensorDataType dtype)
Allocate an uninitialized tensor with the given shape and type.
OwnedTensor clone() const
Deep-copy this tensor.
const T * typed_data() const noexcept
Typed const pointer to the tensor buffer.
TensorView view()
Get a mutable non-owning view.
OwnedTensor(OwnedTensor &&) noexcept=default
TensorDataType dtype() const noexcept
The element data type.
OwnedTensor(const void *data, TensorShape shape, TensorDataType dtype)
Allocate and copy data into the tensor.
void * data() noexcept
Raw mutable pointer to the tensor buffer.
size_t byte_size() const noexcept
Total byte size of the tensor buffer.
const void * data() const noexcept
Raw const pointer to the tensor buffer.
bool is_valid() const noexcept
True if the tensor has been allocated (non-empty buffer).
const TensorShape & shape() const noexcept
The shape of this tensor.
T * typed_data() noexcept
Typed mutable pointer to the tensor buffer.
OwnedTensor()=default
Default constructor: creates an invalid (empty) tensor.
int64_t num_elements() const noexcept
Total number of elements.
TensorView view() const
Get a const non-owning view.
A lightweight, non-owning view into a contiguous block of typed memory, interpreted as a multi-dimens...
T & at(int64_t i)
Access a single element in a 1D tensor (mutable).
size_t effective_byte_stride() const noexcept
Effective stride in bytes along the first dimension.
const T & at(int64_t row, int64_t col) const
Access a single element in a 2D tensor by (row, col) (const).
bool is_valid() const noexcept
True if the view points to valid data.
const T * typed_data() const noexcept
Reinterpret the data pointer as a typed const pointer.
bool is_contiguous() const noexcept
True if the data is densely packed (no stride gaps).
int64_t num_elements() const noexcept
Total number of elements.
TensorView(const void *data, TensorShape shape, TensorDataType dtype, size_t byte_stride=0) noexcept
Construct a const view (stores as void* internally, constness enforced by the const overloads of data...
TensorView(void *data, TensorShape shape, TensorDataType dtype, size_t byte_stride=0) noexcept
Construct a view over existing memory.
TensorView()=default
Default constructor: creates an invalid (null) view.
TensorView slice(int64_t start, int64_t count) const
Slice along the first dimension: returns a view over rows [start, start+count).
size_t byte_size() const noexcept
Total byte size of the tensor data (num_elements * element_size).
const T & at(int64_t i) const
Access a single element in a 1D tensor (const).
const TensorShape & shape() const noexcept
The shape of this tensor view.
T * typed_data() noexcept
Reinterpret the data pointer as a typed mutable pointer.
TensorDataType dtype() const noexcept
The element data type.
size_t element_size() const noexcept
Bytes per element.
const void * data() const noexcept
Raw const pointer to the underlying data buffer.
T & at(int64_t row, int64_t col)
Access a single element in a 2D tensor by (row, col) (mutable).
expected< TensorView > reshape(TensorShape new_shape) const
Reshape the view to a new shape with the same total number of elements.
void * data() noexcept
Raw mutable pointer to the underlying data buffer.
bool operator==(const AlignedAllocator< U, Alignment > &) const noexcept
void deallocate(T *ptr, std::size_t) noexcept
bool operator!=(const AlignedAllocator< U, Alignment > &) const noexcept
A lightweight result type that holds either a success value of type T or an Error.
Definition error.hpp:145
T * aligned_ptr(void *ptr) noexcept
bool is_pointer_aligned(const void *ptr) noexcept
T * aligned_ptr_at(void *base, std::size_t offset) noexcept
PhysicalType
Parquet physical (storage) types as defined in parquet.thrift.
Definition types.hpp:20
@ INT96
96-bit value (deprecated — legacy Impala timestamps).
@ FIXED_LEN_BYTE_ARRAY
Fixed-length byte array (UUID, vectors, decimals).
@ INT64
64-bit signed integer (little-endian).
@ INT32
32-bit signed integer (little-endian).
@ BOOLEAN
1-bit boolean, bit-packed in pages.
@ BYTE_ARRAY
Variable-length byte sequence (strings, binary).
@ FLOAT
IEEE 754 single-precision float.
@ DOUBLE
IEEE 754 double-precision float.
const char * tensor_dtype_name(TensorDataType dtype) noexcept
Returns a human-readable name for a TensorDataType.
@ UNSUPPORTED_TYPE
The file contains a Parquet physical or logical type that is not implemented.
@ SCHEMA_MISMATCH
The requested column name or type does not match the file schema.
@ INTERNAL_ERROR
An unexpected internal error that does not fit any other category.
TensorDataType
Element data type for tensor storage, mapping to ONNX/PyTorch/TF type enums.
@ FLOAT64
IEEE 754 double-precision (8 bytes)
@ INT64
Signed 64-bit integer.
@ INT16
Signed 16-bit integer.
@ INT32
Signed 32-bit integer.
@ FLOAT32
IEEE 754 single-precision (4 bytes)
@ FLOAT16
IEEE 754 half-precision (2 bytes)
@ UINT8
Unsigned 8-bit integer.
@ INT8
Signed 8-bit integer.
constexpr size_t tensor_element_size(TensorDataType dtype) noexcept
Returns the byte size of a single element of the given tensor data type.
Lightweight error value carrying an ErrorCode and a human-readable message.
Definition error.hpp:101
ErrorCode code
The machine-readable error category.
Definition error.hpp:103
Describes the shape of a tensor as a vector of dimension sizes.
bool operator==(const TensorShape &other) const
Equality comparison (element-wise dimension match).
TensorShape()=default
Default constructor: scalar shape (empty dims).
int64_t num_elements() const noexcept
Total number of elements (product of all dimensions).
size_t ndim() const noexcept
Number of dimensions.
bool is_vector() const noexcept
True if this is a 1D vector.
bool is_scalar() const noexcept
True if this is a scalar (no dimensions, or a single dimension of 1).
bool operator!=(const TensorShape &other) const
Inequality comparison.
bool is_matrix() const noexcept
True if this is a 2D matrix.
TensorShape(std::vector< int64_t > d)
Construct from a vector of dimensions.
std::vector< int64_t > dims
Dimension sizes (e.g. {32, 768} for a 32x768 matrix)
TensorShape(std::initializer_list< int64_t > il)
Construct from an initializer list (e.g.
Parquet format enumerations, type traits, and statistics structs.