Signet Forge 0.1.0
C++20 Parquet library with AI-native extensions
DEMO
Loading...
Searching...
No Matches
arrow_bridge.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright 2026 Johnson Ogundeji
3#pragma once
4
19
21#include "signet/types.hpp"
22#include "signet/error.hpp"
23
24#include <cstdint>
25#include <cstdlib>
26#include <cstring>
27#include <memory>
28#include <string>
29
35
36#ifndef SIGNET_ARROW_C_DATA_DEFINED
37#define SIGNET_ARROW_C_DATA_DEFINED
38
39extern "C" {
40
51 const char* format;
52 const char* name;
53 const char* metadata;
54 int64_t flags;
55 int64_t n_children;
58 void (*release)(ArrowSchema*);
60};
61
74struct ArrowArray {
75 int64_t length;
76 int64_t null_count;
77 int64_t offset;
78 int64_t n_buffers;
79 int64_t n_children;
80 const void** buffers;
83 void (*release)(ArrowArray*);
85};
86
87} // extern "C"
89
90#endif // SIGNET_ARROW_C_DATA_DEFINED
91
92namespace signet::forge {
93
96namespace detail {
97
104 std::string format_storage;
105 std::string name_storage;
106};
107
115 bool owns_data = false;
116 const void* buffer_ptrs[2] = {nullptr, nullptr};
117};
118
125inline void release_arrow_schema(ArrowSchema* schema) {
126 if (schema == nullptr) return;
127 if (schema->private_data != nullptr) {
128 delete static_cast<ArrowSchemaPrivate*>(schema->private_data);
129 schema->private_data = nullptr;
130 }
131 schema->format = nullptr;
132 schema->name = nullptr;
133 schema->release = nullptr;
134}
135
143inline void release_arrow_array(ArrowArray* array) {
144 if (array == nullptr) return;
145 if (array->private_data != nullptr) {
146 auto* ctx = static_cast<ArrowArrayPrivate*>(array->private_data);
147 if (ctx->owns_data && ctx->buffer_ptrs[1] != nullptr) {
148 std::free(const_cast<void*>(ctx->buffer_ptrs[1]));
149 }
150 delete ctx;
151 array->private_data = nullptr;
152 }
153 array->buffers = nullptr;
154 array->release = nullptr;
155}
156
157} // namespace detail
159
163
176inline const char* parquet_to_arrow_format(PhysicalType pt) {
177 switch (pt) {
178 case PhysicalType::BOOLEAN: return "b";
179 case PhysicalType::INT32: return "i";
180 case PhysicalType::INT64: return "l";
181 case PhysicalType::FLOAT: return "f";
182 case PhysicalType::DOUBLE: return "g";
183 default: return nullptr;
184 }
185}
186
194 switch (dtype) {
195 case TensorDataType::FLOAT32: return "f";
196 case TensorDataType::FLOAT64: return "g";
197 case TensorDataType::INT32: return "i";
198 case TensorDataType::INT64: return "l";
199 case TensorDataType::INT8: return "c";
200 case TensorDataType::UINT8: return "C";
201 case TensorDataType::INT16: return "s";
202 case TensorDataType::FLOAT16: return "e";
203 case TensorDataType::BOOL: return "b";
204 }
205 return nullptr; // unreachable
206}
207
219 if (format == nullptr || format[0] == '\0') {
221 "null or empty Arrow format string"};
222 }
223
224 // Only single-character format codes are handled for primitive types.
225 if (format[1] != '\0') {
227 std::string("unsupported Arrow format: ") + format};
228 }
229
230 switch (format[0]) {
231 case 'f': return TensorDataType::FLOAT32;
232 case 'g': return TensorDataType::FLOAT64;
233 case 'i': return TensorDataType::INT32;
234 case 'l': return TensorDataType::INT64;
235 case 'c': return TensorDataType::INT8;
236 case 'C': return TensorDataType::UINT8;
237 case 's': return TensorDataType::INT16;
238 case 'e': return TensorDataType::FLOAT16;
239 case 'b': return TensorDataType::BOOL;
240 default:
242 std::string("unsupported Arrow format character: '")
243 + format[0] + "'"};
244 }
245}
246
253 switch (pt) {
259 default:
261 "PhysicalType has no TensorDataType mapping"};
262 }
263}
264
273 switch (pt) {
274 case PhysicalType::BOOLEAN: return 1; // Arrow uses 1 bit, but data buffer is byte-aligned
275 case PhysicalType::INT32: return 4;
276 case PhysicalType::INT64: return 8;
277 case PhysicalType::FLOAT: return 4;
278 case PhysicalType::DOUBLE: return 8;
279 default: return 0;
280 }
281}
282
284
308public:
326 const TensorView& tensor,
327 const std::string& name,
328 ArrowArray* out_array,
329 ArrowSchema* out_schema)
330 {
331 // CWE-457: Use of Uninitialized Variable — zero-init prevents double-free on partial init failure
332 // M27: Zero-initialize outputs so callers see release=nullptr on early error
333 std::memset(out_schema, 0, sizeof(ArrowSchema));
334 std::memset(out_array, 0, sizeof(ArrowArray));
335
336 if (!tensor.is_valid()) {
338 "cannot export invalid tensor to Arrow"};
339 }
340
341 if (!tensor.is_contiguous()) {
343 "non-contiguous tensor cannot be exported to Arrow"};
344 }
345
346 const char* fmt = tensor_dtype_to_arrow_format(tensor.dtype());
347 if (fmt == nullptr) {
349 "tensor dtype has no Arrow format mapping"};
350 }
351
352 // -- Fill schema (RAII: unique_ptr until setup complete) --
353 auto schema_owner = std::make_unique<detail::ArrowSchemaPrivate>();
354 schema_owner->format_storage = fmt;
355 schema_owner->name_storage = name;
356
357 out_schema->format = schema_owner->format_storage.c_str();
358 out_schema->name = schema_owner->name_storage.c_str();
359 out_schema->metadata = nullptr;
360 out_schema->flags = 0; // non-nullable
361 out_schema->n_children = 0;
362 out_schema->children = nullptr;
363 out_schema->dictionary = nullptr;
365 out_schema->private_data = schema_owner.release(); // transfer ownership
366
367 // -- Fill array (RAII: unique_ptr until setup complete) --
368 auto array_owner = std::make_unique<detail::ArrowArrayPrivate>();
369 array_owner->owns_data = false; // zero-copy
370 array_owner->buffer_ptrs[0] = nullptr; // no validity bitmap (non-nullable)
371 array_owner->buffer_ptrs[1] = tensor.data();
372
373 const int64_t num_elements = static_cast<int64_t>(tensor.num_elements());
374
375 out_array->length = num_elements;
376 out_array->null_count = 0;
377 out_array->offset = 0;
378 out_array->n_buffers = 2;
379 out_array->n_children = 0;
380 out_array->buffers = array_owner->buffer_ptrs;
381 out_array->children = nullptr;
382 out_array->dictionary = nullptr;
384 out_array->private_data = array_owner.release(); // transfer ownership
385
386 return expected<void>{};
387 }
388
407 const void* data,
408 int64_t num_values,
409 PhysicalType physical_type,
410 const std::string& name,
411 ArrowArray* out_array,
412 ArrowSchema* out_schema)
413 {
414 // CWE-457: Use of Uninitialized Variable — zero-init prevents double-free on partial init failure
415 // M27: Zero-initialize outputs so callers see release=nullptr on early error
416 std::memset(out_schema, 0, sizeof(ArrowSchema));
417 std::memset(out_array, 0, sizeof(ArrowArray));
418
419 if (data == nullptr || num_values <= 0) {
421 "cannot export null/empty column to Arrow"};
422 }
423
424 const char* fmt = parquet_to_arrow_format(physical_type);
425 if (fmt == nullptr) {
427 "PhysicalType has no direct Arrow format mapping"};
428 }
429
430 // -- Fill schema (RAII: unique_ptr until setup complete) --
431 auto schema_owner = std::make_unique<detail::ArrowSchemaPrivate>();
432 schema_owner->format_storage = fmt;
433 schema_owner->name_storage = name;
434
435 out_schema->format = schema_owner->format_storage.c_str();
436 out_schema->name = schema_owner->name_storage.c_str();
437 out_schema->metadata = nullptr;
438 out_schema->flags = 0;
439 out_schema->n_children = 0;
440 out_schema->children = nullptr;
441 out_schema->dictionary = nullptr;
443 out_schema->private_data = schema_owner.release(); // transfer ownership
444
445 // -- Fill array (RAII: unique_ptr until setup complete) --
446 auto array_owner = std::make_unique<detail::ArrowArrayPrivate>();
447 array_owner->owns_data = false;
448 array_owner->buffer_ptrs[0] = nullptr; // no validity bitmap
449 array_owner->buffer_ptrs[1] = data;
450
451 out_array->length = num_values;
452 out_array->null_count = 0;
453 out_array->offset = 0;
454 out_array->n_buffers = 2;
455 out_array->n_children = 0;
456 out_array->buffers = array_owner->buffer_ptrs;
457 out_array->children = nullptr;
458 out_array->dictionary = nullptr;
460 out_array->private_data = array_owner.release(); // transfer ownership
461
462 return expected<void>{};
463 }
464};
465
483public:
509 const ArrowArray* array,
510 const ArrowSchema* schema)
511 {
512 if (array == nullptr || schema == nullptr) {
514 "null ArrowArray or ArrowSchema pointer"};
515 }
516
517 if (array->release == nullptr) {
519 "ArrowArray has already been released"};
520 }
521
522 if (array->null_count != 0) {
524 "ArrowArray with nulls cannot be imported as a dense tensor"};
525 }
526
527 auto dtype_result = arrow_format_to_tensor_dtype(schema->format);
528 if (!dtype_result) {
529 return dtype_result.error();
530 }
531
532 TensorDataType dtype = *dtype_result;
533 const int64_t length = array->length;
534
535 if (length <= 0) {
537 "ArrowArray has zero or negative length"};
538 }
539
540 // Reject unreasonably large arrays to prevent OOB reads in copy paths.
541 // Arrow C Data Interface doesn't carry buffer sizes, so this is a
542 // defense-in-depth cap. 1 billion elements ≈ 8 GB for float64.
543 static constexpr int64_t MAX_IMPORT_ELEMENTS = 1'000'000'000;
544 static constexpr int64_t MAX_ARROW_OFFSET = 1'000'000'000LL;
545 static constexpr int64_t MAX_ARROW_LENGTH = 1'000'000'000LL;
546
547 if (array->offset > MAX_ARROW_OFFSET) {
548 return Error{ErrorCode::INVALID_ARGUMENT, "ArrowArray offset exceeds 1B cap (CWE-190)"};
549 }
550 if (array->length > MAX_ARROW_LENGTH) {
551 return Error{ErrorCode::INVALID_ARGUMENT, "ArrowArray length exceeds 1B cap (CWE-190)"};
552 }
553
554 if (length > MAX_IMPORT_ELEMENTS) {
556 "ArrowArray length exceeds import limit (1 billion elements)"};
557 }
558
559 if (array->offset < 0) {
561 "ArrowArray has negative offset"};
562 }
563
564 if (array->n_buffers < 2 || array->buffers == nullptr) {
566 "ArrowArray does not have expected buffer layout"};
567 }
568
569 const void* data_buf = array->buffers[1];
570 if (data_buf == nullptr) {
572 "ArrowArray data buffer (buffers[1]) is null"};
573 }
574
575 if (static_cast<int64_t>(length) > INT64_MAX - array->offset) {
577 "ArrowArray: offset + length overflows int64"};
578 }
579
580 // Account for offset: data starts at offset * element_size bytes
581 const size_t elem_size = tensor_element_size(dtype);
582 const size_t offset_val = static_cast<size_t>(array->offset);
583 if (offset_val > static_cast<size_t>(MAX_IMPORT_ELEMENTS)) {
585 "ArrowArray offset exceeds import limit"};
586 }
587
588 // Validate (offset + length) * elem_size doesn't overflow size_t.
589 // The Arrow C Data Interface doesn't carry buffer sizes, so we
590 // cannot verify the buffer is large enough. This overflow guard
591 // prevents wild pointer arithmetic from crafted metadata.
592 const size_t total_elems = offset_val + static_cast<size_t>(length);
593 if (total_elems > static_cast<size_t>(MAX_IMPORT_ELEMENTS)) {
595 "ArrowArray: offset + length exceeds import limit"};
596 }
597 if (elem_size > 0 && total_elems > SIZE_MAX / elem_size) {
599 "ArrowArray: (offset + length) * elem_size overflows size_t"};
600 }
601 const uint8_t* base = static_cast<const uint8_t*>(data_buf)
602 + offset_val * elem_size;
603
604 TensorShape shape;
605 shape.dims = {length};
606
607 return TensorView(const_cast<void*>(static_cast<const void*>(base)),
608 shape, dtype);
609 }
610
626 const ArrowArray* array,
627 const ArrowSchema* schema)
628 {
629 auto view_result = import_array(array, schema);
630 if (!view_result) {
631 return view_result.error();
632 }
633
634 // Create an owned copy from the view
635 const auto& v = *view_result;
636 return OwnedTensor(v.data(), v.shape(), v.dtype());
637 }
638};
639
640} // namespace signet::forge
Exports Signet Forge tensors and columns as Arrow C Data Interface structs.
static expected< void > export_column(const void *data, int64_t num_values, PhysicalType physical_type, const std::string &name, ArrowArray *out_array, ArrowSchema *out_schema)
Export a 1D column of a primitive Parquet type as ArrowArray + ArrowSchema (zero-copy).
static expected< void > export_tensor(const TensorView &tensor, const std::string &name, ArrowArray *out_array, ArrowSchema *out_schema)
Export a TensorView as an ArrowArray + ArrowSchema pair (zero-copy).
Imports Arrow C Data Interface arrays into Signet TensorView or OwnedTensor.
static expected< TensorView > import_array(const ArrowArray *array, const ArrowSchema *schema)
Import an ArrowArray as a TensorView (zero-copy).
static expected< OwnedTensor > import_array_copy(const ArrowArray *array, const ArrowSchema *schema)
Import an ArrowArray as an OwnedTensor (deep copy).
An owning tensor that manages its own memory via a std::vector<uint8_t> buffer.
A lightweight, non-owning view into a contiguous block of typed memory, interpreted as a multi-dimens...
bool is_valid() const noexcept
True if the view points to valid data.
bool is_contiguous() const noexcept
True if the data is densely packed (no stride gaps).
int64_t num_elements() const noexcept
Total number of elements.
TensorDataType dtype() const noexcept
The element data type.
void * data() noexcept
Raw mutable pointer to the underlying data buffer.
A lightweight result type that holds either a success value of type T or an Error.
Definition error.hpp:145
void release_arrow_array(ArrowArray *array)
Release callback for ArrowArray.
void release_arrow_schema(ArrowSchema *schema)
Release callback for ArrowSchema.
const char * tensor_dtype_to_arrow_format(TensorDataType dtype)
Map a TensorDataType to an Arrow format string.
PhysicalType
Parquet physical (storage) types as defined in parquet.thrift.
Definition types.hpp:20
@ INT64
64-bit signed integer (little-endian).
@ INT32
32-bit signed integer (little-endian).
@ BOOLEAN
1-bit boolean, bit-packed in pages.
@ FLOAT
IEEE 754 single-precision float.
@ DOUBLE
IEEE 754 double-precision float.
size_t physical_type_byte_size(PhysicalType pt)
Return the byte size for a PhysicalType (primitive types only).
expected< TensorDataType > arrow_format_to_tensor_dtype(const char *format)
Map an Arrow format string to a TensorDataType.
const char * parquet_to_arrow_format(PhysicalType pt)
Map a Parquet PhysicalType to an Arrow format string.
@ IO_ERROR
A file-system or stream I/O operation failed (open, read, write, rename).
@ UNSUPPORTED_TYPE
The file contains a Parquet physical or logical type that is not implemented.
@ INTERNAL_ERROR
An unexpected internal error that does not fit any other category.
@ INVALID_ARGUMENT
A caller-supplied argument is outside the valid range or violates a precondition.
expected< TensorDataType > physical_to_tensor_dtype(PhysicalType pt)
Map a PhysicalType to a TensorDataType (for column export).
TensorDataType
Element data type for tensor storage, mapping to ONNX/PyTorch/TF type enums.
@ FLOAT64
IEEE 754 double-precision (8 bytes)
@ INT64
Signed 64-bit integer.
@ INT16
Signed 16-bit integer.
@ INT32
Signed 32-bit integer.
@ FLOAT32
IEEE 754 single-precision (4 bytes)
@ FLOAT16
IEEE 754 half-precision (2 bytes)
@ UINT8
Unsigned 8-bit integer.
@ INT8
Signed 8-bit integer.
constexpr size_t tensor_element_size(TensorDataType dtype) noexcept
Returns the byte size of a single element of the given tensor data type.
Data payload for a single Arrow array.
void * private_data
Opaque data for the release callback.
int64_t n_children
Number of child arrays (0 for primitives)
void(* release)(ArrowArray *)
Release callback (null = already released)
int64_t null_count
Number of null elements (0 if non-nullable)
ArrowArray * dictionary
Dictionary array (null if not dict-encoded)
int64_t offset
Logical offset into buffers.
const void ** buffers
Buffer pointers (buffers[0]=validity, buffers[1]=data)
ArrowArray ** children
Child array pointers (null if n_children == 0)
int64_t n_buffers
Number of buffers (typically 2 for primitives)
int64_t length
Number of logical elements.
Schema description for a single Arrow array/column.
ArrowSchema ** children
Child schema pointers (null if n_children == 0)
int64_t flags
Bitfield: bit 1 = nullable, bit 2 = dict-ordered.
const char * metadata
Arrow key-value metadata (may be null)
int64_t n_children
Number of child schemas (0 for primitives)
ArrowSchema * dictionary
Dictionary schema (null if not dict-encoded)
const char * name
Column/field name (may be null)
void(* release)(ArrowSchema *)
Release callback (null = already released)
void * private_data
Opaque data for the release callback.
const char * format
Arrow format string (e.g. "f" = float32)
Lightweight error value carrying an ErrorCode and a human-readable message.
Definition error.hpp:101
Describes the shape of a tensor as a vector of dimension sizes.
std::vector< int64_t > dims
Dimension sizes (e.g. {32, 768} for a 32x768 matrix)
Heap-allocated context attached to ArrowArray.private_data.
const void * buffer_ptrs[2]
[0]=validity, [1]=data
bool owns_data
If true, free buffers[1] on release.
Heap-allocated context attached to ArrowSchema.private_data.
std::string format_storage
Backing storage for ArrowSchema.format.
std::string name_storage
Backing storage for ArrowSchema.name.
Zero-copy tensor bridge: maps Parquet column data directly into ML-framework-compatible tensor views ...
Parquet format enumerations, type traits, and statistics structs.