36#ifndef SIGNET_ARROW_C_DATA_DEFINED
37#define SIGNET_ARROW_C_DATA_DEFINED
126 if (schema ==
nullptr)
return;
132 schema->
name =
nullptr;
144 if (array ==
nullptr)
return;
147 if (ctx->owns_data && ctx->buffer_ptrs[1] !=
nullptr) {
148 std::free(
const_cast<void*
>(ctx->buffer_ptrs[1]));
183 default:
return nullptr;
219 if (format ==
nullptr || format[0] ==
'\0') {
221 "null or empty Arrow format string"};
225 if (format[1] !=
'\0') {
227 std::string(
"unsupported Arrow format: ") + format};
242 std::string(
"unsupported Arrow format character: '")
261 "PhysicalType has no TensorDataType mapping"};
327 const std::string& name,
334 std::memset(out_array, 0,
sizeof(
ArrowArray));
338 "cannot export invalid tensor to Arrow"};
343 "non-contiguous tensor cannot be exported to Arrow"};
347 if (fmt ==
nullptr) {
349 "tensor dtype has no Arrow format mapping"};
353 auto schema_owner = std::make_unique<detail::ArrowSchemaPrivate>();
354 schema_owner->format_storage = fmt;
355 schema_owner->name_storage = name;
357 out_schema->
format = schema_owner->format_storage.c_str();
358 out_schema->
name = schema_owner->name_storage.c_str();
360 out_schema->
flags = 0;
368 auto array_owner = std::make_unique<detail::ArrowArrayPrivate>();
369 array_owner->owns_data =
false;
370 array_owner->buffer_ptrs[0] =
nullptr;
371 array_owner->buffer_ptrs[1] = tensor.
data();
373 const int64_t num_elements =
static_cast<int64_t
>(tensor.
num_elements());
375 out_array->
length = num_elements;
380 out_array->
buffers = array_owner->buffer_ptrs;
410 const std::string& name,
417 std::memset(out_array, 0,
sizeof(
ArrowArray));
419 if (data ==
nullptr || num_values <= 0) {
421 "cannot export null/empty column to Arrow"};
425 if (fmt ==
nullptr) {
427 "PhysicalType has no direct Arrow format mapping"};
431 auto schema_owner = std::make_unique<detail::ArrowSchemaPrivate>();
432 schema_owner->format_storage = fmt;
433 schema_owner->name_storage = name;
435 out_schema->
format = schema_owner->format_storage.c_str();
436 out_schema->
name = schema_owner->name_storage.c_str();
438 out_schema->
flags = 0;
446 auto array_owner = std::make_unique<detail::ArrowArrayPrivate>();
447 array_owner->owns_data =
false;
448 array_owner->buffer_ptrs[0] =
nullptr;
449 array_owner->buffer_ptrs[1] = data;
451 out_array->
length = num_values;
456 out_array->
buffers = array_owner->buffer_ptrs;
512 if (array ==
nullptr || schema ==
nullptr) {
514 "null ArrowArray or ArrowSchema pointer"};
517 if (array->
release ==
nullptr) {
519 "ArrowArray has already been released"};
524 "ArrowArray with nulls cannot be imported as a dense tensor"};
529 return dtype_result.error();
533 const int64_t length = array->
length;
537 "ArrowArray has zero or negative length"};
543 static constexpr int64_t MAX_IMPORT_ELEMENTS = 1'000'000'000;
544 static constexpr int64_t MAX_ARROW_OFFSET = 1'000'000'000LL;
545 static constexpr int64_t MAX_ARROW_LENGTH = 1'000'000'000LL;
547 if (array->
offset > MAX_ARROW_OFFSET) {
550 if (array->
length > MAX_ARROW_LENGTH) {
554 if (length > MAX_IMPORT_ELEMENTS) {
556 "ArrowArray length exceeds import limit (1 billion elements)"};
561 "ArrowArray has negative offset"};
566 "ArrowArray does not have expected buffer layout"};
569 const void* data_buf = array->
buffers[1];
570 if (data_buf ==
nullptr) {
572 "ArrowArray data buffer (buffers[1]) is null"};
575 if (
static_cast<int64_t
>(length) > INT64_MAX - array->
offset) {
577 "ArrowArray: offset + length overflows int64"};
582 const size_t offset_val =
static_cast<size_t>(array->
offset);
583 if (offset_val >
static_cast<size_t>(MAX_IMPORT_ELEMENTS)) {
585 "ArrowArray offset exceeds import limit"};
592 const size_t total_elems = offset_val +
static_cast<size_t>(length);
593 if (total_elems >
static_cast<size_t>(MAX_IMPORT_ELEMENTS)) {
595 "ArrowArray: offset + length exceeds import limit"};
597 if (elem_size > 0 && total_elems > SIZE_MAX / elem_size) {
599 "ArrowArray: (offset + length) * elem_size overflows size_t"};
601 const uint8_t* base =
static_cast<const uint8_t*
>(data_buf)
602 + offset_val * elem_size;
605 shape.
dims = {length};
607 return TensorView(
const_cast<void*
>(
static_cast<const void*
>(base)),
631 return view_result.error();
635 const auto& v = *view_result;
636 return OwnedTensor(v.data(), v.shape(), v.dtype());
Exports Signet Forge tensors and columns as Arrow C Data Interface structs.
static expected< void > export_column(const void *data, int64_t num_values, PhysicalType physical_type, const std::string &name, ArrowArray *out_array, ArrowSchema *out_schema)
Export a 1D column of a primitive Parquet type as ArrowArray + ArrowSchema (zero-copy).
static expected< void > export_tensor(const TensorView &tensor, const std::string &name, ArrowArray *out_array, ArrowSchema *out_schema)
Export a TensorView as an ArrowArray + ArrowSchema pair (zero-copy).
Imports Arrow C Data Interface arrays into Signet TensorView or OwnedTensor.
static expected< TensorView > import_array(const ArrowArray *array, const ArrowSchema *schema)
Import an ArrowArray as a TensorView (zero-copy).
static expected< OwnedTensor > import_array_copy(const ArrowArray *array, const ArrowSchema *schema)
Import an ArrowArray as an OwnedTensor (deep copy).
An owning tensor that manages its own memory via a std::vector<uint8_t> buffer.
A lightweight, non-owning view into a contiguous block of typed memory, interpreted as a multi-dimens...
bool is_valid() const noexcept
True if the view points to valid data.
bool is_contiguous() const noexcept
True if the data is densely packed (no stride gaps).
int64_t num_elements() const noexcept
Total number of elements.
TensorDataType dtype() const noexcept
The element data type.
void * data() noexcept
Raw mutable pointer to the underlying data buffer.
A lightweight result type that holds either a success value of type T or an Error.
void release_arrow_array(ArrowArray *array)
Release callback for ArrowArray.
void release_arrow_schema(ArrowSchema *schema)
Release callback for ArrowSchema.
const char * tensor_dtype_to_arrow_format(TensorDataType dtype)
Map a TensorDataType to an Arrow format string.
PhysicalType
Parquet physical (storage) types as defined in parquet.thrift.
@ INT64
64-bit signed integer (little-endian).
@ INT32
32-bit signed integer (little-endian).
@ BOOLEAN
1-bit boolean, bit-packed in pages.
@ FLOAT
IEEE 754 single-precision float.
@ DOUBLE
IEEE 754 double-precision float.
size_t physical_type_byte_size(PhysicalType pt)
Return the byte size for a PhysicalType (primitive types only).
expected< TensorDataType > arrow_format_to_tensor_dtype(const char *format)
Map an Arrow format string to a TensorDataType.
const char * parquet_to_arrow_format(PhysicalType pt)
Map a Parquet PhysicalType to an Arrow format string.
@ IO_ERROR
A file-system or stream I/O operation failed (open, read, write, rename).
@ UNSUPPORTED_TYPE
The file contains a Parquet physical or logical type that is not implemented.
@ INTERNAL_ERROR
An unexpected internal error that does not fit any other category.
@ INVALID_ARGUMENT
A caller-supplied argument is outside the valid range or violates a precondition.
expected< TensorDataType > physical_to_tensor_dtype(PhysicalType pt)
Map a PhysicalType to a TensorDataType (for column export).
TensorDataType
Element data type for tensor storage, mapping to ONNX/PyTorch/TF type enums.
@ FLOAT64
IEEE 754 double-precision (8 bytes)
@ INT64
Signed 64-bit integer.
@ INT16
Signed 16-bit integer.
@ INT32
Signed 32-bit integer.
@ FLOAT32
IEEE 754 single-precision (4 bytes)
@ FLOAT16
IEEE 754 half-precision (2 bytes)
@ UINT8
Unsigned 8-bit integer.
@ INT8
Signed 8-bit integer.
constexpr size_t tensor_element_size(TensorDataType dtype) noexcept
Returns the byte size of a single element of the given tensor data type.
Data payload for a single Arrow array.
void * private_data
Opaque data for the release callback.
int64_t n_children
Number of child arrays (0 for primitives)
void(* release)(ArrowArray *)
Release callback (null = already released)
int64_t null_count
Number of null elements (0 if non-nullable)
ArrowArray * dictionary
Dictionary array (null if not dict-encoded)
int64_t offset
Logical offset into buffers.
const void ** buffers
Buffer pointers (buffers[0]=validity, buffers[1]=data)
ArrowArray ** children
Child array pointers (null if n_children == 0)
int64_t n_buffers
Number of buffers (typically 2 for primitives)
int64_t length
Number of logical elements.
Schema description for a single Arrow array/column.
ArrowSchema ** children
Child schema pointers (null if n_children == 0)
int64_t flags
Bitfield: bit 1 = nullable, bit 2 = dict-ordered.
const char * metadata
Arrow key-value metadata (may be null)
int64_t n_children
Number of child schemas (0 for primitives)
ArrowSchema * dictionary
Dictionary schema (null if not dict-encoded)
const char * name
Column/field name (may be null)
void(* release)(ArrowSchema *)
Release callback (null = already released)
void * private_data
Opaque data for the release callback.
const char * format
Arrow format string (e.g. "f" = float32)
Lightweight error value carrying an ErrorCode and a human-readable message.
Describes the shape of a tensor as a vector of dimension sizes.
std::vector< int64_t > dims
Dimension sizes (e.g. {32, 768} for a 32x768 matrix)
Heap-allocated context attached to ArrowArray.private_data.
const void * buffer_ptrs[2]
[0]=validity, [1]=data
bool owns_data
If true, free buffers[1] on release.
Heap-allocated context attached to ArrowSchema.private_data.
std::string format_storage
Backing storage for ArrowSchema.format.
std::string name_storage
Backing storage for ArrowSchema.name.
Zero-copy tensor bridge: maps Parquet column data directly into ML-framework-compatible tensor views ...
Parquet format enumerations, type traits, and statistics structs.