33 #include <immintrin.h>
34#elif defined(__SSE4_2__) || defined(__SSE2__)
35 #include <immintrin.h>
36#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
60 const uint32_t sign =
static_cast<uint32_t
>(h >> 15) & 0x1u;
61 const uint32_t exponent =
static_cast<uint32_t
>(h >> 10) & 0x1Fu;
62 const uint32_t mantissa =
static_cast<uint32_t
>(h) & 0x3FFu;
64 uint32_t f32_bits = 0;
69 f32_bits = sign << 31;
73 uint32_t m = mantissa;
75 while ((m & 0x400u) == 0) {
81 uint32_t f32_exp =
static_cast<uint32_t
>(127 - 14 + e + 1);
82 f32_bits = (sign << 31) | (f32_exp << 23) | (m << 13);
84 }
else if (exponent == 0x1Fu) {
86 f32_bits = (sign << 31) | (0xFFu << 23) | (mantissa << 13);
89 uint32_t f32_exp = exponent - 15u + 127u;
90 f32_bits = (sign << 31) | (f32_exp << 23) | (mantissa << 13);
94 std::memcpy(&result, &f32_bits,
sizeof(result));
102 std::memcpy(&f32_bits, &val,
sizeof(f32_bits));
104 const uint32_t sign = (f32_bits >> 31) & 0x1u;
105 const uint32_t exponent = (f32_bits >> 23) & 0xFFu;
106 const uint32_t mantissa = f32_bits & 0x7FFFFFu;
108 uint16_t h_sign =
static_cast<uint16_t
>(sign << 15);
110 if (exponent == 0xFF) {
113 return h_sign | 0x7C00u;
116 return h_sign | 0x7C00u |
static_cast<uint16_t
>(mantissa >> 13);
121 int32_t unbiased_exp =
static_cast<int32_t
>(exponent) - 127;
123 if (unbiased_exp > 15) {
125 return h_sign | 0x7C00u;
128 if (unbiased_exp < -24) {
133 if (unbiased_exp < -14) {
135 uint32_t full_mantissa = mantissa | 0x800000u;
136 int32_t shift = -1 - unbiased_exp - 14 + 24;
139 uint16_t h_mantissa =
static_cast<uint16_t
>((full_mantissa >> 31) & 1u);
140 return h_sign | h_mantissa;
143 if (shift < 0 || shift > 32) {
147 uint32_t rounded = full_mantissa >> shift;
148 uint32_t remainder = full_mantissa & ((1u << shift) - 1u);
149 uint32_t midpoint = 1u << (shift - 1);
150 if (remainder > midpoint || (remainder == midpoint && (rounded & 1u))) {
153 return h_sign |
static_cast<uint16_t
>(rounded);
157 uint16_t h_exp =
static_cast<uint16_t
>((unbiased_exp + 15) << 10);
159 uint16_t h_man =
static_cast<uint16_t
>(mantissa >> 13);
161 uint32_t remainder = mantissa & 0x1FFFu;
162 if (remainder > 0x1000u || (remainder == 0x1000u && (h_man & 1u))) {
164 if (h_man > 0x3FFu) {
170 return h_sign | h_exp | h_man;
228 inline void add(
const float* data) {
234 const auto* raw =
reinterpret_cast<const uint8_t*
>(data);
235 buf_.insert(buf_.end(), raw, raw + bpv);
239 size_t offset = buf_.size();
240 buf_.resize(offset + bpv);
242 for (
size_t i = 0; i < dim; ++i) {
243 double d =
static_cast<double>(data[i]);
244 std::memcpy(buf_.data() + offset + i *
sizeof(
double), &d,
sizeof(d));
249 size_t offset = buf_.size();
250 buf_.resize(offset + bpv);
252 for (
size_t i = 0; i < dim; ++i) {
254 std::memcpy(buf_.data() + offset + i * 2, &h,
sizeof(h));
281 [[nodiscard]]
inline std::vector<uint8_t>
flush() {
282 std::vector<uint8_t> out = std::move(buf_);
289 [[nodiscard]]
size_t num_vectors() const noexcept {
return num_vectors_; }
301 const std::string& name,
313 std::vector<uint8_t> buf_;
314 size_t num_vectors_ = 0;
342 [[nodiscard]]
inline std::vector<std::vector<float>>
343 read_page(
const uint8_t* data,
size_t data_size)
const {
347 if (bpv == 0)
return {};
349 const size_t count = data_size / bpv;
350 std::vector<std::vector<float>> result;
351 result.reserve(count);
353 for (
size_t i = 0; i < count; ++i) {
354 const uint8_t* src = data + i * bpv;
355 std::vector<float> vec(dim);
359 std::memcpy(vec.data(), src, dim *
sizeof(
float));
364 for (
size_t j = 0; j < dim; ++j) {
366 std::memcpy(&d, src + j *
sizeof(
double),
sizeof(d));
367 vec[j] =
static_cast<float>(d);
373 for (
size_t j = 0; j < dim; ++j) {
375 std::memcpy(&h, src + j * 2,
sizeof(h));
381 result.push_back(std::move(vec));
407 "zero-copy read requires FLOAT32 element type"};
411 if (
reinterpret_cast<uintptr_t
>(data) %
alignof(
float) != 0) {
413 "page data is not aligned for float access"};
419 "vector dimension is zero"};
421 if (data_size % bpv != 0) {
423 "page size is not a multiple of bytes_per_vector"};
427 result.
data =
reinterpret_cast<const float*
>(data);
438 [[nodiscard]]
inline std::vector<float>
439 read_vector(
const uint8_t* page_data,
size_t page_size,
size_t index)
const {
443 if (bpv == 0 || (index + 1) * bpv > page_size) {
447 const uint8_t* src = page_data + index * bpv;
448 std::vector<float> vec(dim);
452 std::memcpy(vec.data(), src, dim *
sizeof(
float));
457 for (
size_t j = 0; j < dim; ++j) {
459 std::memcpy(&d, src + j *
sizeof(
double),
sizeof(d));
460 vec[j] =
static_cast<float>(d);
466 for (
size_t j = 0; j < dim; ++j) {
468 std::memcpy(&h, src + j * 2,
sizeof(h));
509inline float dot_product(
const float* a,
const float* b,
size_t n)
noexcept {
514 __m256 acc = _mm256_setzero_ps();
515 for (; i + 8 <= n; i += 8) {
516 __m256 va = _mm256_loadu_ps(a + i);
517 __m256 vb = _mm256_loadu_ps(b + i);
518 acc = _mm256_fmadd_ps(va, vb, acc);
521 __m128 lo = _mm256_castps256_ps128(acc);
522 __m128 hi = _mm256_extractf128_ps(acc, 1);
523 __m128 s4 = _mm_add_ps(lo, hi);
524 __m128 s2 = _mm_add_ps(s4, _mm_movehl_ps(s4, s4));
525 __m128 s1 = _mm_add_ss(s2, _mm_movehdup_ps(s2));
526 sum = _mm_cvtss_f32(s1);
528#elif defined(__SSE4_2__) || defined(__SSE2__)
529 __m128 acc = _mm_setzero_ps();
530 for (; i + 4 <= n; i += 4) {
531 __m128 va = _mm_loadu_ps(a + i);
532 __m128 vb = _mm_loadu_ps(b + i);
533 acc = _mm_add_ps(acc, _mm_mul_ps(va, vb));
536 __m128 shuf = _mm_movehl_ps(acc, acc);
537 __m128 sums = _mm_add_ps(acc, shuf);
538 shuf = _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(0,0,0,1));
539 sums = _mm_add_ss(sums, shuf);
540 sum = _mm_cvtss_f32(sums);
542#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
543 float32x4_t acc = vdupq_n_f32(0.0f);
544 for (; i + 4 <= n; i += 4) {
545 float32x4_t va = vld1q_f32(a + i);
546 float32x4_t vb = vld1q_f32(b + i);
547 acc = vmlaq_f32(acc, va, vb);
550 float32x2_t pair = vadd_f32(vget_low_f32(acc), vget_high_f32(acc));
551 pair = vpadd_f32(pair, pair);
552 sum = vget_lane_f32(pair, 0);
577 __m256 acc = _mm256_setzero_ps();
578 for (; i + 8 <= n; i += 8) {
579 __m256 va = _mm256_loadu_ps(a + i);
580 __m256 vb = _mm256_loadu_ps(b + i);
581 __m256 diff = _mm256_sub_ps(va, vb);
582 acc = _mm256_fmadd_ps(diff, diff, acc);
584 __m128 lo = _mm256_castps256_ps128(acc);
585 __m128 hi = _mm256_extractf128_ps(acc, 1);
586 __m128 s4 = _mm_add_ps(lo, hi);
587 __m128 s2 = _mm_add_ps(s4, _mm_movehl_ps(s4, s4));
588 __m128 s1 = _mm_add_ss(s2, _mm_movehdup_ps(s2));
589 sum = _mm_cvtss_f32(s1);
591#elif defined(__SSE4_2__) || defined(__SSE2__)
592 __m128 acc = _mm_setzero_ps();
593 for (; i + 4 <= n; i += 4) {
594 __m128 va = _mm_loadu_ps(a + i);
595 __m128 vb = _mm_loadu_ps(b + i);
596 __m128 diff = _mm_sub_ps(va, vb);
597 acc = _mm_add_ps(acc, _mm_mul_ps(diff, diff));
599 __m128 shuf = _mm_movehl_ps(acc, acc);
600 __m128 sums = _mm_add_ps(acc, shuf);
601 shuf = _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(0,0,0,1));
602 sums = _mm_add_ss(sums, shuf);
603 sum = _mm_cvtss_f32(sums);
605#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
606 float32x4_t acc = vdupq_n_f32(0.0f);
607 for (; i + 4 <= n; i += 4) {
608 float32x4_t va = vld1q_f32(a + i);
609 float32x4_t vb = vld1q_f32(b + i);
610 float32x4_t diff = vsubq_f32(va, vb);
611 acc = vmlaq_f32(acc, diff, diff);
613 float32x2_t pair = vadd_f32(vget_low_f32(acc), vget_high_f32(acc));
614 pair = vpadd_f32(pair, pair);
615 sum = vget_lane_f32(pair, 0);
619 float d = a[i] - b[i];
639 __m256 acc = _mm256_setzero_ps();
640 for (; i + 8 <= n; i += 8) {
641 __m256 v = _mm256_loadu_ps(data + i);
642 acc = _mm256_fmadd_ps(v, v, acc);
644 __m128 lo = _mm256_castps256_ps128(acc);
645 __m128 hi = _mm256_extractf128_ps(acc, 1);
646 __m128 s4 = _mm_add_ps(lo, hi);
647 __m128 s2 = _mm_add_ps(s4, _mm_movehl_ps(s4, s4));
648 __m128 s1 = _mm_add_ss(s2, _mm_movehdup_ps(s2));
649 sum = _mm_cvtss_f32(s1);
651#elif defined(__SSE4_2__) || defined(__SSE2__)
652 __m128 acc = _mm_setzero_ps();
653 for (; i + 4 <= n; i += 4) {
654 __m128 v = _mm_loadu_ps(data + i);
655 acc = _mm_add_ps(acc, _mm_mul_ps(v, v));
657 __m128 shuf = _mm_movehl_ps(acc, acc);
658 __m128 sums = _mm_add_ps(acc, shuf);
659 shuf = _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(0,0,0,1));
660 sums = _mm_add_ss(sums, shuf);
661 sum = _mm_cvtss_f32(sums);
663#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
664 float32x4_t acc = vdupq_n_f32(0.0f);
665 for (; i + 4 <= n; i += 4) {
666 float32x4_t v = vld1q_f32(data + i);
667 acc = vmlaq_f32(acc, v, v);
669 float32x2_t pair = vadd_f32(vget_low_f32(acc), vget_high_f32(acc));
670 pair = vpadd_f32(pair, pair);
671 sum = vget_lane_f32(pair, 0);
675 sum += data[i] * data[i];
692 if (norm_sq == 0.0f)
return;
694 const float inv_norm = 1.0f / std::sqrt(norm_sq);
698 __m256 scale = _mm256_set1_ps(inv_norm);
699 for (; i + 8 <= n; i += 8) {
700 __m256 v = _mm256_loadu_ps(data + i);
701 _mm256_storeu_ps(data + i, _mm256_mul_ps(v, scale));
704#elif defined(__SSE4_2__) || defined(__SSE2__)
705 __m128 scale = _mm_set1_ps(inv_norm);
706 for (; i + 4 <= n; i += 4) {
707 __m128 v = _mm_loadu_ps(data + i);
708 _mm_storeu_ps(data + i, _mm_mul_ps(v, scale));
711#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
712 float32x4_t scale = vdupq_n_f32(inv_norm);
713 for (; i + 4 <= n; i += 4) {
714 float32x4_t v = vld1q_f32(data + i);
715 vst1q_f32(data + i, vmulq_f32(v, scale));
736inline void copy_floats(
float* dst,
const float* src,
size_t n)
noexcept {
740 for (; i + 8 <= n; i += 8) {
741 __m256 v = _mm256_loadu_ps(src + i);
742 _mm256_storeu_ps(dst + i, v);
745#elif defined(__SSE4_2__) || defined(__SSE2__)
746 for (; i + 4 <= n; i += 4) {
747 __m128 v = _mm_loadu_ps(src + i);
748 _mm_storeu_ps(dst + i, v);
751#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
752 for (; i + 4 <= n; i += 4) {
753 float32x4_t v = vld1q_f32(src + i);
754 vst1q_f32(dst + i, v);
760 std::memcpy(dst + i, src + i, (n - i) *
sizeof(
float));
790 const std::string& name,
Fluent builder for constructing a Schema one column at a time.
SchemaBuilder & raw_column(ColumnDescriptor cd)
Add a pre-built ColumnDescriptor directly.
Reads FIXED_LEN_BYTE_ARRAY page data back into float vectors.
std::vector< float > read_vector(const uint8_t *page_data, size_t page_size, size_t index) const
Read a single vector at the given index from a page.
VectorReader(VectorColumnSpec spec)
Construct a VectorReader for the given column specification.
const VectorColumnSpec & spec() const noexcept
The column spec this reader was constructed with.
std::vector< std::vector< float > > read_page(const uint8_t *data, size_t data_size) const
Decode a PLAIN-encoded page of FIXED_LEN_BYTE_ARRAY vectors into float32 vectors.
expected< ZeroCopyResult > read_page_zero_copy(const uint8_t *data, size_t data_size) const
Attempt a zero-copy read of a FLOAT32 page.
Buffers float vectors and encodes them as FIXED_LEN_BYTE_ARRAY PLAIN data.
void add(const float *data)
Add a single vector from a float32 pointer (must point to dimension floats).
std::vector< uint8_t > flush()
Flush the buffered vectors and return the encoded page bytes.
const VectorColumnSpec & spec() const noexcept
The column spec this writer was constructed with.
VectorWriter(VectorColumnSpec spec)
Construct a VectorWriter for the given column specification.
static ColumnDescriptor make_descriptor(const std::string &name, const VectorColumnSpec &spec)
Create a ColumnDescriptor for a vector column with the given name and spec.
size_t num_vectors() const noexcept
Number of vectors currently buffered (since last flush).
bool add_batch(const float *data, size_t num_vectors)
Add a batch of vectors (num_vectors vectors, each dimension elements, row-major).
A lightweight result type that holds either a success value of type T or an Error.
void copy_floats(float *dst, const float *src, size_t n) noexcept
Fast copy of n floats from src to dst.
float sum_of_squares(const float *data, size_t n) noexcept
Compute the sum of squares of a float vector.
void l2_normalize(float *data, size_t n) noexcept
L2-normalize a float vector in-place (divide each element by the L2 norm).
float dot_product(const float *a, const float *b, size_t n) noexcept
Compute the dot product (inner product) of two float vectors.
float l2_distance_sq(const float *a, const float *b, size_t n) noexcept
Compute the squared L2 (Euclidean) distance between two float vectors.
VectorElementType
Specifies the numerical precision of each element within a vector column.
@ FLOAT64
IEEE 754 double-precision (8 bytes per element)
@ FLOAT32
IEEE 754 single-precision (4 bytes per element)
@ FLOAT16
IEEE 754 half-precision (2 bytes per element)
@ FIXED_LEN_BYTE_ARRAY
Fixed-length byte array (UUID, vectors, decimals).
float f16_to_f32(uint16_t h) noexcept
Convert a 16-bit IEEE 754 half-precision value to a 32-bit float.
SchemaBuilder & add_vector_column(SchemaBuilder &builder, const std::string &name, uint32_t dimension, VectorElementType elem=VectorElementType::FLOAT32)
Add a vector column to a SchemaBuilder.
@ FLOAT32_VECTOR
ML embedding vector — FIXED_LEN_BYTE_ARRAY(dim*4).
uint16_t f32_to_f16(float val) noexcept
Convert a 32-bit float to a 16-bit IEEE 754 half-precision value.
@ UNSUPPORTED_TYPE
The file contains a Parquet physical or logical type that is not implemented.
@ SCHEMA_MISMATCH
The requested column name or type does not match the file schema.
@ INTERNAL_ERROR
An unexpected internal error that does not fit any other category.
@ CORRUPT_PAGE
A data page failed integrity checks (bad CRC, truncated, or exceeds size limits).
@ FLOAT64
IEEE 754 double-precision (8 bytes)
@ FLOAT32
IEEE 754 single-precision (4 bytes)
@ FLOAT16
IEEE 754 half-precision (2 bytes)
Schema definition types: Column<T>, SchemaBuilder, and Schema.
Descriptor for a single column in a Parquet schema.
int32_t type_length
Byte length for FIXED_LEN_BYTE_ARRAY columns (-1 = N/A).
LogicalType logical_type
Semantic annotation (STRING, TIMESTAMP_NS, etc.).
std::string name
Column name (unique within a schema).
PhysicalType physical_type
On-disk storage type.
Lightweight error value carrying an ErrorCode and a human-readable message.
Configuration for a vector column: dimensionality and element precision.
uint32_t dimension
Number of elements per vector.
constexpr size_t element_size() const noexcept
Returns the byte size of one element (2, 4, or 8).
VectorElementType element_type
Element precision.
constexpr size_t bytes_per_vector() const noexcept
Returns the total byte size of one vector (dimension * element_size).
Zero-copy read result: a pointer to the float data and the vector count.
const float * data
Pointer to the first float of the first vector.
size_t num_vectors
Number of complete vectors in the page.
Parquet format enumerations, type traits, and statistics structs.