Signet Forge 0.1.0
C++20 Parquet library with AI-native extensions
DEMO
Loading...
Searching...
No Matches
split_block.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright 2026 Johnson Ogundeji
3#pragma once
4
19
21
22#include <algorithm>
23#include <cassert>
24#include <cmath>
25#include <cstdint>
26#include <cstring>
27#include <stdexcept>
28#include <string>
29#include <vector>
30
31namespace signet::forge {
32
58public:
60 static constexpr size_t kBytesPerBlock = 32;
61
63 static constexpr size_t kWordsPerBlock = 8;
64
66 static constexpr size_t kMinBytes = kBytesPerBlock;
67
69 static constexpr size_t kMaxBytes = 128ULL * 1024 * 1024;
70
71 // -----------------------------------------------------------------------
72 // Construction
73 // -----------------------------------------------------------------------
74
87 explicit SplitBlockBloomFilter(size_t num_distinct_values,
88 double fpr = 0.01)
89 {
90 if (fpr <= 0.0 || fpr >= 1.0) {
91 throw std::invalid_argument(
92 "SplitBlockBloomFilter: fpr must be in (0, 1)");
93 }
94 if (num_distinct_values == 0) {
95 num_distinct_values = 1; // at least one block
96 }
97
98 // Optimal byte count: -8 * ndv / ln(1 - fpr^(1/8))
99 double exponent = 1.0 / 8.0;
100 double fpr_root = std::pow(fpr, exponent);
101 double denom = std::log(1.0 - fpr_root);
102 double raw_bytes = -8.0 * static_cast<double>(num_distinct_values) / denom;
103
104 // Round up to multiple of 32, clamp to [kMinBytes, kMaxBytes]
105 auto num_bytes = static_cast<size_t>(std::ceil(raw_bytes));
106 num_bytes = round_up_to_block(num_bytes);
107 num_bytes = (std::max)(num_bytes, kMinBytes);
108 num_bytes = (std::min)(num_bytes, kMaxBytes);
109
110 data_.resize(num_bytes, 0);
111 }
112
119 static SplitBlockBloomFilter with_size(size_t num_bytes) {
120 if (num_bytes == 0 || (num_bytes % kBytesPerBlock) != 0) {
121 throw std::invalid_argument(
122 "SplitBlockBloomFilter::with_size: "
123 "num_bytes must be a positive multiple of 32");
124 }
125 num_bytes = (std::min)(num_bytes, kMaxBytes);
127 f.data_.resize(num_bytes, 0);
128 return f;
129 }
130
131 // -----------------------------------------------------------------------
132 // Core operations
133 // -----------------------------------------------------------------------
134
141 void insert(uint64_t hash) {
142 const size_t nblocks = num_blocks();
143 // Block index: use upper 32 bits to select the block
144 // Invariant: nblocks > 0 (enforced by constructor min 1 block)
145 const auto block_idx = static_cast<size_t>(
146 (static_cast<uint64_t>(hash >> 32) * nblocks) >> 32);
147 assert(block_idx < nblocks);
148
149 const auto key = static_cast<uint32_t>(hash);
150
151 for (size_t i = 0; i < kWordsPerBlock; ++i) {
152 uint32_t bit_pos = (key * kSalt[i]) >> 27; // top 5 bits
153 uint32_t word = block_read(block_idx, i);
154 word |= (UINT32_C(1) << bit_pos);
155 block_write(block_idx, i, word);
156 }
157 }
158
164 [[nodiscard]] bool might_contain(uint64_t hash) const {
165 const size_t nblocks = num_blocks();
166 const auto block_idx = static_cast<size_t>(
167 (static_cast<uint64_t>(hash >> 32) * nblocks) >> 32);
168 assert(block_idx < nblocks);
169
170 const auto key = static_cast<uint32_t>(hash);
171
172 for (size_t i = 0; i < kWordsPerBlock; ++i) {
173 uint32_t bit_pos = (key * kSalt[i]) >> 27;
174 uint32_t word = block_read(block_idx, i);
175 if ((word & (UINT32_C(1) << bit_pos)) == 0) {
176 return false;
177 }
178 }
179 return true;
180 }
181
182 // -----------------------------------------------------------------------
183 // Convenience: insert/check typed values using xxHash64
184 // -----------------------------------------------------------------------
185
188 void insert_value(const std::string& val) {
190 }
193 void insert_value(int32_t val) {
195 }
198 void insert_value(int64_t val) {
200 }
203 void insert_value(float val) {
205 }
208 void insert_value(double val) {
210 }
211
215 [[nodiscard]] bool might_contain_value(const std::string& val) const {
216 return might_contain(xxhash::hash64(val));
217 }
221 [[nodiscard]] bool might_contain_value(int32_t val) const {
223 }
227 [[nodiscard]] bool might_contain_value(int64_t val) const {
229 }
233 [[nodiscard]] bool might_contain_value(float val) const {
235 }
239 [[nodiscard]] bool might_contain_value(double val) const {
241 }
242
243 // -----------------------------------------------------------------------
244 // Serialization / deserialization
245 // -----------------------------------------------------------------------
246
249 [[nodiscard]] const std::vector<uint8_t>& data() const { return data_; }
250
252 [[nodiscard]] size_t size_bytes() const { return data_.size(); }
253
255 [[nodiscard]] size_t num_blocks() const {
256 return data_.size() / kBytesPerBlock;
257 }
258
265 static SplitBlockBloomFilter from_data(const uint8_t* src, size_t size) {
266 if (size == 0 || (size % kBytesPerBlock) != 0) {
267 throw std::invalid_argument(
268 "SplitBlockBloomFilter::from_data: "
269 "size must be a positive multiple of 32");
270 }
271 // CWE-400: Uncontrolled Resource Consumption — kMaxBytes (128 MiB) cap
272 if (size > kMaxBytes) {
273 throw std::invalid_argument(
274 "SplitBlockBloomFilter::from_data: "
275 "data exceeds maximum filter size");
276 }
278 f.data_.assign(src, src + size);
279 return f;
280 }
281
282 // -----------------------------------------------------------------------
283 // Utilities
284 // -----------------------------------------------------------------------
285
293 void merge(const SplitBlockBloomFilter& other) {
294 if (data_.size() != other.data_.size()) {
295 throw std::invalid_argument(
296 "SplitBlockBloomFilter::merge: filter sizes must match");
297 }
298 for (size_t i = 0; i < data_.size(); ++i) {
299 data_[i] |= other.data_[i];
300 }
301 }
302
304 void reset() {
305 std::fill(data_.begin(), data_.end(), uint8_t{0});
306 }
307
318 [[nodiscard]] double estimated_fpr(size_t num_insertions) const {
319 if (num_insertions == 0) return 0.0;
320
321 const double k = 8.0;
322 const double m = static_cast<double>(num_blocks()) * 256.0; // total bits
323 const double n = static_cast<double>(num_insertions);
324
325 // Probability that a single bit is set after n insertions
326 double p = 1.0 - std::exp(-k * n / m);
327 // FPR = probability all k bits match
328 return std::pow(p, k);
329 }
330
331private:
333 SplitBlockBloomFilter() = default;
334
337 static constexpr uint64_t kBloomSeed = 0;
338 static_assert(kBloomSeed == 0, "Parquet spec requires bloom filter seed = 0");
339
344 static constexpr uint32_t kSalt[kWordsPerBlock] = {
345 0x47b6137bU,
346 0x44974d91U,
347 0x8824ad5bU,
348 0xa2b7289dU,
349 0x705495c7U,
350 0x2df1424bU,
351 0x9efc4947U,
352 0x5c6bfb31U
353 };
354
355 // -----------------------------------------------------------------------
356 // Internal helpers
357 // -----------------------------------------------------------------------
358
362 static constexpr size_t round_up_to_block(size_t bytes) {
363 return ((bytes + kBytesPerBlock - 1) / kBytesPerBlock) * kBytesPerBlock;
364 }
365
369 uint32_t block_read(size_t block_idx, size_t word_idx) const {
370 uint32_t val;
371 std::memcpy(&val, data_.data() + block_idx * kBytesPerBlock + word_idx * sizeof(uint32_t), sizeof(uint32_t));
372 return val;
373 }
374
378 void block_write(size_t block_idx, size_t word_idx, uint32_t val) {
379 std::memcpy(data_.data() + block_idx * kBytesPerBlock + word_idx * sizeof(uint32_t), &val, sizeof(uint32_t));
380 }
381
383 [[deprecated("use block_read/block_write")]]
384 uint32_t* block_ptr(size_t idx) {
385 return reinterpret_cast<uint32_t*>(data_.data() + idx * kBytesPerBlock);
386 }
387
389 [[deprecated("use block_read/block_write")]]
390 const uint32_t* block_ptr(size_t idx) const {
391 return reinterpret_cast<const uint32_t*>(
392 data_.data() + idx * kBytesPerBlock);
393 }
394
396 std::vector<uint8_t> data_;
397};
398
399} // namespace signet::forge
Parquet-spec Split Block Bloom Filter for probabilistic set membership.
void insert_value(float val)
Insert a 32-bit float value (hashed with xxHash64).
size_t size_bytes() const
Total size of the filter in bytes (always a multiple of 32).
static constexpr size_t kWordsPerBlock
Number of uint32_t words per block (8).
static constexpr size_t kMinBytes
Minimum filter size: one block (32 bytes).
static constexpr size_t kBytesPerBlock
Block size in bytes (32 bytes = 256 bits = 8 x uint32_t words).
bool might_contain_value(int32_t val) const
Check if a 32-bit integer value might be present in the filter.
void insert(uint64_t hash)
Insert a pre-computed xxHash64 value into the filter.
void merge(const SplitBlockBloomFilter &other)
Merge another filter into this one via bitwise OR.
bool might_contain(uint64_t hash) const
Check if a pre-computed hash value might be present in the filter.
const std::vector< uint8_t > & data() const
Access the raw filter data for serialization into a Parquet file.
void insert_value(int64_t val)
Insert a 64-bit integer value (hashed with xxHash64).
bool might_contain_value(const std::string &val) const
Check if a string value might be present in the filter.
size_t num_blocks() const
Number of 32-byte blocks in the filter.
void insert_value(int32_t val)
Insert a 32-bit integer value (hashed with xxHash64).
double estimated_fpr(size_t num_insertions) const
Estimate the false-positive rate after a given number of insertions.
static SplitBlockBloomFilter with_size(size_t num_bytes)
Create a bloom filter with an explicit byte size.
static constexpr size_t kMaxBytes
Maximum filter size: 128 MiB (Parquet spec recommendation).
bool might_contain_value(float val) const
Check if a 32-bit float value might be present in the filter.
bool might_contain_value(int64_t val) const
Check if a 64-bit integer value might be present in the filter.
bool might_contain_value(double val) const
Check if a 64-bit double value might be present in the filter.
void insert_value(double val)
Insert a 64-bit double value (hashed with xxHash64).
void reset()
Reset the filter (clear all bits to zero).
static SplitBlockBloomFilter from_data(const uint8_t *src, size_t size)
Reconstruct a filter from previously serialized bytes.
SplitBlockBloomFilter(size_t num_distinct_values, double fpr=0.01)
Create a bloom filter sized for the expected number of distinct values.
void insert_value(const std::string &val)
Insert a string value (hashed with xxHash64).
uint64_t hash64(const void *data, size_t length, uint64_t seed=0)
Compute xxHash64 of an arbitrary byte buffer.
Definition xxhash.hpp:159
uint64_t hash64_value(const T &val, uint64_t seed=0)
Convenience overload: hash a trivially-copyable typed value.
Definition xxhash.hpp:263
Header-only xxHash64 implementation for Signet Forge.