69 static constexpr size_t kMaxBytes = 128ULL * 1024 * 1024;
90 if (fpr <= 0.0 || fpr >= 1.0) {
91 throw std::invalid_argument(
92 "SplitBlockBloomFilter: fpr must be in (0, 1)");
94 if (num_distinct_values == 0) {
95 num_distinct_values = 1;
99 double exponent = 1.0 / 8.0;
100 double fpr_root = std::pow(fpr, exponent);
101 double denom = std::log(1.0 - fpr_root);
102 double raw_bytes = -8.0 *
static_cast<double>(num_distinct_values) / denom;
105 auto num_bytes =
static_cast<size_t>(std::ceil(raw_bytes));
106 num_bytes = round_up_to_block(num_bytes);
107 num_bytes = (std::max)(num_bytes,
kMinBytes);
108 num_bytes = (std::min)(num_bytes,
kMaxBytes);
110 data_.resize(num_bytes, 0);
121 throw std::invalid_argument(
122 "SplitBlockBloomFilter::with_size: "
123 "num_bytes must be a positive multiple of 32");
125 num_bytes = (std::min)(num_bytes,
kMaxBytes);
127 f.data_.resize(num_bytes, 0);
145 const auto block_idx =
static_cast<size_t>(
146 (
static_cast<uint64_t
>(hash >> 32) * nblocks) >> 32);
147 assert(block_idx < nblocks);
149 const auto key =
static_cast<uint32_t
>(hash);
152 uint32_t bit_pos = (key * kSalt[i]) >> 27;
153 uint32_t word = block_read(block_idx, i);
154 word |= (UINT32_C(1) << bit_pos);
155 block_write(block_idx, i, word);
166 const auto block_idx =
static_cast<size_t>(
167 (
static_cast<uint64_t
>(hash >> 32) * nblocks) >> 32);
168 assert(block_idx < nblocks);
170 const auto key =
static_cast<uint32_t
>(hash);
173 uint32_t bit_pos = (key * kSalt[i]) >> 27;
174 uint32_t word = block_read(block_idx, i);
175 if ((word & (UINT32_C(1) << bit_pos)) == 0) {
249 [[nodiscard]]
const std::vector<uint8_t>&
data()
const {
return data_; }
252 [[nodiscard]]
size_t size_bytes()
const {
return data_.size(); }
267 throw std::invalid_argument(
268 "SplitBlockBloomFilter::from_data: "
269 "size must be a positive multiple of 32");
273 throw std::invalid_argument(
274 "SplitBlockBloomFilter::from_data: "
275 "data exceeds maximum filter size");
278 f.data_.assign(src, src + size);
294 if (data_.size() != other.data_.size()) {
295 throw std::invalid_argument(
296 "SplitBlockBloomFilter::merge: filter sizes must match");
298 for (
size_t i = 0; i < data_.size(); ++i) {
299 data_[i] |= other.data_[i];
305 std::fill(data_.begin(), data_.end(), uint8_t{0});
319 if (num_insertions == 0)
return 0.0;
321 const double k = 8.0;
322 const double m =
static_cast<double>(
num_blocks()) * 256.0;
323 const double n =
static_cast<double>(num_insertions);
326 double p = 1.0 - std::exp(-k * n / m);
328 return std::pow(p, k);
337 static constexpr uint64_t kBloomSeed = 0;
338 static_assert(kBloomSeed == 0,
"Parquet spec requires bloom filter seed = 0");
362 static constexpr size_t round_up_to_block(
size_t bytes) {
369 uint32_t block_read(
size_t block_idx,
size_t word_idx)
const {
371 std::memcpy(&val, data_.data() + block_idx *
kBytesPerBlock + word_idx *
sizeof(uint32_t),
sizeof(uint32_t));
378 void block_write(
size_t block_idx,
size_t word_idx, uint32_t val) {
379 std::memcpy(data_.data() + block_idx *
kBytesPerBlock + word_idx *
sizeof(uint32_t), &val,
sizeof(uint32_t));
383 [[deprecated(
"use block_read/block_write")]]
384 uint32_t* block_ptr(
size_t idx) {
385 return reinterpret_cast<uint32_t*
>(data_.data() + idx *
kBytesPerBlock);
389 [[deprecated(
"use block_read/block_write")]]
390 const uint32_t* block_ptr(
size_t idx)
const {
391 return reinterpret_cast<const uint32_t*
>(
396 std::vector<uint8_t> data_;
Parquet-spec Split Block Bloom Filter for probabilistic set membership.
void insert_value(float val)
Insert a 32-bit float value (hashed with xxHash64).
size_t size_bytes() const
Total size of the filter in bytes (always a multiple of 32).
static constexpr size_t kWordsPerBlock
Number of uint32_t words per block (8).
static constexpr size_t kMinBytes
Minimum filter size: one block (32 bytes).
static constexpr size_t kBytesPerBlock
Block size in bytes (32 bytes = 256 bits = 8 x uint32_t words).
bool might_contain_value(int32_t val) const
Check if a 32-bit integer value might be present in the filter.
void insert(uint64_t hash)
Insert a pre-computed xxHash64 value into the filter.
void merge(const SplitBlockBloomFilter &other)
Merge another filter into this one via bitwise OR.
bool might_contain(uint64_t hash) const
Check if a pre-computed hash value might be present in the filter.
const std::vector< uint8_t > & data() const
Access the raw filter data for serialization into a Parquet file.
void insert_value(int64_t val)
Insert a 64-bit integer value (hashed with xxHash64).
bool might_contain_value(const std::string &val) const
Check if a string value might be present in the filter.
size_t num_blocks() const
Number of 32-byte blocks in the filter.
void insert_value(int32_t val)
Insert a 32-bit integer value (hashed with xxHash64).
double estimated_fpr(size_t num_insertions) const
Estimate the false-positive rate after a given number of insertions.
static SplitBlockBloomFilter with_size(size_t num_bytes)
Create a bloom filter with an explicit byte size.
static constexpr size_t kMaxBytes
Maximum filter size: 128 MiB (Parquet spec recommendation).
bool might_contain_value(float val) const
Check if a 32-bit float value might be present in the filter.
bool might_contain_value(int64_t val) const
Check if a 64-bit integer value might be present in the filter.
bool might_contain_value(double val) const
Check if a 64-bit double value might be present in the filter.
void insert_value(double val)
Insert a 64-bit double value (hashed with xxHash64).
void reset()
Reset the filter (clear all bits to zero).
static SplitBlockBloomFilter from_data(const uint8_t *src, size_t size)
Reconstruct a filter from previously serialized bytes.
SplitBlockBloomFilter(size_t num_distinct_values, double fpr=0.01)
Create a bloom filter sized for the expected number of distinct values.
void insert_value(const std::string &val)
Insert a string value (hashed with xxHash64).
uint64_t hash64(const void *data, size_t length, uint64_t seed=0)
Compute xxHash64 of an arbitrary byte buffer.
uint64_t hash64_value(const T &val, uint64_t seed=0)
Convenience overload: hash a trivially-copyable typed value.
Header-only xxHash64 implementation for Signet Forge.