๐Ÿฆ€ Functional Rust

963: Bloom Filter

Difficulty: Intermediate Category: Probabilistic Data Structures Concept: Space-efficient probabilistic set โ€” `add` and `might_contain` operations with no false negatives but possible false positives Key Insight: A Bloom filter stores items as bits set by multiple hash functions; OCaml uses `bool array` (simple), Rust uses `Vec<u64>` with bit manipulation โ€” Rust's `wrapping_mul/add` handles hash overflow explicitly where OCaml's arithmetic silently wraps
// 963: Bloom Filter
// Probabilistic membership test: no false negatives, possible false positives
// Uses 3 independent hash functions + bit array (u64 words)

// Approach 1: Three hash functions (djb2, sdbm, fnv-like)
fn hash1(s: &str) -> usize {
    s.bytes().fold(5381usize, |h, b| h.wrapping_mul(31).wrapping_add(b as usize))
}

fn hash2(s: &str) -> usize {
    s.bytes().fold(0usize, |h, b| {
        (b as usize)
            .wrapping_add(h.wrapping_shl(6))
            .wrapping_add(h.wrapping_shl(16))
            .wrapping_sub(h)
    })
}

fn hash3(s: &str) -> usize {
    s.bytes().fold(0usize, |h, b| h.wrapping_mul(33) ^ (b as usize))
}

// Approach 2: Bloom filter with u64 bit array
pub struct BloomFilter {
    bits: Vec<u64>,
    num_bits: usize,
}

impl BloomFilter {
    pub fn new(num_bits: usize) -> Self {
        let words = (num_bits + 63) / 64;
        BloomFilter {
            bits: vec![0u64; words],
            num_bits,
        }
    }

    fn set_bit(&mut self, i: usize) {
        let idx = i % self.num_bits;
        let word = idx / 64;
        let bit = idx % 64;
        self.bits[word] |= 1u64 << bit;
    }

    fn get_bit(&self, i: usize) -> bool {
        let idx = i % self.num_bits;
        let word = idx / 64;
        let bit = idx % 64;
        (self.bits[word] >> bit) & 1 == 1
    }

    pub fn add(&mut self, s: &str) {
        self.set_bit(hash1(s));
        self.set_bit(hash2(s));
        self.set_bit(hash3(s));
    }

    pub fn might_contain(&self, s: &str) -> bool {
        self.get_bit(hash1(s)) && self.get_bit(hash2(s)) && self.get_bit(hash3(s))
    }

    pub fn count_set_bits(&self) -> u32 {
        self.bits.iter().map(|w| w.count_ones()).sum()
    }

    /// Estimated false positive rate given n items inserted
    pub fn false_positive_rate(&self, n: usize) -> f64 {
        let k = 3.0_f64; // number of hash functions
        let m = self.num_bits as f64;
        (1.0 - (-k * n as f64 / m).exp()).powf(k)
    }
}

fn main() {
    let mut bf = BloomFilter::new(1024);
    bf.add("apple");
    bf.add("banana");
    bf.add("cherry");

    println!("apple:  {}", bf.might_contain("apple"));
    println!("banana: {}", bf.might_contain("banana"));
    println!("cherry: {}", bf.might_contain("cherry"));
    println!("dragon: {} (might be false positive)", bf.might_contain("dragon"));
    println!("bits set: {}", bf.count_set_bits());
    println!("false positive rate (3 items): {:.4}", bf.false_positive_rate(3));
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_inserted_items_found() {
        let mut bf = BloomFilter::new(1024);
        let words = ["apple", "banana", "cherry", "dog", "elephant"];
        for w in &words {
            bf.add(w);
        }
        for w in &words {
            assert!(bf.might_contain(w), "must contain '{}'", w);
        }
    }

    #[test]
    fn test_false_positive_rate_reasonable() {
        let mut bf = BloomFilter::new(10_000);
        for i in 0..100 {
            bf.add(&format!("item_{}", i));
        }
        // Check 1000 non-inserted items, count false positives
        let fp: usize = (0..1000)
            .filter(|i| bf.might_contain(&format!("not_inserted_{}", i)))
            .count();
        // With 10000 bits, 3 hashes, 100 items โ†’ FP rate ~ 0.001
        // Allow up to 5% false positives in test
        assert!(fp < 50, "too many false positives: {}/1000", fp);
    }

    #[test]
    fn test_bit_operations() {
        let mut bf = BloomFilter::new(128);
        bf.set_bit(0);
        bf.set_bit(63);
        bf.set_bit(64);
        bf.set_bit(127);
        assert!(bf.get_bit(0));
        assert!(bf.get_bit(63));
        assert!(bf.get_bit(64));
        assert!(bf.get_bit(127));
        assert!(!bf.get_bit(1));
        assert!(!bf.get_bit(100));
    }

    #[test]
    fn test_count_set_bits() {
        let mut bf = BloomFilter::new(64);
        assert_eq!(bf.count_set_bits(), 0);
        bf.add("test");
        assert!(bf.count_set_bits() > 0);
        assert!(bf.count_set_bits() <= 3);
    }
}
(* 963: Bloom Filter *)
(* Probabilistic set membership: may have false positives, no false negatives *)

(* Simple hash functions using djb2 and sdbm *)

let hash1 s =
  String.fold_left (fun h c -> h * 31 + Char.code c) 5381 s

let hash2 s =
  String.fold_left (fun h c -> (Char.code c) + (h lsl 6) + (h lsl 16) - h) 0 s

let hash3 s =
  String.fold_left (fun h c -> h * 33 lxor Char.code c) 0 s

(* Approach 1: Bloom filter with fixed bit array *)

type bloom_filter = {
  bits: bool array;
  size: int;
}

let create size = { bits = Array.make size false; size }

let add bf s =
  let h1 = abs (hash1 s) mod bf.size in
  let h2 = abs (hash2 s) mod bf.size in
  let h3 = abs (hash3 s) mod bf.size in
  bf.bits.(h1) <- true;
  bf.bits.(h2) <- true;
  bf.bits.(h3) <- true

let might_contain bf s =
  let h1 = abs (hash1 s) mod bf.size in
  let h2 = abs (hash2 s) mod bf.size in
  let h3 = abs (hash3 s) mod bf.size in
  bf.bits.(h1) && bf.bits.(h2) && bf.bits.(h3)

(* Approach 2: Compact bit representation using int array *)

let create_compact num_bits =
  let words = (num_bits + 62) / 63 in
  Array.make words 0

let set_bit bits i =
  let word = i / 63 in
  let bit = i mod 63 in
  bits.(word) <- bits.(word) lor (1 lsl bit)

let get_bit bits i =
  let word = i / 63 in
  let bit = i mod 63 in
  (bits.(word) lsr bit) land 1 = 1

let () =
  let bf = create 1024 in

  add bf "apple";
  add bf "banana";
  add bf "cherry";

  (* All added items must be found *)
  assert (might_contain bf "apple");
  assert (might_contain bf "banana");
  assert (might_contain bf "cherry");

  (* These might or might not be found (false positives possible) *)
  (* but very unlikely with 1024 bits and 3 hashes for these strings *)
  let likely_absent = not (might_contain bf "dragon") in
  let _ = likely_absent in (* suppressing: could be false positive *)

  (* Compact bit array test *)
  let bits = create_compact 128 in
  set_bit bits 0;
  set_bit bits 63;
  set_bit bits 127;
  assert (get_bit bits 0);
  assert (get_bit bits 63);
  assert (get_bit bits 127);
  assert (not (get_bit bits 1));
  assert (not (get_bit bits 64));

  Printf.printf "โœ“ All tests passed\n"

๐Ÿ“Š Detailed Comparison

Bloom Filter โ€” Comparison

Core Insight

A Bloom filter is a bit array + k hash functions. `add(x)` sets k bits; `might_contain(x)` checks all k bits are set. Both languages use the same math. The difference: OCaml uses a `bool array` (simple, 1 byte per bit) while Rust uses `Vec<u64>` (8x more space-efficient via bitwise operations). Rust also requires explicit `wrapping_*` arithmetic to avoid overflow panics.

OCaml Approach

  • `bool array` โ€” one bool per bit slot (simple but memory-wasteful)
  • `String.fold_left` for hash accumulation
  • `abs h mod bf.size` for index calculation
  • `bf.bits.(i) <- true` for bit setting
  • Integer arrays for compact bit representation (manual bit ops)
  • `lsl`, `lor`, `lxor`, `lsr` โ€” OCaml bitwise operators

Rust Approach

  • `Vec<u64>` โ€” 64 bits per word, 8x more compact
  • `wrapping_mul`, `wrapping_add`, `wrapping_shl`, `wrapping_sub` โ€” explicit overflow handling
  • `bits[word] |= 1u64 << bit` โ€” set a bit in a u64 word
  • `(bits[word] >> bit) & 1 == 1` โ€” test a bit
  • `count_ones()` on u64 โ€” popcount (hardware-accelerated)
  • FP rate formula: `(1 - e^(-k*n/m))^k`

Comparison Table

AspectOCamlRust
Bit storage`bool array` (1 byte/bit)`Vec<u64>` (1 bit/bit)
Hash fold`String.fold_left (fun h c -> h31 + code c)``s.bytes().fold(h, \h, b\h.wrapping_mul(31).wrapping_add(b))`
OverflowSilent (OCaml int wraps)Explicit `wrapping_` methods
Bit set`arr.(i) <- true``arr[word] \= 1u64 << bit`
Bit test`arr.(i)``(arr[word] >> bit) & 1 == 1`
PopcountManual count`u64::count_ones()` (hardware)
Bitwise ops`lsl`, `lor`, `lxor``<<`, `\`, `^`, `>>`