arthurprs · arthurprs · Jul 22, 2024 · Jul 21, 2024 · Jul 21, 2024 · Jul 22, 2024
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -20,6 +20,7 @@ jobs:
           toolchain: stable
           override: true
       - run: cargo test
+      - run: cargo test --all-features
 
   fuzz-tests:
     name: Fuzz tests

diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "qfilter"
-version = "0.1.6"
+version = "0.2.0"
 description = "Efficient bloom filter like datastructure, based on the Rank Select Quotient Filter (RSQF)"
 repository = "https://github.com/arthurprs/qfilter"
 authors = ["Arthur Silva <arthurprs@gmail.com>"]
@@ -30,3 +30,10 @@ serde_cbor = "0.11"
 [profile.bench]
 opt-level = 3
 debug = true
+
+[lints.rust]
+unexpected_cfgs = { level = "warn", check-cfg = ['cfg(fuzzing)'] }
+
+[package.metadata.docs.rs]
+features = ["stats"]
+rustdoc-args = ["--cfg", "docsrs"]
diff --git a/README.md b/README.md
@@ -1,9 +1,9 @@
 # Qfilter
 
-Efficient bloom filter like datastructure, based on the [Rank Select Quotient Filter (RSQF)](https://dl.acm.org/doi/pdf/10.1145/3035918.3035963).
+Efficient bloom filter like data structure, based on the [Rank Select Quotient Filter (RSQF)](https://dl.acm.org/doi/pdf/10.1145/3035918.3035963).
 
 This is a small and flexible general-purpose [AMQ-Filter](https://en.wikipedia.org/wiki/Approximate_Membership_Query_Filter).
-It not only supports approximate membership testing like a bloom filter but also deletions, merging (not implemented),
+It not only supports approximate membership testing like a bloom filter but also deletions, merging,
 resizing and [serde](https://crates.io/crates/serde) serialization.
 
 * High performance
@@ -12,6 +12,16 @@ resizing and [serde](https://crates.io/crates/serde) serialization.
 * Can be created with a initial small capacity and grow as needed
 * (De)Serializable with [serde](https://crates.io/crates/serde)
 * Portable Rust implementation
+* Only verifiable usages of unsafe
+
+This data structure is similar to a hash table that stores fingerprints in a very compact way.
+Fingerprints are similar to a hash values, but are possibly truncated.
+The reason for false positives is that multiple items can map to the same fingerprint.
+For more information see the [quotient filter Wikipedia page](https://en.wikipedia.org/wiki/Quotient_filter)
+that describes a similar but less optimized version of the data structure.
+The actual implementation is based on the [Rank Select Quotient Filter (RSQF)](https://dl.acm.org/doi/pdf/10.1145/3035918.3035963).
+
+The public API also exposes a fingerprint API, which can be used to succinctly store u64 hash values.
 
 ### Example
 
@@ -33,45 +43,30 @@ The hashing algorithm used is [xxhash3](https://crates.io/crates/xxhash-rust) wh
 
 For a given capacity and error probability the RSQF may require significantly less space than the equivalent bloom filter or other AMQ-Filters.
 
-| Bits per item | Error probability when full |
-|--------|----------|
-| 3.125  | 0.362    |
-| 4.125  | 0.201    |
-| 5.125  | 0.106    |
-| 6.125  | 0.0547   |
-| 7.125  | 0.0277   |
-| 8.125  | 0.014    |
-| 9.125  | 0.00701  |
-| 10.125 | 0.00351  |
-| 11.125 | 0.00176  |
-| 12.125 | 0.000879 |
-| 13.125 | 0.000439 |
-| 14.125 | 0.00022  |
-| 15.125 | 0.00011  |
-| 16.125 | 5.49e-05 |
-| 17.125 | 2.75e-05 |
-| 18.125 | 1.37e-05 |
-| 19.125 | 6.87e-06 |
-| 20.125 | 3.43e-06 |
-| 21.125 | 1.72e-06 |
-| 22.125 | 8.58e-07 |
-| 23.125 | 4.29e-07 |
-| 24.125 | 2.15e-07 |
-| 25.125 | 1.07e-07 |
-| 26.125 | 5.36e-08 |
-| 27.125 | 2.68e-08 |
-| 28.125 | 1.34e-08 |
-| 29.125 | 6.71e-09 |
-| 30.125 | 3.35e-09 |
-| 31.125 | 1.68e-09 |
-| 32.125 | 8.38e-10 |
+| Bits per item | Error probability when full | Bits per item (Cont.) | Error (cont.) |
+|:---:|:---:|:---:|---|
+| 3.125 | 0.362 | 19.125 | 6.87e-06 |
+| 4.125 | 0.201 | 20.125 | 3.43e-06 |
+| 5.125 | 0.106 | 21.125 | 1.72e-06 |
+| 6.125 | 0.0547 | 22.125 | 8.58e-07 |
+| 7.125 | 0.0277 | 23.125 | 4.29e-07 |
+| 8.125 | 0.014 | 24.125 | 2.15e-07 |
+| 9.125 | 0.00701 | 25.125 | 1.07e-07 |
+| 10.125 | 0.00351 | 26.125 | 5.36e-08 |
+| 11.125 | 0.00176 | 27.125 | 2.68e-08 |
+| 12.125 | 0.000879 | 28.125 | 1.34e-08 |
+| 13.125 | 0.000439 | 29.125 | 6.71e-09 |
+| 14.125 | 0.00022 | 30.125 | 3.35e-09 |
+| 15.125 | 0.00011 | 31.125 | 1.68e-09 |
+| 16.125 | 5.49e-05 | 32.125 | 8.38e-10 |
+| 17.125 | 2.75e-05 | .. | .. |
+| 18.125 | 1.37e-05 | .. | .. |
 
 ### Not implemented
 
-- [ ] Merging
-- [ ] Shrink to fit
-- [ ] Counting
-- [ ] Smoother resizing by chaining exponentially larger and more precise filters
+- [ ] Fingerprint attached values
+- [ ] Counting with fingerprint values, not fingerprint duplication
+- [ ] More advanced growth strategies (InfiniFilter).
 
 ### Legacy x86_64 CPUs support
 

diff --git a/benches/benches.rs b/benches/benches.rs
@@ -6,11 +6,11 @@ use test::Bencher;
 
 #[bench]
 fn bench_new(b: &mut Bencher) {
-    b.iter(|| Filter::new(1000, 0.005));
+    b.iter(|| Filter::new(1000, 0.005).unwrap());
 }
 #[bench]
 fn bench_get_ok_medium(b: &mut Bencher) {
-    let mut f = Filter::new(100000, 0.01);
+    let mut f = Filter::new(100000, 0.01).unwrap();
     for i in 0..f.capacity() {
         f.insert_duplicated(&i).unwrap();
     }
@@ -23,7 +23,7 @@ fn bench_get_ok_medium(b: &mut Bencher) {
 
 #[bench]
 fn bench_get_nok_medium(b: &mut Bencher) {
-    let mut f = Filter::new(100000, 0.01);
+    let mut f = Filter::new(100000, 0.01).unwrap();
     for i in 0..f.capacity() {
         f.insert_duplicated(&i).unwrap();
     }
@@ -37,7 +37,7 @@ fn bench_get_nok_medium(b: &mut Bencher) {
 #[bench]
 fn bench_grow(b: &mut Bencher) {
     b.iter(|| {
-        let mut f = Filter::new(10000, 0.01);
+        let mut f = Filter::new(10000, 0.01).unwrap();
         for i in 0..f.capacity() {
             f.insert_duplicated(i).unwrap();
         }
@@ -47,7 +47,7 @@ fn bench_grow(b: &mut Bencher) {
 
 #[bench]
 fn bench_grow_from_90pct(b: &mut Bencher) {
-    let mut f = Filter::new(10000, 0.01);
+    let mut f = Filter::new(10000, 0.01).unwrap();
     for i in 0..f.capacity() / 10 * 9 {
         f.insert_duplicated(i).unwrap();
     }
@@ -63,7 +63,7 @@ fn bench_grow_from_90pct(b: &mut Bencher) {
 #[bench]
 fn bench_grow_resizeable(b: &mut Bencher) {
     b.iter(|| {
-        let mut f = Filter::new_resizeable(0, 10000, 0.01);
+        let mut f = Filter::new_resizeable(0, 10000, 0.01).unwrap();
         for i in 0u64.. {
             if f.insert_duplicated(i).is_err() {
                 break;
@@ -76,7 +76,7 @@ fn bench_grow_resizeable(b: &mut Bencher) {
 
 #[bench]
 fn bench_shrink(b: &mut Bencher) {
-    let mut f = Filter::new(10000, 0.01);
+    let mut f = Filter::new(10000, 0.01).unwrap();
     for i in 0..f.capacity() {
         let _ = f.insert(i);
     }

diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
@@ -23,3 +23,9 @@ name = "fuzz_qfilter"
 path = "fuzz_targets/fuzz_qfilter.rs"
 test = false
 doc = false
+
+[[bin]]
+name = "fuzz_fingerprint"
+path = "fuzz_targets/fuzz_fingerprint.rs"
+test = false
+doc = false
diff --git a/fuzz/fuzz_targets/fuzz_fingerprint.rs b/fuzz/fuzz_targets/fuzz_fingerprint.rs
@@ -0,0 +1,87 @@
+#![no_main]
+use libfuzzer_sys::arbitrary;
+use libfuzzer_sys::arbitrary::Arbitrary;
+use libfuzzer_sys::fuzz_target;
+
+const FUZZ_REMOVES: bool = true;
+const CHECK_EVERY: usize = 8;
+const CHECK_SHRUNK: bool = true;
+
+#[derive(Debug, Arbitrary)]
+struct Input {
+    cap: u16,
+    fp_size: u8,
+    ops: Vec<(bool, u16)>,
+}
+
+fuzz_target!(|input: Input| {
+    let Input { cap, ops, fp_size } = input;
+    // The "Model", tracks the count for each item
+    let mut counts = [0u64; (u16::MAX as usize) + 1];
+    let Ok(mut f) = qfilter::Filter::with_fingerprint_size(cap as u64, fp_size.clamp(7, 64)) else {
+        return;
+    };
+    for i in 0..ops.len() {
+        // print_sample(&counts);
+        // dbg!(ops[i]);
+
+        let (add, item) = ops[i];
+        let item = item as u64;
+        if !FUZZ_REMOVES || add {
+            if f.insert_fingerprint(true, item).is_err() {
+                continue;
+            }
+            counts[item as usize] += 1;
+        } else if counts[item as usize] != 0 && f.remove_fingerprint(item) {
+            counts[item as usize] -= 1;
+        } else {
+            continue;
+        }
+
+        if i % CHECK_EVERY == 0 {
+            for &(_add, e) in &ops[..=i] {
+                let min = counts[e as usize];
+                // Since we can only check for >= due to collisions skip min = 0
+                if min != 0 {
+                    let est = f.count_fingerprint(e as u64);
+                    assert!(est >= min, "{e}: est {est} < min {min}");
+                }
+            }
+        }
+    }
+
+    for shrunk in [false, true] {
+        for &(_add, e) in &ops {
+            let min = counts[e as usize];
+            let est = f.count_fingerprint(e as u64);
+            assert!(est >= min, "{e}: est {est} < min {min} shrunk {shrunk:?}");
+        }
+        let prints = f.fingerprints().collect::<Vec<_>>();
+        let mut expected_prints = counts
+            .iter()
+            .enumerate()
+            .flat_map(|(i, n)| {
+                let t = (i as u64) << (64 - f.fingerprint_size()) >> (64 - f.fingerprint_size());
+                std::iter::repeat(t).take(*n as usize)
+            })
+            .collect::<Vec<_>>();
+        expected_prints.sort_unstable();
+        assert_eq!(prints.len(), f.len() as usize);
+        assert_eq!(prints, expected_prints);
+        if !CHECK_SHRUNK {
+            break;
+        }
+        f.shrink_to_fit();
+    }
+});
+
+#[allow(dead_code)]
+fn print_sample(counts: &[u64]) {
+    print!("[");
+    for (i, c) in counts.iter().copied().enumerate() {
+        if c != 0 {
+            print!("({i}u16, {c}), ");
+        }
+    }
+    println!("]");
+}
diff --git a/fuzz/fuzz_targets/fuzz_qfilter.rs b/fuzz/fuzz_targets/fuzz_qfilter.rs
@@ -1,60 +1,71 @@
 #![no_main]
+use libfuzzer_sys::arbitrary;
+use libfuzzer_sys::arbitrary::Arbitrary;
 use libfuzzer_sys::fuzz_target;
 
 const FUZZ_REMOVES: bool = true;
 const CHECK_EVERY: usize = 8;
+const CHECK_SHRUNK: bool = true;
 
-fuzz_target!(|data: Vec<i16>| {
-    if data.len() < 2 {
-        return;
-    }
-    let cap = (data[0] as u64).min(data.len() as u64 / 2);
-    let fp = (0.1f64).powi(data[1].leading_ones() as i32).min(0.1);
-    let ops = data
-        .into_iter()
-        .map(|i| {
-            if i < 0 && FUZZ_REMOVES {
-                (false, i.checked_neg().unwrap_or(0) as u16)
-            } else {
-                (true, i as u16)
-            }
-        })
-        .collect::<Vec<(bool, u16)>>();
+#[derive(Debug, Arbitrary)]
+struct Input {
+    cap: u16,
+    max_cap: u16,
+    fp_exp: u16,
+    ops: Vec<(bool, u16)>,
+}
+
+fuzz_target!(|input: Input| {
+    let Input {
+        cap,
+        max_cap,
+        fp_exp,
+        ops,
+    } = input;
+    let max_cap = max_cap.max(cap) as u64;
+    let cap = cap as u64;
+    let fp = 2f64.powi(-(fp_exp.leading_ones() as i32));
     // The "Model", tracks the count for each item
     let mut counts = [0u64; (u16::MAX as usize) + 1];
-    let mut f = qfilter::Filter::new(cap, fp);
+    let mut f = qfilter::Filter::new_resizeable(cap, max_cap, fp).unwrap();
     for i in 0..ops.len() {
         // print_sample(&counts);
         // dbg!(ops[i]);
 
         let (add, item) = ops[i];
-        if add {
+        if !FUZZ_REMOVES || add {
             if f.insert_duplicated(item).is_err() {
                 continue;
             }
             counts[item as usize] += 1;
+        } else if counts[item as usize] != 0 && f.remove(item) {
+            counts[item as usize] -= 1;
         } else {
-            if counts[item as usize] != 0 && f.remove(item) {
-                counts[item as usize] -= 1;
-            } else {
-                continue;
-            }
+            continue;
         }
+
         if i % CHECK_EVERY == 0 {
             for &(_add, e) in &ops[..=i] {
                 let min = counts[e as usize];
                 // Since we can only check for >= due to collisions skip min = 0
                 if min != 0 {
                     let est = f.count(e);
-                    assert!(est >= min, "{}: est {} min {}", e, est, min);
+                    assert!(est >= min, "{e}: est {est} < min {min}");
                 }
             }
         }
     }
-    for &(_add, e) in &ops {
-        let min = counts[e as usize];
-        let est = f.count(e);
-        assert!(est >= min, "{}: est {} min {}", e, est, min);
+
+    for shrunk in [false, true] {
+        for &(_add, e) in &ops {
+            let min = counts[e as usize];
+            let est = f.count(e);
+            assert!(est >= min, "{e}: est {est} < min {min} shrunk {shrunk:?}");
+        }
+        if !CHECK_SHRUNK {
+            break;
+        }
+        f.shrink_to_fit();
     }
 });