Bump github/codeql-action from 4.36.1 to 4.36.2

Bumps [github/codeql-action](https://github.com/github/codeql-action) from 4.36.1 to 4.36.2. - [Release notes](https://github.com/github/codeql-action/releases) - [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md) - [Commits](87557b9c84...8aad20d150) --- updated-dependencies: - dependency-name: github/codeql-action dependency-version: 4.36.2 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>
Bump actions/checkout from 6.0.2 to 6.0.3 (#2949 )
2026-06-09 20:10:42 +00:00 · 2026-06-08 20:04:11 +00:00 · 2026-06-08 10:55:54 +02:00 · 2026-06-08 10:49:04 +02:00 · 2026-06-08 10:48:48 +02:00 · 2026-06-08 10:48:32 +02:00
17 changed files with 752 additions and 149 deletions
--- a/.github/workflows/coverage.yml
+++ b/.github/workflows/coverage.yml
@@ -20,7 +20,7 @@ jobs:
      contents: read

    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+      - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
      - name: Install Rust
        run: rustup toolchain install nightly-2025-12-01 --profile minimal --component llvm-tools-preview
      - uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4 # v2.9.1
--- a/.github/workflows/long_running.yml
+++ b/.github/workflows/long_running.yml
@@ -25,7 +25,7 @@ jobs:
      contents: read

    steps:
-    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+    - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
    - name: Install stable
      uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af # v1.0.7
      with:
--- a/.github/workflows/scorecard.yml
+++ b/.github/workflows/scorecard.yml
@@ -22,7 +22,7 @@ jobs:

    steps:
      - name: 'Checkout code'
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
        with:
          persist-credentials: false

@@ -36,7 +36,7 @@ jobs:

      # Upload the results as artifacts.
      - name: 'Upload artifact'
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
        with:
          name: SARIF file
          path: results.sarif
@@ -44,6 +44,6 @@ jobs:

      # Upload the results to GitHub's code scanning dashboard.
      - name: 'Upload to code-scanning'
-        uses: github/codeql-action/upload-sarif@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4.35.2
+        uses: github/codeql-action/upload-sarif@8aad20d150bbac5944a9f9d289da16a4b0d87c1e # v4.36.2
        with:
          sarif_file: results.sarif
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -27,7 +27,7 @@ jobs:
      checks: write

    steps:
-    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+    - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3

    - name: Install nightly
      uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af # v1.0.7
@@ -77,7 +77,7 @@ jobs:
    name: test-${{ matrix.features.label}}

    steps:
-    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+    - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3

    - name: Install stable
      uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af # v1.0.7
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -11,7 +11,7 @@ repository = "https://github.com/quickwit-oss/tantivy"
 readme = "README.md"
 keywords = ["search", "information", "retrieval"]
 edition = "2021"
-rust-version = "1.92"
+rust-version = "1.86"
 exclude = ["benches/*.json", "benches/*.txt"]

 [dependencies]
--- a/bitpacker/Cargo.toml
+++ b/bitpacker/Cargo.toml
@@ -18,5 +18,10 @@ homepage = "https://github.com/quickwit-oss/tantivy"
 bitpacking = { version = "0.9.2", default-features = false, features = ["bitpacker1x"] }

 [dev-dependencies]
+binggan = "0.17.0"
 rand = "0.9"
 proptest = "1"
+
+[[bench]]
+name = "bench"
+harness = false
--- a/bitpacker/benches/bench.rs
+++ b/bitpacker/benches/bench.rs
@@ -1,65 +1,110 @@
-#![feature(test)]
+use std::cell::RefCell;

-extern crate test;
+use binggan::{BenchRunner, black_box};
+use rand::rng;
+use rand::seq::IteratorRandom;
+use tantivy_bitpacker::{BitPacker, BitUnpacker, BlockedBitpacker};

-#[cfg(test)]
-mod tests {
-    use rand::rng;
-    use rand::seq::IteratorRandom;
-    use tantivy_bitpacker::{BitPacker, BitUnpacker, BlockedBitpacker};
-    use test::Bencher;
+fn create_bitpacked_data(bit_width: u8, num_els: u32) -> Vec<u8> {
+    let mut bitpacker = BitPacker::new();
+    let mut buffer = Vec::new();
+    for _ in 0..num_els {
+        bitpacker.write(0u64, bit_width, &mut buffer).unwrap();
+        bitpacker.flush(&mut buffer).unwrap();
+    }
+    buffer
+}

-    #[inline(never)]
-    fn create_bitpacked_data(bit_width: u8, num_els: u32) -> Vec<u8> {
-        let mut bitpacker = BitPacker::new();
-        let mut buffer = Vec::new();
-        for _ in 0..num_els {
-            // the values do not matter.
-            bitpacker.write(0u64, bit_width, &mut buffer).unwrap();
-            bitpacker.flush(&mut buffer).unwrap();
+const N: usize = 100_000;
+const MAX_VAL: u64 = 1_000;
+const BIT_WIDTH: u8 = 10; // 2^10 = 1024 > MAX_VAL
+
+fn create_packed_data() -> (BitUnpacker, Vec<u8>) {
+    let mut bitpacker = BitPacker::new();
+    let mut data = Vec::new();
+    for i in 0..N as u64 {
+        let val = i * MAX_VAL / N as u64;
+        bitpacker.write(val, BIT_WIDTH, &mut data).unwrap();
+    }
+    bitpacker.close(&mut data).unwrap();
+    (BitUnpacker::new(BIT_WIDTH), data)
+}
+
+fn bench_bitpacking() {
+    let mut runner = BenchRunner::new();
+    let bit_width = 3;
+    let num_els = 1_000_000u32;
+    let bit_unpacker = BitUnpacker::new(bit_width);
+    let data = create_bitpacked_data(bit_width, num_els);
+    let idxs: Vec<u32> = (0..num_els).choose_multiple(&mut rng(), 100_000);
+    runner.bench_function("bitpacking_read", move |_| {
+        let mut out = 0u64;
+        for &idx in &idxs {
+            out = out.wrapping_add(bit_unpacker.get(idx, &data[..]));
        }
-        buffer
-    }
+        black_box(out);
+    });
+}

-    #[bench]
-    fn bench_bitpacking_read(b: &mut Bencher) {
-        let bit_width = 3;
-        let num_els = 1_000_000u32;
-        let bit_unpacker = BitUnpacker::new(bit_width);
-        let data = create_bitpacked_data(bit_width, num_els);
-        let idxs: Vec<u32> = (0..num_els).choose_multiple(&mut rng(), 100_000);
-        b.iter(|| {
-            let mut out = 0u64;
-            for &idx in &idxs {
-                out = out.wrapping_add(bit_unpacker.get(idx, &data[..]));
-            }
-            out
-        });
+fn bench_blocked_bitpacker() {
+    let mut runner = BenchRunner::new();
+    let mut blocked_bitpacker = BlockedBitpacker::new();
+    for val in 0..=21500 {
+        blocked_bitpacker.add(val * val);
    }
-
-    #[bench]
-    fn bench_blockedbitp_read(b: &mut Bencher) {
+    runner.bench_function("blockedbitp_read", move |_| {
+        let mut out = 0u64;
+        for val in 0..=21500 {
+            out = out.wrapping_add(blocked_bitpacker.get(val));
+        }
+        black_box(out);
+    });
+    runner.bench_function("blockedbitp_create", |_| {
        let mut blocked_bitpacker = BlockedBitpacker::new();
        for val in 0..=21500 {
            blocked_bitpacker.add(val * val);
        }
-        b.iter(|| {
-            let mut out = 0u64;
-            for val in 0..=21500 {
-                out = out.wrapping_add(blocked_bitpacker.get(val));
-            }
-            out
-        });
-    }
-
-    #[bench]
-    fn bench_blockedbitp_create(b: &mut Bencher) {
-        b.iter(|| {
-            let mut blocked_bitpacker = BlockedBitpacker::new();
-            for val in 0..=21500 {
-                blocked_bitpacker.add(val * val);
-            }
-            blocked_bitpacker
-        });
-    }
+        black_box(blocked_bitpacker);
+    });
+}
+
+fn bench_filter_vec() {
+    let mut runner = BenchRunner::new();
+
+    let (unpacker, data) = create_packed_data();
+    let positions = RefCell::new(Vec::with_capacity(N));
+    runner.bench_function("filter_vec_dense", move |_| {
+        unpacker.get_ids_for_value_range(
+            250..=750,
+            0..N as u32,
+            &data,
+            &mut positions.borrow_mut(),
+        );
+        black_box(positions.borrow().len());
+    });
+
+    let (unpacker, data) = create_packed_data();
+    let positions = RefCell::new(Vec::with_capacity(N));
+    runner.bench_function("filter_vec_sparse", move |_| {
+        unpacker.get_ids_for_value_range(0..=50, 0..N as u32, &data, &mut positions.borrow_mut());
+        black_box(positions.borrow().len());
+    });
+
+    let (unpacker, data) = create_packed_data();
+    let positions = RefCell::new(Vec::with_capacity(N));
+    runner.bench_function("filter_vec_full", move |_| {
+        unpacker.get_ids_for_value_range(
+            0..=MAX_VAL,
+            0..N as u32,
+            &data,
+            &mut positions.borrow_mut(),
+        );
+        black_box(positions.borrow().len());
+    });
+}
+
+fn main() {
+    bench_bitpacking();
+    bench_blocked_bitpacker();
+    bench_filter_vec();
 }
--- a/bitpacker/src/filter_vec/mod.rs
+++ b/bitpacker/src/filter_vec/mod.rs
@@ -1,8 +1,17 @@
+#[cfg(all(target_arch = "aarch64", not(target_vendor = "apple")))]
+use std::arch::is_aarch64_feature_detected;
 use std::ops::RangeInclusive;

 #[cfg(target_arch = "x86_64")]
 mod avx2;

+#[cfg(target_arch = "aarch64")]
+mod neon;
+
+// SVE intrinsics are not exposed on aarch64-apple-darwin.
+#[cfg(all(target_arch = "aarch64", not(target_vendor = "apple")))]
+mod sve;
+
 mod scalar;

 #[derive(Clone, Copy, Eq, PartialEq, Debug)]
@@ -10,6 +19,10 @@ mod scalar;
 enum FilterImplPerInstructionSet {
    #[cfg(target_arch = "x86_64")]
    AVX2 = 0u8,
+    #[cfg(all(target_arch = "aarch64", not(target_vendor = "apple")))]
+    SVE = 3u8,
+    #[cfg(target_arch = "aarch64")]
+    Neon = 2u8,
    Scalar = 1u8,
 }

@@ -19,29 +32,57 @@ impl FilterImplPerInstructionSet {
        match *self {
            #[cfg(target_arch = "x86_64")]
            FilterImplPerInstructionSet::AVX2 => is_x86_feature_detected!("avx2"),
+            #[cfg(all(target_arch = "aarch64", not(target_vendor = "apple")))]
+            FilterImplPerInstructionSet::SVE => is_aarch64_feature_detected!("sve"),
+            // TIL Neon is required on aarch 64.
+            #[cfg(target_arch = "aarch64")]
+            FilterImplPerInstructionSet::Neon => true,
            FilterImplPerInstructionSet::Scalar => true,
        }
    }
 }

-// List of available implementation in preferred order.
+// List of available implementations in preferred order.
 #[cfg(target_arch = "x86_64")]
 const IMPLS: [FilterImplPerInstructionSet; 2] = [
    FilterImplPerInstructionSet::AVX2,
    FilterImplPerInstructionSet::Scalar,
 ];

-#[cfg(not(target_arch = "x86_64"))]
+// Non-Apple aarch64: try SVE, NEON, Scalar.
+#[cfg(all(target_arch = "aarch64", not(target_vendor = "apple")))]
+const IMPLS: [FilterImplPerInstructionSet; 3] = [
+    FilterImplPerInstructionSet::SVE,
+    FilterImplPerInstructionSet::Neon,
+    FilterImplPerInstructionSet::Scalar,
+];
+
+// Apple aarch64 (M-series): SVE not available; use NEON or Scalar.
+#[cfg(all(target_arch = "aarch64", target_vendor = "apple"))]
+const IMPLS: [FilterImplPerInstructionSet; 2] = [
+    FilterImplPerInstructionSet::Neon,
+    FilterImplPerInstructionSet::Scalar,
+];
+
+#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
 const IMPLS: [FilterImplPerInstructionSet; 1] = [FilterImplPerInstructionSet::Scalar];

 impl FilterImplPerInstructionSet {
    #[inline]
-    #[allow(unused_variables)] // on non-x86_64, code is unused.
+    #[allow(unused_variables)]
    fn from(code: u8) -> FilterImplPerInstructionSet {
        #[cfg(target_arch = "x86_64")]
        if code == FilterImplPerInstructionSet::AVX2 as u8 {
            return FilterImplPerInstructionSet::AVX2;
        }
+        #[cfg(all(target_arch = "aarch64", not(target_vendor = "apple")))]
+        if code == FilterImplPerInstructionSet::SVE as u8 {
+            return FilterImplPerInstructionSet::SVE;
+        }
+        #[cfg(target_arch = "aarch64")]
+        if code == FilterImplPerInstructionSet::Neon as u8 {
+            return FilterImplPerInstructionSet::Neon;
+        }
        FilterImplPerInstructionSet::Scalar
    }

@@ -50,6 +91,13 @@ impl FilterImplPerInstructionSet {
        match self {
            #[cfg(target_arch = "x86_64")]
            FilterImplPerInstructionSet::AVX2 => avx2::filter_vec_in_place(range, offset, output),
+            #[cfg(all(target_arch = "aarch64", not(target_vendor = "apple")))]
+            // SAFETY: SVE availability was verified by is_available() before selecting this impl.
+            FilterImplPerInstructionSet::SVE => unsafe {
+                sve::filter_vec_in_place(range, offset, output)
+            },
+            #[cfg(target_arch = "aarch64")]
+            FilterImplPerInstructionSet::Neon => neon::filter_vec_in_place(range, offset, output),
            FilterImplPerInstructionSet::Scalar => {
                scalar::filter_vec_in_place(range, offset, output)
            }
@@ -57,6 +105,12 @@ impl FilterImplPerInstructionSet {
    }
 }

+fn available_impls() -> impl Iterator<Item = FilterImplPerInstructionSet> {
+    IMPLS
+        .into_iter()
+        .filter(FilterImplPerInstructionSet::is_available)
+}
+
 #[inline]
 fn get_best_available_instruction_set() -> FilterImplPerInstructionSet {
    use std::sync::atomic::{AtomicU8, Ordering};
@@ -64,10 +118,7 @@ fn get_best_available_instruction_set() -> FilterImplPerInstructionSet {
    let instruction_set_byte: u8 = INSTRUCTION_SET_BYTE.load(Ordering::Relaxed);
    if instruction_set_byte == u8::MAX {
        // Let's initialize the instruction set and cache it.
-        let instruction_set = IMPLS
-            .into_iter()
-            .find(FilterImplPerInstructionSet::is_available)
-            .unwrap();
+        let instruction_set = available_impls().next().unwrap();
        INSTRUCTION_SET_BYTE.store(instruction_set as u8, Ordering::Relaxed);
        return instruction_set;
    }
@@ -80,12 +131,12 @@ pub fn filter_vec_in_place(range: RangeInclusive<u32>, offset: u32, output: &mut

 #[cfg(test)]
 mod tests {
+    use proptest::strategy::Strategy;
+
    use super::*;

    #[test]
    fn test_get_best_available_instruction_set() {
-        // This does not test much unfortunately.
-        // We just make sure the function returns without crashing and returns the same result.
        let instruction_set = get_best_available_instruction_set();
        assert_eq!(get_best_available_instruction_set(), instruction_set);
    }
@@ -102,6 +153,31 @@ mod tests {
        }
    }

+    #[cfg(all(target_arch = "aarch64", not(target_vendor = "apple")))]
+    #[test]
+    fn test_instruction_set_to_code_from_code() {
+        for instruction_set in [
+            FilterImplPerInstructionSet::SVE,
+            FilterImplPerInstructionSet::Neon,
+            FilterImplPerInstructionSet::Scalar,
+        ] {
+            let code = instruction_set as u8;
+            assert_eq!(instruction_set, FilterImplPerInstructionSet::from(code));
+        }
+    }
+
+    #[cfg(all(target_arch = "aarch64", target_vendor = "apple"))]
+    #[test]
+    fn test_instruction_set_to_code_from_code() {
+        for instruction_set in [
+            FilterImplPerInstructionSet::Neon,
+            FilterImplPerInstructionSet::Scalar,
+        ] {
+            let code = instruction_set as u8;
+            assert_eq!(instruction_set, FilterImplPerInstructionSet::from(code));
+        }
+    }
+
    fn test_filter_impl_empty_aux(filter_impl: FilterImplPerInstructionSet) {
        let mut output = vec![];
        filter_impl.filter_vec_in_place(0..=u32::MAX, 0, &mut output);
@@ -126,11 +202,20 @@ mod tests {
        assert_eq!(&output, &[1, 3, 4, 5, 6, 7, 8]);
    }

+    fn test_filter_impl_empty_range_aux(filter_impl: FilterImplPerInstructionSet) {
+        // start > end: RangeInclusive::contains always returns false; output must be empty.
+        // The SVE path's wrapping_sub would otherwise produce a huge range_width.
+        let mut output = vec![3, 2, 1, 5, 11, 2, 5, 10, 2];
+        filter_impl.filter_vec_in_place(10..=5, 0, &mut output);
+        assert_eq!(&output, &[]);
+    }
+
    fn test_filter_impl_test_suite(filter_impl: FilterImplPerInstructionSet) {
        test_filter_impl_empty_aux(filter_impl);
        test_filter_impl_simple_aux(filter_impl);
        test_filter_impl_simple_aux_shifted(filter_impl);
        test_filter_impl_simple_outside_i32_range(filter_impl);
+        test_filter_impl_empty_range_aux(filter_impl);
    }

    #[test]
@@ -141,25 +226,60 @@ mod tests {
        }
    }

+    #[test]
+    #[cfg(all(target_arch = "aarch64", not(target_vendor = "apple")))]
+    fn test_filter_implementation_sve() {
+        if FilterImplPerInstructionSet::SVE.is_available() {
+            test_filter_impl_test_suite(FilterImplPerInstructionSet::SVE);
+        }
+    }
+
+    #[test]
+    #[cfg(target_arch = "aarch64")]
+    fn test_filter_implementation_neon() {
+        test_filter_impl_test_suite(FilterImplPerInstructionSet::Neon);
+    }
+
    #[test]
    fn test_filter_implementation_scalar() {
        test_filter_impl_test_suite(FilterImplPerInstructionSet::Scalar);
    }

-    #[cfg(target_arch = "x86_64")]
+    fn max_val_strategy() -> impl proptest::strategy::Strategy<Value = u32> {
+        proptest::prop_oneof![
+            0u32..10u32,
+            255u32..258u32,
+            proptest::prelude::Just(1u32 << 25),
+            proptest::prelude::Just(u32::MAX - 1),
+            proptest::prelude::Just(u32::MAX),
+        ]
+    }
+
+    fn vals_strategy() -> impl proptest::strategy::Strategy<Value = Vec<u32>> {
+        proptest::prop_oneof![
+            proptest::collection::vec(proptest::prelude::any::<u32>(), 0..300),
+            max_val_strategy()
+                .prop_flat_map(|max_val| { proptest::collection::vec(0..=max_val, 0..300) })
+        ]
+    }
+
    proptest::proptest! {
        #[test]
-        fn test_filter_compare_scalar_and_avx2_impl_proptest(
-            start in proptest::prelude::any::<u32>(),
-            end in proptest::prelude::any::<u32>(),
+        fn test_filter_compare_scalar_and_impls_impl_proptest(
+            start in 0u32..400u32,
+            end in 0u32..400u32,
            offset in 0u32..2u32,
-            mut vals in proptest::collection::vec(0..u32::MAX, 0..30)) {
-            if FilterImplPerInstructionSet::AVX2.is_available() {
-                let mut vals_clone = vals.clone();
-                FilterImplPerInstructionSet::AVX2.filter_vec_in_place(start..=end, offset, &mut vals);
-                FilterImplPerInstructionSet::Scalar.filter_vec_in_place(start..=end, offset, &mut vals_clone);
-                assert_eq!(&vals, &vals_clone);
-            }
+            vals in vals_strategy()) {
+                for implementation in available_impls() {
+                    if implementation == FilterImplPerInstructionSet::Scalar {
+                        continue;
+                    }
+                    let mut impl_output = vals.clone();
+                    let mut scalar_output = vals.clone();
+                    implementation.filter_vec_in_place(start..=end, offset, &mut impl_output);
+                    FilterImplPerInstructionSet::Scalar.filter_vec_in_place(start..=end, offset, &mut scalar_output);
+                    assert_eq!(&impl_output, &scalar_output);
+                }
       }
    }
 }
--- a/bitpacker/src/filter_vec/neon.rs
+++ b/bitpacker/src/filter_vec/neon.rs
@@ -0,0 +1,118 @@
+use std::arch::aarch64::*;
+use std::ops::RangeInclusive;
+
+const NUM_LANES: usize = 4;
+
+// Compacts matching lanes to the front using a byte-level shuffle.
+// `mask` is a 4-bit value: bit k=1 means lane k should appear in the output.
+#[inline]
+#[target_feature(enable = "neon")]
+unsafe fn compact(data: uint32x4_t, mask: u8) -> uint32x4_t {
+    unsafe {
+        // SAFETY: mask is always in [0, 15] by construction (max sum of [1,2,4,8]).
+        // BYTE_SHUFFLE_TABLE has 16 entries, so this is always in bounds.
+        let shuffle = BYTE_SHUFFLE_TABLE.get_unchecked(mask as usize);
+        let shuffle_vec = vld1q_u8(shuffle.as_ptr());
+        vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(data), shuffle_vec))
+    }
+}
+
+// Safe (not unsafe) because NEON is mandatory on aarch64: no runtime feature check needed.
+#[inline(never)]
+pub fn filter_vec_in_place(range: RangeInclusive<u32>, offset: u32, output: &mut Vec<u32>) {
+    let num_words = output.len() / NUM_LANES;
+    let mut output_len = unsafe {
+        filter_vec_neon_aux(
+            output.as_ptr(),
+            range.clone(),
+            output.as_mut_ptr(),
+            offset,
+            num_words,
+        )
+    };
+    let remainder_start = num_words * NUM_LANES;
+    for i in remainder_start..output.len() {
+        let val = output[i];
+        output[output_len] = offset + i as u32;
+        output_len += if range.contains(&val) { 1 } else { 0 };
+    }
+    output.truncate(output_len);
+}
+
+#[target_feature(enable = "neon")]
+unsafe fn filter_vec_neon_aux(
+    input: *const u32,
+    range: RangeInclusive<u32>,
+    output: *mut u32,
+    offset: u32,
+    num_words: usize,
+) -> usize {
+    unsafe {
+        let mut input = input;
+        let mut output_tail = output;
+        let range_start_simd = vdupq_n_u32(*range.start());
+        let range_end_simd = vdupq_n_u32(*range.end());
+        let mut ids = vld1q_u32([offset, offset + 1, offset + 2, offset + 3].as_ptr());
+        let shift = vdupq_n_u32(NUM_LANES as u32);
+        let bit_weights = vld1q_u32([1u32, 2, 4, 8].as_ptr());
+
+        for _ in 0..num_words {
+            let word = vld1q_u32(input);
+
+            // Unsigned compares: CMHS (compare higher or same) tests `word >= start`
+            // and `end >= word`. ANDing both gives the inside-range mask directly,
+            // which is cheaper than computing `outside` and then negating.
+            let ge_start = vcgeq_u32(word, range_start_simd);
+            let le_end = vcleq_u32(word, range_end_simd);
+            // inside[k] = 0xFFFFFFFF if val[k] is in range, 0 otherwise.
+            let inside = vandq_u32(ge_start, le_end);
+
+            // Build the 4-bit mask: AND bit_weights with the inside lane mask, so each
+            // inside lane contributes its bit_weight (1, 2, 4, or 8). Summing yields the
+            // 4-bit mask in one addv.
+            let inside_bits = vandq_u32(bit_weights, inside);
+            let mask = vaddvq_u32(inside_bits) as u8;
+            // mask is mathematically bounded: max value is 1+2+4+8=15 (all lanes match)
+            debug_assert!(mask <= 15, "mask must fit in 4 bits: {}", mask);
+
+            // Count of matching lanes = popcount(mask). Derives the count directly from
+            // the mask instead of running a parallel SIMD reduction over `outside`.
+            let added_len = mask.count_ones() as usize;
+
+            // Safe because mask is guaranteed to be in [0, 15]
+            let filtered_ids = compact(ids, mask);
+            vst1q_u32(output_tail, filtered_ids);
+            output_tail = output_tail.add(added_len);
+            ids = vaddq_u32(ids, shift);
+            input = input.add(NUM_LANES);
+        }
+
+        output_tail.offset_from(output) as usize
+    }
+}
+
+// Byte shuffle patterns to compact matching lanes to the front of the vector.
+// Index is a 4-bit mask: bit k=1 means lane k (bytes 4k..4k+3) is in-range.
+// The j-th set bit determines which input lane goes to output position j.
+const BYTE_SHUFFLE_TABLE: [[u8; 16]; 16] = [
+    [
+        16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    ], // 0b0000: none
+    [0, 1, 2, 3, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], // 0b0001: lane 0
+    [4, 5, 6, 7, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], // 0b0010: lane 1
+    [0, 1, 2, 3, 4, 5, 6, 7, 16, 16, 16, 16, 16, 16, 16, 16],     // 0b0011: lanes 0,1
+    [8, 9, 10, 11, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], // 0b0100: lane 2
+    [0, 1, 2, 3, 8, 9, 10, 11, 16, 16, 16, 16, 16, 16, 16, 16],   // 0b0101: lanes 0,2
+    [4, 5, 6, 7, 8, 9, 10, 11, 16, 16, 16, 16, 16, 16, 16, 16],   // 0b0110: lanes 1,2
+    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 16, 16, 16],       // 0b0111: lanes 0,1,2
+    [
+        12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    ], // 0b1000: lane 3
+    [0, 1, 2, 3, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16], // 0b1001: lanes 0,3
+    [4, 5, 6, 7, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16], // 0b1010: lanes 1,3
+    [0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 16, 16, 16, 16],     // 0b1011: lanes 0,1,3
+    [8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16], // 0b1100: lanes 2,3
+    [0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16],   // 0b1101: lanes 0,2,3
+    [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16],   // 0b1110: lanes 1,2,3
+    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],       // 0b1111: all lanes
+];
--- a/bitpacker/src/filter_vec/sve.rs
+++ b/bitpacker/src/filter_vec/sve.rs
@@ -0,0 +1,260 @@
+use std::ops::RangeInclusive;
+
+// SVE vector length (in u32 lanes) is not a compile-time constant; query at runtime.
+// Safe to call only when SVE is confirmed available via is_aarch64_feature_detected!("sve").
+#[target_feature(enable = "sve")]
+unsafe fn num_lanes() -> usize {
+    let vl: usize;
+    unsafe {
+        core::arch::asm!(
+            "cntw {vl}",
+            vl = out(reg) vl,
+            options(nostack, nomem, preserves_flags),
+        );
+    }
+    vl
+}
+
+// SAFETY: caller must ensure SVE is available (checked via is_aarch64_feature_detected!("sve")).
+// Unlike NEON, SVE is optional on aarch64 and not guaranteed by the target architecture.
+pub unsafe fn filter_vec_in_place(range: RangeInclusive<u32>, offset: u32, output: &mut Vec<u32>) {
+    if range.start() > range.end() {
+        output.clear();
+        return;
+    }
+    let vl = unsafe { num_lanes() };
+    let num_words = output.len() / vl;
+    let range_start = *range.start();
+    // Unsigned subtraction trick: val ∈ [lo, hi] ↔ (val - lo) ≤ᵤ (hi - lo).
+    // Values below lo wrap around to large u32, so the single unsigned ≤ excludes them.
+    let range_width = range.end().wrapping_sub(range_start);
+    let mut output_len = unsafe {
+        filter_vec_sve_aux(
+            output.as_ptr(),
+            range_start,
+            range_width,
+            output.as_mut_ptr(),
+            offset,
+            num_words,
+            vl,
+        )
+    };
+    let remainder_start = num_words * vl;
+    for i in remainder_start..output.len() {
+        let val = output[i];
+        output[output_len] = offset + i as u32;
+        output_len += if range.contains(&val) { 1 } else { 0 };
+    }
+    output.truncate(output_len);
+}
+
+// Register allocation for the asm! blocks:
+//   z0        ids_a (index vector for first half of each pair, advances by step2 each iter)
+//   z1        range_width broadcast
+//   z2        range_start broadcast
+//   z3        step2 broadcast (2 * vl)
+//   z4        ids_b (index vector for second half, = ids_a + step, advances by step2)
+//   z5        scratch: loaded word_a, then compacted_a
+//   z6        scratch: loaded word_b, then compacted_b
+//   p0        all-true predicate (ptrue p0.s)
+//   p1        in-range mask for word_a
+//   p2        in-range mask for word_b
+#[target_feature(enable = "sve")]
+unsafe fn filter_vec_sve_aux(
+    input: *const u32,
+    range_start: u32,
+    range_width: u32,
+    output: *mut u32,
+    offset: u32,
+    num_words: usize,
+    vl: usize,
+) -> usize {
+    let num_pairs = num_words / 2;
+    let mut input_ptr = input;
+    let mut output_tail = output;
+
+    if num_pairs > 0 {
+        unsafe {
+            // We rely on asm! because the SVE intrinsics are not available in stable Rust.
+            // The code that follows was generated by Rustc nightly based on the intrinsics version
+            // at the bottom of this file.
+            core::arch::asm!(
+                // --- Setup ---
+                // All-true predicate for 32-bit lanes.
+                "ptrue p0.s",
+                // ids_a = [offset, offset+1, offset+2, ...]
+                "index z0.s, {offset:w}, #1",
+                // Broadcast scalars into SVE vectors.
+                "mov z1.s, {range_width:w}",
+                "mov z2.s, {range_start:w}",
+                // vl_gpr = number of 32-bit lanes (cntw).
+                "cntw {vl_gpr}",
+                // step2_bytes will first hold 2*vl (for the step2 vector), then 2*VL in bytes.
+                "lsl {step2_bytes}, {vl_gpr}, #1",
+                // z4 = step = [vl, vl, ...]; will become ids_b after the add below.
+                "mov z4.s, {vl_gpr:w}",
+                // z3 = step2 = [2*vl, 2*vl, ...], used to advance both id vectors each iter.
+                "mov z3.s, {step2_bytes:w}",
+                // Repurpose step2_bytes to hold the byte stride for advancing the input pointer
+                // by two full SVE vectors per iteration.
+                "rdvl {step2_bytes}, #2",
+                // ids_b = ids_a + step = [offset+vl, offset+vl+1, ...]
+                "add z4.s, z0.s, z4.s",
+
+                // --- Main loop: process two SVE vectors (ids_a and ids_b) per iteration ---
+                "0:",
+                // Load two consecutive SVE vectors from input.
+                "ld1w {{z5.s}}, p0/z, [{input}]",
+                "ld1w {{z6.s}}, p0/z, [{input}, #1, mul vl]",
+                // Advance input pointer by 2 * VL bytes.
+                "add {input}, {input}, {step2_bytes}",
+                // Unsigned shift: subtract range_start so in-range check becomes a single cmpu ≤.
+                "sub z5.s, z5.s, z2.s",
+                "sub z6.s, z6.s, z2.s",
+                // in_range: shifted value ≤ range_width  (unsigned, so values below lo also fail).
+                "cmphs p1.s, p0/z, z1.s, z5.s",
+                "cmphs p2.s, p0/z, z1.s, z6.s",
+                // Count matching lanes; both cntp calls have independent inputs for OOO parallelism.
+                "cntp {cnt_a}, p0, p1.s",
+                "compact z5.s, p1, z0.s",
+                "compact z6.s, p2, z4.s",
+                "cntp {cnt_b}, p0, p2.s",
+                // Advance id vectors for the next iteration.
+                "add z0.s, z0.s, z3.s",
+                "add z4.s, z4.s, z3.s",
+                // Store compacted ids. Only the first cnt_a / cnt_b slots are valid; the rest
+                // will be overwritten by subsequent iterations before the final truncate.
+                "str z5, [{out}]",
+                "st1w {{z6.s}}, p0, [{out}, {cnt_a}, lsl #2]",
+                "add {out}, {out}, {cnt_a}, lsl #2",
+                "add {out}, {out}, {cnt_b}, lsl #2",
+                "subs {pairs}, {pairs}, #1",
+                "b.ne 0b",
+
+                // --- Operands ---
+                input       = inout(reg) input_ptr,
+                out         = inout(reg) output_tail,
+                pairs       = inout(reg) num_pairs => _,
+                offset      = in(reg) offset,
+                range_start = in(reg) range_start,
+                range_width = in(reg) range_width,
+                vl_gpr      = out(reg) _,
+                step2_bytes = out(reg) _,
+                cnt_a       = out(reg) _,
+                cnt_b       = out(reg) _,
+                out("p0") _, out("p1") _, out("p2") _,
+                out("v0") _, out("v1") _, out("v2") _, out("v3") _,
+                out("v4") _, out("v5") _, out("v6") _,
+                options(nostack),
+            );
+        }
+    }
+
+    // Handle an odd trailing vector.
+    if num_words % 2 == 1 {
+        // ids_a for the odd word starts at offset + num_pairs * 2 * vl.
+        // input_ptr was advanced by the main loop and now points at the odd word.
+        let odd_offset =
+            offset.wrapping_add((num_pairs as u32).wrapping_mul(2).wrapping_mul(vl as u32));
+        unsafe {
+            core::arch::asm!(
+                "ptrue p0.s",
+                "index z0.s, {odd_offset:w}, #1",
+                "mov z1.s, {range_width:w}",
+                "mov z2.s, {range_start:w}",
+                "ld1w {{z3.s}}, p0/z, [{input}]",
+                "sub z3.s, z3.s, z2.s",
+                "cmphs p1.s, p0/z, z1.s, z3.s",
+                "cntp {cnt}, p0, p1.s",
+                "compact z0.s, p1, z0.s",
+                "str z0, [{out}]",
+                "add {out}, {out}, {cnt}, lsl #2",
+                odd_offset  = in(reg) odd_offset,
+                range_width = in(reg) range_width,
+                range_start = in(reg) range_start,
+                input       = in(reg) input_ptr,
+                out         = inout(reg) output_tail,
+                cnt         = out(reg) _,
+                out("p0") _, out("p1") _,
+                out("v0") _, out("v1") _, out("v2") _, out("v3") _,
+                options(nostack),
+            );
+        }
+    }
+
+    unsafe { output_tail.offset_from(output) as usize }
+}
+
+// SVE implements with intrinsics.
+//
+// #[target_feature(enable = "sve")]
+// unsafe fn filter_vec_sve_aux(
+//     input: *const u32,
+//     range_start: u32,
+//     range_width: u32,
+//     output: *mut u32,
+//     offset: u32,
+//     num_words: usize,
+//     vl: usize,
+// ) -> usize {
+//     unsafe {
+//         let all_true = svptrue_b32();
+//         let range_start_simd = svdup_n_u32(range_start);
+//         let range_width_simd = svdup_n_u32(range_width);
+//         // ids_a covers [offset .. offset+vl), ids_b covers the next vl ids.
+//         // Keeping them separate breaks the loop-carried dependency through ids so
+//         // both compact/cntp chains are fully independent within each unrolled body.
+//         let mut ids_a = svindex_u32(offset, 1);
+//         let step = svdup_n_u32(vl as u32);
+//         let step2 = svdup_n_u32(2 * vl as u32);
+//         let mut ids_b = svadd_u32_x(all_true, ids_a, step);
+
+//         let mut input = input;
+//         let mut output_tail = output;
+
+//         // Unrolled ×2: both cntp calls have independent inputs and execute in parallel.
+//         // The two output_tail updates are sequential but together cost 4+1+1=6 cy per
+//         // pair vs 5+5=10 cy for two scalar iterations, breaking the cntp latency chain.
+//         let num_pairs = num_words / 2;
+//         for _ in 0..num_pairs {
+//             let word_a = svld1_u32(all_true, input);
+//             let word_b = svld1_u32(all_true, input.add(vl));
+
+//             let shifted_a = svsub_u32_x(all_true, word_a, range_start_simd);
+//             let shifted_b = svsub_u32_x(all_true, word_b, range_start_simd);
+
+//             let in_range_a = svcmple_u32(all_true, shifted_a, range_width_simd);
+//             let in_range_b = svcmple_u32(all_true, shifted_b, range_width_simd);
+
+//             let compacted_a = svcompact_u32(in_range_a, ids_a);
+//             let compacted_b = svcompact_u32(in_range_b, ids_b);
+//             // cntp_a and cntp_b have independent inputs: OOO engine issues them in parallel.
+//             let added_len_a = svcntp_b32(all_true, in_range_a) as usize;
+//             let added_len_b = svcntp_b32(all_true, in_range_b) as usize;
+
+//             // Write the full vector — only the first added_len slots are valid.
+//             // Subsequent iterations overwrite the trailing zeros before truncate.
+//             svst1_u32(all_true, output_tail, compacted_a);
+//             output_tail = output_tail.add(added_len_a);
+//             svst1_u32(all_true, output_tail, compacted_b);
+//             output_tail = output_tail.add(added_len_b);
+
+//             ids_a = svadd_u32_x(all_true, ids_a, step2);
+//             ids_b = svadd_u32_x(all_true, ids_b, step2);
+//             input = input.add(2 * vl);
+//         }
+
+//         // Handle an odd trailing word.
+//         if num_words % 2 == 1 {
+//             let word = svld1_u32(all_true, input);
+//             let shifted = svsub_u32_x(all_true, word, range_start_simd);
+//             let in_range = svcmple_u32(all_true, shifted, range_width_simd);
+//             let added_len = svcntp_b32(all_true, in_range) as usize;
+//             let compacted_ids = svcompact_u32(in_range, ids_a);
+//             svst1_u32(all_true, output_tail, compacted_ids);
+//             output_tail = output_tail.add(added_len);
+//         }
+
+//         output_tail.offset_from(output) as usize
+//     }
+// }
--- a/src/collector/mod.rs
+++ b/src/collector/mod.rs
@@ -301,14 +301,11 @@ pub trait SegmentCollector: 'static {
    /// The query pushes the scored document to the collector via this method.
    fn collect(&mut self, doc: DocId, score: Score);

-    /// The query pushes the matched documents to the collector via this method.
+    /// The query pushes the scored document to the collector via this method.
    /// This method is used when the collector does not require scoring.
    ///
-    /// `docs` is a block of matched doc ids. Doc ids are produced in increasing
-    /// order, in windows of [`COLLECT_BLOCK_BUFFER_LEN`](crate::COLLECT_BLOCK_BUFFER_LEN),
-    /// but several windows are accumulated before being flushed here, so the
-    /// block may be larger than `COLLECT_BLOCK_BUFFER_LEN`. Implementations must
-    /// not assume any particular maximum length.
+    /// See [`COLLECT_BLOCK_BUFFER_LEN`](crate::COLLECT_BLOCK_BUFFER_LEN) for the
+    /// buffer size passed to the collector.
    fn collect_block(&mut self, docs: &[DocId]) {
        for doc in docs {
            self.collect(*doc, 0.0);
--- a/src/docset.rs
+++ b/src/docset.rs
@@ -11,14 +11,9 @@ use crate::DocId;
 /// to compare `[u32; 4]`.
 pub const TERMINATED: DocId = i32::MAX as u32;

-/// Window size used by [`DocSet::fill_buffer`]: a single `fill_buffer` call
-/// writes at most this many doc ids, and exactly this many as long as the
-/// `DocSet` is not exhausted.
-///
-/// Note that this is *not* the maximum length of the slice passed to
-/// `SegmentCollector::collect_block`: the collection loop accumulates several
-/// such windows into a larger buffer before flushing it, so `collect_block`
-/// may receive a block larger than `COLLECT_BLOCK_BUFFER_LEN`.
+/// The collect_block method on `SegmentCollector` uses a buffer of this size.
+/// Passed results to `collect_block` will not exceed this size and will be
+/// exactly this size as long as we can fill the buffer.
 pub const COLLECT_BLOCK_BUFFER_LEN: usize = 64;

 /// Number of `TinySet` (64-bit) buckets in a block used by [`DocSet::fill_bitset_block`].
--- a/src/postings/recorder.rs
+++ b/src/postings/recorder.rs
@@ -275,8 +275,9 @@ impl Recorder for TfAndPositionRecorder {
 mod tests {

    use common::write_u32_vint;
+    use stacker::MemoryArena;

-    use super::{BufferLender, VInt32Reader};
+    use super::{BufferLender, Recorder, TermFrequencyRecorder, VInt32Reader};

    #[test]
    fn test_buffer_lender() {
@@ -314,4 +315,98 @@ mod tests {
        let res: Vec<u32> = VInt32Reader::new(&buffer[..]).collect();
        assert_eq!(&res[..], &vals[..]);
    }
+
+    // ── TermFrequencyRecorder ─────────────────────────────────────────────────
+
+    #[test]
+    fn term_frequency_recorder_has_term_freq() {
+        let rec = TermFrequencyRecorder::default();
+        assert!(
+            rec.has_term_freq(),
+            "TermFrequencyRecorder must advertise term-frequency support"
+        );
+    }
+
+    #[test]
+    fn term_frequency_recorder_term_doc_freq_single_doc() {
+        let mut arena = MemoryArena::default();
+        let mut rec = TermFrequencyRecorder::default();
+
+        // Record one document with two term occurrences.
+        rec.new_doc(0, &mut arena);
+        rec.record_position(0, &mut arena);
+        rec.record_position(1, &mut arena);
+        rec.close_doc(&mut arena);
+
+        assert_eq!(
+            rec.term_doc_freq(),
+            Some(1),
+            "term_doc_freq should be 1 after recording one document"
+        );
+    }
+
+    #[test]
+    fn term_frequency_recorder_term_doc_freq_multiple_docs() {
+        let mut arena = MemoryArena::default();
+        let mut rec = TermFrequencyRecorder::default();
+
+        // Three documents with 1, 3, and 2 occurrences respectively.
+        for (doc, tf) in [(0u32, 1u32), (5, 3), (10, 2)] {
+            rec.new_doc(doc, &mut arena);
+            for pos in 0..tf {
+                rec.record_position(pos, &mut arena);
+            }
+            rec.close_doc(&mut arena);
+        }
+
+        assert_eq!(
+            rec.term_doc_freq(),
+            Some(3),
+            "term_doc_freq should equal the number of documents recorded"
+        );
+    }
+
+    #[test]
+    fn term_frequency_recorder_zero_docs() {
+        let rec = TermFrequencyRecorder::default();
+        assert_eq!(
+            rec.term_doc_freq(),
+            Some(0),
+            "term_doc_freq should be 0 before any document is recorded"
+        );
+    }
+
+    #[test]
+    fn term_frequency_recorder_single_occurrence_per_doc() {
+        let mut arena = MemoryArena::default();
+        let mut rec = TermFrequencyRecorder::default();
+
+        // Each document has exactly one occurrence — the minimum non-trivial case.
+        for doc in [1u32, 2, 100] {
+            rec.new_doc(doc, &mut arena);
+            rec.record_position(0, &mut arena);
+            rec.close_doc(&mut arena);
+        }
+
+        assert_eq!(rec.term_doc_freq(), Some(3));
+    }
+
+    #[test]
+    fn term_frequency_recorder_high_frequency_doc() {
+        let mut arena = MemoryArena::default();
+        let mut rec = TermFrequencyRecorder::default();
+
+        // A document where the term appears many times.
+        rec.new_doc(42, &mut arena);
+        for pos in 0..1000 {
+            rec.record_position(pos, &mut arena);
+        }
+        rec.close_doc(&mut arena);
+
+        assert_eq!(
+            rec.term_doc_freq(),
+            Some(1),
+            "term_doc_freq counts documents, not occurrences"
+        );
+    }
 }
--- a/src/query/boolean_query/boolean_weight.rs
+++ b/src/query/boolean_query/boolean_weight.rs
@@ -1,5 +1,6 @@
 use std::collections::HashMap;

+use crate::docset::COLLECT_BLOCK_BUFFER_LEN;
 use crate::index::SegmentReader;
 use crate::postings::FreqReadingOption;
 use crate::query::disjunction::Disjunction;
@@ -530,12 +531,13 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
    ) -> crate::Result<()> {
        let scorer = self.complex_scorer(reader, 1.0, || DoNothingCombiner)?;
        let num_docs = reader.num_docs();
+        let mut buffer = [0u32; COLLECT_BLOCK_BUFFER_LEN];

        match scorer {
            SpecializedScorer::TermUnion(term_scorers) => {
                let mut union_scorer =
                    BufferedUnionScorer::build(term_scorers, &self.score_combiner_fn, num_docs);
-                for_each_docset_buffered(&mut union_scorer, callback);
+                for_each_docset_buffered(&mut union_scorer, &mut buffer, callback);
            }
            SpecializedScorer::TermIntersection(term_scorers) => {
                let boxed_scorers: Vec<Box<dyn Scorer>> = term_scorers
@@ -543,10 +545,10 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
                    .map(|term_scorer| Box::new(term_scorer) as Box<dyn Scorer>)
                    .collect();
                let mut intersection = intersect_scorers(boxed_scorers, num_docs);
-                for_each_docset_buffered(intersection.as_mut(), callback);
+                for_each_docset_buffered(intersection.as_mut(), &mut buffer, callback);
            }
            SpecializedScorer::Other(mut scorer) => {
-                for_each_docset_buffered(scorer.as_mut(), callback);
+                for_each_docset_buffered(scorer.as_mut(), &mut buffer, callback);
            }
        }
        Ok(())
--- a/src/query/term_query/term_weight.rs
+++ b/src/query/term_query/term_weight.rs
@@ -1,5 +1,5 @@
 use super::term_scorer::TermScorer;
-use crate::docset::DocSet;
+use crate::docset::{DocSet, COLLECT_BLOCK_BUFFER_LEN};
 use crate::fieldnorm::FieldNormReader;
 use crate::index::SegmentReader;
 use crate::postings::SegmentPostings;
@@ -92,11 +92,13 @@ impl Weight for TermWeight {
    ) -> crate::Result<()> {
        match self.specialized_scorer(reader, 1.0)? {
            TermOrEmptyOrAllScorer::TermScorer(mut term_scorer) => {
-                for_each_docset_buffered(&mut term_scorer, callback);
+                let mut buffer = [0u32; COLLECT_BLOCK_BUFFER_LEN];
+                for_each_docset_buffered(&mut term_scorer, &mut buffer, callback);
            }
            TermOrEmptyOrAllScorer::Empty => {}
            TermOrEmptyOrAllScorer::AllMatch(mut all_scorer) => {
-                for_each_docset_buffered(&mut all_scorer, callback);
+                let mut buffer = [0u32; COLLECT_BLOCK_BUFFER_LEN];
+                for_each_docset_buffered(&mut all_scorer, &mut buffer, callback);
            }
        };

--- a/src/query/weight.rs
+++ b/src/query/weight.rs
@@ -17,56 +17,18 @@ pub(crate) fn for_each_scorer<TScorer: Scorer + ?Sized>(
    }
 }

-/// Number of `COLLECT_BLOCK_BUFFER_LEN`-sized windows accumulated into the large
-/// buffer before it is flushed to the collector via `collect_block`.
-const NUM_WINDOWS_PER_BLOCK: usize = 32;
-/// Size of the buffer accumulated before invoking the callback (2_048 = 32 * 64).
-/// `fill_buffer` keeps writing `COLLECT_BLOCK_BUFFER_LEN`-sized windows; this only
-/// changes how much we accumulate before flushing.
-const LARGE_COLLECT_BUFFER_LEN: usize = COLLECT_BLOCK_BUFFER_LEN * NUM_WINDOWS_PER_BLOCK;
-
-/// Iterates through all of the documents matched by the `DocSet`, flushing
-/// blocks of up to `LARGE_COLLECT_BUFFER_LEN` doc ids to `callback`.
-///
-/// `fill_buffer` only ever writes `COLLECT_BLOCK_BUFFER_LEN` doc ids at a time,
-/// so we accumulate several such windows into a single larger buffer before
-/// handing it to the collector. This amortizes the per-`collect_block` overhead
-/// (virtual dispatch, aggregation setup) over more documents.
+/// Iterates through all of the documents matched by the DocSet
+/// `DocSet`.
 #[inline]
 pub(crate) fn for_each_docset_buffered<T: DocSet + ?Sized>(
    docset: &mut T,
+    buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN],
    mut callback: impl FnMut(&[DocId]),
 ) {
-    // Heap-allocated once per call (i.e. once per segment in the no-score path).
-    // `new_zeroed_slice` zeroes directly on the heap, avoiding a 2_048-element
-    // stack temporary.
-    // SAFETY: an all-zero bit pattern is a valid value for every `DocId` (u32),
-    // so the zeroed slice is fully initialized.
-    let mut buffer: Box<[DocId]> =
-        unsafe { Box::new_zeroed_slice(LARGE_COLLECT_BUFFER_LEN).assume_init() };
    loop {
-        let mut filled = 0;
-        let mut reached_end = false;
-        // Fill the large buffer one `COLLECT_BLOCK_BUFFER_LEN` window at a time.
-        // `chunks_exact_mut` yields windows of exactly `COLLECT_BLOCK_BUFFER_LEN`
-        // because `LARGE_COLLECT_BUFFER_LEN` is a multiple of it (empty remainder).
-        // The windows are contiguous and filled in order, so the doc ids always
-        // occupy the contiguous prefix `buffer[..filled]`.
-        for window in buffer.chunks_exact_mut(COLLECT_BLOCK_BUFFER_LEN) {
-            // SAFETY: each `window` is a slice of exactly `COLLECT_BLOCK_BUFFER_LEN`
-            // elements, so reinterpreting its start pointer as a fixed-size array
-            // reference of that length is valid.
-            let window: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN] =
-                unsafe { &mut *window.as_mut_ptr().cast::<[DocId; COLLECT_BLOCK_BUFFER_LEN]>() };
-            let num_items = docset.fill_buffer(window);
-            filled += num_items;
-            if num_items != COLLECT_BLOCK_BUFFER_LEN {
-                reached_end = true;
-                break;
-            }
-        }
-        callback(&buffer[..filled]);
-        if reached_end {
+        let num_items = docset.fill_buffer(buffer);
+        callback(&buffer[..num_items]);
+        if num_items != buffer.len() {
            break;
        }
    }
@@ -142,7 +104,9 @@ pub trait Weight: Send + Sync + 'static {
        callback: &mut dyn FnMut(&[DocId]),
    ) -> crate::Result<()> {
        let mut docset = self.scorer(reader, 1.0)?;
-        for_each_docset_buffered(&mut docset, callback);
+
+        let mut buffer = [0u32; COLLECT_BLOCK_BUFFER_LEN];
+        for_each_docset_buffered(&mut docset, &mut buffer, callback);
        Ok(())
    }

--- a/stacker/Cargo.toml
+++ b/stacker/Cargo.toml
@@ -8,7 +8,7 @@ repository = "https://github.com/quickwit-oss/tantivy"
 description = "term hashmap used for indexing"

 [dependencies]
-murmurhash32 = "0.3"
+murmurhash32 = "0.4"
 common = { version = "0.11", path = "../common/", package = "tantivy-common" }
 ahash = { version = "0.8.11", default-features = false, optional = true }
Author	SHA1	Message	Date
dependabot[bot]	f50cb0208b	Bump github/codeql-action from 4.36.1 to 4.36.2 Bumps [github/codeql-action](https://github.com/github/codeql-action) from 4.36.1 to 4.36.2. - [Release notes](https://github.com/github/codeql-action/releases) - [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md) - [Commits](`87557b9c84...8aad20d150`) --- updated-dependencies: - dependency-name: github/codeql-action dependency-version: 4.36.2 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2026-06-08 20:04:11 +00:00
dependabot[bot]	fd9713e1ca	Bump actions/checkout from 6.0.2 to 6.0.3 (#2949 ) Bumps [actions/checkout](https://github.com/actions/checkout) from 6.0.2 to 6.0.3. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](`de0fac2e45...df4cb1c069`) --- updated-dependencies: - dependency-name: actions/checkout dependency-version: 6.0.3 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2026-06-08 10:55:54 +02:00
dependabot[bot]	96f3784f79	Bump github/codeql-action from 4.35.2 to 4.36.1 (#2948 ) Bumps [github/codeql-action](https://github.com/github/codeql-action) from 4.35.2 to 4.36.1. - [Release notes](https://github.com/github/codeql-action/releases) - [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md) - [Commits](`95e58e9a2c...87557b9c84`) --- updated-dependencies: - dependency-name: github/codeql-action dependency-version: 4.36.1 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2026-06-08 10:49:04 +02:00
dependabot[bot]	87a6679a79	Bump actions/upload-artifact from 7.0.0 to 7.0.1 (#2917 ) Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 7.0.0 to 7.0.1. - [Release notes](https://github.com/actions/upload-artifact/releases) - [Commits](`bbbca2ddaa...043fb46d1a`) --- updated-dependencies: - dependency-name: actions/upload-artifact dependency-version: 7.0.1 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2026-06-08 10:48:48 +02:00
dependabot[bot]	864a6aa72c	Update murmurhash32 requirement from 0.3 to 0.4 (#2894 ) Updates the requirements on [murmurhash32](https://github.com/quickwit-inc/murmurhash32) to permit the latest version. - [Commits](https://github.com/quickwit-inc/murmurhash32/commits) --- updated-dependencies: - dependency-name: murmurhash32 dependency-version: 0.4.0 dependency-type: direct:production ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2026-06-08 10:48:32 +02:00
Paul Masurel	abcf6754a2	CR comments from https://github.com/quickwit-oss/tantivy/pull/2940 (#2952 ) Co-authored-by: Paul Masurel <paul.masurel@datadoghq.com>	2026-06-08 10:47:58 +02:00
Kanishk Sachan	70a8e56ee5	test(postings): add unit tests for TermFrequencyRecorder Closes #2285 The TermFrequencyRecorder was completely untested. Add five focused tests: - term_frequency_recorder_has_term_freq: verifies the recorder correctly advertises term-frequency support via has_term_freq() - term_frequency_recorder_zero_docs: term_doc_freq() returns Some(0) before any documents are recorded - term_frequency_recorder_term_doc_freq_single_doc: one document with two occurrences yields term_doc_freq() == Some(1) - term_frequency_recorder_term_doc_freq_multiple_docs: three documents with varying term frequencies yield term_doc_freq() == Some(3), confirming the count tracks documents, not occurrences - term_frequency_recorder_single_occurrence_per_doc: each of three documents has exactly one occurrence - term_frequency_recorder_high_frequency_doc: a single document with 1000 occurrences still yields term_doc_freq() == Some(1)	2026-06-06 14:44:51 +08:00
Paul Masurel	62705526e8	Add sve + neon filter vec implementation as spotted by Adam (#2940 ) * Add filter_vec benchmarks (dense, sparse, full coverage) Uses get_ids_for_value_range to exercise both the bitpacking decode and the filter_vec SIMD path together under realistic cache conditions. * Add NEON and SVE implementations for filter_vec Adds aarch64-specific SIMD paths (NEON always available on aarch64; SVE gated on nightly + non-Apple target) with routing logic in mod.rs that selects the best available instruction set at runtime. * Using asm! to workaround the lack of stabilized SVE intrinsics * showing instruction set * improved proptesting * removing build.rs --------- Co-authored-by: Paul Masurel <paul.masurel@datadoghq.com>	2026-06-04 17:51:26 +02:00