mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-09 20:10:42 +00:00
Compare commits
8 Commits
larger-col
...
dependabot
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f50cb0208b | ||
|
|
fd9713e1ca | ||
|
|
96f3784f79 | ||
|
|
87a6679a79 | ||
|
|
864a6aa72c | ||
|
|
abcf6754a2 | ||
|
|
70a8e56ee5 | ||
|
|
62705526e8 |
2
.github/workflows/coverage.yml
vendored
2
.github/workflows/coverage.yml
vendored
@@ -20,7 +20,7 @@ jobs:
|
||||
contents: read
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
- uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
|
||||
- name: Install Rust
|
||||
run: rustup toolchain install nightly-2025-12-01 --profile minimal --component llvm-tools-preview
|
||||
- uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4 # v2.9.1
|
||||
|
||||
2
.github/workflows/long_running.yml
vendored
2
.github/workflows/long_running.yml
vendored
@@ -25,7 +25,7 @@ jobs:
|
||||
contents: read
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
- uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
|
||||
- name: Install stable
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af # v1.0.7
|
||||
with:
|
||||
|
||||
6
.github/workflows/scorecard.yml
vendored
6
.github/workflows/scorecard.yml
vendored
@@ -22,7 +22,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: 'Checkout code'
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
@@ -36,7 +36,7 @@ jobs:
|
||||
|
||||
# Upload the results as artifacts.
|
||||
- name: 'Upload artifact'
|
||||
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
|
||||
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
|
||||
with:
|
||||
name: SARIF file
|
||||
path: results.sarif
|
||||
@@ -44,6 +44,6 @@ jobs:
|
||||
|
||||
# Upload the results to GitHub's code scanning dashboard.
|
||||
- name: 'Upload to code-scanning'
|
||||
uses: github/codeql-action/upload-sarif@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4.35.2
|
||||
uses: github/codeql-action/upload-sarif@8aad20d150bbac5944a9f9d289da16a4b0d87c1e # v4.36.2
|
||||
with:
|
||||
sarif_file: results.sarif
|
||||
|
||||
4
.github/workflows/test.yml
vendored
4
.github/workflows/test.yml
vendored
@@ -27,7 +27,7 @@ jobs:
|
||||
checks: write
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
- uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
|
||||
|
||||
- name: Install nightly
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af # v1.0.7
|
||||
@@ -77,7 +77,7 @@ jobs:
|
||||
name: test-${{ matrix.features.label}}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
- uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
|
||||
|
||||
- name: Install stable
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af # v1.0.7
|
||||
|
||||
@@ -11,7 +11,7 @@ repository = "https://github.com/quickwit-oss/tantivy"
|
||||
readme = "README.md"
|
||||
keywords = ["search", "information", "retrieval"]
|
||||
edition = "2021"
|
||||
rust-version = "1.92"
|
||||
rust-version = "1.86"
|
||||
exclude = ["benches/*.json", "benches/*.txt"]
|
||||
|
||||
[dependencies]
|
||||
|
||||
@@ -18,5 +18,10 @@ homepage = "https://github.com/quickwit-oss/tantivy"
|
||||
bitpacking = { version = "0.9.2", default-features = false, features = ["bitpacker1x"] }
|
||||
|
||||
[dev-dependencies]
|
||||
binggan = "0.17.0"
|
||||
rand = "0.9"
|
||||
proptest = "1"
|
||||
|
||||
[[bench]]
|
||||
name = "bench"
|
||||
harness = false
|
||||
|
||||
@@ -1,65 +1,110 @@
|
||||
#![feature(test)]
|
||||
use std::cell::RefCell;
|
||||
|
||||
extern crate test;
|
||||
use binggan::{BenchRunner, black_box};
|
||||
use rand::rng;
|
||||
use rand::seq::IteratorRandom;
|
||||
use tantivy_bitpacker::{BitPacker, BitUnpacker, BlockedBitpacker};
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use rand::rng;
|
||||
use rand::seq::IteratorRandom;
|
||||
use tantivy_bitpacker::{BitPacker, BitUnpacker, BlockedBitpacker};
|
||||
use test::Bencher;
|
||||
fn create_bitpacked_data(bit_width: u8, num_els: u32) -> Vec<u8> {
|
||||
let mut bitpacker = BitPacker::new();
|
||||
let mut buffer = Vec::new();
|
||||
for _ in 0..num_els {
|
||||
bitpacker.write(0u64, bit_width, &mut buffer).unwrap();
|
||||
bitpacker.flush(&mut buffer).unwrap();
|
||||
}
|
||||
buffer
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
fn create_bitpacked_data(bit_width: u8, num_els: u32) -> Vec<u8> {
|
||||
let mut bitpacker = BitPacker::new();
|
||||
let mut buffer = Vec::new();
|
||||
for _ in 0..num_els {
|
||||
// the values do not matter.
|
||||
bitpacker.write(0u64, bit_width, &mut buffer).unwrap();
|
||||
bitpacker.flush(&mut buffer).unwrap();
|
||||
const N: usize = 100_000;
|
||||
const MAX_VAL: u64 = 1_000;
|
||||
const BIT_WIDTH: u8 = 10; // 2^10 = 1024 > MAX_VAL
|
||||
|
||||
fn create_packed_data() -> (BitUnpacker, Vec<u8>) {
|
||||
let mut bitpacker = BitPacker::new();
|
||||
let mut data = Vec::new();
|
||||
for i in 0..N as u64 {
|
||||
let val = i * MAX_VAL / N as u64;
|
||||
bitpacker.write(val, BIT_WIDTH, &mut data).unwrap();
|
||||
}
|
||||
bitpacker.close(&mut data).unwrap();
|
||||
(BitUnpacker::new(BIT_WIDTH), data)
|
||||
}
|
||||
|
||||
fn bench_bitpacking() {
|
||||
let mut runner = BenchRunner::new();
|
||||
let bit_width = 3;
|
||||
let num_els = 1_000_000u32;
|
||||
let bit_unpacker = BitUnpacker::new(bit_width);
|
||||
let data = create_bitpacked_data(bit_width, num_els);
|
||||
let idxs: Vec<u32> = (0..num_els).choose_multiple(&mut rng(), 100_000);
|
||||
runner.bench_function("bitpacking_read", move |_| {
|
||||
let mut out = 0u64;
|
||||
for &idx in &idxs {
|
||||
out = out.wrapping_add(bit_unpacker.get(idx, &data[..]));
|
||||
}
|
||||
buffer
|
||||
}
|
||||
black_box(out);
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_bitpacking_read(b: &mut Bencher) {
|
||||
let bit_width = 3;
|
||||
let num_els = 1_000_000u32;
|
||||
let bit_unpacker = BitUnpacker::new(bit_width);
|
||||
let data = create_bitpacked_data(bit_width, num_els);
|
||||
let idxs: Vec<u32> = (0..num_els).choose_multiple(&mut rng(), 100_000);
|
||||
b.iter(|| {
|
||||
let mut out = 0u64;
|
||||
for &idx in &idxs {
|
||||
out = out.wrapping_add(bit_unpacker.get(idx, &data[..]));
|
||||
}
|
||||
out
|
||||
});
|
||||
fn bench_blocked_bitpacker() {
|
||||
let mut runner = BenchRunner::new();
|
||||
let mut blocked_bitpacker = BlockedBitpacker::new();
|
||||
for val in 0..=21500 {
|
||||
blocked_bitpacker.add(val * val);
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_blockedbitp_read(b: &mut Bencher) {
|
||||
runner.bench_function("blockedbitp_read", move |_| {
|
||||
let mut out = 0u64;
|
||||
for val in 0..=21500 {
|
||||
out = out.wrapping_add(blocked_bitpacker.get(val));
|
||||
}
|
||||
black_box(out);
|
||||
});
|
||||
runner.bench_function("blockedbitp_create", |_| {
|
||||
let mut blocked_bitpacker = BlockedBitpacker::new();
|
||||
for val in 0..=21500 {
|
||||
blocked_bitpacker.add(val * val);
|
||||
}
|
||||
b.iter(|| {
|
||||
let mut out = 0u64;
|
||||
for val in 0..=21500 {
|
||||
out = out.wrapping_add(blocked_bitpacker.get(val));
|
||||
}
|
||||
out
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_blockedbitp_create(b: &mut Bencher) {
|
||||
b.iter(|| {
|
||||
let mut blocked_bitpacker = BlockedBitpacker::new();
|
||||
for val in 0..=21500 {
|
||||
blocked_bitpacker.add(val * val);
|
||||
}
|
||||
blocked_bitpacker
|
||||
});
|
||||
}
|
||||
black_box(blocked_bitpacker);
|
||||
});
|
||||
}
|
||||
|
||||
fn bench_filter_vec() {
|
||||
let mut runner = BenchRunner::new();
|
||||
|
||||
let (unpacker, data) = create_packed_data();
|
||||
let positions = RefCell::new(Vec::with_capacity(N));
|
||||
runner.bench_function("filter_vec_dense", move |_| {
|
||||
unpacker.get_ids_for_value_range(
|
||||
250..=750,
|
||||
0..N as u32,
|
||||
&data,
|
||||
&mut positions.borrow_mut(),
|
||||
);
|
||||
black_box(positions.borrow().len());
|
||||
});
|
||||
|
||||
let (unpacker, data) = create_packed_data();
|
||||
let positions = RefCell::new(Vec::with_capacity(N));
|
||||
runner.bench_function("filter_vec_sparse", move |_| {
|
||||
unpacker.get_ids_for_value_range(0..=50, 0..N as u32, &data, &mut positions.borrow_mut());
|
||||
black_box(positions.borrow().len());
|
||||
});
|
||||
|
||||
let (unpacker, data) = create_packed_data();
|
||||
let positions = RefCell::new(Vec::with_capacity(N));
|
||||
runner.bench_function("filter_vec_full", move |_| {
|
||||
unpacker.get_ids_for_value_range(
|
||||
0..=MAX_VAL,
|
||||
0..N as u32,
|
||||
&data,
|
||||
&mut positions.borrow_mut(),
|
||||
);
|
||||
black_box(positions.borrow().len());
|
||||
});
|
||||
}
|
||||
|
||||
fn main() {
|
||||
bench_bitpacking();
|
||||
bench_blocked_bitpacker();
|
||||
bench_filter_vec();
|
||||
}
|
||||
|
||||
@@ -1,8 +1,17 @@
|
||||
#[cfg(all(target_arch = "aarch64", not(target_vendor = "apple")))]
|
||||
use std::arch::is_aarch64_feature_detected;
|
||||
use std::ops::RangeInclusive;
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
mod avx2;
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
mod neon;
|
||||
|
||||
// SVE intrinsics are not exposed on aarch64-apple-darwin.
|
||||
#[cfg(all(target_arch = "aarch64", not(target_vendor = "apple")))]
|
||||
mod sve;
|
||||
|
||||
mod scalar;
|
||||
|
||||
#[derive(Clone, Copy, Eq, PartialEq, Debug)]
|
||||
@@ -10,6 +19,10 @@ mod scalar;
|
||||
enum FilterImplPerInstructionSet {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
AVX2 = 0u8,
|
||||
#[cfg(all(target_arch = "aarch64", not(target_vendor = "apple")))]
|
||||
SVE = 3u8,
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
Neon = 2u8,
|
||||
Scalar = 1u8,
|
||||
}
|
||||
|
||||
@@ -19,29 +32,57 @@ impl FilterImplPerInstructionSet {
|
||||
match *self {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
FilterImplPerInstructionSet::AVX2 => is_x86_feature_detected!("avx2"),
|
||||
#[cfg(all(target_arch = "aarch64", not(target_vendor = "apple")))]
|
||||
FilterImplPerInstructionSet::SVE => is_aarch64_feature_detected!("sve"),
|
||||
// TIL Neon is required on aarch 64.
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
FilterImplPerInstructionSet::Neon => true,
|
||||
FilterImplPerInstructionSet::Scalar => true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// List of available implementation in preferred order.
|
||||
// List of available implementations in preferred order.
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
const IMPLS: [FilterImplPerInstructionSet; 2] = [
|
||||
FilterImplPerInstructionSet::AVX2,
|
||||
FilterImplPerInstructionSet::Scalar,
|
||||
];
|
||||
|
||||
#[cfg(not(target_arch = "x86_64"))]
|
||||
// Non-Apple aarch64: try SVE, NEON, Scalar.
|
||||
#[cfg(all(target_arch = "aarch64", not(target_vendor = "apple")))]
|
||||
const IMPLS: [FilterImplPerInstructionSet; 3] = [
|
||||
FilterImplPerInstructionSet::SVE,
|
||||
FilterImplPerInstructionSet::Neon,
|
||||
FilterImplPerInstructionSet::Scalar,
|
||||
];
|
||||
|
||||
// Apple aarch64 (M-series): SVE not available; use NEON or Scalar.
|
||||
#[cfg(all(target_arch = "aarch64", target_vendor = "apple"))]
|
||||
const IMPLS: [FilterImplPerInstructionSet; 2] = [
|
||||
FilterImplPerInstructionSet::Neon,
|
||||
FilterImplPerInstructionSet::Scalar,
|
||||
];
|
||||
|
||||
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
|
||||
const IMPLS: [FilterImplPerInstructionSet; 1] = [FilterImplPerInstructionSet::Scalar];
|
||||
|
||||
impl FilterImplPerInstructionSet {
|
||||
#[inline]
|
||||
#[allow(unused_variables)] // on non-x86_64, code is unused.
|
||||
#[allow(unused_variables)]
|
||||
fn from(code: u8) -> FilterImplPerInstructionSet {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
if code == FilterImplPerInstructionSet::AVX2 as u8 {
|
||||
return FilterImplPerInstructionSet::AVX2;
|
||||
}
|
||||
#[cfg(all(target_arch = "aarch64", not(target_vendor = "apple")))]
|
||||
if code == FilterImplPerInstructionSet::SVE as u8 {
|
||||
return FilterImplPerInstructionSet::SVE;
|
||||
}
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
if code == FilterImplPerInstructionSet::Neon as u8 {
|
||||
return FilterImplPerInstructionSet::Neon;
|
||||
}
|
||||
FilterImplPerInstructionSet::Scalar
|
||||
}
|
||||
|
||||
@@ -50,6 +91,13 @@ impl FilterImplPerInstructionSet {
|
||||
match self {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
FilterImplPerInstructionSet::AVX2 => avx2::filter_vec_in_place(range, offset, output),
|
||||
#[cfg(all(target_arch = "aarch64", not(target_vendor = "apple")))]
|
||||
// SAFETY: SVE availability was verified by is_available() before selecting this impl.
|
||||
FilterImplPerInstructionSet::SVE => unsafe {
|
||||
sve::filter_vec_in_place(range, offset, output)
|
||||
},
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
FilterImplPerInstructionSet::Neon => neon::filter_vec_in_place(range, offset, output),
|
||||
FilterImplPerInstructionSet::Scalar => {
|
||||
scalar::filter_vec_in_place(range, offset, output)
|
||||
}
|
||||
@@ -57,6 +105,12 @@ impl FilterImplPerInstructionSet {
|
||||
}
|
||||
}
|
||||
|
||||
fn available_impls() -> impl Iterator<Item = FilterImplPerInstructionSet> {
|
||||
IMPLS
|
||||
.into_iter()
|
||||
.filter(FilterImplPerInstructionSet::is_available)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_best_available_instruction_set() -> FilterImplPerInstructionSet {
|
||||
use std::sync::atomic::{AtomicU8, Ordering};
|
||||
@@ -64,10 +118,7 @@ fn get_best_available_instruction_set() -> FilterImplPerInstructionSet {
|
||||
let instruction_set_byte: u8 = INSTRUCTION_SET_BYTE.load(Ordering::Relaxed);
|
||||
if instruction_set_byte == u8::MAX {
|
||||
// Let's initialize the instruction set and cache it.
|
||||
let instruction_set = IMPLS
|
||||
.into_iter()
|
||||
.find(FilterImplPerInstructionSet::is_available)
|
||||
.unwrap();
|
||||
let instruction_set = available_impls().next().unwrap();
|
||||
INSTRUCTION_SET_BYTE.store(instruction_set as u8, Ordering::Relaxed);
|
||||
return instruction_set;
|
||||
}
|
||||
@@ -80,12 +131,12 @@ pub fn filter_vec_in_place(range: RangeInclusive<u32>, offset: u32, output: &mut
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use proptest::strategy::Strategy;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_get_best_available_instruction_set() {
|
||||
// This does not test much unfortunately.
|
||||
// We just make sure the function returns without crashing and returns the same result.
|
||||
let instruction_set = get_best_available_instruction_set();
|
||||
assert_eq!(get_best_available_instruction_set(), instruction_set);
|
||||
}
|
||||
@@ -102,6 +153,31 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(target_arch = "aarch64", not(target_vendor = "apple")))]
|
||||
#[test]
|
||||
fn test_instruction_set_to_code_from_code() {
|
||||
for instruction_set in [
|
||||
FilterImplPerInstructionSet::SVE,
|
||||
FilterImplPerInstructionSet::Neon,
|
||||
FilterImplPerInstructionSet::Scalar,
|
||||
] {
|
||||
let code = instruction_set as u8;
|
||||
assert_eq!(instruction_set, FilterImplPerInstructionSet::from(code));
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(target_arch = "aarch64", target_vendor = "apple"))]
|
||||
#[test]
|
||||
fn test_instruction_set_to_code_from_code() {
|
||||
for instruction_set in [
|
||||
FilterImplPerInstructionSet::Neon,
|
||||
FilterImplPerInstructionSet::Scalar,
|
||||
] {
|
||||
let code = instruction_set as u8;
|
||||
assert_eq!(instruction_set, FilterImplPerInstructionSet::from(code));
|
||||
}
|
||||
}
|
||||
|
||||
fn test_filter_impl_empty_aux(filter_impl: FilterImplPerInstructionSet) {
|
||||
let mut output = vec![];
|
||||
filter_impl.filter_vec_in_place(0..=u32::MAX, 0, &mut output);
|
||||
@@ -126,11 +202,20 @@ mod tests {
|
||||
assert_eq!(&output, &[1, 3, 4, 5, 6, 7, 8]);
|
||||
}
|
||||
|
||||
fn test_filter_impl_empty_range_aux(filter_impl: FilterImplPerInstructionSet) {
|
||||
// start > end: RangeInclusive::contains always returns false; output must be empty.
|
||||
// The SVE path's wrapping_sub would otherwise produce a huge range_width.
|
||||
let mut output = vec![3, 2, 1, 5, 11, 2, 5, 10, 2];
|
||||
filter_impl.filter_vec_in_place(10..=5, 0, &mut output);
|
||||
assert_eq!(&output, &[]);
|
||||
}
|
||||
|
||||
fn test_filter_impl_test_suite(filter_impl: FilterImplPerInstructionSet) {
|
||||
test_filter_impl_empty_aux(filter_impl);
|
||||
test_filter_impl_simple_aux(filter_impl);
|
||||
test_filter_impl_simple_aux_shifted(filter_impl);
|
||||
test_filter_impl_simple_outside_i32_range(filter_impl);
|
||||
test_filter_impl_empty_range_aux(filter_impl);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -141,25 +226,60 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(all(target_arch = "aarch64", not(target_vendor = "apple")))]
|
||||
fn test_filter_implementation_sve() {
|
||||
if FilterImplPerInstructionSet::SVE.is_available() {
|
||||
test_filter_impl_test_suite(FilterImplPerInstructionSet::SVE);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
fn test_filter_implementation_neon() {
|
||||
test_filter_impl_test_suite(FilterImplPerInstructionSet::Neon);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_filter_implementation_scalar() {
|
||||
test_filter_impl_test_suite(FilterImplPerInstructionSet::Scalar);
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
fn max_val_strategy() -> impl proptest::strategy::Strategy<Value = u32> {
|
||||
proptest::prop_oneof![
|
||||
0u32..10u32,
|
||||
255u32..258u32,
|
||||
proptest::prelude::Just(1u32 << 25),
|
||||
proptest::prelude::Just(u32::MAX - 1),
|
||||
proptest::prelude::Just(u32::MAX),
|
||||
]
|
||||
}
|
||||
|
||||
fn vals_strategy() -> impl proptest::strategy::Strategy<Value = Vec<u32>> {
|
||||
proptest::prop_oneof![
|
||||
proptest::collection::vec(proptest::prelude::any::<u32>(), 0..300),
|
||||
max_val_strategy()
|
||||
.prop_flat_map(|max_val| { proptest::collection::vec(0..=max_val, 0..300) })
|
||||
]
|
||||
}
|
||||
|
||||
proptest::proptest! {
|
||||
#[test]
|
||||
fn test_filter_compare_scalar_and_avx2_impl_proptest(
|
||||
start in proptest::prelude::any::<u32>(),
|
||||
end in proptest::prelude::any::<u32>(),
|
||||
fn test_filter_compare_scalar_and_impls_impl_proptest(
|
||||
start in 0u32..400u32,
|
||||
end in 0u32..400u32,
|
||||
offset in 0u32..2u32,
|
||||
mut vals in proptest::collection::vec(0..u32::MAX, 0..30)) {
|
||||
if FilterImplPerInstructionSet::AVX2.is_available() {
|
||||
let mut vals_clone = vals.clone();
|
||||
FilterImplPerInstructionSet::AVX2.filter_vec_in_place(start..=end, offset, &mut vals);
|
||||
FilterImplPerInstructionSet::Scalar.filter_vec_in_place(start..=end, offset, &mut vals_clone);
|
||||
assert_eq!(&vals, &vals_clone);
|
||||
}
|
||||
vals in vals_strategy()) {
|
||||
for implementation in available_impls() {
|
||||
if implementation == FilterImplPerInstructionSet::Scalar {
|
||||
continue;
|
||||
}
|
||||
let mut impl_output = vals.clone();
|
||||
let mut scalar_output = vals.clone();
|
||||
implementation.filter_vec_in_place(start..=end, offset, &mut impl_output);
|
||||
FilterImplPerInstructionSet::Scalar.filter_vec_in_place(start..=end, offset, &mut scalar_output);
|
||||
assert_eq!(&impl_output, &scalar_output);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
118
bitpacker/src/filter_vec/neon.rs
Normal file
118
bitpacker/src/filter_vec/neon.rs
Normal file
@@ -0,0 +1,118 @@
|
||||
use std::arch::aarch64::*;
|
||||
use std::ops::RangeInclusive;
|
||||
|
||||
const NUM_LANES: usize = 4;
|
||||
|
||||
// Compacts matching lanes to the front using a byte-level shuffle.
|
||||
// `mask` is a 4-bit value: bit k=1 means lane k should appear in the output.
|
||||
#[inline]
|
||||
#[target_feature(enable = "neon")]
|
||||
unsafe fn compact(data: uint32x4_t, mask: u8) -> uint32x4_t {
|
||||
unsafe {
|
||||
// SAFETY: mask is always in [0, 15] by construction (max sum of [1,2,4,8]).
|
||||
// BYTE_SHUFFLE_TABLE has 16 entries, so this is always in bounds.
|
||||
let shuffle = BYTE_SHUFFLE_TABLE.get_unchecked(mask as usize);
|
||||
let shuffle_vec = vld1q_u8(shuffle.as_ptr());
|
||||
vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(data), shuffle_vec))
|
||||
}
|
||||
}
|
||||
|
||||
// Safe (not unsafe) because NEON is mandatory on aarch64: no runtime feature check needed.
|
||||
#[inline(never)]
|
||||
pub fn filter_vec_in_place(range: RangeInclusive<u32>, offset: u32, output: &mut Vec<u32>) {
|
||||
let num_words = output.len() / NUM_LANES;
|
||||
let mut output_len = unsafe {
|
||||
filter_vec_neon_aux(
|
||||
output.as_ptr(),
|
||||
range.clone(),
|
||||
output.as_mut_ptr(),
|
||||
offset,
|
||||
num_words,
|
||||
)
|
||||
};
|
||||
let remainder_start = num_words * NUM_LANES;
|
||||
for i in remainder_start..output.len() {
|
||||
let val = output[i];
|
||||
output[output_len] = offset + i as u32;
|
||||
output_len += if range.contains(&val) { 1 } else { 0 };
|
||||
}
|
||||
output.truncate(output_len);
|
||||
}
|
||||
|
||||
#[target_feature(enable = "neon")]
|
||||
unsafe fn filter_vec_neon_aux(
|
||||
input: *const u32,
|
||||
range: RangeInclusive<u32>,
|
||||
output: *mut u32,
|
||||
offset: u32,
|
||||
num_words: usize,
|
||||
) -> usize {
|
||||
unsafe {
|
||||
let mut input = input;
|
||||
let mut output_tail = output;
|
||||
let range_start_simd = vdupq_n_u32(*range.start());
|
||||
let range_end_simd = vdupq_n_u32(*range.end());
|
||||
let mut ids = vld1q_u32([offset, offset + 1, offset + 2, offset + 3].as_ptr());
|
||||
let shift = vdupq_n_u32(NUM_LANES as u32);
|
||||
let bit_weights = vld1q_u32([1u32, 2, 4, 8].as_ptr());
|
||||
|
||||
for _ in 0..num_words {
|
||||
let word = vld1q_u32(input);
|
||||
|
||||
// Unsigned compares: CMHS (compare higher or same) tests `word >= start`
|
||||
// and `end >= word`. ANDing both gives the inside-range mask directly,
|
||||
// which is cheaper than computing `outside` and then negating.
|
||||
let ge_start = vcgeq_u32(word, range_start_simd);
|
||||
let le_end = vcleq_u32(word, range_end_simd);
|
||||
// inside[k] = 0xFFFFFFFF if val[k] is in range, 0 otherwise.
|
||||
let inside = vandq_u32(ge_start, le_end);
|
||||
|
||||
// Build the 4-bit mask: AND bit_weights with the inside lane mask, so each
|
||||
// inside lane contributes its bit_weight (1, 2, 4, or 8). Summing yields the
|
||||
// 4-bit mask in one addv.
|
||||
let inside_bits = vandq_u32(bit_weights, inside);
|
||||
let mask = vaddvq_u32(inside_bits) as u8;
|
||||
// mask is mathematically bounded: max value is 1+2+4+8=15 (all lanes match)
|
||||
debug_assert!(mask <= 15, "mask must fit in 4 bits: {}", mask);
|
||||
|
||||
// Count of matching lanes = popcount(mask). Derives the count directly from
|
||||
// the mask instead of running a parallel SIMD reduction over `outside`.
|
||||
let added_len = mask.count_ones() as usize;
|
||||
|
||||
// Safe because mask is guaranteed to be in [0, 15]
|
||||
let filtered_ids = compact(ids, mask);
|
||||
vst1q_u32(output_tail, filtered_ids);
|
||||
output_tail = output_tail.add(added_len);
|
||||
ids = vaddq_u32(ids, shift);
|
||||
input = input.add(NUM_LANES);
|
||||
}
|
||||
|
||||
output_tail.offset_from(output) as usize
|
||||
}
|
||||
}
|
||||
|
||||
// Byte shuffle patterns to compact matching lanes to the front of the vector.
|
||||
// Index is a 4-bit mask: bit k=1 means lane k (bytes 4k..4k+3) is in-range.
|
||||
// The j-th set bit determines which input lane goes to output position j.
|
||||
const BYTE_SHUFFLE_TABLE: [[u8; 16]; 16] = [
|
||||
[
|
||||
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
|
||||
], // 0b0000: none
|
||||
[0, 1, 2, 3, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], // 0b0001: lane 0
|
||||
[4, 5, 6, 7, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], // 0b0010: lane 1
|
||||
[0, 1, 2, 3, 4, 5, 6, 7, 16, 16, 16, 16, 16, 16, 16, 16], // 0b0011: lanes 0,1
|
||||
[8, 9, 10, 11, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], // 0b0100: lane 2
|
||||
[0, 1, 2, 3, 8, 9, 10, 11, 16, 16, 16, 16, 16, 16, 16, 16], // 0b0101: lanes 0,2
|
||||
[4, 5, 6, 7, 8, 9, 10, 11, 16, 16, 16, 16, 16, 16, 16, 16], // 0b0110: lanes 1,2
|
||||
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 16, 16, 16], // 0b0111: lanes 0,1,2
|
||||
[
|
||||
12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
|
||||
], // 0b1000: lane 3
|
||||
[0, 1, 2, 3, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16], // 0b1001: lanes 0,3
|
||||
[4, 5, 6, 7, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16], // 0b1010: lanes 1,3
|
||||
[0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 16, 16, 16, 16], // 0b1011: lanes 0,1,3
|
||||
[8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16], // 0b1100: lanes 2,3
|
||||
[0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16], // 0b1101: lanes 0,2,3
|
||||
[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16], // 0b1110: lanes 1,2,3
|
||||
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], // 0b1111: all lanes
|
||||
];
|
||||
260
bitpacker/src/filter_vec/sve.rs
Normal file
260
bitpacker/src/filter_vec/sve.rs
Normal file
@@ -0,0 +1,260 @@
|
||||
use std::ops::RangeInclusive;
|
||||
|
||||
// SVE vector length (in u32 lanes) is not a compile-time constant; query at runtime.
|
||||
// Safe to call only when SVE is confirmed available via is_aarch64_feature_detected!("sve").
|
||||
#[target_feature(enable = "sve")]
|
||||
unsafe fn num_lanes() -> usize {
|
||||
let vl: usize;
|
||||
unsafe {
|
||||
core::arch::asm!(
|
||||
"cntw {vl}",
|
||||
vl = out(reg) vl,
|
||||
options(nostack, nomem, preserves_flags),
|
||||
);
|
||||
}
|
||||
vl
|
||||
}
|
||||
|
||||
// SAFETY: caller must ensure SVE is available (checked via is_aarch64_feature_detected!("sve")).
|
||||
// Unlike NEON, SVE is optional on aarch64 and not guaranteed by the target architecture.
|
||||
pub unsafe fn filter_vec_in_place(range: RangeInclusive<u32>, offset: u32, output: &mut Vec<u32>) {
|
||||
if range.start() > range.end() {
|
||||
output.clear();
|
||||
return;
|
||||
}
|
||||
let vl = unsafe { num_lanes() };
|
||||
let num_words = output.len() / vl;
|
||||
let range_start = *range.start();
|
||||
// Unsigned subtraction trick: val ∈ [lo, hi] ↔ (val - lo) ≤ᵤ (hi - lo).
|
||||
// Values below lo wrap around to large u32, so the single unsigned ≤ excludes them.
|
||||
let range_width = range.end().wrapping_sub(range_start);
|
||||
let mut output_len = unsafe {
|
||||
filter_vec_sve_aux(
|
||||
output.as_ptr(),
|
||||
range_start,
|
||||
range_width,
|
||||
output.as_mut_ptr(),
|
||||
offset,
|
||||
num_words,
|
||||
vl,
|
||||
)
|
||||
};
|
||||
let remainder_start = num_words * vl;
|
||||
for i in remainder_start..output.len() {
|
||||
let val = output[i];
|
||||
output[output_len] = offset + i as u32;
|
||||
output_len += if range.contains(&val) { 1 } else { 0 };
|
||||
}
|
||||
output.truncate(output_len);
|
||||
}
|
||||
|
||||
// Register allocation for the asm! blocks:
|
||||
// z0 ids_a (index vector for first half of each pair, advances by step2 each iter)
|
||||
// z1 range_width broadcast
|
||||
// z2 range_start broadcast
|
||||
// z3 step2 broadcast (2 * vl)
|
||||
// z4 ids_b (index vector for second half, = ids_a + step, advances by step2)
|
||||
// z5 scratch: loaded word_a, then compacted_a
|
||||
// z6 scratch: loaded word_b, then compacted_b
|
||||
// p0 all-true predicate (ptrue p0.s)
|
||||
// p1 in-range mask for word_a
|
||||
// p2 in-range mask for word_b
|
||||
#[target_feature(enable = "sve")]
|
||||
unsafe fn filter_vec_sve_aux(
|
||||
input: *const u32,
|
||||
range_start: u32,
|
||||
range_width: u32,
|
||||
output: *mut u32,
|
||||
offset: u32,
|
||||
num_words: usize,
|
||||
vl: usize,
|
||||
) -> usize {
|
||||
let num_pairs = num_words / 2;
|
||||
let mut input_ptr = input;
|
||||
let mut output_tail = output;
|
||||
|
||||
if num_pairs > 0 {
|
||||
unsafe {
|
||||
// We rely on asm! because the SVE intrinsics are not available in stable Rust.
|
||||
// The code that follows was generated by Rustc nightly based on the intrinsics version
|
||||
// at the bottom of this file.
|
||||
core::arch::asm!(
|
||||
// --- Setup ---
|
||||
// All-true predicate for 32-bit lanes.
|
||||
"ptrue p0.s",
|
||||
// ids_a = [offset, offset+1, offset+2, ...]
|
||||
"index z0.s, {offset:w}, #1",
|
||||
// Broadcast scalars into SVE vectors.
|
||||
"mov z1.s, {range_width:w}",
|
||||
"mov z2.s, {range_start:w}",
|
||||
// vl_gpr = number of 32-bit lanes (cntw).
|
||||
"cntw {vl_gpr}",
|
||||
// step2_bytes will first hold 2*vl (for the step2 vector), then 2*VL in bytes.
|
||||
"lsl {step2_bytes}, {vl_gpr}, #1",
|
||||
// z4 = step = [vl, vl, ...]; will become ids_b after the add below.
|
||||
"mov z4.s, {vl_gpr:w}",
|
||||
// z3 = step2 = [2*vl, 2*vl, ...], used to advance both id vectors each iter.
|
||||
"mov z3.s, {step2_bytes:w}",
|
||||
// Repurpose step2_bytes to hold the byte stride for advancing the input pointer
|
||||
// by two full SVE vectors per iteration.
|
||||
"rdvl {step2_bytes}, #2",
|
||||
// ids_b = ids_a + step = [offset+vl, offset+vl+1, ...]
|
||||
"add z4.s, z0.s, z4.s",
|
||||
|
||||
// --- Main loop: process two SVE vectors (ids_a and ids_b) per iteration ---
|
||||
"0:",
|
||||
// Load two consecutive SVE vectors from input.
|
||||
"ld1w {{z5.s}}, p0/z, [{input}]",
|
||||
"ld1w {{z6.s}}, p0/z, [{input}, #1, mul vl]",
|
||||
// Advance input pointer by 2 * VL bytes.
|
||||
"add {input}, {input}, {step2_bytes}",
|
||||
// Unsigned shift: subtract range_start so in-range check becomes a single cmpu ≤.
|
||||
"sub z5.s, z5.s, z2.s",
|
||||
"sub z6.s, z6.s, z2.s",
|
||||
// in_range: shifted value ≤ range_width (unsigned, so values below lo also fail).
|
||||
"cmphs p1.s, p0/z, z1.s, z5.s",
|
||||
"cmphs p2.s, p0/z, z1.s, z6.s",
|
||||
// Count matching lanes; both cntp calls have independent inputs for OOO parallelism.
|
||||
"cntp {cnt_a}, p0, p1.s",
|
||||
"compact z5.s, p1, z0.s",
|
||||
"compact z6.s, p2, z4.s",
|
||||
"cntp {cnt_b}, p0, p2.s",
|
||||
// Advance id vectors for the next iteration.
|
||||
"add z0.s, z0.s, z3.s",
|
||||
"add z4.s, z4.s, z3.s",
|
||||
// Store compacted ids. Only the first cnt_a / cnt_b slots are valid; the rest
|
||||
// will be overwritten by subsequent iterations before the final truncate.
|
||||
"str z5, [{out}]",
|
||||
"st1w {{z6.s}}, p0, [{out}, {cnt_a}, lsl #2]",
|
||||
"add {out}, {out}, {cnt_a}, lsl #2",
|
||||
"add {out}, {out}, {cnt_b}, lsl #2",
|
||||
"subs {pairs}, {pairs}, #1",
|
||||
"b.ne 0b",
|
||||
|
||||
// --- Operands ---
|
||||
input = inout(reg) input_ptr,
|
||||
out = inout(reg) output_tail,
|
||||
pairs = inout(reg) num_pairs => _,
|
||||
offset = in(reg) offset,
|
||||
range_start = in(reg) range_start,
|
||||
range_width = in(reg) range_width,
|
||||
vl_gpr = out(reg) _,
|
||||
step2_bytes = out(reg) _,
|
||||
cnt_a = out(reg) _,
|
||||
cnt_b = out(reg) _,
|
||||
out("p0") _, out("p1") _, out("p2") _,
|
||||
out("v0") _, out("v1") _, out("v2") _, out("v3") _,
|
||||
out("v4") _, out("v5") _, out("v6") _,
|
||||
options(nostack),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Handle an odd trailing vector.
|
||||
if num_words % 2 == 1 {
|
||||
// ids_a for the odd word starts at offset + num_pairs * 2 * vl.
|
||||
// input_ptr was advanced by the main loop and now points at the odd word.
|
||||
let odd_offset =
|
||||
offset.wrapping_add((num_pairs as u32).wrapping_mul(2).wrapping_mul(vl as u32));
|
||||
unsafe {
|
||||
core::arch::asm!(
|
||||
"ptrue p0.s",
|
||||
"index z0.s, {odd_offset:w}, #1",
|
||||
"mov z1.s, {range_width:w}",
|
||||
"mov z2.s, {range_start:w}",
|
||||
"ld1w {{z3.s}}, p0/z, [{input}]",
|
||||
"sub z3.s, z3.s, z2.s",
|
||||
"cmphs p1.s, p0/z, z1.s, z3.s",
|
||||
"cntp {cnt}, p0, p1.s",
|
||||
"compact z0.s, p1, z0.s",
|
||||
"str z0, [{out}]",
|
||||
"add {out}, {out}, {cnt}, lsl #2",
|
||||
odd_offset = in(reg) odd_offset,
|
||||
range_width = in(reg) range_width,
|
||||
range_start = in(reg) range_start,
|
||||
input = in(reg) input_ptr,
|
||||
out = inout(reg) output_tail,
|
||||
cnt = out(reg) _,
|
||||
out("p0") _, out("p1") _,
|
||||
out("v0") _, out("v1") _, out("v2") _, out("v3") _,
|
||||
options(nostack),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
unsafe { output_tail.offset_from(output) as usize }
|
||||
}
|
||||
|
||||
// SVE implements with intrinsics.
|
||||
//
|
||||
// #[target_feature(enable = "sve")]
|
||||
// unsafe fn filter_vec_sve_aux(
|
||||
// input: *const u32,
|
||||
// range_start: u32,
|
||||
// range_width: u32,
|
||||
// output: *mut u32,
|
||||
// offset: u32,
|
||||
// num_words: usize,
|
||||
// vl: usize,
|
||||
// ) -> usize {
|
||||
// unsafe {
|
||||
// let all_true = svptrue_b32();
|
||||
// let range_start_simd = svdup_n_u32(range_start);
|
||||
// let range_width_simd = svdup_n_u32(range_width);
|
||||
// // ids_a covers [offset .. offset+vl), ids_b covers the next vl ids.
|
||||
// // Keeping them separate breaks the loop-carried dependency through ids so
|
||||
// // both compact/cntp chains are fully independent within each unrolled body.
|
||||
// let mut ids_a = svindex_u32(offset, 1);
|
||||
// let step = svdup_n_u32(vl as u32);
|
||||
// let step2 = svdup_n_u32(2 * vl as u32);
|
||||
// let mut ids_b = svadd_u32_x(all_true, ids_a, step);
|
||||
|
||||
// let mut input = input;
|
||||
// let mut output_tail = output;
|
||||
|
||||
// // Unrolled ×2: both cntp calls have independent inputs and execute in parallel.
|
||||
// // The two output_tail updates are sequential but together cost 4+1+1=6 cy per
|
||||
// // pair vs 5+5=10 cy for two scalar iterations, breaking the cntp latency chain.
|
||||
// let num_pairs = num_words / 2;
|
||||
// for _ in 0..num_pairs {
|
||||
// let word_a = svld1_u32(all_true, input);
|
||||
// let word_b = svld1_u32(all_true, input.add(vl));
|
||||
|
||||
// let shifted_a = svsub_u32_x(all_true, word_a, range_start_simd);
|
||||
// let shifted_b = svsub_u32_x(all_true, word_b, range_start_simd);
|
||||
|
||||
// let in_range_a = svcmple_u32(all_true, shifted_a, range_width_simd);
|
||||
// let in_range_b = svcmple_u32(all_true, shifted_b, range_width_simd);
|
||||
|
||||
// let compacted_a = svcompact_u32(in_range_a, ids_a);
|
||||
// let compacted_b = svcompact_u32(in_range_b, ids_b);
|
||||
// // cntp_a and cntp_b have independent inputs: OOO engine issues them in parallel.
|
||||
// let added_len_a = svcntp_b32(all_true, in_range_a) as usize;
|
||||
// let added_len_b = svcntp_b32(all_true, in_range_b) as usize;
|
||||
|
||||
// // Write the full vector — only the first added_len slots are valid.
|
||||
// // Subsequent iterations overwrite the trailing zeros before truncate.
|
||||
// svst1_u32(all_true, output_tail, compacted_a);
|
||||
// output_tail = output_tail.add(added_len_a);
|
||||
// svst1_u32(all_true, output_tail, compacted_b);
|
||||
// output_tail = output_tail.add(added_len_b);
|
||||
|
||||
// ids_a = svadd_u32_x(all_true, ids_a, step2);
|
||||
// ids_b = svadd_u32_x(all_true, ids_b, step2);
|
||||
// input = input.add(2 * vl);
|
||||
// }
|
||||
|
||||
// // Handle an odd trailing word.
|
||||
// if num_words % 2 == 1 {
|
||||
// let word = svld1_u32(all_true, input);
|
||||
// let shifted = svsub_u32_x(all_true, word, range_start_simd);
|
||||
// let in_range = svcmple_u32(all_true, shifted, range_width_simd);
|
||||
// let added_len = svcntp_b32(all_true, in_range) as usize;
|
||||
// let compacted_ids = svcompact_u32(in_range, ids_a);
|
||||
// svst1_u32(all_true, output_tail, compacted_ids);
|
||||
// output_tail = output_tail.add(added_len);
|
||||
// }
|
||||
|
||||
// output_tail.offset_from(output) as usize
|
||||
// }
|
||||
// }
|
||||
@@ -301,14 +301,11 @@ pub trait SegmentCollector: 'static {
|
||||
/// The query pushes the scored document to the collector via this method.
|
||||
fn collect(&mut self, doc: DocId, score: Score);
|
||||
|
||||
/// The query pushes the matched documents to the collector via this method.
|
||||
/// The query pushes the scored document to the collector via this method.
|
||||
/// This method is used when the collector does not require scoring.
|
||||
///
|
||||
/// `docs` is a block of matched doc ids. Doc ids are produced in increasing
|
||||
/// order, in windows of [`COLLECT_BLOCK_BUFFER_LEN`](crate::COLLECT_BLOCK_BUFFER_LEN),
|
||||
/// but several windows are accumulated before being flushed here, so the
|
||||
/// block may be larger than `COLLECT_BLOCK_BUFFER_LEN`. Implementations must
|
||||
/// not assume any particular maximum length.
|
||||
/// See [`COLLECT_BLOCK_BUFFER_LEN`](crate::COLLECT_BLOCK_BUFFER_LEN) for the
|
||||
/// buffer size passed to the collector.
|
||||
fn collect_block(&mut self, docs: &[DocId]) {
|
||||
for doc in docs {
|
||||
self.collect(*doc, 0.0);
|
||||
|
||||
@@ -11,14 +11,9 @@ use crate::DocId;
|
||||
/// to compare `[u32; 4]`.
|
||||
pub const TERMINATED: DocId = i32::MAX as u32;
|
||||
|
||||
/// Window size used by [`DocSet::fill_buffer`]: a single `fill_buffer` call
|
||||
/// writes at most this many doc ids, and exactly this many as long as the
|
||||
/// `DocSet` is not exhausted.
|
||||
///
|
||||
/// Note that this is *not* the maximum length of the slice passed to
|
||||
/// `SegmentCollector::collect_block`: the collection loop accumulates several
|
||||
/// such windows into a larger buffer before flushing it, so `collect_block`
|
||||
/// may receive a block larger than `COLLECT_BLOCK_BUFFER_LEN`.
|
||||
/// The collect_block method on `SegmentCollector` uses a buffer of this size.
|
||||
/// Passed results to `collect_block` will not exceed this size and will be
|
||||
/// exactly this size as long as we can fill the buffer.
|
||||
pub const COLLECT_BLOCK_BUFFER_LEN: usize = 64;
|
||||
|
||||
/// Number of `TinySet` (64-bit) buckets in a block used by [`DocSet::fill_bitset_block`].
|
||||
|
||||
@@ -275,8 +275,9 @@ impl Recorder for TfAndPositionRecorder {
|
||||
mod tests {
|
||||
|
||||
use common::write_u32_vint;
|
||||
use stacker::MemoryArena;
|
||||
|
||||
use super::{BufferLender, VInt32Reader};
|
||||
use super::{BufferLender, Recorder, TermFrequencyRecorder, VInt32Reader};
|
||||
|
||||
#[test]
|
||||
fn test_buffer_lender() {
|
||||
@@ -314,4 +315,98 @@ mod tests {
|
||||
let res: Vec<u32> = VInt32Reader::new(&buffer[..]).collect();
|
||||
assert_eq!(&res[..], &vals[..]);
|
||||
}
|
||||
|
||||
// ── TermFrequencyRecorder ─────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn term_frequency_recorder_has_term_freq() {
|
||||
let rec = TermFrequencyRecorder::default();
|
||||
assert!(
|
||||
rec.has_term_freq(),
|
||||
"TermFrequencyRecorder must advertise term-frequency support"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn term_frequency_recorder_term_doc_freq_single_doc() {
|
||||
let mut arena = MemoryArena::default();
|
||||
let mut rec = TermFrequencyRecorder::default();
|
||||
|
||||
// Record one document with two term occurrences.
|
||||
rec.new_doc(0, &mut arena);
|
||||
rec.record_position(0, &mut arena);
|
||||
rec.record_position(1, &mut arena);
|
||||
rec.close_doc(&mut arena);
|
||||
|
||||
assert_eq!(
|
||||
rec.term_doc_freq(),
|
||||
Some(1),
|
||||
"term_doc_freq should be 1 after recording one document"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn term_frequency_recorder_term_doc_freq_multiple_docs() {
|
||||
let mut arena = MemoryArena::default();
|
||||
let mut rec = TermFrequencyRecorder::default();
|
||||
|
||||
// Three documents with 1, 3, and 2 occurrences respectively.
|
||||
for (doc, tf) in [(0u32, 1u32), (5, 3), (10, 2)] {
|
||||
rec.new_doc(doc, &mut arena);
|
||||
for pos in 0..tf {
|
||||
rec.record_position(pos, &mut arena);
|
||||
}
|
||||
rec.close_doc(&mut arena);
|
||||
}
|
||||
|
||||
assert_eq!(
|
||||
rec.term_doc_freq(),
|
||||
Some(3),
|
||||
"term_doc_freq should equal the number of documents recorded"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn term_frequency_recorder_zero_docs() {
|
||||
let rec = TermFrequencyRecorder::default();
|
||||
assert_eq!(
|
||||
rec.term_doc_freq(),
|
||||
Some(0),
|
||||
"term_doc_freq should be 0 before any document is recorded"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn term_frequency_recorder_single_occurrence_per_doc() {
|
||||
let mut arena = MemoryArena::default();
|
||||
let mut rec = TermFrequencyRecorder::default();
|
||||
|
||||
// Each document has exactly one occurrence — the minimum non-trivial case.
|
||||
for doc in [1u32, 2, 100] {
|
||||
rec.new_doc(doc, &mut arena);
|
||||
rec.record_position(0, &mut arena);
|
||||
rec.close_doc(&mut arena);
|
||||
}
|
||||
|
||||
assert_eq!(rec.term_doc_freq(), Some(3));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn term_frequency_recorder_high_frequency_doc() {
|
||||
let mut arena = MemoryArena::default();
|
||||
let mut rec = TermFrequencyRecorder::default();
|
||||
|
||||
// A document where the term appears many times.
|
||||
rec.new_doc(42, &mut arena);
|
||||
for pos in 0..1000 {
|
||||
rec.record_position(pos, &mut arena);
|
||||
}
|
||||
rec.close_doc(&mut arena);
|
||||
|
||||
assert_eq!(
|
||||
rec.term_doc_freq(),
|
||||
Some(1),
|
||||
"term_doc_freq counts documents, not occurrences"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use crate::docset::COLLECT_BLOCK_BUFFER_LEN;
|
||||
use crate::index::SegmentReader;
|
||||
use crate::postings::FreqReadingOption;
|
||||
use crate::query::disjunction::Disjunction;
|
||||
@@ -530,12 +531,13 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
|
||||
) -> crate::Result<()> {
|
||||
let scorer = self.complex_scorer(reader, 1.0, || DoNothingCombiner)?;
|
||||
let num_docs = reader.num_docs();
|
||||
let mut buffer = [0u32; COLLECT_BLOCK_BUFFER_LEN];
|
||||
|
||||
match scorer {
|
||||
SpecializedScorer::TermUnion(term_scorers) => {
|
||||
let mut union_scorer =
|
||||
BufferedUnionScorer::build(term_scorers, &self.score_combiner_fn, num_docs);
|
||||
for_each_docset_buffered(&mut union_scorer, callback);
|
||||
for_each_docset_buffered(&mut union_scorer, &mut buffer, callback);
|
||||
}
|
||||
SpecializedScorer::TermIntersection(term_scorers) => {
|
||||
let boxed_scorers: Vec<Box<dyn Scorer>> = term_scorers
|
||||
@@ -543,10 +545,10 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
|
||||
.map(|term_scorer| Box::new(term_scorer) as Box<dyn Scorer>)
|
||||
.collect();
|
||||
let mut intersection = intersect_scorers(boxed_scorers, num_docs);
|
||||
for_each_docset_buffered(intersection.as_mut(), callback);
|
||||
for_each_docset_buffered(intersection.as_mut(), &mut buffer, callback);
|
||||
}
|
||||
SpecializedScorer::Other(mut scorer) => {
|
||||
for_each_docset_buffered(scorer.as_mut(), callback);
|
||||
for_each_docset_buffered(scorer.as_mut(), &mut buffer, callback);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use super::term_scorer::TermScorer;
|
||||
use crate::docset::DocSet;
|
||||
use crate::docset::{DocSet, COLLECT_BLOCK_BUFFER_LEN};
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::index::SegmentReader;
|
||||
use crate::postings::SegmentPostings;
|
||||
@@ -92,11 +92,13 @@ impl Weight for TermWeight {
|
||||
) -> crate::Result<()> {
|
||||
match self.specialized_scorer(reader, 1.0)? {
|
||||
TermOrEmptyOrAllScorer::TermScorer(mut term_scorer) => {
|
||||
for_each_docset_buffered(&mut term_scorer, callback);
|
||||
let mut buffer = [0u32; COLLECT_BLOCK_BUFFER_LEN];
|
||||
for_each_docset_buffered(&mut term_scorer, &mut buffer, callback);
|
||||
}
|
||||
TermOrEmptyOrAllScorer::Empty => {}
|
||||
TermOrEmptyOrAllScorer::AllMatch(mut all_scorer) => {
|
||||
for_each_docset_buffered(&mut all_scorer, callback);
|
||||
let mut buffer = [0u32; COLLECT_BLOCK_BUFFER_LEN];
|
||||
for_each_docset_buffered(&mut all_scorer, &mut buffer, callback);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -17,56 +17,18 @@ pub(crate) fn for_each_scorer<TScorer: Scorer + ?Sized>(
|
||||
}
|
||||
}
|
||||
|
||||
/// Number of `COLLECT_BLOCK_BUFFER_LEN`-sized windows accumulated into the large
|
||||
/// buffer before it is flushed to the collector via `collect_block`.
|
||||
const NUM_WINDOWS_PER_BLOCK: usize = 32;
|
||||
/// Size of the buffer accumulated before invoking the callback (2_048 = 32 * 64).
|
||||
/// `fill_buffer` keeps writing `COLLECT_BLOCK_BUFFER_LEN`-sized windows; this only
|
||||
/// changes how much we accumulate before flushing.
|
||||
const LARGE_COLLECT_BUFFER_LEN: usize = COLLECT_BLOCK_BUFFER_LEN * NUM_WINDOWS_PER_BLOCK;
|
||||
|
||||
/// Iterates through all of the documents matched by the `DocSet`, flushing
|
||||
/// blocks of up to `LARGE_COLLECT_BUFFER_LEN` doc ids to `callback`.
|
||||
///
|
||||
/// `fill_buffer` only ever writes `COLLECT_BLOCK_BUFFER_LEN` doc ids at a time,
|
||||
/// so we accumulate several such windows into a single larger buffer before
|
||||
/// handing it to the collector. This amortizes the per-`collect_block` overhead
|
||||
/// (virtual dispatch, aggregation setup) over more documents.
|
||||
/// Iterates through all of the documents matched by the DocSet
|
||||
/// `DocSet`.
|
||||
#[inline]
|
||||
pub(crate) fn for_each_docset_buffered<T: DocSet + ?Sized>(
|
||||
docset: &mut T,
|
||||
buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN],
|
||||
mut callback: impl FnMut(&[DocId]),
|
||||
) {
|
||||
// Heap-allocated once per call (i.e. once per segment in the no-score path).
|
||||
// `new_zeroed_slice` zeroes directly on the heap, avoiding a 2_048-element
|
||||
// stack temporary.
|
||||
// SAFETY: an all-zero bit pattern is a valid value for every `DocId` (u32),
|
||||
// so the zeroed slice is fully initialized.
|
||||
let mut buffer: Box<[DocId]> =
|
||||
unsafe { Box::new_zeroed_slice(LARGE_COLLECT_BUFFER_LEN).assume_init() };
|
||||
loop {
|
||||
let mut filled = 0;
|
||||
let mut reached_end = false;
|
||||
// Fill the large buffer one `COLLECT_BLOCK_BUFFER_LEN` window at a time.
|
||||
// `chunks_exact_mut` yields windows of exactly `COLLECT_BLOCK_BUFFER_LEN`
|
||||
// because `LARGE_COLLECT_BUFFER_LEN` is a multiple of it (empty remainder).
|
||||
// The windows are contiguous and filled in order, so the doc ids always
|
||||
// occupy the contiguous prefix `buffer[..filled]`.
|
||||
for window in buffer.chunks_exact_mut(COLLECT_BLOCK_BUFFER_LEN) {
|
||||
// SAFETY: each `window` is a slice of exactly `COLLECT_BLOCK_BUFFER_LEN`
|
||||
// elements, so reinterpreting its start pointer as a fixed-size array
|
||||
// reference of that length is valid.
|
||||
let window: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN] =
|
||||
unsafe { &mut *window.as_mut_ptr().cast::<[DocId; COLLECT_BLOCK_BUFFER_LEN]>() };
|
||||
let num_items = docset.fill_buffer(window);
|
||||
filled += num_items;
|
||||
if num_items != COLLECT_BLOCK_BUFFER_LEN {
|
||||
reached_end = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
callback(&buffer[..filled]);
|
||||
if reached_end {
|
||||
let num_items = docset.fill_buffer(buffer);
|
||||
callback(&buffer[..num_items]);
|
||||
if num_items != buffer.len() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -142,7 +104,9 @@ pub trait Weight: Send + Sync + 'static {
|
||||
callback: &mut dyn FnMut(&[DocId]),
|
||||
) -> crate::Result<()> {
|
||||
let mut docset = self.scorer(reader, 1.0)?;
|
||||
for_each_docset_buffered(&mut docset, callback);
|
||||
|
||||
let mut buffer = [0u32; COLLECT_BLOCK_BUFFER_LEN];
|
||||
for_each_docset_buffered(&mut docset, &mut buffer, callback);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@ repository = "https://github.com/quickwit-oss/tantivy"
|
||||
description = "term hashmap used for indexing"
|
||||
|
||||
[dependencies]
|
||||
murmurhash32 = "0.3"
|
||||
murmurhash32 = "0.4"
|
||||
common = { version = "0.11", path = "../common/", package = "tantivy-common" }
|
||||
ahash = { version = "0.8.11", default-features = false, optional = true }
|
||||
|
||||
|
||||
Reference in New Issue
Block a user