Compare commits

..

41 Commits

Author SHA1 Message Date
Paul Masurel
f5221215b3 blop 2026-01-21 14:46:51 +01:00
Paul Masurel
7d9427e9d6 cleanup 2026-01-21 13:56:54 +01:00
Paul Masurel
62b50bb254 code clenaup 2026-01-21 11:44:27 +01:00
Paul Masurel
2dcd550b74 blop 2026-01-21 11:09:37 +01:00
Paul Masurel
4840886a87 codec 2026-01-21 10:30:35 +01:00
Paul Masurel
1de872ba71 blop 2026-01-20 22:48:45 +01:00
Paul Masurel
93915ce1bf unused removed 2026-01-20 22:43:02 +01:00
Paul Masurel
bf87c54f0e blop 2026-01-20 19:58:55 +01:00
Paul Masurel
4f27b503ed blop 2026-01-20 19:00:57 +01:00
Paul Masurel
0346942174 blop 2026-01-20 14:21:10 +01:00
Paul Masurel
6ec38276a6 Remove unused imports
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 14:20:24 +01:00
Paul Masurel
476960e89b blop 2026-01-20 12:20:32 +01:00
Paul Masurel
955ce6477c blop 2026-01-20 11:27:07 +01:00
Paul Masurel
daaecf2afb blop 2026-01-20 10:42:52 +01:00
Paul Masurel
d64178906f blop 2026-01-20 10:28:57 +01:00
Paul Masurel
f1377018b0 blop 2026-01-19 16:44:47 +01:00
Paul Masurel
9fe96d06af blop 2026-01-19 15:58:09 +01:00
Paul Masurel
d9c4270acb blop 2026-01-19 15:22:50 +01:00
Paul Masurel
16d1611f4d blo 2026-01-19 15:18:28 +01:00
Paul Masurel
b296948bcd blop 2026-01-19 15:09:04 +01:00
Paul Masurel
6b7380eda8 doc freq 2026-01-19 12:18:54 +01:00
Paul Masurel
947459a0a9 tests are compiling 2026-01-16 17:28:17 +01:00
Paul Masurel
eb18182901 blop 2026-01-16 15:57:25 +01:00
Paul Masurel
01d670f60c renaming 2026-01-16 15:44:30 +01:00
Paul Masurel
b77338b590 compiling 2026-01-16 15:18:31 +01:00
Paul Masurel
c75fa94d25 blop 2026-01-16 14:09:21 +01:00
Paul Masurel
cf632673ac blop 2026-01-16 13:56:21 +01:00
Paul Masurel
6f00d96127 blop 2026-01-16 13:45:51 +01:00
Paul Masurel
a5ccb62c99 Removing block postings public accessors 2026-01-16 11:38:01 +01:00
Paul Masurel
c42505a043 added method to lift Postings into TermScorer 2026-01-16 11:21:03 +01:00
Paul Masurel
3e57eb9add Generic TermScorer 2026-01-15 18:06:26 +01:00
Paul Masurel
0955b44ce1 blop 2026-01-15 14:54:42 +01:00
Paul Masurel
783a2a6bef Removing read_block_postings 2026-01-15 12:02:10 +01:00
Paul Masurel
1e3c353e21 blop 2026-01-14 11:43:38 +01:00
Paul Masurel
799e88adbd blop 2026-01-13 20:21:22 +01:00
Paul Masurel
1d5fe6bc7c blop 2026-01-13 19:06:34 +01:00
Paul Masurel
d768b2a491 blop 2026-01-13 17:03:12 +01:00
Paul Masurel
7453df8db3 postings writer enum 2026-01-13 15:24:03 +01:00
Paul Masurel
ba6abba20a First stab at introducing codecs 2026-01-12 19:41:06 +01:00
Paul Masurel
d128e5c2a2 first stab at codec 2026-01-12 15:59:43 +01:00
Paul Masurel
e6d062bf2d Minor refactoring in PostingsSerializer
Removes the Write generics argument in PostingsSerializer.
This removes useless generic.
Prepares the path for codecs.
Removes one useless CountingWrite layer.
etc.
2026-01-12 11:06:45 +01:00
85 changed files with 1054 additions and 2097 deletions

View File

@@ -27,7 +27,7 @@ regex = { version = "1.5.5", default-features = false, features = [
aho-corasick = "1.0"
tantivy-fst = "0.5"
memmap2 = { version = "0.9.0", optional = true }
lz4_flex = { version = "0.12", default-features = false, optional = true }
lz4_flex = { version = "0.11", default-features = false, optional = true }
zstd = { version = "0.13", optional = true, default-features = false }
tempfile = { version = "3.12.0", optional = true }
log = "0.4.16"
@@ -50,7 +50,7 @@ fail = { version = "0.5.0", optional = true }
time = { version = "0.3.35", features = ["serde-well-known"] }
smallvec = "1.8.0"
rayon = "1.5.2"
lru = "0.16.3"
lru = "0.12.0"
fastdivide = "0.4.0"
itertools = "0.14.0"
measure_time = "0.9.0"
@@ -76,7 +76,7 @@ winapi = "0.3.9"
[dev-dependencies]
binggan = "0.14.2"
rand = "0.9"
rand = "0.8.5"
maplit = "1.0.2"
matches = "0.1.9"
pretty_assertions = "1.2.1"
@@ -85,7 +85,7 @@ test-log = "0.2.10"
futures = "0.3.21"
paste = "1.0.11"
more-asserts = "0.3.1"
rand_distr = "0.5"
rand_distr = "0.4.3"
time = { version = "0.3.10", features = ["serde-well-known", "macros"] }
postcard = { version = "1.0.4", features = [
"use-std",
@@ -189,13 +189,3 @@ harness = false
[[bench]]
name = "bool_queries_with_range"
harness = false
[[bench]]
name = "str_search_and_get"
harness = false
[[bench]]
name = "merge_segments"
harness = false

View File

@@ -1,8 +1,8 @@
use binggan::plugins::PeakMemAllocPlugin;
use binggan::{black_box, InputGroup, PeakMemAlloc, INSTRUMENTED_SYSTEM};
use rand::distr::weighted::WeightedIndex;
use rand::distributions::WeightedIndex;
use rand::prelude::SliceRandom;
use rand::rngs::StdRng;
use rand::seq::IndexedRandom;
use rand::{Rng, SeedableRng};
use rand_distr::Distribution;
use serde_json::json;
@@ -532,7 +532,7 @@ fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
// Prepare 1000 unique terms sampled using a Zipf distribution.
// Exponent ~1.1 approximates top-20 terms covering around ~20%.
let terms_1000: Vec<String> = (1..=1000).map(|i| format!("term_{i}")).collect();
let zipf_1000 = rand_distr::Zipf::new(1000.0, 1.1f64).unwrap();
let zipf_1000 = rand_distr::Zipf::new(1000, 1.1f64).unwrap();
{
let mut rng = StdRng::from_seed([1u8; 32]);
@@ -576,8 +576,8 @@ fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
}
let _val_max = 1_000_000.0;
for _ in 0..doc_with_value {
let val: f64 = rng.random_range(0.0..1_000_000.0);
let json = if rng.random_bool(0.1) {
let val: f64 = rng.gen_range(0.0..1_000_000.0);
let json = if rng.gen_bool(0.1) {
// 10% are numeric values
json!({ "mixed_type": val })
} else {
@@ -586,7 +586,7 @@ fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
index_writer.add_document(doc!(
text_field => "cool",
json_field => json,
text_field_all_unique_terms => format!("unique_term_{}", rng.random::<u64>()),
text_field_all_unique_terms => format!("unique_term_{}", rng.gen::<u64>()),
text_field_many_terms => many_terms_data.choose(&mut rng).unwrap().to_string(),
text_field_few_terms_status => status_field_data[log_level_distribution.sample(&mut rng)].0,
text_field_1000_terms_zipf => terms_1000[zipf_1000.sample(&mut rng) as usize - 1].as_str(),

View File

@@ -55,29 +55,29 @@ fn build_shared_indices(num_docs: usize, p_a: f32, p_b: f32, p_c: f32) -> (Bench
{
let mut writer = index.writer_with_num_threads(1, 500_000_000).unwrap();
for _ in 0..num_docs {
let has_a = rng.random_bool(p_a as f64);
let has_b = rng.random_bool(p_b as f64);
let has_c = rng.random_bool(p_c as f64);
let score = rng.random_range(0u64..100u64);
let score2 = rng.random_range(0u64..100_000u64);
let has_a = rng.gen_bool(p_a as f64);
let has_b = rng.gen_bool(p_b as f64);
let has_c = rng.gen_bool(p_c as f64);
let score = rng.gen_range(0u64..100u64);
let score2 = rng.gen_range(0u64..100_000u64);
let mut title_tokens: Vec<&str> = Vec::new();
let mut body_tokens: Vec<&str> = Vec::new();
if has_a {
if rng.random_bool(0.1) {
if rng.gen_bool(0.1) {
title_tokens.push("a");
} else {
body_tokens.push("a");
}
}
if has_b {
if rng.random_bool(0.1) {
if rng.gen_bool(0.1) {
title_tokens.push("b");
} else {
body_tokens.push("b");
}
}
if has_c {
if rng.random_bool(0.1) {
if rng.gen_bool(0.1) {
title_tokens.push("c");
} else {
body_tokens.push("c");

View File

@@ -36,13 +36,13 @@ fn build_shared_indices(num_docs: usize, p_title_a: f32, distribution: &str) ->
"dense" => {
for doc_id in 0..num_docs {
// Always add title to avoid empty documents
let title_token = if rng.random_bool(p_title_a as f64) {
let title_token = if rng.gen_bool(p_title_a as f64) {
"a"
} else {
"b"
};
let num_rand = rng.random_range(0u64..1000u64);
let num_rand = rng.gen_range(0u64..1000u64);
let num_asc = (doc_id / 10000) as u64;
@@ -60,13 +60,13 @@ fn build_shared_indices(num_docs: usize, p_title_a: f32, distribution: &str) ->
"sparse" => {
for doc_id in 0..num_docs {
// Always add title to avoid empty documents
let title_token = if rng.random_bool(p_title_a as f64) {
let title_token = if rng.gen_bool(p_title_a as f64) {
"a"
} else {
"b"
};
let num_rand = rng.random_range(0u64..10000000u64);
let num_rand = rng.gen_range(0u64..10000000u64);
let num_asc = doc_id as u64;

View File

@@ -1,224 +0,0 @@
// Benchmarks segment merging
//
// Notes:
// - Input segments are kept intact (no deletes / no IndexWriter merge).
// - Output is written to a `NullDirectory` that discards all files except
// fieldnorms (needed for merging).
use std::collections::HashMap;
use std::io::{self, Write};
use std::path::{Path, PathBuf};
use std::sync::{Arc, RwLock};
use binggan::{black_box, BenchRunner};
use rand::prelude::*;
use rand::rngs::StdRng;
use rand::SeedableRng;
use tantivy::directory::error::{DeleteError, OpenReadError, OpenWriteError};
use tantivy::directory::{
AntiCallToken, Directory, FileHandle, OwnedBytes, TerminatingWrite, WatchCallback, WatchHandle,
WritePtr,
};
use tantivy::indexer::{merge_filtered_segments, NoMergePolicy};
use tantivy::schema::{Schema, TEXT};
use tantivy::{doc, HasLen, Index, IndexSettings, Segment};
#[derive(Clone, Default, Debug)]
struct NullDirectory {
blobs: Arc<RwLock<HashMap<PathBuf, OwnedBytes>>>,
}
struct NullWriter;
impl Write for NullWriter {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
Ok(buf.len())
}
fn flush(&mut self) -> io::Result<()> {
Ok(())
}
}
impl TerminatingWrite for NullWriter {
fn terminate_ref(&mut self, _token: AntiCallToken) -> io::Result<()> {
Ok(())
}
}
struct InMemoryWriter {
path: PathBuf,
buffer: Vec<u8>,
blobs: Arc<RwLock<HashMap<PathBuf, OwnedBytes>>>,
}
impl Write for InMemoryWriter {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
self.buffer.extend_from_slice(buf);
Ok(buf.len())
}
fn flush(&mut self) -> io::Result<()> {
Ok(())
}
}
impl TerminatingWrite for InMemoryWriter {
fn terminate_ref(&mut self, _token: AntiCallToken) -> io::Result<()> {
let bytes = OwnedBytes::new(std::mem::take(&mut self.buffer));
self.blobs.write().unwrap().insert(self.path.clone(), bytes);
Ok(())
}
}
#[derive(Debug, Default)]
struct NullFileHandle;
impl HasLen for NullFileHandle {
fn len(&self) -> usize {
0
}
}
impl FileHandle for NullFileHandle {
fn read_bytes(&self, _range: std::ops::Range<usize>) -> io::Result<OwnedBytes> {
unimplemented!()
}
}
impl Directory for NullDirectory {
fn get_file_handle(&self, path: &Path) -> Result<Arc<dyn FileHandle>, OpenReadError> {
if let Some(bytes) = self.blobs.read().unwrap().get(path) {
return Ok(Arc::new(bytes.clone()));
}
Ok(Arc::new(NullFileHandle))
}
fn delete(&self, _path: &Path) -> Result<(), DeleteError> {
Ok(())
}
fn exists(&self, _path: &Path) -> Result<bool, OpenReadError> {
Ok(true)
}
fn open_write(&self, path: &Path) -> Result<WritePtr, OpenWriteError> {
let path_buf = path.to_path_buf();
if path.to_string_lossy().ends_with(".fieldnorm") {
let writer = InMemoryWriter {
path: path_buf,
buffer: Vec::new(),
blobs: Arc::clone(&self.blobs),
};
Ok(io::BufWriter::new(Box::new(writer)))
} else {
Ok(io::BufWriter::new(Box::new(NullWriter)))
}
}
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError> {
if let Some(bytes) = self.blobs.read().unwrap().get(path) {
return Ok(bytes.as_slice().to_vec());
}
Err(OpenReadError::FileDoesNotExist(path.to_path_buf()))
}
fn atomic_write(&self, _path: &Path, _data: &[u8]) -> io::Result<()> {
Ok(())
}
fn sync_directory(&self) -> io::Result<()> {
Ok(())
}
fn watch(&self, _watch_callback: WatchCallback) -> tantivy::Result<WatchHandle> {
Ok(WatchHandle::empty())
}
}
struct MergeScenario {
#[allow(dead_code)]
index: Index,
segments: Vec<Segment>,
settings: IndexSettings,
label: String,
}
fn build_index(
num_segments: usize,
docs_per_segment: usize,
tokens_per_doc: usize,
vocab_size: usize,
) -> MergeScenario {
let mut schema_builder = Schema::builder();
let body = schema_builder.add_text_field("body", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
assert!(vocab_size > 0);
let total_tokens = num_segments * docs_per_segment * tokens_per_doc;
let use_unique_terms = vocab_size >= total_tokens;
let mut rng = StdRng::from_seed([7u8; 32]);
let mut next_token_id: u64 = 0;
{
let mut writer = index.writer_with_num_threads(1, 256_000_000).unwrap();
writer.set_merge_policy(Box::new(NoMergePolicy));
for _ in 0..num_segments {
for _ in 0..docs_per_segment {
let mut tokens = Vec::with_capacity(tokens_per_doc);
for _ in 0..tokens_per_doc {
let token_id = if use_unique_terms {
let id = next_token_id;
next_token_id += 1;
id
} else {
rng.random_range(0..vocab_size as u64)
};
tokens.push(format!("term_{token_id}"));
}
writer.add_document(doc!(body => tokens.join(" "))).unwrap();
}
writer.commit().unwrap();
}
}
let segments = index.searchable_segments().unwrap();
let settings = index.settings().clone();
let label = format!(
"segments={}, docs/seg={}, tokens/doc={}, vocab={}",
num_segments, docs_per_segment, tokens_per_doc, vocab_size
);
MergeScenario {
index,
segments,
settings,
label,
}
}
fn main() {
let scenarios = vec![
build_index(8, 50_000, 12, 8),
build_index(16, 50_000, 12, 8),
build_index(16, 100_000, 12, 8),
build_index(8, 50_000, 8, 8 * 50_000 * 8),
];
let mut runner = BenchRunner::new();
for scenario in scenarios {
let mut group = runner.new_group();
group.set_name(format!("merge_segments — {}", scenario.label));
let segments = scenario.segments.clone();
let settings = scenario.settings.clone();
group.register("merge", move |_| {
let output_dir = NullDirectory::default();
let filter_doc_ids = vec![None; segments.len()];
let merged_index =
merge_filtered_segments(&segments, settings.clone(), filter_doc_ids, output_dir)
.unwrap();
black_box(merged_index);
});
group.run();
}
}

View File

@@ -33,7 +33,7 @@ fn build_shared_indices(num_docs: usize, distribution: &str) -> BenchIndex {
match distribution {
"dense" => {
for doc_id in 0..num_docs {
let num_rand = rng.random_range(0u64..1000u64);
let num_rand = rng.gen_range(0u64..1000u64);
let num_asc = (doc_id / 10000) as u64;
writer
@@ -46,7 +46,7 @@ fn build_shared_indices(num_docs: usize, distribution: &str) -> BenchIndex {
}
"sparse" => {
for doc_id in 0..num_docs {
let num_rand = rng.random_range(0u64..10000000u64);
let num_rand = rng.gen_range(0u64..10000000u64);
let num_asc = doc_id as u64;
writer

View File

@@ -97,20 +97,20 @@ fn get_index_0_to_100() -> Index {
let num_vals = 100_000;
let docs: Vec<_> = (0..num_vals)
.map(|_i| {
let id_name = if rng.random_bool(0.01) {
let id_name = if rng.gen_bool(0.01) {
"veryfew".to_string() // 1%
} else if rng.random_bool(0.1) {
} else if rng.gen_bool(0.1) {
"few".to_string() // 9%
} else {
"most".to_string() // 90%
};
Doc {
id_name,
id: rng.random_range(0..100),
id: rng.gen_range(0..100),
// Multiply by 1000, so that we create most buckets in the compact space
// The benches depend on this range to select n-percent of elements with the
// methods below.
ip: Ipv6Addr::from_u128(rng.random_range(0..100) * 1000),
ip: Ipv6Addr::from_u128(rng.gen_range(0..100) * 1000),
}
})
.collect();

View File

@@ -1,421 +0,0 @@
// This benchmark compares different approaches for retrieving string values:
//
// 1. Fast Field Approach: retrieves string values via term_ords() and ord_to_str()
//
// 2. Doc Store Approach: retrieves string values via searcher.doc() and field extraction
//
// The benchmark includes various data distributions:
// - Dense Sequential: Sequential document IDs with dense data
// - Dense Random: Random document IDs with dense data
// - Sparse Sequential: Sequential document IDs with sparse data
// - Sparse Random: Random document IDs with sparse data
use std::ops::Bound;
use binggan::{black_box, BenchGroup, BenchRunner};
use rand::prelude::*;
use rand::rngs::StdRng;
use rand::SeedableRng;
use tantivy::collector::{Count, DocSetCollector};
use tantivy::query::RangeQuery;
use tantivy::schema::document::TantivyDocument;
use tantivy::schema::{Schema, Value, FAST, STORED, STRING};
use tantivy::{doc, Index, ReloadPolicy, Searcher, Term};
#[derive(Clone)]
struct BenchIndex {
#[allow(dead_code)]
index: Index,
searcher: Searcher,
}
fn build_shared_indices(num_docs: usize, distribution: &str) -> BenchIndex {
// Schema with string fast field and stored field for doc access
let mut schema_builder = Schema::builder();
let f_str_fast = schema_builder.add_text_field("str_fast", STRING | STORED | FAST);
let f_str_stored = schema_builder.add_text_field("str_stored", STRING | STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
// Populate index with stable RNG for reproducibility.
let mut rng = StdRng::from_seed([7u8; 32]);
{
let mut writer = index.writer_with_num_threads(1, 4_000_000_000).unwrap();
match distribution {
"dense_random" => {
for _doc_id in 0..num_docs {
let suffix = rng.random_range(0u64..1000u64);
let str_val = format!("str_{:03}", suffix);
writer
.add_document(doc!(
f_str_fast=>str_val.clone(),
f_str_stored=>str_val,
))
.unwrap();
}
}
"dense_sequential" => {
for doc_id in 0..num_docs {
let suffix = doc_id as u64 % 1000;
let str_val = format!("str_{:03}", suffix);
writer
.add_document(doc!(
f_str_fast=>str_val.clone(),
f_str_stored=>str_val,
))
.unwrap();
}
}
"sparse_random" => {
for _doc_id in 0..num_docs {
let suffix = rng.random_range(0u64..1000000u64);
let str_val = format!("str_{:07}", suffix);
writer
.add_document(doc!(
f_str_fast=>str_val.clone(),
f_str_stored=>str_val,
))
.unwrap();
}
}
"sparse_sequential" => {
for doc_id in 0..num_docs {
let suffix = doc_id as u64;
let str_val = format!("str_{:07}", suffix);
writer
.add_document(doc!(
f_str_fast=>str_val.clone(),
f_str_stored=>str_val,
))
.unwrap();
}
}
_ => {
panic!("Unsupported distribution type");
}
}
writer.commit().unwrap();
}
// Prepare reader/searcher once.
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
let searcher = reader.searcher();
BenchIndex { index, searcher }
}
fn main() {
// Prepare corpora with varying scenarios
let scenarios = vec![
(
"dense_random_search_low_range".to_string(),
1_000_000,
"dense_random",
0,
9,
),
(
"dense_random_search_high_range".to_string(),
1_000_000,
"dense_random",
990,
999,
),
(
"dense_sequential_search_low_range".to_string(),
1_000_000,
"dense_sequential",
0,
9,
),
(
"dense_sequential_search_high_range".to_string(),
1_000_000,
"dense_sequential",
990,
999,
),
(
"sparse_random_search_low_range".to_string(),
1_000_000,
"sparse_random",
0,
9999,
),
(
"sparse_random_search_high_range".to_string(),
1_000_000,
"sparse_random",
990_000,
999_999,
),
(
"sparse_sequential_search_low_range".to_string(),
1_000_000,
"sparse_sequential",
0,
9999,
),
(
"sparse_sequential_search_high_range".to_string(),
1_000_000,
"sparse_sequential",
990_000,
999_999,
),
];
let mut runner = BenchRunner::new();
for (scenario_id, n, distribution, range_low, range_high) in scenarios {
let bench_index = build_shared_indices(n, distribution);
let mut group = runner.new_group();
group.set_name(scenario_id);
let field = bench_index.searcher.schema().get_field("str_fast").unwrap();
let (lower_str, upper_str) =
if distribution == "dense_sequential" || distribution == "dense_random" {
(
format!("str_{:03}", range_low),
format!("str_{:03}", range_high),
)
} else {
(
format!("str_{:07}", range_low),
format!("str_{:07}", range_high),
)
};
let lower_term = Term::from_field_text(field, &lower_str);
let upper_term = Term::from_field_text(field, &upper_str);
let query = RangeQuery::new(Bound::Included(lower_term), Bound::Included(upper_term));
run_benchmark_tasks(&mut group, &bench_index, query, range_low, range_high);
group.run();
}
}
/// Run all benchmark tasks for a given range query
fn run_benchmark_tasks(
bench_group: &mut BenchGroup,
bench_index: &BenchIndex,
query: RangeQuery,
range_low: u64,
range_high: u64,
) {
// Test count of matching documents
add_bench_task_count(
bench_group,
bench_index,
query.clone(),
range_low,
range_high,
);
// Test fetching all DocIds of matching documents
add_bench_task_docset(
bench_group,
bench_index,
query.clone(),
range_low,
range_high,
);
// Test fetching all string fast field values of matching documents
add_bench_task_fetch_all_strings(
bench_group,
bench_index,
query.clone(),
range_low,
range_high,
);
// Test fetching all string values of matching documents through doc() method
add_bench_task_fetch_all_strings_from_doc(
bench_group,
bench_index,
query,
range_low,
range_high,
);
}
fn add_bench_task_count(
bench_group: &mut BenchGroup,
bench_index: &BenchIndex,
query: RangeQuery,
range_low: u64,
range_high: u64,
) {
let task_name = format!("string_search_count_[{}-{}]", range_low, range_high);
let search_task = CountSearchTask {
searcher: bench_index.searcher.clone(),
query,
};
bench_group.register(task_name, move |_| black_box(search_task.run()));
}
fn add_bench_task_docset(
bench_group: &mut BenchGroup,
bench_index: &BenchIndex,
query: RangeQuery,
range_low: u64,
range_high: u64,
) {
let task_name = format!("string_fetch_all_docset_[{}-{}]", range_low, range_high);
let search_task = DocSetSearchTask {
searcher: bench_index.searcher.clone(),
query,
};
bench_group.register(task_name, move |_| black_box(search_task.run()));
}
fn add_bench_task_fetch_all_strings(
bench_group: &mut BenchGroup,
bench_index: &BenchIndex,
query: RangeQuery,
range_low: u64,
range_high: u64,
) {
let task_name = format!(
"string_fastfield_fetch_all_strings_[{}-{}]",
range_low, range_high
);
let search_task = FetchAllStringsSearchTask {
searcher: bench_index.searcher.clone(),
query,
};
bench_group.register(task_name, move |_| {
let result = black_box(search_task.run());
result.len()
});
}
fn add_bench_task_fetch_all_strings_from_doc(
bench_group: &mut BenchGroup,
bench_index: &BenchIndex,
query: RangeQuery,
range_low: u64,
range_high: u64,
) {
let task_name = format!(
"string_doc_fetch_all_strings_[{}-{}]",
range_low, range_high
);
let search_task = FetchAllStringsFromDocTask {
searcher: bench_index.searcher.clone(),
query,
};
bench_group.register(task_name, move |_| {
let result = black_box(search_task.run());
result.len()
});
}
struct CountSearchTask {
searcher: Searcher,
query: RangeQuery,
}
impl CountSearchTask {
#[inline(never)]
pub fn run(&self) -> usize {
self.searcher.search(&self.query, &Count).unwrap()
}
}
struct DocSetSearchTask {
searcher: Searcher,
query: RangeQuery,
}
impl DocSetSearchTask {
#[inline(never)]
pub fn run(&self) -> usize {
let result = self.searcher.search(&self.query, &DocSetCollector).unwrap();
result.len()
}
}
struct FetchAllStringsSearchTask {
searcher: Searcher,
query: RangeQuery,
}
impl FetchAllStringsSearchTask {
#[inline(never)]
pub fn run(&self) -> Vec<String> {
let doc_addresses = self.searcher.search(&self.query, &DocSetCollector).unwrap();
let mut docs = doc_addresses.into_iter().collect::<Vec<_>>();
docs.sort();
let mut strings = Vec::with_capacity(docs.len());
for doc_address in docs {
let segment_reader = &self.searcher.segment_readers()[doc_address.segment_ord as usize];
let str_column_opt = segment_reader.fast_fields().str("str_fast");
if let Ok(Some(str_column)) = str_column_opt {
let doc_id = doc_address.doc_id;
let term_ord = str_column.term_ords(doc_id).next().unwrap();
let mut str_buffer = String::new();
if str_column.ord_to_str(term_ord, &mut str_buffer).is_ok() {
strings.push(str_buffer);
}
}
}
strings
}
}
struct FetchAllStringsFromDocTask {
searcher: Searcher,
query: RangeQuery,
}
impl FetchAllStringsFromDocTask {
#[inline(never)]
pub fn run(&self) -> Vec<String> {
let doc_addresses = self.searcher.search(&self.query, &DocSetCollector).unwrap();
let mut docs = doc_addresses.into_iter().collect::<Vec<_>>();
docs.sort();
let mut strings = Vec::with_capacity(docs.len());
let str_stored_field = self
.searcher
.schema()
.get_field("str_stored")
.expect("str_stored field should exist");
for doc_address in docs {
// Get the document from the doc store (row store access)
if let Ok(doc) = self.searcher.doc::<TantivyDocument>(doc_address) {
// Extract string values from the stored field
if let Some(field_value) = doc.get_first(str_stored_field) {
if let Some(text) = field_value.as_value().as_str() {
strings.push(text.to_string());
}
}
}
}
strings
}
}

View File

@@ -18,5 +18,5 @@ homepage = "https://github.com/quickwit-oss/tantivy"
bitpacking = { version = "0.9.2", default-features = false, features = ["bitpacker1x"] }
[dev-dependencies]
rand = "0.9"
rand = "0.8"
proptest = "1"

View File

@@ -4,8 +4,8 @@ extern crate test;
#[cfg(test)]
mod tests {
use rand::rng;
use rand::seq::IteratorRandom;
use rand::thread_rng;
use tantivy_bitpacker::{BitPacker, BitUnpacker, BlockedBitpacker};
use test::Bencher;
@@ -27,7 +27,7 @@ mod tests {
let num_els = 1_000_000u32;
let bit_unpacker = BitUnpacker::new(bit_width);
let data = create_bitpacked_data(bit_width, num_els);
let idxs: Vec<u32> = (0..num_els).choose_multiple(&mut rng(), 100_000);
let idxs: Vec<u32> = (0..num_els).choose_multiple(&mut thread_rng(), 100_000);
b.iter(|| {
let mut out = 0u64;
for &idx in &idxs {

View File

@@ -22,7 +22,7 @@ downcast-rs = "2.0.1"
[dev-dependencies]
proptest = "1"
more-asserts = "0.3.1"
rand = "0.9"
rand = "0.8"
binggan = "0.14.0"
[[bench]]

View File

@@ -9,7 +9,7 @@ use tantivy_columnar::column_values::{CodecType, serialize_and_load_u64_based_co
fn get_data() -> Vec<u64> {
let mut rng = StdRng::seed_from_u64(2u64);
let mut data: Vec<_> = (100..55_000_u64)
.map(|num| num + rng.random::<u8>() as u64)
.map(|num| num + rng.r#gen::<u8>() as u64)
.collect();
data.push(99_000);
data.insert(1000, 2000);

View File

@@ -6,7 +6,7 @@ use tantivy_columnar::column_values::{CodecType, serialize_u64_based_column_valu
fn get_data() -> Vec<u64> {
let mut rng = StdRng::seed_from_u64(2u64);
let mut data: Vec<_> = (100..55_000_u64)
.map(|num| num + rng.random::<u8>() as u64)
.map(|num| num + rng.r#gen::<u8>() as u64)
.collect();
data.push(99_000);
data.insert(1000, 2000);

View File

@@ -8,7 +8,7 @@ const TOTAL_NUM_VALUES: u32 = 1_000_000;
fn gen_optional_index(fill_ratio: f64) -> OptionalIndex {
let mut rng: StdRng = StdRng::from_seed([1u8; 32]);
let vals: Vec<u32> = (0..TOTAL_NUM_VALUES)
.map(|_| rng.random_bool(fill_ratio))
.map(|_| rng.gen_bool(fill_ratio))
.enumerate()
.filter(|(_pos, val)| *val)
.map(|(pos, _)| pos as u32)
@@ -25,7 +25,7 @@ fn random_range_iterator(
let mut rng: StdRng = StdRng::from_seed([1u8; 32]);
let mut current = start;
std::iter::from_fn(move || {
current += rng.random_range(avg_step_size - avg_deviation..=avg_step_size + avg_deviation);
current += rng.gen_range(avg_step_size - avg_deviation..=avg_step_size + avg_deviation);
if current >= end { None } else { Some(current) }
})
}

View File

@@ -39,7 +39,7 @@ fn get_data_50percent_item() -> Vec<u128> {
let mut data = vec![];
for _ in 0..300_000 {
let val = rng.random_range(1..=100);
let val = rng.gen_range(1..=100);
data.push(val);
}
data.push(SINGLE_ITEM);

View File

@@ -34,7 +34,7 @@ fn get_data_50percent_item() -> Vec<u128> {
let mut data = vec![];
for _ in 0..300_000 {
let val = rng.random_range(1..=100);
let val = rng.gen_range(1..=100);
data.push(val);
}
data.push(SINGLE_ITEM);

View File

@@ -268,7 +268,7 @@ mod tests {
#[test]
fn linear_interpol_fast_field_rand() {
let mut rng = rand::rng();
let mut rng = rand::thread_rng();
for _ in 0..50 {
let mut data = (0..10_000).map(|_| rng.next_u64()).collect::<Vec<_>>();
create_and_validate::<LinearCodec>(&data, "random");

View File

@@ -122,7 +122,7 @@ pub(crate) fn create_and_validate<TColumnCodec: ColumnCodec>(
assert_eq!(vals, buffer);
if !vals.is_empty() {
let test_rand_idx = rand::rng().random_range(0..=vals.len() - 1);
let test_rand_idx = rand::thread_rng().gen_range(0..=vals.len() - 1);
let expected_positions: Vec<u32> = vals
.iter()
.enumerate()

View File

@@ -21,5 +21,5 @@ serde = { version = "1.0.136", features = ["derive"] }
[dev-dependencies]
binggan = "0.14.0"
proptest = "1.0.0"
rand = "0.9"
rand = "0.8.4"

View File

@@ -1,6 +1,6 @@
use binggan::{BenchRunner, black_box};
use rand::rng;
use rand::seq::IteratorRandom;
use rand::thread_rng;
use tantivy_common::{BitSet, TinySet, serialize_vint_u32};
fn bench_vint() {
@@ -17,7 +17,7 @@ fn bench_vint() {
black_box(out);
});
let vals: Vec<u32> = (0..20_000).choose_multiple(&mut rng(), 100_000);
let vals: Vec<u32> = (0..20_000).choose_multiple(&mut thread_rng(), 100_000);
runner.bench_function("bench_vint_rand", move |_| {
let mut out = 0u64;
for val in vals.iter().cloned() {

View File

@@ -297,9 +297,6 @@ impl BitSet {
.map(|delta_bucket| bucket + delta_bucket as u32)
}
/// Returns the maximum number of elements in the bitset.
///
/// Warning: The largest element the bitset can contain is `max_value - 1`.
#[inline]
pub fn max_value(&self) -> u32 {
self.max_value
@@ -417,7 +414,7 @@ mod tests {
use std::collections::HashSet;
use ownedbytes::OwnedBytes;
use rand::distr::Bernoulli;
use rand::distributions::Bernoulli;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};

View File

@@ -560,7 +560,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
(
(
value((), tag(">=")),
map(word_infallible(")", false), |(bound, err)| {
map(word_infallible("", false), |(bound, err)| {
(
(
bound
@@ -574,7 +574,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
),
(
value((), tag("<=")),
map(word_infallible(")", false), |(bound, err)| {
map(word_infallible("", false), |(bound, err)| {
(
(
UserInputBound::Unbounded,
@@ -588,7 +588,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
),
(
value((), tag(">")),
map(word_infallible(")", false), |(bound, err)| {
map(word_infallible("", false), |(bound, err)| {
(
(
bound
@@ -602,7 +602,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
),
(
value((), tag("<")),
map(word_infallible(")", false), |(bound, err)| {
map(word_infallible("", false), |(bound, err)| {
(
(
UserInputBound::Unbounded,
@@ -1323,14 +1323,6 @@ mod test {
test_parse_query_to_ast_helper("<a", "{\"*\" TO \"a\"}");
test_parse_query_to_ast_helper("<=a", "{\"*\" TO \"a\"]");
test_parse_query_to_ast_helper("<=bsd", "{\"*\" TO \"bsd\"]");
test_parse_query_to_ast_helper("(<=42)", "{\"*\" TO \"42\"]");
test_parse_query_to_ast_helper("(<=42 )", "{\"*\" TO \"42\"]");
test_parse_query_to_ast_helper("(age:>5)", "\"age\":{\"5\" TO \"*\"}");
test_parse_query_to_ast_helper(
"(title:bar AND age:>12)",
"(+\"title\":bar +\"age\":{\"12\" TO \"*\"})",
);
}
#[test]

View File

@@ -11,11 +11,7 @@ pub use standard::StandardCodec;
use crate::codec::postings::PostingsCodec;
use crate::fieldnorm::FieldNormReader;
use crate::postings::{Postings, TermInfo};
use crate::query::score_combiner::DoNothingCombiner;
use crate::query::term_query::TermScorer;
use crate::query::{
box_scorer, Bm25Weight, BufferedUnionScorer, PhraseScorer, Scorer, SumCombiner,
};
use crate::query::{box_scorer, Bm25Weight, Scorer};
use crate::schema::IndexRecordOption;
use crate::{DocId, InvertedIndexReader, Score};
@@ -26,9 +22,8 @@ pub trait Codec: Clone + std::fmt::Debug + Send + Sync + 'static {
/// The specific postings type used by this codec.
type PostingsCodec: PostingsCodec;
/// ID of the codec. It should be unique to your codec.
/// Make it human-readable, descriptive, short and unique.
const ID: &'static str;
/// Name of the codec. It should be unique to your codec.
const NAME: &'static str;
/// Load codec based on the codec configuration.
fn from_json_props(json_value: &serde_json::Value) -> crate::Result<Self>;
@@ -53,7 +48,7 @@ pub trait ObjectSafeCodec: 'static + Send + Sync {
&self,
term_info: &TermInfo,
option: IndexRecordOption,
inverted_index_reader: &dyn InvertedIndexReader,
inverted_index_reader: &InvertedIndexReader,
) -> io::Result<Box<dyn Postings>>;
/// Loads a type-erased TermScorer object for the given term.
@@ -68,7 +63,7 @@ pub trait ObjectSafeCodec: 'static + Send + Sync {
&self,
term_info: &TermInfo,
option: IndexRecordOption,
inverted_index_reader: &dyn InvertedIndexReader,
inverted_index_reader: &InvertedIndexReader,
fieldnorm_reader: FieldNormReader,
similarity_weight: Bm25Weight,
) -> io::Result<Box<dyn Scorer>>;
@@ -87,7 +82,7 @@ pub trait ObjectSafeCodec: 'static + Send + Sync {
similarity_weight: Option<Bm25Weight>,
fieldnorm_reader: FieldNormReader,
slop: u32,
inverted_index_reader: &dyn InvertedIndexReader,
inverted_index_reader: &InvertedIndexReader,
) -> io::Result<Box<dyn Scorer>>;
/// Performs a for_each_pruning operation on the given scorer.
@@ -106,15 +101,6 @@ pub trait ObjectSafeCodec: 'static + Send + Sync {
scorer: Box<dyn Scorer>,
callback: &mut dyn FnMut(DocId, Score) -> Score,
);
/// Builds a union scorer possibly specialized if
/// all scorers are `Term<Self::Postings>`.
fn build_union_scorer_with_sum_combiner(
&self,
scorers: Vec<Box<dyn Scorer>>,
num_docs: DocId,
score_combiner_type: SumOrDoNothingCombiner,
) -> Box<dyn Scorer>;
}
impl<TCodec: Codec> ObjectSafeCodec for TCodec {
@@ -122,16 +108,10 @@ impl<TCodec: Codec> ObjectSafeCodec for TCodec {
&self,
term_info: &TermInfo,
option: IndexRecordOption,
inverted_index_reader: &dyn InvertedIndexReader,
inverted_index_reader: &InvertedIndexReader,
) -> io::Result<Box<dyn Postings>> {
let postings_data = inverted_index_reader.read_postings_data(term_info, option)?;
let postings = self.postings_codec().load_postings(
term_info.doc_freq,
postings_data.postings_data,
postings_data.record_option,
postings_data.effective_option,
postings_data.positions_data,
)?;
let postings = inverted_index_reader
.read_postings_from_terminfo_specialized(term_info, option, self)?;
Ok(Box::new(postings))
}
@@ -139,19 +119,17 @@ impl<TCodec: Codec> ObjectSafeCodec for TCodec {
&self,
term_info: &TermInfo,
option: IndexRecordOption,
inverted_index_reader: &dyn InvertedIndexReader,
inverted_index_reader: &InvertedIndexReader,
fieldnorm_reader: FieldNormReader,
similarity_weight: Bm25Weight,
) -> io::Result<Box<dyn Scorer>> {
let postings_data = inverted_index_reader.read_postings_data(term_info, option)?;
let postings = self.postings_codec().load_postings(
term_info.doc_freq,
postings_data.postings_data,
postings_data.record_option,
postings_data.effective_option,
postings_data.positions_data,
let scorer = inverted_index_reader.new_term_scorer_specialized(
term_info,
option,
fieldnorm_reader,
similarity_weight,
self,
)?;
let scorer = TermScorer::new(postings, fieldnorm_reader, similarity_weight);
Ok(box_scorer(scorer))
}
@@ -161,72 +139,18 @@ impl<TCodec: Codec> ObjectSafeCodec for TCodec {
similarity_weight: Option<Bm25Weight>,
fieldnorm_reader: FieldNormReader,
slop: u32,
inverted_index_reader: &dyn InvertedIndexReader,
inverted_index_reader: &InvertedIndexReader,
) -> io::Result<Box<dyn Scorer>> {
let mut offset_and_term_postings: Vec<(
usize,
<<Self as Codec>::PostingsCodec as PostingsCodec>::Postings,
)> = Vec::with_capacity(term_infos.len());
for (offset, term_info) in term_infos {
let postings_data = inverted_index_reader
.read_postings_data(term_info, IndexRecordOption::WithFreqsAndPositions)?;
let postings = self.postings_codec().load_postings(
term_info.doc_freq,
postings_data.postings_data,
postings_data.record_option,
postings_data.effective_option,
postings_data.positions_data,
)?;
offset_and_term_postings.push((*offset, postings));
}
let scorer = PhraseScorer::new(
offset_and_term_postings,
let scorer = inverted_index_reader.new_phrase_scorer_type_specialized(
term_infos,
similarity_weight,
fieldnorm_reader,
slop,
);
self,
)?;
Ok(box_scorer(scorer))
}
fn build_union_scorer_with_sum_combiner(
&self,
scorers: Vec<Box<dyn Scorer>>,
num_docs: DocId,
sum_or_do_nothing_combiner: SumOrDoNothingCombiner,
) -> Box<dyn Scorer> {
if !scorers.iter().all(|scorer| {
scorer.is::<TermScorer<<<Self as Codec>::PostingsCodec as PostingsCodec>::Postings>>()
}) {
return box_scorer(BufferedUnionScorer::build(
scorers,
SumCombiner::default,
num_docs,
));
}
let specialized_scorers: Vec<
TermScorer<<<Self as Codec>::PostingsCodec as PostingsCodec>::Postings>,
> = scorers
.into_iter()
.map(|scorer| {
*scorer.downcast::<TermScorer<_>>().ok().expect(
"Downcast failed despite the fact we already checked the type was correct",
)
})
.collect();
match sum_or_do_nothing_combiner {
SumOrDoNothingCombiner::Sum => box_scorer(BufferedUnionScorer::build(
specialized_scorers,
SumCombiner::default,
num_docs,
)),
SumOrDoNothingCombiner::DoNothing => box_scorer(BufferedUnionScorer::build(
specialized_scorers,
DoNothingCombiner::default,
num_docs,
)),
}
}
fn for_each_pruning(
&self,
threshold: Score,
@@ -243,12 +167,3 @@ impl<TCodec: Codec> ObjectSafeCodec for TCodec {
}
}
}
/// SumCombiner or DoNothingCombiner
#[derive(Copy, Clone)]
pub enum SumOrDoNothingCombiner {
/// Sum scores together
Sum,
/// Do not track any score.
DoNothing,
}

View File

@@ -10,10 +10,19 @@ use crate::query::{Bm25Weight, Scorer};
use crate::schema::IndexRecordOption;
use crate::{DocId, Score};
/// Postings codec (read path).
/// Postings codec.
pub trait PostingsCodec: Send + Sync + 'static {
/// Serializer type for the postings codec.
type PostingsSerializer: PostingsSerializer;
/// Postings type for the postings codec.
type Postings: Postings + Clone;
/// Creates a new postings serializer.
fn new_serializer(
&self,
avg_fieldnorm: Score,
mode: IndexRecordOption,
fieldnorm_reader: Option<FieldNormReader>,
) -> Self::PostingsSerializer;
/// Loads postings
///
@@ -54,6 +63,45 @@ pub trait PostingsCodec: Send + Sync + 'static {
}
}
/// A postings serializer is a listener that is in charge of serializing postings
///
/// IO is done only once per postings, once all of the data has been received.
/// A serializer will therefore contain internal buffers.
///
/// A serializer is created once and recycled for all postings.
///
/// Clients should use PostingsSerializer as follows.
/// ```
/// // First postings list
/// serializer.new_term(2, true);
/// serializer.write_doc(2, 1);
/// serializer.write_doc(6, 2);
/// serializer.close_term(3);
/// serializer.clear();
/// // Second postings list
/// serializer.new_term(1, true);
/// serializer.write_doc(3, 1);
/// serializer.close_term(3);
/// ```
pub trait PostingsSerializer {
/// The term_doc_freq here is the number of documents
/// in the postings lists.
///
/// It can be used to compute the idf that will be used for the
/// blockmax parameters.
///
/// If not available (e.g. if we do not collect `term_frequencies`
/// blockwand is disabled), the term_doc_freq passed will be set 0.
fn new_term(&mut self, term_doc_freq: u32, record_term_freq: bool);
/// Records a new document id for the current term.
/// The serializer may ignore it.
fn write_doc(&mut self, doc_id: DocId, term_freq: u32);
/// Closes the current term and writes the postings list associated.
fn close_term(&mut self, doc_freq: u32, wrt: &mut impl io::Write) -> io::Result<()>;
}
/// A light complement interface to Postings to allow block-max wand acceleration.
pub trait PostingsWithBlockMax: Postings {
/// Moves the postings to the block containign `target_doc` and returns
@@ -62,12 +110,8 @@ pub trait PostingsWithBlockMax: Postings {
/// `Warning`: Calling this method may leave the postings in an invalid state.
/// callers are required to call seek before calling any other of the
/// `Postings` method (like doc / advance etc.).
fn seek_block_max(
&mut self,
target_doc: crate::DocId,
fieldnorm_reader: &FieldNormReader,
similarity_weight: &Bm25Weight,
) -> Score;
fn seek_block_max(&mut self, target_doc: crate::DocId, similarity_weight: &Bm25Weight)
-> Score;
/// Returns the last document in the current block (or Terminated if this
/// is the last block).

View File

@@ -13,7 +13,7 @@ pub struct StandardCodec;
impl Codec for StandardCodec {
type PostingsCodec = StandardPostingsCodec;
const ID: &'static str = "tantivy-default";
const NAME: &'static str = "standard";
fn from_json_props(json_value: &serde_json::Value) -> crate::Result<Self> {
if !json_value.is_null() {

View File

@@ -0,0 +1,50 @@
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
use crate::DocId;
pub struct Block {
doc_ids: [DocId; COMPRESSION_BLOCK_SIZE],
term_freqs: [u32; COMPRESSION_BLOCK_SIZE],
len: usize,
}
impl Block {
pub fn new() -> Self {
Block {
doc_ids: [0u32; COMPRESSION_BLOCK_SIZE],
term_freqs: [0u32; COMPRESSION_BLOCK_SIZE],
len: 0,
}
}
pub fn doc_ids(&self) -> &[DocId] {
&self.doc_ids[..self.len]
}
pub fn term_freqs(&self) -> &[u32] {
&self.term_freqs[..self.len]
}
pub fn clear(&mut self) {
self.len = 0;
}
pub fn append_doc(&mut self, doc: DocId, term_freq: u32) {
let len = self.len;
self.doc_ids[len] = doc;
self.term_freqs[len] = term_freq;
self.len = len + 1;
}
pub fn is_full(&self) -> bool {
self.len == COMPRESSION_BLOCK_SIZE
}
pub fn is_empty(&self) -> bool {
self.len == 0
}
pub fn last_doc(&self) -> DocId {
assert_eq!(self.len, COMPRESSION_BLOCK_SIZE);
self.doc_ids[COMPRESSION_BLOCK_SIZE - 1]
}
}

View File

@@ -2,10 +2,9 @@ use std::io;
use common::{OwnedBytes, VInt};
use crate::codec::standard::postings::skip::{BlockInfo, SkipReader};
use crate::codec::standard::postings::FreqReadingOption;
use crate::fieldnorm::FieldNormReader;
use crate::postings::compression::{BlockDecoder, VIntDecoder as _, COMPRESSION_BLOCK_SIZE};
use crate::postings::skip::{BlockInfo, SkipReader};
use crate::query::Bm25Weight;
use crate::schema::IndexRecordOption;
use crate::{DocId, Score, TERMINATED};
@@ -130,10 +129,6 @@ impl BlockSegmentPostings {
}
}
fn max_score<I: Iterator<Item = Score>>(mut it: I) -> Option<Score> {
it.next().map(|first| it.fold(first, Score::max))
}
impl BlockSegmentPostings {
/// Returns the overall number of documents in the block postings.
/// It does not take in account whether documents are deleted or not.
@@ -214,11 +209,7 @@ impl BlockSegmentPostings {
/// after having called `.shallow_advance(..)`.
///
/// See `TermScorer::block_max_score(..)` for more information.
pub fn block_max_score(
&mut self,
fieldnorm_reader: &FieldNormReader,
bm25_weight: &Bm25Weight,
) -> Score {
pub fn block_max_score(&mut self, bm25_weight: &Bm25Weight) -> Score {
if let Some(score) = self.block_max_score_cache {
return score;
}
@@ -228,21 +219,9 @@ impl BlockSegmentPostings {
self.block_max_score_cache = Some(skip_reader_max_score);
return skip_reader_max_score;
}
// this is the last block of the segment posting list.
// If it is actually loaded, we can compute block max manually.
if self.block_loaded {
let docs = self.doc_decoder.output_array().iter().cloned();
let freqs = self.freq_decoder.output_array().iter().cloned();
let bm25_scores = docs.zip(freqs).map(|(doc, term_freq)| {
let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc);
bm25_weight.score(fieldnorm_id, term_freq)
});
let block_max_score = max_score(bm25_scores).unwrap_or(0.0);
self.block_max_score_cache = Some(block_max_score);
return block_max_score;
}
// We do not have access to any good block max value. We return bm25_weight.max_score()
// as it is a valid upperbound.
// We do not have access to any good block max value.
// It happens if this is the last block.
// We return bm25_weight.max_score() as it is a valid upperbound.
//
// We do not cache it however, so that it gets computed when once block is loaded.
bm25_weight.max_score()
@@ -337,17 +316,18 @@ mod tests {
use common::OwnedBytes;
use super::BlockSegmentPostings;
use crate::codec::postings::PostingsSerializer;
use crate::codec::standard::postings::segment_postings::SegmentPostings;
use crate::codec::standard::postings::StandardPostingsSerializer;
use crate::docset::{DocSet, TERMINATED};
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
use crate::postings::serializer::PostingsSerializer;
use crate::schema::IndexRecordOption;
#[cfg(test)]
fn build_block_postings(docs: &[u32]) -> BlockSegmentPostings {
let doc_freq = docs.len() as u32;
let mut postings_serializer =
PostingsSerializer::new(1.0f32, IndexRecordOption::Basic, None);
StandardPostingsSerializer::new(1.0f32, IndexRecordOption::Basic, None);
postings_serializer.new_term(docs.len() as u32, false);
for doc in docs {
postings_serializer.write_doc(*doc, 1u32);

View File

@@ -4,16 +4,21 @@ use crate::codec::postings::block_wand::{block_wand, block_wand_single_scorer};
use crate::codec::postings::PostingsCodec;
use crate::codec::standard::postings::block_segment_postings::BlockSegmentPostings;
pub use crate::codec::standard::postings::segment_postings::SegmentPostings;
use crate::fieldnorm::FieldNormReader;
use crate::positions::PositionReader;
use crate::query::term_query::TermScorer;
use crate::query::{BufferedUnionScorer, Scorer, SumCombiner};
use crate::schema::IndexRecordOption;
use crate::{DocSet as _, Score, TERMINATED};
mod block;
mod block_segment_postings;
mod segment_postings;
mod skip;
mod standard_postings_serializer;
pub use segment_postings::SegmentPostings as StandardPostings;
pub use standard_postings_serializer::StandardPostingsSerializer;
/// The default postings codec for tantivy.
pub struct StandardPostingsCodec;
@@ -27,8 +32,18 @@ pub(crate) enum FreqReadingOption {
}
impl PostingsCodec for StandardPostingsCodec {
type PostingsSerializer = StandardPostingsSerializer;
type Postings = SegmentPostings;
fn new_serializer(
&self,
avg_fieldnorm: Score,
mode: IndexRecordOption,
fieldnorm_reader: Option<FieldNormReader>,
) -> Self::PostingsSerializer {
StandardPostingsSerializer::new(avg_fieldnorm, mode, fieldnorm_reader)
}
fn load_postings(
&self,
doc_freq: u32,
@@ -61,7 +76,14 @@ impl PostingsCodec for StandardPostingsCodec {
Err(scorer) => scorer,
};
let mut union_scorer =
scorer.downcast::<BufferedUnionScorer<TermScorer<Self::Postings>, SumCombiner>>()?;
scorer.downcast::<BufferedUnionScorer<Box<dyn Scorer>, SumCombiner>>()?;
if !union_scorer
.scorers()
.iter()
.all(|scorer| scorer.is::<TermScorer<Self::Postings>>())
{
return Err(union_scorer);
}
let doc = union_scorer.doc();
if doc == TERMINATED {
return Ok(());
@@ -70,65 +92,16 @@ impl PostingsCodec for StandardPostingsCodec {
if score > threshold {
threshold = callback(doc, score);
}
let scorers: Vec<TermScorer<Self::Postings>> = union_scorer.into_scorers();
let boxed_scorers: Vec<Box<dyn Scorer>> = union_scorer.into_scorers();
let scorers: Vec<TermScorer<Self::Postings>> = boxed_scorers
.into_iter()
.map(|scorer| {
*scorer.downcast::<TermScorer<Self::Postings>>().ok().expect(
"Downcast failed despite the fact we already checked the type was correct",
)
})
.collect();
block_wand(scorers, threshold, callback);
Ok(())
}
}
#[cfg(test)]
mod tests {
use common::OwnedBytes;
use super::*;
use crate::postings::serializer::PostingsSerializer;
use crate::postings::Postings as _;
fn test_segment_postings_tf_aux(num_docs: u32, include_term_freq: bool) -> SegmentPostings {
let mut postings_serializer =
PostingsSerializer::new(1.0f32, IndexRecordOption::WithFreqs, None);
let mut buffer = Vec::new();
postings_serializer.new_term(num_docs, include_term_freq);
for i in 0..num_docs {
postings_serializer.write_doc(i, 2);
}
postings_serializer
.close_term(num_docs, &mut buffer)
.unwrap();
StandardPostingsCodec
.load_postings(
num_docs,
OwnedBytes::new(buffer),
IndexRecordOption::WithFreqs,
IndexRecordOption::WithFreqs,
None,
)
.unwrap()
}
#[test]
fn test_segment_postings_small_block_with_and_without_freq() {
let small_block_without_term_freq = test_segment_postings_tf_aux(1, false);
assert!(!small_block_without_term_freq.has_freq());
assert_eq!(small_block_without_term_freq.doc(), 0);
assert_eq!(small_block_without_term_freq.term_freq(), 1);
let small_block_with_term_freq = test_segment_postings_tf_aux(1, true);
assert!(small_block_with_term_freq.has_freq());
assert_eq!(small_block_with_term_freq.doc(), 0);
assert_eq!(small_block_with_term_freq.term_freq(), 2);
}
#[test]
fn test_segment_postings_large_block_with_and_without_freq() {
let large_block_without_term_freq = test_segment_postings_tf_aux(128, false);
assert!(!large_block_without_term_freq.has_freq());
assert_eq!(large_block_without_term_freq.doc(), 0);
assert_eq!(large_block_without_term_freq.term_freq(), 1);
let large_block_with_term_freq = test_segment_postings_tf_aux(128, true);
assert!(large_block_with_term_freq.has_freq());
assert_eq!(large_block_with_term_freq.doc(), 0);
assert_eq!(large_block_with_term_freq.term_freq(), 2);
}
}

View File

@@ -1,9 +1,8 @@
use common::BitSet;
use common::{BitSet, HasLen};
use super::BlockSegmentPostings;
use crate::codec::postings::PostingsWithBlockMax;
use crate::docset::DocSet;
use crate::fieldnorm::FieldNormReader;
use crate::positions::PositionReader;
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
use crate::postings::{DocFreq, Postings};
@@ -47,10 +46,14 @@ impl SegmentPostings {
use crate::schema::IndexRecordOption;
let mut buffer = Vec::new();
{
use crate::postings::serializer::PostingsSerializer;
use crate::codec::postings::PostingsSerializer;
let mut postings_serializer =
PostingsSerializer::new(0.0, IndexRecordOption::Basic, None);
crate::codec::standard::postings::StandardPostingsSerializer::new(
0.0,
IndexRecordOption::Basic,
None,
);
postings_serializer.new_term(docs.len() as u32, false);
for &doc in docs {
postings_serializer.write_doc(doc, 1u32);
@@ -77,8 +80,9 @@ impl SegmentPostings {
) -> SegmentPostings {
use common::OwnedBytes;
use crate::codec::postings::PostingsSerializer as _;
use crate::codec::standard::postings::StandardPostingsSerializer;
use crate::fieldnorm::FieldNormReader;
use crate::postings::serializer::PostingsSerializer;
use crate::schema::IndexRecordOption;
use crate::Score;
let mut buffer: Vec<u8> = Vec::new();
@@ -95,7 +99,7 @@ impl SegmentPostings {
total_num_tokens as Score / fieldnorms.len() as Score
})
.unwrap_or(0.0);
let mut postings_serializer = PostingsSerializer::new(
let mut postings_serializer = StandardPostingsSerializer::new(
average_field_norm,
IndexRecordOption::WithFreqs,
fieldnorm_reader,
@@ -148,20 +152,12 @@ impl DocSet for SegmentPostings {
self.doc()
}
#[inline]
fn seek(&mut self, target: DocId) -> DocId {
debug_assert!(self.doc() <= target);
if self.doc() >= target {
return self.doc();
}
// As an optimization, if the block is already loaded, we can
// cheaply check the next doc.
self.cur = (self.cur + 1).min(COMPRESSION_BLOCK_SIZE - 1);
if self.doc() >= target {
return self.doc();
}
// Delegate block-local search to BlockSegmentPostings::seek, which returns
// the in-block index of the first doc >= target.
self.cur = self.block_cursor.seek(target);
@@ -177,34 +173,29 @@ impl DocSet for SegmentPostings {
}
fn size_hint(&self) -> u32 {
self.doc_freq().into()
self.len() as u32
}
fn fill_bitset(&mut self, bitset: &mut BitSet) {
let bitset_max_value: DocId = bitset.max_value();
loop {
let docs = self.block_cursor.docs();
let Some(&last_doc) = docs.last() else {
break;
};
if last_doc < bitset_max_value {
// All docs are within the range of the bitset
for &doc in docs {
bitset.insert(doc);
}
} else {
for &doc in docs {
if doc < bitset_max_value {
bitset.insert(doc);
}
}
if docs.is_empty() {
break;
}
for &doc in docs {
bitset.insert(doc);
}
self.block_cursor.advance();
}
}
}
impl HasLen for SegmentPostings {
fn len(&self) -> usize {
self.block_cursor.doc_freq() as usize
}
}
impl Postings for SegmentPostings {
/// Returns the frequency associated with the current document.
/// If the schema is set up so that no frequency have been encoded,
@@ -212,7 +203,7 @@ impl Postings for SegmentPostings {
///
/// # Panics
///
/// Will panics if called without having called advance before.
/// Will panics if called without having cagled advance before.
fn term_freq(&self) -> u32 {
debug_assert!(
// Here we do not use the len of `freqs()`
@@ -264,19 +255,15 @@ impl Postings for SegmentPostings {
}
impl PostingsWithBlockMax for SegmentPostings {
#[inline]
fn seek_block_max(
&mut self,
target_doc: crate::DocId,
fieldnorm_reader: &FieldNormReader,
similarity_weight: &Bm25Weight,
) -> Score {
self.block_cursor.seek_block_without_loading(target_doc);
self.block_cursor
.block_max_score(fieldnorm_reader, similarity_weight)
self.block_cursor.block_max_score(similarity_weight)
}
#[inline]
fn last_doc_in_block(&self) -> crate::DocId {
self.block_cursor.skip_reader().last_doc_in_block()
}
@@ -284,6 +271,9 @@ impl PostingsWithBlockMax for SegmentPostings {
#[cfg(test)]
mod tests {
use common::HasLen;
use super::SegmentPostings;
use crate::docset::{DocSet, TERMINATED};
use crate::postings::Postings;
@@ -295,6 +285,7 @@ mod tests {
assert_eq!(postings.advance(), TERMINATED);
assert_eq!(postings.advance(), TERMINATED);
assert_eq!(postings.doc_freq(), crate::postings::DocFreq::Exact(0));
assert_eq!(postings.len(), 0);
}
#[test]

View File

@@ -0,0 +1,183 @@
use std::cmp::Ordering;
use std::io::{self, Write as _};
use common::{BinarySerializable as _, VInt};
use crate::codec::postings::PostingsSerializer;
use crate::codec::standard::postings::block::Block;
use crate::codec::standard::postings::skip::SkipSerializer;
use crate::fieldnorm::FieldNormReader;
use crate::postings::compression::{BlockEncoder, VIntEncoder as _, COMPRESSION_BLOCK_SIZE};
use crate::query::Bm25Weight;
use crate::schema::IndexRecordOption;
use crate::{DocId, Score};
pub struct StandardPostingsSerializer {
last_doc_id_encoded: u32,
block_encoder: BlockEncoder,
block: Box<Block>,
postings_write: Vec<u8>,
skip_write: SkipSerializer,
mode: IndexRecordOption,
fieldnorm_reader: Option<FieldNormReader>,
bm25_weight: Option<Bm25Weight>,
avg_fieldnorm: Score, /* Average number of term in the field for that segment.
* this value is used to compute the block wand information. */
term_has_freq: bool,
}
impl StandardPostingsSerializer {
pub fn new(
avg_fieldnorm: Score,
mode: IndexRecordOption,
fieldnorm_reader: Option<FieldNormReader>,
) -> StandardPostingsSerializer {
Self {
last_doc_id_encoded: 0,
block_encoder: BlockEncoder::new(),
block: Box::new(Block::new()),
postings_write: Vec::new(),
skip_write: SkipSerializer::new(),
mode,
fieldnorm_reader,
bm25_weight: None,
avg_fieldnorm,
term_has_freq: false,
}
}
}
impl PostingsSerializer for StandardPostingsSerializer {
fn new_term(&mut self, term_doc_freq: u32, record_term_freq: bool) {
self.clear();
self.term_has_freq = self.mode.has_freq() && record_term_freq;
if !self.term_has_freq {
return;
}
let num_docs_in_segment: u64 =
if let Some(fieldnorm_reader) = self.fieldnorm_reader.as_ref() {
fieldnorm_reader.num_docs() as u64
} else {
return;
};
if num_docs_in_segment == 0 {
return;
}
self.bm25_weight = Some(Bm25Weight::for_one_term_without_explain(
term_doc_freq as u64,
num_docs_in_segment,
self.avg_fieldnorm,
));
}
fn write_doc(&mut self, doc_id: DocId, term_freq: u32) {
self.block.append_doc(doc_id, term_freq);
if self.block.is_full() {
self.write_block();
}
}
fn close_term(&mut self, doc_freq: u32, output_write: &mut impl io::Write) -> io::Result<()> {
if !self.block.is_empty() {
// we have doc ids waiting to be written
// this happens when the number of doc ids is
// not a perfect multiple of our block size.
//
// In that case, the remaining part is encoded
// using variable int encoding.
{
let block_encoded = self
.block_encoder
.compress_vint_sorted(self.block.doc_ids(), self.last_doc_id_encoded);
self.postings_write.write_all(block_encoded)?;
}
// ... Idem for term frequencies
if self.term_has_freq {
let block_encoded = self
.block_encoder
.compress_vint_unsorted(self.block.term_freqs());
self.postings_write.write_all(block_encoded)?;
}
self.block.clear();
}
if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 {
let skip_data = self.skip_write.data();
VInt(skip_data.len() as u64).serialize(output_write)?;
output_write.write_all(skip_data)?;
}
output_write.write_all(&self.postings_write[..])?;
self.skip_write.clear();
self.postings_write.clear();
self.bm25_weight = None;
Ok(())
}
}
impl StandardPostingsSerializer {
fn clear(&mut self) {
self.bm25_weight = None;
self.block.clear();
self.last_doc_id_encoded = 0;
}
fn write_block(&mut self) {
{
// encode the doc ids
let (num_bits, block_encoded): (u8, &[u8]) = self
.block_encoder
.compress_block_sorted(self.block.doc_ids(), self.last_doc_id_encoded);
self.last_doc_id_encoded = self.block.last_doc();
self.skip_write
.write_doc(self.last_doc_id_encoded, num_bits);
// last el block 0, offset block 1,
self.postings_write.extend(block_encoded);
}
if self.term_has_freq {
let (num_bits, block_encoded): (u8, &[u8]) = self
.block_encoder
.compress_block_unsorted(self.block.term_freqs(), true);
self.postings_write.extend(block_encoded);
self.skip_write.write_term_freq(num_bits);
if self.mode.has_positions() {
// We serialize the sum of term freqs within the skip information
// in order to navigate through positions.
let sum_freq = self.block.term_freqs().iter().cloned().sum();
self.skip_write.write_total_term_freq(sum_freq);
}
let mut blockwand_params = (0u8, 0u32);
if let Some(bm25_weight) = self.bm25_weight.as_ref() {
if let Some(fieldnorm_reader) = self.fieldnorm_reader.as_ref() {
let docs = self.block.doc_ids().iter().cloned();
let term_freqs = self.block.term_freqs().iter().cloned();
let fieldnorms = docs.map(|doc| fieldnorm_reader.fieldnorm_id(doc));
blockwand_params = fieldnorms
.zip(term_freqs)
.max_by(
|(left_fieldnorm_id, left_term_freq),
(right_fieldnorm_id, right_term_freq)| {
let left_score =
bm25_weight.tf_factor(*left_fieldnorm_id, *left_term_freq);
let right_score =
bm25_weight.tf_factor(*right_fieldnorm_id, *right_term_freq);
left_score
.partial_cmp(&right_score)
.unwrap_or(Ordering::Equal)
},
)
.unwrap();
}
}
let (fieldnorm_id, term_freq) = blockwand_params;
self.skip_write.write_blockwand_max(fieldnorm_id, term_freq);
}
self.block.clear();
}
}

View File

@@ -486,9 +486,9 @@ mod tests {
use std::collections::BTreeSet;
use columnar::Dictionary;
use rand::distr::Uniform;
use rand::distributions::Uniform;
use rand::prelude::SliceRandom;
use rand::{rng, Rng};
use rand::{thread_rng, Rng};
use super::{FacetCollector, FacetCounts};
use crate::collector::facet_collector::compress_mapping;
@@ -731,7 +731,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let uniform = Uniform::new_inclusive(1, 100_000).unwrap();
let uniform = Uniform::new_inclusive(1, 100_000);
let mut docs: Vec<TantivyDocument> =
vec![("a", 10), ("b", 100), ("c", 7), ("d", 12), ("e", 21)]
.into_iter()
@@ -741,11 +741,14 @@ mod tests {
std::iter::repeat_n(doc, count)
})
.map(|mut doc| {
doc.add_facet(facet_field, &format!("/facet/{}", rng().sample(uniform)));
doc.add_facet(
facet_field,
&format!("/facet/{}", thread_rng().sample(uniform)),
);
doc
})
.collect();
docs[..].shuffle(&mut rng());
docs[..].shuffle(&mut thread_rng());
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
for doc in docs {
@@ -819,8 +822,8 @@ mod tests {
#[cfg(all(test, feature = "unstable"))]
mod bench {
use rand::rng;
use rand::seq::SliceRandom;
use rand::thread_rng;
use test::Bencher;
use crate::collector::FacetCollector;
@@ -843,7 +846,7 @@ mod bench {
}
}
// 40425 docs
docs[..].shuffle(&mut rng());
docs[..].shuffle(&mut thread_rng());
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
for doc in docs {

View File

@@ -160,7 +160,7 @@ mod tests {
expected: &[(crate::Score, usize)],
) {
let mut vals: Vec<(crate::Score, usize)> = (0..10).map(|val| (val as f32, val)).collect();
vals.shuffle(&mut rand::rng());
vals.shuffle(&mut rand::thread_rng());
let vals_merged = merge_top_k(vals.into_iter(), doc_range, ComparatorEnum::from(order));
assert_eq!(&vals_merged, expected);
}

View File

@@ -676,7 +676,7 @@ mod tests {
let num_segments = reader.searcher().segment_readers().len();
assert!(num_segments <= 4);
let num_components_except_deletes_and_tempstore =
crate::index::SegmentComponent::iterator().len() - 1;
crate::index::SegmentComponent::iterator().len() - 2;
let max_num_mmapped = num_components_except_deletes_and_tempstore * num_segments;
assert_eventually(|| {
let num_mmapped = mmap_directory.get_cache_info().mmapped.len();

View File

@@ -1,5 +1,4 @@
use std::borrow::BorrowMut;
use std::ops::{Deref as _, DerefMut as _};
use std::borrow::{Borrow, BorrowMut};
use common::BitSet;
@@ -54,55 +53,31 @@ pub trait DocSet: Send {
doc
}
/// !!!Dragons ahead!!!
/// In spirit, this is an approximate and dangerous version of `seek`.
///
/// It can leave the DocSet in an `invalid` state and might return a
/// lower bound of what the result of Seek would have been.
///
///
/// More accurately it returns either:
/// - Found if the target is in the docset. In that case, the DocSet is left in a valid state.
/// - SeekLowerBound(seek_lower_bound) if the target is not in the docset. In that case, The
/// DocSet can be the left in a invalid state. The DocSet should then only receives call to
/// `seek_danger(..)` until it returns `Found`, and get back to a valid state.
///
/// `seek_lower_bound` can be any `DocId` (in the docset or not) as long as it is in
/// `(target .. seek_result] U {TERMINATED}` where `seek_result` is the first document in the
/// docset greater than to `target`.
///
/// `seek_danger` may return `SeekLowerBound(TERMINATED)`.
///
/// Calling `seek_danger` with TERMINATED as a target is allowed,
/// and should always return NewTarget(TERMINATED) or anything larger as TERMINATED is NOT in
/// the DocSet.
/// Seeks to the target if possible and returns true if the target is in the DocSet.
///
/// DocSets that already have an efficient `seek` method don't need to implement
/// `seek_danger`.
/// `seek_into_the_danger_zone`. All wrapper DocSets should forward
/// `seek_into_the_danger_zone` to the underlying DocSet.
///
/// Consecutive calls to seek_danger are guaranteed to have strictly increasing `target`
/// values.
fn seek_danger(&mut self, target: DocId) -> SeekDangerResult {
if target >= TERMINATED {
debug_assert!(target == TERMINATED);
// No need to advance.
return SeekDangerResult::SeekLowerBound(target);
}
// The default implementation does not include any
// `danger zone` behavior.
//
// It does not leave the scorer in an invalid state.
// For this reason, we can safely call `self.doc()`.
let mut doc = self.doc();
if doc < target {
doc = self.seek(target);
}
if doc == target {
SeekDangerResult::Found
} else {
SeekDangerResult::SeekLowerBound(doc)
/// ## API Behaviour
/// If `seek_into_the_danger_zone` is returning true, a call to `doc()` has to return target.
/// If `seek_into_the_danger_zone` is returning false, a call to `doc()` may return any doc
/// between the last doc that matched and target or a doc that is a valid next hit after
/// target. The DocSet is considered to be in an invalid state until
/// `seek_into_the_danger_zone` returns true again.
///
/// `target` needs to be equal or larger than `doc` when in a valid state.
///
/// Consecutive calls are not allowed to have decreasing `target` values.
///
/// # Warning
/// This is an advanced API used by intersection. The API contract is tricky, avoid using it.
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
let current_doc = self.doc();
if current_doc < target {
self.seek(target);
}
self.doc() == target
}
/// Fills a given mutable buffer with the next doc ids from the
@@ -133,14 +108,10 @@ pub trait DocSet: Send {
buffer.len()
}
/// Fills the given bitset with the documents in the docset.
///
/// If the docset max_doc is smaller than the largest doc, this function might not consume the
/// docset entirely.
/// TODO comment on the size of the bitset
fn fill_bitset(&mut self, bitset: &mut BitSet) {
let bitset_max_value: u32 = bitset.max_value();
let mut doc = self.doc();
while doc < bitset_max_value {
while doc != TERMINATED {
bitset.insert(doc);
doc = self.advance();
}
@@ -206,17 +177,6 @@ pub trait DocSet: Send {
}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum SeekDangerResult {
/// The target was found in the DocSet.
Found,
/// The target was not found in the DocSet.
/// We return a range in which the value could be.
/// The given target can be any DocId, that is <= than the first document
/// in the docset after the target.
SeekLowerBound(DocId),
}
impl DocSet for &mut dyn DocSet {
fn advance(&mut self) -> u32 {
(**self).advance()
@@ -226,8 +186,8 @@ impl DocSet for &mut dyn DocSet {
(**self).seek(target)
}
fn seek_danger(&mut self, target: DocId) -> SeekDangerResult {
(**self).seek_danger(target)
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
(**self).seek_into_the_danger_zone(target)
}
fn doc(&self) -> u32 {
@@ -249,59 +209,51 @@ impl DocSet for &mut dyn DocSet {
fn count_including_deleted(&mut self) -> u32 {
(**self).count_including_deleted()
}
fn fill_bitset(&mut self, bitset: &mut BitSet) {
(**self).fill_bitset(bitset);
}
}
impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
#[inline]
fn advance(&mut self) -> DocId {
self.deref_mut().advance()
}
#[inline]
fn seek(&mut self, target: DocId) -> DocId {
self.deref_mut().seek(target)
}
#[inline]
fn seek_danger(&mut self, target: DocId) -> SeekDangerResult {
let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.seek_danger(target)
unboxed.advance()
}
fn seek(&mut self, target: DocId) -> DocId {
let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.seek(target)
}
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.seek_into_the_danger_zone(target)
}
#[inline]
fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize {
self.deref_mut().fill_buffer(buffer)
let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.fill_buffer(buffer)
}
#[inline]
fn doc(&self) -> DocId {
self.deref().doc()
let unboxed: &TDocSet = self.borrow();
unboxed.doc()
}
#[inline]
fn size_hint(&self) -> u32 {
self.deref().size_hint()
let unboxed: &TDocSet = self.borrow();
unboxed.size_hint()
}
#[inline]
fn cost(&self) -> u64 {
self.deref().cost()
let unboxed: &TDocSet = self.borrow();
unboxed.cost()
}
#[inline]
fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 {
self.deref_mut().count(alive_bitset)
let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.count(alive_bitset)
}
fn count_including_deleted(&mut self) -> u32 {
self.deref_mut().count_including_deleted()
}
fn fill_bitset(&mut self, bitset: &mut BitSet) {
self.deref_mut().fill_bitset(bitset);
let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.count_including_deleted()
}
}

View File

@@ -162,7 +162,7 @@ mod tests {
mod bench {
use rand::prelude::IteratorRandom;
use rand::rng;
use rand::thread_rng;
use test::Bencher;
use super::AliveBitSet;
@@ -176,7 +176,7 @@ mod bench {
}
fn remove_rand(raw: &mut Vec<u32>) {
let i = (0..raw.len()).choose(&mut rng()).unwrap();
let i = (0..raw.len()).choose(&mut thread_rng()).unwrap();
raw.remove(i);
}

View File

@@ -879,7 +879,7 @@ mod tests {
const ONE_HOUR_IN_MICROSECS: i64 = 3_600 * 1_000_000;
let times: Vec<DateTime> = std::iter::repeat_with(|| {
// +- One hour.
let t = T0 + rng.random_range(-ONE_HOUR_IN_MICROSECS..ONE_HOUR_IN_MICROSECS);
let t = T0 + rng.gen_range(-ONE_HOUR_IN_MICROSECS..ONE_HOUR_IN_MICROSECS);
DateTime::from_timestamp_micros(t)
})
.take(1_000)

View File

@@ -1,6 +1,6 @@
use std::collections::HashSet;
use rand::{rng, Rng};
use rand::{thread_rng, Rng};
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
use crate::schema::*;
@@ -29,7 +29,7 @@ fn test_functional_store() -> crate::Result<()> {
let index = Index::create_in_ram(schema);
let reader = index.reader()?;
let mut rng = rng();
let mut rng = thread_rng();
let mut index_writer: IndexWriter =
index.writer_with_num_threads(3, 3 * MEMORY_BUDGET_NUM_BYTES_MIN)?;
@@ -38,9 +38,9 @@ fn test_functional_store() -> crate::Result<()> {
let mut doc_id = 0u64;
for _iteration in 0..get_num_iterations() {
let num_docs: usize = rng.random_range(0..4);
let num_docs: usize = rng.gen_range(0..4);
if !doc_set.is_empty() {
let doc_to_remove_id = rng.random_range(0..doc_set.len());
let doc_to_remove_id = rng.gen_range(0..doc_set.len());
let removed_doc_id = doc_set.swap_remove(doc_to_remove_id);
index_writer.delete_term(Term::from_field_u64(id_field, removed_doc_id));
}
@@ -70,10 +70,10 @@ const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat \
non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.";
fn get_text() -> String {
use rand::seq::IndexedRandom;
let mut rng = rng();
use rand::seq::SliceRandom;
let mut rng = thread_rng();
let tokens: Vec<_> = LOREM.split(' ').collect();
let random_val = rng.random_range(0..20);
let random_val = rng.gen_range(0..20);
(0..random_val)
.map(|_| tokens.choose(&mut rng).unwrap())
@@ -101,7 +101,7 @@ fn test_functional_indexing_unsorted() -> crate::Result<()> {
let index = Index::create_from_tempdir(schema)?;
let reader = index.reader()?;
let mut rng = rng();
let mut rng = thread_rng();
let mut index_writer: IndexWriter =
index.writer_with_num_threads(3, 3 * MEMORY_BUDGET_NUM_BYTES_MIN)?;
@@ -110,7 +110,7 @@ fn test_functional_indexing_unsorted() -> crate::Result<()> {
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
for _ in 0..get_num_iterations() {
let random_val = rng.random_range(0..20);
let random_val = rng.gen_range(0..20);
if random_val == 0 {
index_writer.commit()?;
committed_docs.extend(&uncommitted_docs);

View File

@@ -4,46 +4,35 @@ use serde::{Deserialize, Serialize};
use crate::codec::{Codec, StandardCodec};
/// A Codec configuration is just a serializable object.
#[derive(Serialize, Deserialize, Clone, Debug)]
pub struct CodecConfiguration {
codec_id: Cow<'static, str>,
name: Cow<'static, str>,
#[serde(default, skip_serializing_if = "serde_json::Value::is_null")]
props: serde_json::Value,
}
impl CodecConfiguration {
/// Returns true if the codec is the standard codec.
pub fn is_standard(&self) -> bool {
self.codec_id == StandardCodec::ID && self.props.is_null()
pub fn from_codec<C: Codec>(codec: &C) -> Self {
CodecConfiguration {
name: Cow::Borrowed(C::NAME),
props: codec.to_json_props(),
}
}
/// Creates a codec instance from the configuration.
///
/// If the codec id does not match the code's name, an error is returned.
pub fn to_codec<C: Codec>(&self) -> crate::Result<C> {
if self.codec_id != C::ID {
if self.name != C::NAME {
return Err(crate::TantivyError::InvalidArgument(format!(
"Codec id mismatch: expected {}, got {}",
C::ID,
self.codec_id
"Codec name mismatch: expected {}, got {}",
C::NAME,
self.name
)));
}
C::from_json_props(&self.props)
}
}
impl<'a, C: Codec> From<&'a C> for CodecConfiguration {
fn from(codec: &'a C) -> Self {
CodecConfiguration {
codec_id: Cow::Borrowed(C::ID),
props: codec.to_json_props(),
}
}
}
impl Default for CodecConfiguration {
fn default() -> Self {
CodecConfiguration::from(&StandardCodec)
CodecConfiguration::from_codec(&StandardCodec)
}
}

View File

@@ -276,7 +276,7 @@ impl<Codec: crate::codec::Codec> IndexBuilder<Codec> {
fn create_avoid_monomorphization(self, dir: Box<dyn Directory>) -> crate::Result<Index<Codec>> {
self.validate()?;
let directory = ManagedDirectory::wrap(dir)?;
let codec: CodecConfiguration = CodecConfiguration::from(&self.codec);
let codec: CodecConfiguration = CodecConfiguration::from_codec(&self.codec);
save_new_metas(
self.get_expect_schema()?,
self.index_settings.clone(),
@@ -394,7 +394,6 @@ impl Index {
Self::open_in_dir_to_avoid_monomorphization(directory_path.as_ref())
}
#[cfg(feature = "mmap")]
#[inline(never)]
fn open_in_dir_to_avoid_monomorphization(directory_path: &Path) -> crate::Result<Index> {
let mmap_directory = MmapDirectory::open(directory_path)?;
@@ -785,7 +784,7 @@ impl<Codec: crate::codec::Codec> Index<Codec> {
}
impl fmt::Debug for Index {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "Index({:?})", self.directory)
}
}

View File

@@ -1,6 +1,8 @@
use std::collections::HashSet;
use std::fmt;
use std::path::PathBuf;
use std::sync::atomic::AtomicBool;
use std::sync::Arc;
use serde::{Deserialize, Serialize};
@@ -36,6 +38,7 @@ impl SegmentMetaInventory {
let inner = InnerSegmentMeta {
segment_id,
max_doc,
include_temp_doc_store: Arc::new(AtomicBool::new(true)),
deletes: None,
};
SegmentMeta::from(self.inventory.track(inner))
@@ -83,6 +86,15 @@ impl SegmentMeta {
self.tracked.segment_id
}
/// Removes the Component::TempStore from the alive list and
/// therefore marks the temp docstore file to be deleted by
/// the garbage collection.
pub fn untrack_temp_docstore(&self) {
self.tracked
.include_temp_doc_store
.store(false, std::sync::atomic::Ordering::Relaxed);
}
/// Returns the number of deleted documents.
pub fn num_deleted_docs(&self) -> u32 {
self.tracked
@@ -100,9 +112,20 @@ impl SegmentMeta {
/// is by removing all files that have been created by tantivy
/// and are not used by any segment anymore.
pub fn list_files(&self) -> HashSet<PathBuf> {
SegmentComponent::iterator()
.map(|component| self.relative_path(*component))
.collect::<HashSet<PathBuf>>()
if self
.tracked
.include_temp_doc_store
.load(std::sync::atomic::Ordering::Relaxed)
{
SegmentComponent::iterator()
.map(|component| self.relative_path(*component))
.collect::<HashSet<PathBuf>>()
} else {
SegmentComponent::iterator()
.filter(|comp| *comp != &SegmentComponent::TempStore)
.map(|component| self.relative_path(*component))
.collect::<HashSet<PathBuf>>()
}
}
/// Returns the relative path of a component of our segment.
@@ -116,6 +139,7 @@ impl SegmentMeta {
SegmentComponent::Positions => ".pos".to_string(),
SegmentComponent::Terms => ".term".to_string(),
SegmentComponent::Store => ".store".to_string(),
SegmentComponent::TempStore => ".store.temp".to_string(),
SegmentComponent::FastFields => ".fast".to_string(),
SegmentComponent::FieldNorms => ".fieldnorm".to_string(),
SegmentComponent::Delete => format!(".{}.del", self.delete_opstamp().unwrap_or(0)),
@@ -160,6 +184,7 @@ impl SegmentMeta {
segment_id: inner_meta.segment_id,
max_doc,
deletes: None,
include_temp_doc_store: Arc::new(AtomicBool::new(true)),
});
SegmentMeta { tracked }
}
@@ -178,6 +203,7 @@ impl SegmentMeta {
let tracked = self.tracked.map(move |inner_meta| InnerSegmentMeta {
segment_id: inner_meta.segment_id,
max_doc: inner_meta.max_doc,
include_temp_doc_store: Arc::new(AtomicBool::new(true)),
deletes: Some(delete_meta),
});
SegmentMeta { tracked }
@@ -189,6 +215,14 @@ struct InnerSegmentMeta {
segment_id: SegmentId,
max_doc: u32,
pub deletes: Option<DeleteMeta>,
/// If you want to avoid the SegmentComponent::TempStore file to be covered by
/// garbage collection and deleted, set this to true. This is used during merge.
#[serde(skip)]
#[serde(default = "default_temp_store")]
pub(crate) include_temp_doc_store: Arc<AtomicBool>,
}
fn default_temp_store() -> Arc<AtomicBool> {
Arc::new(AtomicBool::new(false))
}
impl InnerSegmentMeta {
@@ -288,9 +322,9 @@ pub struct IndexMeta {
#[serde(skip_serializing_if = "Option::is_none")]
pub payload: Option<String>,
/// Codec configuration for the index.
#[serde(skip_serializing_if = "CodecConfiguration::is_standard")]
pub codec: CodecConfiguration,
}
#[derive(Deserialize, Debug)]
struct UntrackedIndexMeta {
pub segments: Vec<InnerSegmentMeta>,
@@ -334,7 +368,7 @@ impl IndexMeta {
schema,
opstamp: 0u64,
payload: None,
codec: CodecConfiguration::from(codec),
codec: CodecConfiguration::from_codec(codec),
}
}
@@ -387,36 +421,13 @@ mod tests {
payload: None,
codec: Default::default(),
};
let json_value: serde_json::Value =
serde_json::to_value(&index_metas).expect("serialization failed");
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
assert_eq!(
&json_value,
&serde_json::json!(
{
"index_settings": {
"docstore_compression": "none",
"docstore_blocksize": 16384
},
"segments": [],
"schema": [
{
"name": "text",
"type": "text",
"options": {
"indexing": {
"record": "position",
"fieldnorms": true,
"tokenizer": "default"
},
"stored": false,
"fast": false
}
}
],
"opstamp": 0
})
json,
r#"{"index_settings":{"docstore_compression":"none","docstore_blocksize":16384},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0,"codec":{"name":"standard"}}"#
);
let deser_meta: UntrackedIndexMeta = serde_json::from_value(json_value).unwrap();
let deser_meta: UntrackedIndexMeta = serde_json::from_str(&json).unwrap();
assert_eq!(index_metas.index_settings, deser_meta.index_settings);
assert_eq!(index_metas.schema, deser_meta.schema);
assert_eq!(index_metas.opstamp, deser_meta.opstamp);
@@ -442,39 +453,14 @@ mod tests {
schema,
opstamp: 0u64,
payload: None,
codec: Default::default(),
};
let json_value = serde_json::to_value(&index_metas).expect("serialization failed");
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
assert_eq!(
&json_value,
&serde_json::json!(
{
"index_settings": {
"docstore_compression": "zstd(compression_level=4)",
"docstore_blocksize": 1000000
},
"segments": [],
"schema": [
{
"name": "text",
"type": "text",
"options": {
"indexing": {
"record": "position",
"fieldnorms": true,
"tokenizer": "default"
},
"stored": false,
"fast": false
}
}
],
"opstamp": 0
}
)
json,
r#"{"index_settings":{"docstore_compression":"zstd(compression_level=4)","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#
);
let deser_meta: UntrackedIndexMeta = serde_json::from_value(json_value).unwrap();
let deser_meta: UntrackedIndexMeta = serde_json::from_str(&json).unwrap();
assert_eq!(index_metas.index_settings, deser_meta.index_settings);
assert_eq!(index_metas.schema, deser_meta.schema);
assert_eq!(index_metas.opstamp, deser_meta.opstamp);

View File

@@ -1,8 +1,4 @@
#[cfg(feature = "quickwit")]
use std::future::Future;
use std::io;
#[cfg(feature = "quickwit")]
use std::pin::Pin;
use std::sync::Arc;
use common::json_path_writer::JSON_END_OF_PATH;
@@ -14,104 +10,29 @@ use itertools::Itertools;
#[cfg(feature = "quickwit")]
use tantivy_fst::automaton::{AlwaysMatch, Automaton};
use crate::codec::{ObjectSafeCodec, StandardCodec};
use crate::codec::postings::PostingsCodec;
use crate::codec::{Codec, ObjectSafeCodec, StandardCodec};
use crate::directory::FileSlice;
use crate::fieldnorm::FieldNormReader;
use crate::postings::{Postings, TermInfo};
use crate::query::{Bm25Weight, Scorer};
use crate::query::term_query::TermScorer;
use crate::query::{Bm25Weight, PhraseScorer, Scorer};
use crate::schema::{IndexRecordOption, Term, Type};
use crate::termdict::TermDictionary;
/// Postings data returned by [`InvertedIndexReader::read_postings_data`].
pub struct PostingsData {
/// Raw postings bytes for the term.
pub postings_data: OwnedBytes,
/// Raw positions bytes for the term, if positions are available.
pub positions_data: Option<OwnedBytes>,
/// Record option of the indexed field.
pub record_option: IndexRecordOption,
/// Effective record option after downgrading to the indexed field capability.
pub effective_option: IndexRecordOption,
}
/// Trait defining the contract for inverted index readers.
pub trait InvertedIndexReader: Send + Sync {
/// Returns the term info associated with the term.
fn get_term_info(&self, term: &Term) -> io::Result<Option<TermInfo>>;
/// Return the term dictionary datastructure.
fn terms(&self) -> &TermDictionary;
/// Return the fields and types encoded in the dictionary in lexicographic order.
/// Only valid on JSON fields.
///
/// Notice: This requires a full scan and therefore **very expensive**.
fn list_encoded_json_fields(&self) -> io::Result<Vec<InvertedIndexFieldSpace>>;
/// Read postings bytes and metadata for a given term.
fn read_postings_data(
&self,
term_info: &TermInfo,
option: IndexRecordOption,
) -> io::Result<PostingsData>;
/// Build a new term scorer.
fn new_term_scorer(
&self,
term_info: &TermInfo,
option: IndexRecordOption,
fieldnorm_reader: FieldNormReader,
similarity_weight: Bm25Weight,
) -> io::Result<Box<dyn Scorer>>;
/// Returns a posting object given a `term_info`.
/// This method is for an advanced usage only.
///
/// Most users should prefer using [`Self::read_postings()`] instead.
fn read_postings_from_terminfo(
&self,
term_info: &TermInfo,
option: IndexRecordOption,
) -> io::Result<Box<dyn Postings>>;
/// Returns the total number of tokens recorded for all documents
/// (including deleted documents).
fn total_num_tokens(&self) -> u64;
/// Returns the segment postings associated with the term, and with the given option,
/// or `None` if the term has never been encountered and indexed.
fn read_postings(
&self,
term: &Term,
option: IndexRecordOption,
) -> io::Result<Option<Box<dyn Postings>>>;
/// Returns the number of documents containing the term.
fn doc_freq(&self, term: &Term) -> io::Result<u32>;
/// Returns the number of documents containing the term asynchronously.
#[cfg(feature = "quickwit")]
fn doc_freq_async<'a>(
&'a self,
term: &'a Term,
) -> Pin<Box<dyn Future<Output = io::Result<u32>> + Send + 'a>>;
}
/// Tantivy's default inverted index reader implementation.
///
/// The inverted index reader is in charge of accessing
/// the inverted index associated with a specific field.
///
/// # Note
///
/// It is safe to delete the segment associated with
/// an `InvertedIndexReader` implementation. As long as it is open,
/// an `InvertedIndexReader`. As long as it is open,
/// the [`FileSlice`] it is relying on should
/// stay available.
///
/// `TantivyInvertedIndexReader` instances are created by calling
/// `InvertedIndexReader` are created by calling
/// [`SegmentReader::inverted_index()`](crate::SegmentReader::inverted_index).
pub struct TantivyInvertedIndexReader {
pub struct InvertedIndexReader {
termdict: TermDictionary,
postings_file_slice: FileSlice,
positions_file_slice: FileSlice,
@@ -121,16 +42,11 @@ pub struct TantivyInvertedIndexReader {
}
/// Object that records the amount of space used by a field in an inverted index.
pub struct InvertedIndexFieldSpace {
/// Field name as encoded in the term dictionary.
pub(crate) struct InvertedIndexFieldSpace {
pub field_name: String,
/// Value type for the encoded field.
pub field_type: Type,
/// Total bytes used by postings for this field.
pub postings_size: ByteCount,
/// Total bytes used by positions for this field.
pub positions_size: ByteCount,
/// Number of terms in the field.
pub num_terms: u64,
}
@@ -152,17 +68,17 @@ impl InvertedIndexFieldSpace {
}
}
impl TantivyInvertedIndexReader {
impl InvertedIndexReader {
pub(crate) fn new(
termdict: TermDictionary,
postings_file_slice: FileSlice,
positions_file_slice: FileSlice,
record_option: IndexRecordOption,
codec: Arc<dyn ObjectSafeCodec>,
) -> io::Result<TantivyInvertedIndexReader> {
) -> io::Result<InvertedIndexReader> {
let (total_num_tokens_slice, postings_body) = postings_file_slice.split(8);
let total_num_tokens = u64::deserialize(&mut total_num_tokens_slice.read_bytes()?)?;
Ok(TantivyInvertedIndexReader {
Ok(InvertedIndexReader {
termdict,
postings_file_slice: postings_body,
positions_file_slice,
@@ -172,10 +88,10 @@ impl TantivyInvertedIndexReader {
})
}
/// Creates an empty `TantivyInvertedIndexReader` object, which
/// Creates an empty `InvertedIndexReader` object, which
/// contains no terms at all.
pub fn empty(record_option: IndexRecordOption) -> TantivyInvertedIndexReader {
TantivyInvertedIndexReader {
pub fn empty(record_option: IndexRecordOption) -> InvertedIndexReader {
InvertedIndexReader {
termdict: TermDictionary::empty(),
postings_file_slice: FileSlice::empty(),
positions_file_slice: FileSlice::empty(),
@@ -184,18 +100,23 @@ impl TantivyInvertedIndexReader {
codec: Arc::new(StandardCodec),
}
}
}
impl InvertedIndexReader for TantivyInvertedIndexReader {
fn get_term_info(&self, term: &Term) -> io::Result<Option<TermInfo>> {
/// Returns the term info associated with the term.
pub fn get_term_info(&self, term: &Term) -> io::Result<Option<TermInfo>> {
self.termdict.get(term.serialized_value_bytes())
}
fn terms(&self) -> &TermDictionary {
/// Return the term dictionary datastructure.
pub fn terms(&self) -> &TermDictionary {
&self.termdict
}
fn list_encoded_json_fields(&self) -> io::Result<Vec<InvertedIndexFieldSpace>> {
/// Return the fields and types encoded in the dictionary in lexicographic order.
/// Only valid on JSON fields.
///
/// Notice: This requires a full scan and therefore **very expensive**.
/// TODO: Move to sstable to use the index.
pub(crate) fn list_encoded_json_fields(&self) -> io::Result<Vec<InvertedIndexFieldSpace>> {
let mut stream = self.termdict.stream()?;
let mut fields: Vec<InvertedIndexFieldSpace> = Vec::new();
@@ -248,34 +169,50 @@ impl InvertedIndexReader for TantivyInvertedIndexReader {
Ok(fields)
}
fn read_postings_data(
pub(crate) fn new_term_scorer_specialized<C: Codec>(
&self,
term_info: &TermInfo,
option: IndexRecordOption,
) -> io::Result<PostingsData> {
let effective_option = option.downgrade(self.record_option);
let postings_data = self
.postings_file_slice
.slice(term_info.postings_range.clone())
.read_bytes()?;
let positions_data: Option<OwnedBytes> = if effective_option.has_positions() {
let positions_data = self
.positions_file_slice
.slice(term_info.positions_range.clone())
.read_bytes()?;
Some(positions_data)
} else {
None
};
Ok(PostingsData {
postings_data,
positions_data,
record_option: self.record_option,
effective_option,
})
fieldnorm_reader: FieldNormReader,
similarity_weight: Bm25Weight,
codec: &C,
) -> io::Result<TermScorer<<<C as Codec>::PostingsCodec as PostingsCodec>::Postings>> {
let postings = self.read_postings_from_terminfo_specialized(term_info, option, codec)?;
let term_scorer = TermScorer::new(postings, fieldnorm_reader, similarity_weight);
Ok(term_scorer)
}
fn new_term_scorer(
pub(crate) fn new_phrase_scorer_type_specialized<C: Codec>(
&self,
term_infos: &[(usize, TermInfo)],
similarity_weight_opt: Option<Bm25Weight>,
fieldnorm_reader: FieldNormReader,
slop: u32,
codec: &C,
) -> io::Result<PhraseScorer<<<C as Codec>::PostingsCodec as PostingsCodec>::Postings>> {
let mut offset_and_term_postings: Vec<(
usize,
<<C as Codec>::PostingsCodec as PostingsCodec>::Postings,
)> = Vec::with_capacity(term_infos.len());
for (offset, term_info) in term_infos {
let postings = self.read_postings_from_terminfo_specialized(
term_info,
IndexRecordOption::WithFreqsAndPositions,
codec,
)?;
offset_and_term_postings.push((*offset, postings));
}
let phrase_scorer = PhraseScorer::new(
offset_and_term_postings,
similarity_weight_opt,
fieldnorm_reader,
slop,
);
Ok(phrase_scorer)
}
/// Build a new term scorer.
pub fn new_term_scorer(
&self,
term_info: &TermInfo,
option: IndexRecordOption,
@@ -292,7 +229,45 @@ impl InvertedIndexReader for TantivyInvertedIndexReader {
Ok(term_scorer)
}
fn read_postings_from_terminfo(
/// Returns a postings object specific with a concrete type.
///
/// This requires you to provied the actual codec.
pub fn read_postings_from_terminfo_specialized<C: Codec>(
&self,
term_info: &TermInfo,
option: IndexRecordOption,
codec: &C,
) -> io::Result<<<C as Codec>::PostingsCodec as PostingsCodec>::Postings> {
let option = option.downgrade(self.record_option);
let postings_data = self
.postings_file_slice
.slice(term_info.postings_range.clone())
.read_bytes()?;
let positions_data: Option<OwnedBytes> = if option.has_positions() {
let positions_data = self
.positions_file_slice
.slice(term_info.positions_range.clone())
.read_bytes()?;
Some(positions_data)
} else {
None
};
let postings: <<C as Codec>::PostingsCodec as PostingsCodec>::Postings =
codec.postings_codec().load_postings(
term_info.doc_freq,
postings_data,
self.record_option,
option,
positions_data,
)?;
Ok(postings)
}
/// Returns a posting object given a `term_info`.
/// This method is for an advanced usage only.
///
/// Most users should prefer using [`Self::read_postings()`] instead.
pub fn read_postings_from_terminfo(
&self,
term_info: &TermInfo,
option: IndexRecordOption,
@@ -301,11 +276,23 @@ impl InvertedIndexReader for TantivyInvertedIndexReader {
.load_postings_type_erased(term_info, option, self)
}
fn total_num_tokens(&self) -> u64 {
/// Returns the total number of tokens recorded for all documents
/// (including deleted documents).
pub fn total_num_tokens(&self) -> u64 {
self.total_num_tokens
}
fn read_postings(
/// Returns the segment postings associated with the term, and with the given option,
/// or `None` if the term has never been encountered and indexed.
///
/// If the field was not indexed with the indexing options that cover
/// the requested options, the returned [`SegmentPostings`] the method does not fail
/// and returns a `SegmentPostings` with as much information as possible.
///
/// For instance, requesting [`IndexRecordOption::WithFreqs`] for a
/// [`TextOptions`](crate::schema::TextOptions) that does not index position
/// will return a [`SegmentPostings`] with `DocId`s and frequencies.
pub fn read_postings(
&self,
term: &Term,
option: IndexRecordOption,
@@ -315,30 +302,17 @@ impl InvertedIndexReader for TantivyInvertedIndexReader {
.transpose()
}
fn doc_freq(&self, term: &Term) -> io::Result<u32> {
/// Returns the number of documents containing the term.
pub fn doc_freq(&self, term: &Term) -> io::Result<u32> {
Ok(self
.get_term_info(term)?
.map(|term_info| term_info.doc_freq)
.unwrap_or(0u32))
}
#[cfg(feature = "quickwit")]
fn doc_freq_async<'a>(
&'a self,
term: &'a Term,
) -> Pin<Box<dyn Future<Output = io::Result<u32>> + Send + 'a>> {
Box::pin(async move {
Ok(self
.get_term_info_async(term)
.await?
.map(|term_info| term_info.doc_freq)
.unwrap_or(0u32))
})
}
}
#[cfg(feature = "quickwit")]
impl TantivyInvertedIndexReader {
impl InvertedIndexReader {
pub(crate) async fn get_term_info_async(&self, term: &Term) -> io::Result<Option<TermInfo>> {
self.termdict.get_async(term.serialized_value_bytes()).await
}
@@ -538,4 +512,13 @@ impl TantivyInvertedIndexReader {
}
Ok(())
}
/// Returns the number of documents containing the term asynchronously.
pub async fn doc_freq_async(&self, term: &Term) -> io::Result<u32> {
Ok(self
.get_term_info_async(term)
.await?
.map(|term_info| term_info.doc_freq)
.unwrap_or(0u32))
}
}

View File

@@ -15,9 +15,7 @@ pub use self::codec_configuration::CodecConfiguration;
pub use self::index::{Index, IndexBuilder};
pub(crate) use self::index_meta::SegmentMetaInventory;
pub use self::index_meta::{IndexMeta, IndexSettings, Order, SegmentMeta};
pub use self::inverted_index_reader::{
InvertedIndexFieldSpace, InvertedIndexReader, PostingsData, TantivyInvertedIndexReader,
};
pub use self::inverted_index_reader::InvertedIndexReader;
pub use self::segment::Segment;
pub use self::segment_component::SegmentComponent;
pub use self::segment_id::SegmentId;

View File

@@ -23,6 +23,8 @@ pub enum SegmentComponent {
/// Accessing a document from the store is relatively slow, as it
/// requires to decompress the entire block it belongs to.
Store,
/// Temporary storage of the documents, before streamed to `Store`.
TempStore,
/// Bitset describing which document of the segment is alive.
/// (It was representing deleted docs but changed to represent alive docs from v0.17)
Delete,
@@ -31,13 +33,14 @@ pub enum SegmentComponent {
impl SegmentComponent {
/// Iterates through the components.
pub fn iterator() -> slice::Iter<'static, SegmentComponent> {
static SEGMENT_COMPONENTS: [SegmentComponent; 7] = [
static SEGMENT_COMPONENTS: [SegmentComponent; 8] = [
SegmentComponent::Postings,
SegmentComponent::Positions,
SegmentComponent::FastFields,
SegmentComponent::FieldNorms,
SegmentComponent::Terms,
SegmentComponent::Store,
SegmentComponent::TempStore,
SegmentComponent::Delete,
];
SEGMENT_COMPONENTS.iter()

View File

@@ -11,9 +11,7 @@ use crate::directory::{CompositeFile, FileSlice};
use crate::error::DataCorruption;
use crate::fastfield::{intersect_alive_bitsets, AliveBitSet, FacetReader, FastFieldReaders};
use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
use crate::index::{
InvertedIndexReader, Segment, SegmentComponent, SegmentId, TantivyInvertedIndexReader,
};
use crate::index::{InvertedIndexReader, Segment, SegmentComponent, SegmentId};
use crate::json_utils::json_path_sep_to_dot;
use crate::schema::{Field, IndexRecordOption, Schema, Type};
use crate::space_usage::SegmentSpaceUsage;
@@ -33,7 +31,7 @@ use crate::{DocId, Opstamp};
/// as close to all of the memory data is mmapped.
#[derive(Clone)]
pub struct SegmentReader {
inv_idx_reader_cache: Arc<RwLock<HashMap<Field, Arc<dyn InvertedIndexReader>>>>,
inv_idx_reader_cache: Arc<RwLock<HashMap<Field, Arc<InvertedIndexReader>>>>,
segment_id: SegmentId,
delete_opstamp: Option<Opstamp>,
@@ -50,7 +48,8 @@ pub struct SegmentReader {
store_file: FileSlice,
alive_bitset_opt: Option<AliveBitSet>,
schema: Schema,
codec: Arc<dyn ObjectSafeCodec>,
pub(crate) codec: Arc<dyn ObjectSafeCodec>,
}
impl SegmentReader {
@@ -71,11 +70,6 @@ impl SegmentReader {
&self.schema
}
/// Returns the index codec.
pub fn codec(&self) -> &dyn ObjectSafeCodec {
&*self.codec
}
/// Return the number of documents that have been
/// deleted in the segment.
pub fn num_deleted_docs(&self) -> DocId {
@@ -220,17 +214,17 @@ impl SegmentReader {
/// Returns a field reader associated with the field given in argument.
/// If the field was not present in the index during indexing time,
/// the `InvertedIndexReader` is empty.
/// the InvertedIndexReader is empty.
///
/// The field reader is in charge of iterating through the
/// term dictionary associated with a specific field,
/// and opening the posting list associated with any term.
///
/// If the field is not marked as index, a warning is logged and an empty
/// `InvertedIndexReader` is returned.
/// If the field is not marked as index, a warning is logged and an empty `InvertedIndexReader`
/// is returned.
/// Similarly, if the field is marked as indexed but no term has been indexed for the given
/// index, an empty `InvertedIndexReader` is returned (but no warning is logged).
pub fn inverted_index(&self, field: Field) -> crate::Result<Arc<dyn InvertedIndexReader>> {
pub fn inverted_index(&self, field: Field) -> crate::Result<Arc<InvertedIndexReader>> {
if let Some(inv_idx_reader) = self
.inv_idx_reader_cache
.read()
@@ -255,9 +249,7 @@ impl SegmentReader {
//
// Returns an empty inverted index.
let record_option = record_option_opt.unwrap_or(IndexRecordOption::Basic);
let inv_idx_reader: Arc<dyn InvertedIndexReader> =
Arc::new(TantivyInvertedIndexReader::empty(record_option));
return Ok(inv_idx_reader);
return Ok(Arc::new(InvertedIndexReader::empty(record_option)));
}
let record_option = record_option_opt.unwrap();
@@ -281,14 +273,13 @@ impl SegmentReader {
DataCorruption::comment_only(error_msg)
})?;
let inv_idx_reader: Arc<dyn InvertedIndexReader> =
Arc::new(TantivyInvertedIndexReader::new(
TermDictionary::open(termdict_file)?,
postings_file,
positions_file,
record_option,
self.codec.clone(),
)?);
let inv_idx_reader = Arc::new(InvertedIndexReader::new(
TermDictionary::open(termdict_file)?,
postings_file,
positions_file,
record_option,
self.codec.clone(),
)?);
// by releasing the lock in between, we may end up opening the inverting index
// twice, but this is fine.

View File

@@ -219,7 +219,7 @@ fn index_documents<C: crate::codec::Codec, D: Document>(
let alive_bitset_opt = apply_deletes(&segment_with_max_doc, &mut delete_cursor, &doc_opstamps)?;
let meta = segment_with_max_doc.meta().clone();
meta.untrack_temp_docstore();
// update segment_updater inventory to remove tempstore
let segment_entry = SegmentEntry::new(meta, delete_cursor, alive_bitset_opt);
segment_updater.schedule_add_segment(segment_entry).wait()?;

View File

@@ -1,7 +1,6 @@
#[cfg(test)]
mod tests {
use crate::codec::postings::PostingsCodec;
use crate::codec::{Codec, StandardCodec};
use crate::codec::StandardCodec;
use crate::collector::TopDocs;
use crate::fastfield::AliveBitSet;
use crate::index::Index;
@@ -124,17 +123,11 @@ mod tests {
let term_a = Term::from_field_text(my_text_field, "text");
let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
let term_info = inverted_index.get_term_info(&term_a).unwrap().unwrap();
let postings_data = inverted_index
.read_postings_data(&term_info, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
let postings = StandardCodec
.postings_codec()
.load_postings(
term_info.doc_freq,
postings_data.postings_data,
postings_data.record_option,
postings_data.effective_option,
postings_data.positions_data,
let mut postings = inverted_index
.read_postings_from_terminfo_specialized(
&term_info,
IndexRecordOption::WithFreqsAndPositions,
&StandardCodec,
)
.unwrap();
assert_eq!(postings.doc_freq(), DocFreq::Exact(2));
@@ -146,9 +139,6 @@ mod tests {
),
2
);
let mut postings = inverted_index
.read_postings_from_terminfo(&term_info, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
assert_eq!(postings.term_freq(), 1);
let mut output = Vec::new();

View File

@@ -295,7 +295,7 @@ impl<C: Codec> IndexMerger<C> {
&self,
indexed_field: Field,
_field_type: &FieldType,
serializer: &mut InvertedIndexSerializer,
serializer: &mut InvertedIndexSerializer<C>,
fieldnorm_reader: Option<FieldNormReader>,
doc_id_mapping: &SegmentDocIdMapping,
) -> crate::Result<()> {
@@ -305,7 +305,7 @@ impl<C: Codec> IndexMerger<C> {
let mut max_term_ords: Vec<TermOrdinal> = Vec::new();
let field_readers: Vec<Arc<dyn InvertedIndexReader>> = self
let field_readers: Vec<Arc<InvertedIndexReader>> = self
.readers
.iter()
.map(|reader| reader.inverted_index(indexed_field))
@@ -377,15 +377,11 @@ impl<C: Codec> IndexMerger<C> {
// Let's compute the list of non-empty posting lists
for (segment_ord, term_info) in merged_terms.current_segment_ords_and_term_infos() {
let segment_reader = &self.readers[segment_ord];
let inverted_index = &field_readers[segment_ord];
let postings_data =
inverted_index.read_postings_data(&term_info, segment_postings_option)?;
let postings = self.codec.postings_codec().load_postings(
term_info.doc_freq,
postings_data.postings_data,
postings_data.record_option,
postings_data.effective_option,
postings_data.positions_data,
let inverted_index: &InvertedIndexReader = &field_readers[segment_ord];
let postings = inverted_index.read_postings_from_terminfo_specialized(
&term_info,
segment_postings_option,
&self.codec,
)?;
let alive_bitset_opt = segment_reader.alive_bitset();
let doc_freq = if let Some(alive_bitset) = alive_bitset_opt {
@@ -485,7 +481,7 @@ impl<C: Codec> IndexMerger<C> {
fn write_postings(
&self,
serializer: &mut InvertedIndexSerializer,
serializer: &mut InvertedIndexSerializer<C>,
fieldnorm_readers: FieldNormReaders,
doc_id_mapping: &SegmentDocIdMapping,
) -> crate::Result<()> {

View File

@@ -13,7 +13,7 @@ pub struct SegmentSerializer<C: crate::codec::Codec> {
pub(crate) store_writer: StoreWriter,
fast_field_write: WritePtr,
fieldnorms_serializer: Option<FieldNormsSerializer>,
postings_serializer: InvertedIndexSerializer,
postings_serializer: InvertedIndexSerializer<C>,
}
impl<C: crate::codec::Codec> SegmentSerializer<C> {
@@ -55,7 +55,7 @@ impl<C: crate::codec::Codec> SegmentSerializer<C> {
}
/// Accessor to the `PostingsSerializer`.
pub fn get_postings_serializer(&mut self) -> &mut InvertedIndexSerializer {
pub fn get_postings_serializer(&mut self) -> &mut InvertedIndexSerializer<C> {
&mut self.postings_serializer
}

View File

@@ -239,7 +239,7 @@ pub fn merge_filtered_segments<C: crate::codec::Codec, T: Into<Box<dyn Directory
))
.trim_end()
);
let codec_configuration = CodecConfiguration::from(segments[0].index().codec());
let codec_configuration = CodecConfiguration::from_codec(segments[0].index().codec());
let index_meta = IndexMeta {
index_settings: target_settings, // index_settings of all segments should be the same
@@ -410,7 +410,7 @@ impl<Codec: crate::codec::Codec> SegmentUpdater<Codec> {
//
// Segment 1 from disk 1, Segment 1 from disk 2, etc.
committed_segment_metas.sort_by_key(|segment_meta| -(segment_meta.max_doc() as i32));
let codec = CodecConfiguration::from(index.codec());
let codec = CodecConfiguration::from_codec(index.codec());
let index_meta = IndexMeta {
index_settings: index.settings().clone(),
segments: committed_segment_metas,

View File

@@ -53,7 +53,7 @@ impl<Codec: crate::codec::Codec, D: Document> SingleSegmentIndexWriter<Codec, D>
schema: index.schema(),
opstamp: 0,
payload: None,
codec: CodecConfiguration::from(index.codec()),
codec: CodecConfiguration::from_codec(index.codec()),
};
save_metas(&index_meta, index.directory())?;
index.directory().sync_directory()?;

View File

@@ -228,7 +228,7 @@ pub use crate::core::{json_utils, Executor, Searcher, SearcherGeneration};
pub use crate::directory::Directory;
pub use crate::index::{
Index, IndexBuilder, IndexMeta, IndexSettings, InvertedIndexReader, Order, Segment,
SegmentMeta, SegmentReader, TantivyInvertedIndexReader,
SegmentMeta, SegmentReader,
};
pub use crate::indexer::{IndexWriter, SingleSegmentIndexWriter};
pub use crate::schema::{Document, TantivyDocument, Term};
@@ -380,7 +380,7 @@ pub mod tests {
use common::{BinarySerializable, FixedSize};
use query_grammar::{UserInputAst, UserInputLeaf, UserInputLiteral};
use rand::distr::{Bernoulli, Uniform};
use rand::distributions::{Bernoulli, Uniform};
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use time::OffsetDateTime;
@@ -431,7 +431,7 @@ pub mod tests {
pub fn generate_nonunique_unsorted(max_value: u32, n_elems: usize) -> Vec<u32> {
let seed: [u8; 32] = [1; 32];
StdRng::from_seed(seed)
.sample_iter(&Uniform::new(0u32, max_value).unwrap())
.sample_iter(&Uniform::new(0u32, max_value))
.take(n_elems)
.collect::<Vec<u32>>()
}

View File

@@ -397,10 +397,7 @@ mod bench {
let mut seed: [u8; 32] = [0; 32];
seed[31] = seed_val;
let mut rng = StdRng::from_seed(seed);
(0u32..)
.filter(|_| rng.random_bool(ratio))
.take(n)
.collect()
(0u32..).filter(|_| rng.gen_bool(ratio)).take(n).collect()
}
pub fn generate_array(n: usize, ratio: f64) -> Vec<u32> {

View File

@@ -3,6 +3,7 @@ use std::io;
use common::json_path_writer::JSON_END_OF_PATH;
use stacker::Addr;
use crate::codec::Codec;
use crate::indexer::indexing_term::IndexingTerm;
use crate::indexer::path_to_unordered_id::OrderedPathId;
use crate::postings::postings_writer::SpecializedPostingsWriter;
@@ -52,12 +53,12 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
}
/// The actual serialization format is handled by the `PostingsSerializer`.
fn serialize(
fn serialize<C: Codec>(
&self,
ordered_term_addrs: &[(Field, OrderedPathId, &[u8], Addr)],
ordered_id_to_path: &[&str],
ctx: &IndexingContext,
serializer: &mut FieldSerializer,
serializer: &mut FieldSerializer<C>,
) -> io::Result<()> {
let mut term_buffer = JsonTermSerializer(Vec::with_capacity(48));
let mut buffer_lender = BufferLender::default();

View File

@@ -12,8 +12,7 @@ mod per_field_postings_writer;
mod postings;
mod postings_writer;
mod recorder;
pub(crate) mod serializer;
pub(crate) mod skip;
mod serializer;
mod term_info;
pub(crate) use loaded_postings::LoadedPostings;
@@ -595,13 +594,13 @@ mod bench {
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
for _ in 0..posting_list_size {
let mut doc = TantivyDocument::default();
if rng.random_bool(1f64 / 15f64) {
if rng.gen_bool(1f64 / 15f64) {
doc.add_text(text_field, "a");
}
if rng.random_bool(1f64 / 10f64) {
if rng.gen_bool(1f64 / 10f64) {
doc.add_text(text_field, "b");
}
if rng.random_bool(1f64 / 5f64) {
if rng.gen_bool(1f64 / 5f64) {
doc.add_text(text_field, "c");
}
doc.add_text(text_field, "d");

View File

@@ -4,6 +4,7 @@ use std::ops::Range;
use stacker::Addr;
use crate::codec::Codec;
use crate::fieldnorm::FieldNormReaders;
use crate::indexer::indexing_term::IndexingTerm;
use crate::indexer::path_to_unordered_id::OrderedPathId;
@@ -48,12 +49,12 @@ fn make_field_partition(
/// Serialize the inverted index.
/// It pushes all term, one field at a time, towards the
/// postings serializer.
pub(crate) fn serialize_postings(
pub(crate) fn serialize_postings<C: Codec>(
ctx: IndexingContext,
schema: Schema,
per_field_postings_writers: &PerFieldPostingsWriter,
fieldnorm_readers: FieldNormReaders,
serializer: &mut InvertedIndexSerializer,
serializer: &mut InvertedIndexSerializer<C>,
) -> crate::Result<()> {
// Replace unordered ids by ordered ids to be able to sort
let unordered_id_to_ordered_id: Vec<OrderedPathId> =
@@ -166,12 +167,12 @@ impl PostingsWriter for PostingsWriterEnum {
}
}
fn serialize(
fn serialize<C: Codec>(
&self,
term_addrs: &[(Field, OrderedPathId, &[u8], Addr)],
ordered_id_to_path: &[&str],
ctx: &IndexingContext,
serializer: &mut FieldSerializer,
serializer: &mut FieldSerializer<C>,
) -> io::Result<()> {
match self {
PostingsWriterEnum::DocId(writer) => {
@@ -254,12 +255,12 @@ pub(crate) trait PostingsWriter: Send + Sync {
/// Serializes the postings on disk.
/// The actual serialization format is handled by the `PostingsSerializer`.
fn serialize(
fn serialize<C: Codec>(
&self,
term_addrs: &[(Field, OrderedPathId, &[u8], Addr)],
ordered_id_to_path: &[&str],
ctx: &IndexingContext,
serializer: &mut FieldSerializer,
serializer: &mut FieldSerializer<C>,
) -> io::Result<()>;
/// Tokenize a text and subscribe all of its token.
@@ -311,12 +312,12 @@ pub(crate) struct SpecializedPostingsWriter<Rec: Recorder> {
impl<Rec: Recorder> SpecializedPostingsWriter<Rec> {
#[inline]
pub(crate) fn serialize_one_term(
pub(crate) fn serialize_one_term<C: Codec>(
term: &[u8],
addr: Addr,
buffer_lender: &mut BufferLender,
ctx: &IndexingContext,
serializer: &mut FieldSerializer,
serializer: &mut FieldSerializer<C>,
) -> io::Result<()> {
let recorder: Rec = ctx.term_index.read(addr);
let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32);
@@ -357,12 +358,12 @@ impl<Rec: Recorder> PostingsWriter for SpecializedPostingsWriter<Rec> {
});
}
fn serialize(
fn serialize<C: Codec>(
&self,
term_addrs: &[(Field, OrderedPathId, &[u8], Addr)],
_ordered_id_to_path: &[&str],
ctx: &IndexingContext,
serializer: &mut FieldSerializer,
serializer: &mut FieldSerializer<C>,
) -> io::Result<()> {
let mut buffer_lender = BufferLender::default();
for (_field, _path_id, term, addr) in term_addrs {

View File

@@ -1,6 +1,7 @@
use common::read_u32_vint;
use stacker::{ExpUnrolledLinkedList, MemoryArena};
use crate::codec::Codec;
use crate::postings::FieldSerializer;
use crate::DocId;
@@ -67,10 +68,10 @@ pub(crate) trait Recorder: Copy + Default + Send + Sync + 'static {
/// Close the document. It will help record the term frequency.
fn close_doc(&mut self, arena: &mut MemoryArena);
/// Pushes the postings information to the serializer.
fn serialize(
fn serialize<C: Codec>(
&self,
arena: &MemoryArena,
serializer: &mut FieldSerializer,
serializer: &mut FieldSerializer<C>,
buffer_lender: &mut BufferLender,
);
/// Returns the number of document containing this term.
@@ -110,10 +111,10 @@ impl Recorder for DocIdRecorder {
#[inline]
fn close_doc(&mut self, _arena: &mut MemoryArena) {}
fn serialize(
fn serialize<C: Codec>(
&self,
arena: &MemoryArena,
serializer: &mut FieldSerializer,
serializer: &mut FieldSerializer<C>,
buffer_lender: &mut BufferLender,
) {
let buffer = buffer_lender.lend_u8();
@@ -178,10 +179,10 @@ impl Recorder for TermFrequencyRecorder {
self.current_tf = 0;
}
fn serialize(
fn serialize<C: Codec>(
&self,
arena: &MemoryArena,
serializer: &mut FieldSerializer,
serializer: &mut FieldSerializer<C>,
buffer_lender: &mut BufferLender,
) {
let buffer = buffer_lender.lend_u8();
@@ -235,10 +236,10 @@ impl Recorder for TfAndPositionRecorder {
self.stack.writer(arena).write_u32_vint(POSITION_END);
}
fn serialize(
fn serialize<C: Codec>(
&self,
arena: &MemoryArena,
serializer: &mut FieldSerializer,
serializer: &mut FieldSerializer<C>,
buffer_lender: &mut BufferLender,
) {
let (buffer_u8, buffer_positions) = buffer_lender.lend_all();

View File

@@ -1,16 +1,14 @@
use std::cmp::Ordering;
use std::io::{self, Write};
use common::{BinarySerializable, CountingWriter, VInt};
use common::{BinarySerializable, CountingWriter};
use super::TermInfo;
use crate::codec::postings::PostingsSerializer;
use crate::codec::Codec;
use crate::directory::{CompositeWrite, WritePtr};
use crate::fieldnorm::FieldNormReader;
use crate::index::Segment;
use crate::positions::PositionSerializer;
use crate::postings::compression::{BlockEncoder, VIntEncoder as _, COMPRESSION_BLOCK_SIZE};
use crate::postings::skip::SkipSerializer;
use crate::query::Bm25Weight;
use crate::schema::{Field, FieldEntry, FieldType, IndexRecordOption, Schema};
use crate::termdict::TermDictionaryBuilder;
use crate::{DocId, Score};
@@ -46,24 +44,27 @@ use crate::{DocId, Score};
///
/// A description of the serialization format is
/// [available here](https://fulmicoton.gitbooks.io/tantivy-doc/content/inverted-index.html).
pub struct InvertedIndexSerializer {
pub struct InvertedIndexSerializer<C: Codec> {
terms_write: CompositeWrite<WritePtr>,
postings_write: CompositeWrite<WritePtr>,
positions_write: CompositeWrite<WritePtr>,
schema: Schema,
codec: C,
}
impl InvertedIndexSerializer {
use crate::codec::postings::PostingsCodec;
impl<C: Codec> InvertedIndexSerializer<C> {
/// Open a new `InvertedIndexSerializer` for the given segment
pub fn open<C: crate::codec::Codec>(
segment: &mut Segment<C>,
) -> crate::Result<InvertedIndexSerializer> {
pub fn open(segment: &mut Segment<C>) -> crate::Result<InvertedIndexSerializer<C>> {
use crate::index::SegmentComponent::{Positions, Postings, Terms};
let codec = segment.index().codec().clone();
let inv_index_serializer = InvertedIndexSerializer {
terms_write: CompositeWrite::wrap(segment.open_write(Terms)?),
postings_write: CompositeWrite::wrap(segment.open_write(Postings)?),
positions_write: CompositeWrite::wrap(segment.open_write(Positions)?),
schema: segment.schema(),
codec,
};
Ok(inv_index_serializer)
}
@@ -77,7 +78,7 @@ impl InvertedIndexSerializer {
field: Field,
total_num_tokens: u64,
fieldnorm_reader: Option<FieldNormReader>,
) -> io::Result<FieldSerializer<'_>> {
) -> io::Result<FieldSerializer<'_, C>> {
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
let term_dictionary_write = self.terms_write.for_field(field);
let postings_write = self.postings_write.for_field(field);
@@ -90,6 +91,7 @@ impl InvertedIndexSerializer {
postings_write,
positions_write,
fieldnorm_reader,
&self.codec,
)
}
@@ -104,9 +106,9 @@ impl InvertedIndexSerializer {
/// The field serializer is in charge of
/// the serialization of a specific field.
pub struct FieldSerializer<'a> {
pub struct FieldSerializer<'a, C: Codec> {
term_dictionary_builder: TermDictionaryBuilder<&'a mut CountingWriter<WritePtr>>,
postings_serializer: PostingsSerializer,
postings_serializer: <C::PostingsCodec as PostingsCodec>::PostingsSerializer,
positions_serializer_opt: Option<PositionSerializer<&'a mut CountingWriter<WritePtr>>>,
current_term_info: TermInfo,
term_open: bool,
@@ -114,7 +116,7 @@ pub struct FieldSerializer<'a> {
postings_start_offset: u64,
}
impl<'a> FieldSerializer<'a> {
impl<'a, C: Codec> FieldSerializer<'a, C> {
fn create(
field_type: &FieldType,
total_num_tokens: u64,
@@ -122,7 +124,8 @@ impl<'a> FieldSerializer<'a> {
postings_write: &'a mut CountingWriter<WritePtr>,
positions_write: &'a mut CountingWriter<WritePtr>,
fieldnorm_reader: Option<FieldNormReader>,
) -> io::Result<FieldSerializer<'a>> {
codec: &C,
) -> io::Result<FieldSerializer<'a, C>> {
total_num_tokens.serialize(postings_write)?;
let index_record_option = field_type
.index_record_option()
@@ -132,8 +135,11 @@ impl<'a> FieldSerializer<'a> {
.as_ref()
.map(|ff_reader| total_num_tokens as Score / ff_reader.num_docs() as Score)
.unwrap_or(0.0);
let postings_serializer =
PostingsSerializer::new(average_fieldnorm, index_record_option, fieldnorm_reader);
let postings_serializer = codec.postings_codec().new_serializer(
average_fieldnorm,
index_record_option,
fieldnorm_reader,
);
let positions_serializer_opt = if index_record_option.has_positions() {
Some(PositionSerializer::new(positions_write))
} else {
@@ -186,7 +192,6 @@ impl<'a> FieldSerializer<'a> {
"Called new_term, while the previous term was not closed."
);
self.term_open = true;
self.postings_serializer.clear();
self.current_term_info = self.current_term_info();
self.term_dictionary_builder.insert_key(term)?;
self.postings_serializer
@@ -250,223 +255,3 @@ impl<'a> FieldSerializer<'a> {
Ok(())
}
}
struct Block {
doc_ids: [DocId; COMPRESSION_BLOCK_SIZE],
term_freqs: [u32; COMPRESSION_BLOCK_SIZE],
len: usize,
}
impl Block {
fn new() -> Self {
Block {
doc_ids: [0u32; COMPRESSION_BLOCK_SIZE],
term_freqs: [0u32; COMPRESSION_BLOCK_SIZE],
len: 0,
}
}
fn doc_ids(&self) -> &[DocId] {
&self.doc_ids[..self.len]
}
fn term_freqs(&self) -> &[u32] {
&self.term_freqs[..self.len]
}
fn clear(&mut self) {
self.len = 0;
}
fn append_doc(&mut self, doc: DocId, term_freq: u32) {
let len = self.len;
self.doc_ids[len] = doc;
self.term_freqs[len] = term_freq;
self.len = len + 1;
}
fn is_full(&self) -> bool {
self.len == COMPRESSION_BLOCK_SIZE
}
fn is_empty(&self) -> bool {
self.len == 0
}
fn last_doc(&self) -> DocId {
assert_eq!(self.len, COMPRESSION_BLOCK_SIZE);
self.doc_ids[COMPRESSION_BLOCK_SIZE - 1]
}
}
pub struct PostingsSerializer {
last_doc_id_encoded: u32,
block_encoder: BlockEncoder,
block: Box<Block>,
postings_write: Vec<u8>,
skip_write: SkipSerializer,
mode: IndexRecordOption,
fieldnorm_reader: Option<FieldNormReader>,
bm25_weight: Option<Bm25Weight>,
avg_fieldnorm: Score, /* Average number of term in the field for that segment.
* this value is used to compute the block wand information. */
term_has_freq: bool,
}
impl PostingsSerializer {
pub fn new(
avg_fieldnorm: Score,
mode: IndexRecordOption,
fieldnorm_reader: Option<FieldNormReader>,
) -> PostingsSerializer {
PostingsSerializer {
block_encoder: BlockEncoder::new(),
block: Box::new(Block::new()),
postings_write: Vec::new(),
skip_write: SkipSerializer::new(),
last_doc_id_encoded: 0u32,
mode,
fieldnorm_reader,
bm25_weight: None,
avg_fieldnorm,
term_has_freq: false,
}
}
pub fn new_term(&mut self, term_doc_freq: u32, record_term_freq: bool) {
self.bm25_weight = None;
self.term_has_freq = self.mode.has_freq() && record_term_freq;
if !self.term_has_freq {
return;
}
let num_docs_in_segment: u64 =
if let Some(fieldnorm_reader) = self.fieldnorm_reader.as_ref() {
fieldnorm_reader.num_docs() as u64
} else {
return;
};
if num_docs_in_segment == 0 {
return;
}
self.bm25_weight = Some(Bm25Weight::for_one_term_without_explain(
term_doc_freq as u64,
num_docs_in_segment,
self.avg_fieldnorm,
));
}
fn write_block(&mut self) {
{
// encode the doc ids
let (num_bits, block_encoded): (u8, &[u8]) = self
.block_encoder
.compress_block_sorted(self.block.doc_ids(), self.last_doc_id_encoded);
self.last_doc_id_encoded = self.block.last_doc();
self.skip_write
.write_doc(self.last_doc_id_encoded, num_bits);
// last el block 0, offset block 1,
self.postings_write.extend(block_encoded);
}
if self.term_has_freq {
let (num_bits, block_encoded): (u8, &[u8]) = self
.block_encoder
.compress_block_unsorted(self.block.term_freqs(), true);
self.postings_write.extend(block_encoded);
self.skip_write.write_term_freq(num_bits);
if self.mode.has_positions() {
// We serialize the sum of term freqs within the skip information
// in order to navigate through positions.
let sum_freq = self.block.term_freqs().iter().cloned().sum();
self.skip_write.write_total_term_freq(sum_freq);
}
let mut blockwand_params = (0u8, 0u32);
if let Some(bm25_weight) = self.bm25_weight.as_ref() {
if let Some(fieldnorm_reader) = self.fieldnorm_reader.as_ref() {
let docs = self.block.doc_ids().iter().cloned();
let term_freqs = self.block.term_freqs().iter().cloned();
let fieldnorms = docs.map(|doc| fieldnorm_reader.fieldnorm_id(doc));
blockwand_params = fieldnorms
.zip(term_freqs)
.max_by(
|(left_fieldnorm_id, left_term_freq),
(right_fieldnorm_id, right_term_freq)| {
let left_score =
bm25_weight.tf_factor(*left_fieldnorm_id, *left_term_freq);
let right_score =
bm25_weight.tf_factor(*right_fieldnorm_id, *right_term_freq);
left_score
.partial_cmp(&right_score)
.unwrap_or(Ordering::Equal)
},
)
.unwrap();
}
}
let (fieldnorm_id, term_freq) = blockwand_params;
self.skip_write.write_blockwand_max(fieldnorm_id, term_freq);
}
self.block.clear();
}
pub fn write_doc(&mut self, doc_id: DocId, term_freq: u32) {
self.block.append_doc(doc_id, term_freq);
if self.block.is_full() {
self.write_block();
}
}
pub fn close_term(
&mut self,
doc_freq: u32,
output_write: &mut impl std::io::Write,
) -> io::Result<()> {
if !self.block.is_empty() {
// we have doc ids waiting to be written
// this happens when the number of doc ids is
// not a perfect multiple of our block size.
//
// In that case, the remaining part is encoded
// using variable int encoding.
{
let block_encoded = self
.block_encoder
.compress_vint_sorted(self.block.doc_ids(), self.last_doc_id_encoded);
self.postings_write.write_all(block_encoded)?;
}
// ... Idem for term frequencies
if self.term_has_freq {
let block_encoded = self
.block_encoder
.compress_vint_unsorted(self.block.term_freqs());
self.postings_write.write_all(block_encoded)?;
}
self.block.clear();
}
if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 {
let skip_data = self.skip_write.data();
VInt(skip_data.len() as u64).serialize(output_write)?;
output_write.write_all(skip_data)?;
}
output_write.write_all(&self.postings_write[..])?;
self.skip_write.clear();
self.postings_write.clear();
self.bm25_weight = None;
Ok(())
}
fn clear(&mut self) {
self.block.clear();
self.last_doc_id_encoded = 0;
}
}

View File

@@ -1,15 +1,14 @@
use std::collections::HashMap;
use crate::codec::{ObjectSafeCodec, SumOrDoNothingCombiner};
use crate::docset::COLLECT_BLOCK_BUFFER_LEN;
use crate::index::SegmentReader;
use crate::query::disjunction::Disjunction;
use crate::query::explanation::does_not_match;
use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner};
use crate::query::weight::for_each_docset_buffered;
use crate::query::weight::{for_each_docset_buffered, for_each_scorer};
use crate::query::{
box_scorer, intersect_scorers, AllScorer, BufferedUnionScorer, EmptyScorer, Exclude,
Explanation, Occur, RequiredOptionalScorer, Scorer, SumCombiner, Weight,
Explanation, Occur, RequiredOptionalScorer, Scorer, Weight,
};
use crate::{DocId, Score};
@@ -38,7 +37,6 @@ fn scorer_union<TScoreCombiner>(
scorers: Vec<Box<dyn Scorer>>,
score_combiner_fn: impl Fn() -> TScoreCombiner,
num_docs: u32,
codec: &dyn ObjectSafeCodec,
) -> Box<dyn Scorer>
where
TScoreCombiner: ScoreCombiner,
@@ -46,30 +44,11 @@ where
match scorers.len() {
0 => box_scorer(EmptyScorer),
1 => scorers.into_iter().next().unwrap(),
_ => {
let combiner_opt: Option<SumOrDoNothingCombiner> = if std::any::TypeId::of::<
TScoreCombiner,
>() == std::any::TypeId::of::<
SumCombiner,
>() {
Some(SumOrDoNothingCombiner::Sum)
} else if std::any::TypeId::of::<TScoreCombiner>()
== std::any::TypeId::of::<DoNothingCombiner>()
{
Some(SumOrDoNothingCombiner::DoNothing)
} else {
None
};
if let Some(combiner) = combiner_opt {
codec.build_union_scorer_with_sum_combiner(scorers, num_docs, combiner)
} else {
box_scorer(BufferedUnionScorer::build(
scorers,
score_combiner_fn,
num_docs,
))
}
}
_ => box_scorer(BufferedUnionScorer::build(
scorers,
score_combiner_fn,
num_docs,
)),
}
}
@@ -243,13 +222,11 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
should_scorers,
&score_combiner_fn,
num_docs,
reader.codec(),
)),
1 => ShouldScorersCombinationMethod::Required(scorer_union(
should_scorers,
&score_combiner_fn,
num_docs,
reader.codec(),
)),
n if num_of_should_scorers == n => {
// When num_of_should_scorers equals the number of should clauses,
@@ -268,12 +245,8 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
let exclude_scorer_opt: Option<Box<dyn Scorer>> = if exclude_scorers.is_empty() {
None
} else {
let exclude_scorers_union: Box<dyn Scorer> = scorer_union(
exclude_scorers,
DoNothingCombiner::default,
num_docs,
reader.codec(),
);
let exclude_scorers_union: Box<dyn Scorer> =
scorer_union(exclude_scorers, DoNothingCombiner::default, num_docs);
Some(exclude_scorers_union)
};
@@ -421,7 +394,7 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
callback: &mut dyn FnMut(DocId, Score),
) -> crate::Result<()> {
let mut scorer = self.complex_scorer(reader, 1.0, &self.score_combiner_fn)?;
scorer.for_each(callback);
for_each_scorer(scorer.as_mut(), callback);
Ok(())
}
@@ -453,7 +426,7 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
callback: &mut dyn FnMut(DocId, Score) -> Score,
) -> crate::Result<()> {
let scorer = self.complex_scorer(reader, 1.0, &self.score_combiner_fn)?;
reader.codec().for_each_pruning(threshold, scorer, callback);
reader.codec.for_each_pruning(threshold, scorer, callback);
Ok(())
}
}

View File

@@ -1,6 +1,6 @@
use std::fmt;
use crate::docset::{SeekDangerResult, COLLECT_BLOCK_BUFFER_LEN};
use crate::docset::COLLECT_BLOCK_BUFFER_LEN;
use crate::fastfield::AliveBitSet;
use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight};
use crate::{DocId, DocSet, Score, SegmentReader, Term};
@@ -104,8 +104,8 @@ impl<S: Scorer> DocSet for BoostScorer<S> {
fn seek(&mut self, target: DocId) -> DocId {
self.underlying.seek(target)
}
fn seek_danger(&mut self, target: DocId) -> SeekDangerResult {
self.underlying.seek_danger(target)
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
self.underlying.seek_into_the_danger_zone(target)
}
fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize {

View File

@@ -1,7 +1,6 @@
use std::cmp::Ordering;
use std::collections::BinaryHeap;
use crate::docset::SeekDangerResult;
use crate::query::score_combiner::DoNothingCombiner;
use crate::query::{ScoreCombiner, Scorer};
use crate::{DocId, DocSet, Score, TERMINATED};
@@ -68,12 +67,10 @@ impl<T: Scorer> DocSet for ScorerWrapper<T> {
self.current_doc = doc_id;
doc_id
}
fn seek_danger(&mut self, target: DocId) -> SeekDangerResult {
let result = self.scorer.seek_danger(target);
if result == SeekDangerResult::Found {
self.current_doc = target;
}
result
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
let found = self.scorer.seek_into_the_danger_zone(target);
self.current_doc = self.scorer.doc();
found
}
fn doc(&self) -> DocId {

View File

@@ -1,5 +1,5 @@
use super::size_hint::estimate_intersection;
use crate::docset::{DocSet, SeekDangerResult, TERMINATED};
use crate::docset::{DocSet, TERMINATED};
use crate::query::term_query::TermScorer;
use crate::query::{box_scorer, EmptyScorer, Scorer};
use crate::{DocId, Score};
@@ -84,14 +84,6 @@ impl<TDocSet: DocSet> Intersection<TDocSet, TDocSet> {
docsets.sort_by_key(|docset| docset.cost());
go_to_first_doc(&mut docsets);
let left = docsets.remove(0);
debug_assert!({
let doc = left.doc();
if doc == TERMINATED {
true
} else {
docsets.iter().all(|docset| docset.doc() == doc)
}
});
let right = docsets.remove(0);
Intersection {
left,
@@ -116,61 +108,46 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
#[inline]
fn advance(&mut self) -> DocId {
let (left, right) = (&mut self.left, &mut self.right);
let mut candidate = left.advance();
if candidate == TERMINATED {
return TERMINATED;
}
// Invariant:
// - candidate is always <= to the next document in the intersection.
// - candidate strictly increases at every occurence of the loop.
let mut candidate = left.doc() + 1;
loop {
// In the first part we look for a document in the intersection
// of the two rarest `DocSet` in the intersection.
// Termination: candidate strictly increases.
'outer: while candidate < TERMINATED {
// As we enter the loop, we should always have candidate < next_doc.
candidate = left.seek(candidate);
// Left is positionned on `candidate`.
debug_assert_eq!(left.doc(), candidate);
if let SeekDangerResult::SeekLowerBound(seek_lower_bound) = right.seek_danger(candidate)
{
debug_assert!(
seek_lower_bound == TERMINATED || seek_lower_bound > candidate,
"seek_lower_bound {seek_lower_bound} must be greater than candidate \
{candidate}"
);
candidate = seek_lower_bound;
continue;
}
// Left and right are positionned on `candidate`.
debug_assert_eq!(right.doc(), candidate);
for other in &mut self.others {
if let SeekDangerResult::SeekLowerBound(seek_lower_bound) =
other.seek_danger(candidate)
{
// One of the scorer does not match, let's restart at the top of the loop.
debug_assert!(
seek_lower_bound == TERMINATED || seek_lower_bound > candidate,
"seek_lower_bound {seek_lower_bound} must be greater than candidate \
{candidate}"
);
candidate = seek_lower_bound;
continue 'outer;
loop {
if right.seek_into_the_danger_zone(candidate) {
break;
}
let right_doc = right.doc();
// TODO: Think about which value would make sense here
// It depends on the DocSet implementation, when a seek would outweigh an advance.
if right_doc > candidate.wrapping_add(100) {
candidate = left.seek(right_doc);
} else {
candidate = left.advance();
}
if candidate == TERMINATED {
return TERMINATED;
}
}
// At this point all scorers are in a valid state, aligned on the next document in the
// intersection.
debug_assert!(self.others.iter().all(|docset| docset.doc() == candidate));
return candidate;
debug_assert_eq!(left.doc(), right.doc());
// test the remaining scorers
if self
.others
.iter_mut()
.all(|docset| docset.seek_into_the_danger_zone(candidate))
{
debug_assert_eq!(candidate, self.left.doc());
debug_assert_eq!(candidate, self.right.doc());
debug_assert!(self.others.iter().all(|docset| docset.doc() == candidate));
return candidate;
}
candidate = left.advance();
}
// We make sure our docset is in a valid state.
// In particular, we want .doc() to return TERMINATED.
left.seek(TERMINATED);
TERMINATED
}
fn seek(&mut self, target: DocId) -> DocId {
@@ -189,19 +166,13 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
///
/// Some implementations may choose to advance past the target if beneficial for performance.
/// The return value is `true` if the target is in the docset, and `false` otherwise.
fn seek_danger(&mut self, target: DocId) -> SeekDangerResult {
if let SeekDangerResult::SeekLowerBound(new_target) = self.left.seek_danger(target) {
return SeekDangerResult::SeekLowerBound(new_target);
}
if let SeekDangerResult::SeekLowerBound(new_target) = self.right.seek_danger(target) {
return SeekDangerResult::SeekLowerBound(new_target);
}
for docset in &mut self.others {
if let SeekDangerResult::SeekLowerBound(new_target) = docset.seek_danger(target) {
return SeekDangerResult::SeekLowerBound(new_target);
}
}
SeekDangerResult::Found
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
self.left.seek_into_the_danger_zone(target)
&& self.right.seek_into_the_danger_zone(target)
&& self
.others
.iter_mut()
.all(|docset| docset.seek_into_the_danger_zone(target))
}
#[inline]
@@ -244,12 +215,9 @@ mod tests {
use proptest::prelude::*;
use super::Intersection;
use crate::collector::Count;
use crate::docset::{DocSet, TERMINATED};
use crate::postings::tests::test_skip_against_unoptimized;
use crate::query::{QueryParser, VecDocSet};
use crate::schema::{Schema, TEXT};
use crate::Index;
use crate::query::VecDocSet;
#[test]
fn test_intersection() {
@@ -336,58 +304,6 @@ mod tests {
assert_eq!(intersection.doc(), TERMINATED);
}
#[test]
fn test_intersection_abc() {
let a = VecDocSet::from(vec![2, 3, 6]);
let b = VecDocSet::from(vec![1, 3, 5]);
let c = VecDocSet::from(vec![1, 3, 5]);
let mut intersection = Intersection::new(vec![c, b, a], 10);
let mut docs = Vec::new();
use crate::DocSet;
while intersection.doc() != TERMINATED {
docs.push(intersection.doc());
intersection.advance();
}
assert_eq!(&docs, &[3]);
}
#[test]
fn test_intersection_termination() {
use crate::query::score_combiner::DoNothingCombiner;
use crate::query::{BufferedUnionScorer, ConstScorer, VecDocSet};
let a1 = ConstScorer::new(VecDocSet::from(vec![0u32, 10000]), 1.0);
let a2 = ConstScorer::new(VecDocSet::from(vec![0u32, 10000]), 1.0);
let mut b_scorers = vec![];
for _ in 0..2 {
// Union matches 0 and 10000.
b_scorers.push(ConstScorer::new(VecDocSet::from(vec![0, 10000]), 1.0));
}
// That's the union of two scores matching 0, and 10_000.
let union = BufferedUnionScorer::build(b_scorers, DoNothingCombiner::default, 30000);
// Mismatching scorer: matches 0 and 20000. We then append more docs at the end to ensure it
// is last.
let mut m_docs = vec![0, 20000];
for i in 30000..30100 {
m_docs.push(i);
}
let m = ConstScorer::new(VecDocSet::from(m_docs), 1.0);
// Costs: A1=2, A2=2, Union=4, M=102.
// Sorted: A1, A2, Union, M.
// Left=A1, Right=A2, Others=[Union, M].
let mut intersection = crate::query::intersect_scorers(
vec![Box::new(a1), Box::new(a2), Box::new(union), Box::new(m)],
40000,
);
while intersection.doc() != TERMINATED {
intersection.advance();
}
}
// Strategy to generate sorted and deduplicated vectors of u32 document IDs
fn sorted_deduped_vec(max_val: u32, max_size: usize) -> impl Strategy<Value = Vec<u32>> {
prop::collection::vec(0..max_val, 0..max_size).prop_map(|mut vec| {
@@ -419,30 +335,6 @@ mod tests {
}
assert_eq!(intersection.doc(), TERMINATED);
}
}
#[test]
fn test_bug_2811_intersection_candidate_should_increase() {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_for_tests().unwrap();
writer
.add_document(doc!(text_field=>"hello happy tax"))
.unwrap();
writer.add_document(doc!(text_field=>"hello")).unwrap();
writer.add_document(doc!(text_field=>"hello")).unwrap();
writer.add_document(doc!(text_field=>"happy tax")).unwrap();
writer.commit().unwrap();
let query_parser = QueryParser::for_index(&index, Vec::new());
let query = query_parser
.parse_query(r#"+text:hello +text:"happy tax""#)
.unwrap();
let searcher = index.reader().unwrap().searcher();
let c = searcher.search(&*query, &Count).unwrap();
assert_eq!(c, 1);
}
}

View File

@@ -1,4 +1,4 @@
use crate::docset::{DocSet, SeekDangerResult, TERMINATED};
use crate::docset::{DocSet, TERMINATED};
use crate::fieldnorm::FieldNormReader;
use crate::postings::Postings;
use crate::query::bm25::Bm25Weight;
@@ -188,16 +188,11 @@ impl<TPostings: Postings> DocSet for PhrasePrefixScorer<TPostings> {
self.advance()
}
fn seek_danger(&mut self, target: DocId) -> SeekDangerResult {
let seek_res = self.phrase_scorer.seek_danger(target);
if seek_res != SeekDangerResult::Found {
return seek_res;
}
// The intersection matched. Now let's see if we match the prefix.
if self.matches_prefix() {
SeekDangerResult::Found
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
if self.phrase_scorer.seek_into_the_danger_zone(target) {
self.matches_prefix()
} else {
SeekDangerResult::SeekLowerBound(target + 1)
false
}
}

View File

@@ -3,9 +3,9 @@ use crate::fieldnorm::FieldNormReader;
use crate::index::SegmentReader;
use crate::postings::Postings;
use crate::query::bm25::Bm25Weight;
use crate::query::{box_scorer, EmptyScorer, Scorer, Weight};
use crate::query::{box_scorer, EmptyScorer, Explanation, Scorer, Weight};
use crate::schema::{IndexRecordOption, Term};
use crate::Score;
use crate::{DocId, Score};
pub struct PhrasePrefixWeight {
phrase_terms: Vec<(usize, Term)>,
@@ -102,6 +102,7 @@ impl PhrasePrefixWeight {
}
}
// TODO make this specialized.
Ok(Some(box_scorer(PhrasePrefixScorer::new(
term_postings_list,
similarity_weight_opt,
@@ -120,6 +121,26 @@ impl Weight for PhrasePrefixWeight {
Ok(box_scorer(EmptyScorer))
}
}
fn explain(&self, _reader: &SegmentReader, _doc: DocId) -> crate::Result<Explanation> {
todo!();
// let scorer_opt = self.phrase_scorer(reader, 1.0)?;
// if scorer_opt.is_none() {
// return Err(does_not_match(doc));
// }
// let mut scorer = scorer_opt.unwrap();
// if scorer.seek(doc) != doc {
// return Err(does_not_match(doc));
// }
// let fieldnorm_reader = self.fieldnorm_reader(reader)?;
// let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc);
// let phrase_count = scorer.phrase_count();
// let mut explanation = Explanation::new("Phrase Prefix Scorer", scorer.score());
// if let Some(similarity_weight) = self.similarity_weight_opt.as_ref() {
// explanation.add_detail(similarity_weight.explain(fieldnorm_id, phrase_count));
// }
// Ok(explanation)
}
}
#[cfg(test)]

View File

@@ -1,7 +1,7 @@
use std::cmp::Ordering;
use crate::codec::standard::postings::StandardPostings;
use crate::docset::{DocSet, SeekDangerResult, TERMINATED};
use crate::docset::{DocSet, TERMINATED};
use crate::fieldnorm::FieldNormReader;
use crate::postings::Postings;
use crate::query::bm25::Bm25Weight;
@@ -532,23 +532,12 @@ impl<TPostings: Postings> DocSet for PhraseScorer<TPostings> {
self.advance()
}
fn seek_danger(&mut self, target: DocId) -> SeekDangerResult {
debug_assert!(
target >= self.doc(),
"target ({}) should be greater than or equal to doc ({})",
target,
self.doc()
);
let seek_res = self.intersection_docset.seek_danger(target);
if seek_res != SeekDangerResult::Found {
return seek_res;
}
// The intersection matched. Now let's see if we match the phrase.
if self.phrase_match() {
SeekDangerResult::Found
} else {
SeekDangerResult::SeekLowerBound(target + 1)
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
debug_assert!(target >= self.doc());
if self.intersection_docset.seek_into_the_danger_zone(target) && self.phrase_match() {
return true;
}
false
}
fn doc(&self) -> DocId {

View File

@@ -67,6 +67,7 @@ impl PhraseWeight {
let mut term_infos: Vec<(usize, TermInfo)> = Vec::with_capacity(self.phrase_terms.len());
// TODO make it specialized
for &(offset, ref term) in &self.phrase_terms {
let Some(term_info) = inverted_index_reader.get_term_info(term)? else {
return Ok(None);
@@ -74,12 +75,12 @@ impl PhraseWeight {
term_infos.push((offset, term_info));
}
let scorer = reader.codec().new_phrase_scorer_type_erased(
let scorer = reader.codec.new_phrase_scorer_type_erased(
&term_infos[..],
similarity_weight_opt,
fieldnorm_reader,
self.slop,
inverted_index_reader.as_ref(),
&inverted_index_reader,
)?;
Ok(Some(scorer))

View File

@@ -86,8 +86,7 @@ impl RegexPhraseWeight {
"Phrase query exceeded max expansions {num_terms}"
)));
}
let union =
Self::get_union_from_term_infos(&term_infos, reader, inverted_index.as_ref())?;
let union = Self::get_union_from_term_infos(&term_infos, reader, &inverted_index)?;
posting_lists.push((offset, union));
}
@@ -102,7 +101,7 @@ impl RegexPhraseWeight {
/// Add all docs of the term to the docset
fn add_to_bitset(
inverted_index: &dyn InvertedIndexReader,
inverted_index: &InvertedIndexReader,
term_info: &TermInfo,
doc_bitset: &mut BitSet,
) -> crate::Result<()> {
@@ -169,7 +168,7 @@ impl RegexPhraseWeight {
pub(crate) fn get_union_from_term_infos(
term_infos: &[TermInfo],
reader: &SegmentReader,
inverted_index: &dyn InvertedIndexReader,
inverted_index: &InvertedIndexReader,
) -> crate::Result<UnionType> {
let max_doc = reader.max_doc();
@@ -305,7 +304,7 @@ mod tests {
#![proptest_config(ProptestConfig::with_cases(50))]
#[test]
fn test_phrase_regex_with_random_strings(mut random_strings in proptest::collection::vec("[c-z ]{0,10}", 1..100), num_occurrences in 1..150_usize) {
let mut rng = rand::rng();
let mut rng = rand::thread_rng();
// Insert "aaa ccc" the specified number of times into the list
for _ in 0..num_occurrences {

View File

@@ -422,7 +422,7 @@ mod tests {
docs.push(doc);
}
docs.shuffle(&mut rand::rng());
docs.shuffle(&mut rand::thread_rng());
let mut docs_it = docs.into_iter();
for doc in (&mut docs_it).take(50) {
index_writer.add_document(doc)?;

View File

@@ -492,7 +492,7 @@ mod tests {
use common::DateTime;
use proptest::prelude::*;
use rand::rngs::StdRng;
use rand::seq::IndexedRandom;
use rand::seq::SliceRandom;
use rand::SeedableRng;
use time::format_description::well_known::Rfc3339;
use time::OffsetDateTime;

View File

@@ -1,6 +1,6 @@
use std::marker::PhantomData;
use crate::docset::{DocSet, SeekDangerResult};
use crate::docset::DocSet;
use crate::query::score_combiner::ScoreCombiner;
use crate::query::Scorer;
use crate::{DocId, Score};
@@ -56,9 +56,9 @@ where
self.req_scorer.seek(target)
}
fn seek_danger(&mut self, target: DocId) -> SeekDangerResult {
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
self.score_cache = None;
self.req_scorer.seek_danger(target)
self.req_scorer.seek_into_the_danger_zone(target)
}
fn doc(&self) -> DocId {

View File

@@ -34,15 +34,6 @@ pub trait Scorer: downcast_rs::Downcast + DocSet + 'static {
for_each_pruning_scorer_default_impl(self, threshold, callback);
}
/// Calls `callback` with all of the `(doc, score)` in the scorer.
fn for_each(&mut self, callback: &mut dyn FnMut(DocId, Score)) {
let mut doc = self.doc();
while doc != TERMINATED {
callback(doc, self.score());
doc = self.advance();
}
}
/// Returns an explanation for the score of the current document.
fn explain(&mut self) -> Explanation {
let score = self.score();
@@ -79,10 +70,6 @@ impl Scorer for Box<dyn Scorer> {
) {
self.deref_mut().for_each_pruning(threshold, callback);
}
fn for_each(&mut self, callback: &mut dyn FnMut(DocId, Score)) {
self.deref_mut().for_each(callback);
}
}
/// Calls `callback` with all of the `(doc, score)` for which score

View File

@@ -53,7 +53,7 @@ impl<TPostingsWithBlockMax: PostingsWithBlockMax> TermScorer<TPostingsWithBlockM
/// effective maximum score of the block.
pub(crate) fn seek_block_max(&mut self, target_doc: DocId) -> Score {
self.postings
.seek_block_max(target_doc, &self.fieldnorm_reader, &self.similarity_weight)
.seek_block_max(target_doc, &self.similarity_weight)
}
}
@@ -89,7 +89,6 @@ impl<TPostings: Postings> DocSet for TermScorer<TPostings> {
#[inline]
fn seek(&mut self, target: DocId) -> DocId {
debug_assert!(target >= self.doc());
self.postings.seek(target)
}
@@ -122,16 +121,10 @@ impl<TPostings: Postings> Scorer for TermScorer<TPostings> {
mod tests {
use proptest::prelude::*;
use crate::index::SegmentId;
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
use crate::indexer::NoMergePolicy;
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
use crate::query::term_query::TermScorer;
use crate::query::{Bm25Weight, EnableScoring, Scorer, TermQuery};
use crate::schema::{IndexRecordOption, Schema, TEXT};
use crate::{
assert_nearly_equals, DocId, DocSet, Index, IndexWriter, Score, Searcher, Term, TERMINATED,
};
use crate::query::{Bm25Weight, Scorer};
use crate::{assert_nearly_equals, DocId, DocSet, Score, TERMINATED};
#[test]
fn test_term_scorer_max_score() -> crate::Result<()> {
@@ -219,110 +212,110 @@ mod tests {
}
}
#[test]
fn test_block_wand() {
let mut doc_tfs: Vec<(u32, u32)> = vec![];
for doc in 0u32..128u32 {
doc_tfs.push((doc, 1u32));
}
for doc in 128u32..256u32 {
doc_tfs.push((doc, if doc == 200 { 2u32 } else { 1u32 }));
}
doc_tfs.push((256, 1u32));
doc_tfs.push((257, 3u32));
doc_tfs.push((258, 1u32));
// #[test]
// fn test_block_wand() {
// let mut doc_tfs: Vec<(u32, u32)> = vec![];
// for doc in 0u32..128u32 {
// doc_tfs.push((doc, 1u32));
// }
// for doc in 128u32..256u32 {
// doc_tfs.push((doc, if doc == 200 { 2u32 } else { 1u32 }));
// }
// doc_tfs.push((256, 1u32));
// doc_tfs.push((257, 3u32));
// doc_tfs.push((258, 1u32));
let fieldnorms: Vec<u32> = std::iter::repeat_n(20u32, 300).collect();
let bm25_weight = Bm25Weight::for_one_term(10, 129, 20.0);
let mut docs = TermScorer::create_for_test(&doc_tfs[..], &fieldnorms[..], bm25_weight);
assert_nearly_equals!(docs.seek_block_max(0), 2.5161593);
assert_nearly_equals!(docs.seek_block_max(135), 3.4597192);
// the block is not loaded yet.
assert_nearly_equals!(docs.seek_block_max(256), 5.2971773);
assert_eq!(256, docs.seek(256));
assert_nearly_equals!(docs.seek_block_max(256), 3.9539647);
}
// let fieldnorms: Vec<u32> = std::iter::repeat_n(20u32, 300).collect();
// let bm25_weight = Bm25Weight::for_one_term(10, 129, 20.0);
// let mut docs = TermScorer::create_for_test(&doc_tfs[..], &fieldnorms[..], bm25_weight);
// assert_nearly_equals!(docs.seek_block(0), 2.5161593);
// assert_nearly_equals!(docs.seek_block(135), 3.4597192);
// // the block is not loaded yet.
// assert_nearly_equals!(docs.seek_block(256), 5.2971773);
// assert_eq!(256, docs.seek(256));
// assert_nearly_equals!(docs.seek_block(256), 3.9539647);
// }
fn test_block_wand_aux(term_query: &TermQuery, searcher: &Searcher) {
let term_weight = term_query
.specialized_weight(EnableScoring::enabled_from_searcher(searcher))
.unwrap();
for reader in searcher.segment_readers() {
let mut block_max_scores = vec![];
let mut block_max_scores_b = vec![];
let mut docs = vec![];
{
let mut term_scorer = term_weight.term_scorer_for_test(reader, 1.0).unwrap();
while term_scorer.doc() != TERMINATED {
let mut score = term_scorer.score();
docs.push(term_scorer.doc());
for _ in 0..128 {
score = score.max(term_scorer.score());
if term_scorer.advance() == TERMINATED {
break;
}
}
block_max_scores.push(score);
}
}
{
let mut term_scorer = term_weight.term_scorer_for_test(reader, 1.0).unwrap();
for d in docs {
let block_max_score = term_scorer.seek_block_max(d);
block_max_scores_b.push(block_max_score);
}
}
for (l, r) in block_max_scores
.iter()
.cloned()
.zip(block_max_scores_b.iter().cloned())
{
assert_nearly_equals!(l, r);
}
}
}
// fn test_block_wand_aux(term_query: &TermQuery, searcher: &Searcher) -> crate::Result<()> {
// let term_weight =
// term_query.specialized_weight(EnableScoring::enabled_from_searcher(searcher))?;
// for reader in searcher.segment_readers() {
// let mut block_max_scores = vec![];
// let mut block_max_scores_b = vec![];
// let mut docs = vec![];
// {
// let mut term_scorer = term_weight.term_scorer_for_test(reader, 1.0)?.unwrap();
// while term_scorer.doc() != TERMINATED {
// let mut score = term_scorer.score();
// docs.push(term_scorer.doc());
// for _ in 0..128 {
// score = score.max(term_scorer.score());
// if term_scorer.advance() == TERMINATED {
// break;
// }
// }
// block_max_scores.push(score);
// }
// }
// {
// let mut term_scorer = term_weight.term_scorer_for_test(reader, 1.0)?.unwrap();
// for d in docs {
// let block_max_score = term_scorer.seek_block(d);
// block_max_scores_b.push(block_max_score);
// }
// }
// for (l, r) in block_max_scores
// .iter()
// .cloned()
// .zip(block_max_scores_b.iter().cloned())
// {
// assert_nearly_equals!(l, r);
// }
// }
// Ok(())
// }
#[ignore]
#[test]
fn test_block_wand_long_test() {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer: IndexWriter = index
.writer_with_num_threads(3, 3 * MEMORY_BUDGET_NUM_BYTES_MIN)
.unwrap();
use rand::Rng;
let mut rng = rand::rng();
writer.set_merge_policy(Box::new(NoMergePolicy));
for _ in 0..3_000 {
let term_freq = rng.random_range(1..10000);
let words: Vec<&str> = std::iter::repeat_n("bbbb", term_freq).collect();
let text = words.join(" ");
writer.add_document(doc!(text_field=>text)).unwrap();
}
writer.commit().unwrap();
let term_query = TermQuery::new(
Term::from_field_text(text_field, "bbbb"),
IndexRecordOption::WithFreqs,
);
let segment_ids: Vec<SegmentId>;
let reader = index.reader().unwrap();
{
let searcher = reader.searcher();
segment_ids = searcher
.segment_readers()
.iter()
.map(|segment| segment.segment_id())
.collect();
test_block_wand_aux(&term_query, &searcher);
}
writer.merge(&segment_ids[..]).wait().unwrap();
{
reader.reload().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
test_block_wand_aux(&term_query, &searcher);
}
}
// #[ignore]
// #[test]
// fn test_block_wand_long_test() -> crate::Result<()> {
// let mut schema_builder = Schema::builder();
// let text_field = schema_builder.add_text_field("text", TEXT);
// let schema = schema_builder.build();
// let index = Index::create_in_ram(schema);
// let mut writer: IndexWriter =
// index.writer_with_num_threads(3, 3 * MEMORY_BUDGET_NUM_BYTES_MIN)?;
// use rand::Rng;
// let mut rng = rand::thread_rng();
// writer.set_merge_policy(Box::new(NoMergePolicy));
// for _ in 0..3_000 {
// let term_freq = rng.gen_range(1..10000);
// let words: Vec<&str> = std::iter::repeat_n("bbbb", term_freq).collect();
// let text = words.join(" ");
// writer.add_document(doc!(text_field=>text))?;
// }
// writer.commit()?;
// let term_query = TermQuery::new(
// Term::from_field_text(text_field, "bbbb"),
// IndexRecordOption::WithFreqs,
// );
// let segment_ids: Vec<SegmentId>;
// let reader = index.reader()?;
// {
// let searcher = reader.searcher();
// segment_ids = searcher
// .segment_readers()
// .iter()
// .map(|segment| segment.segment_id())
// .collect();
// test_block_wand_aux(&term_query, &searcher)?;
// }
// writer.merge(&segment_ids[..]).wait().unwrap();
// {
// reader.reload()?;
// let searcher = reader.searcher();
// assert_eq!(searcher.segment_readers().len(), 1);
// test_block_wand_aux(&term_query, &searcher)?;
// }
// Ok(())
// }
}

View File

@@ -4,7 +4,7 @@ use crate::index::SegmentReader;
use crate::query::bm25::Bm25Weight;
use crate::query::explanation::does_not_match;
use crate::query::term_query::TermScorer;
use crate::query::weight::for_each_docset_buffered;
use crate::query::weight::{for_each_docset_buffered, for_each_scorer};
use crate::query::{box_scorer, AllScorer, AllWeight, EmptyScorer, Explanation, Scorer, Weight};
use crate::schema::IndexRecordOption;
use crate::{DocId, Score, TantivyError, Term};
@@ -73,11 +73,11 @@ impl Weight for TermWeight {
) -> crate::Result<()> {
match self.specialized_scorer(reader, 1.0)? {
TermOrEmptyOrAllScorer::TermScorer(mut term_scorer) => {
term_scorer.for_each(callback);
for_each_scorer(&mut *term_scorer, callback);
}
TermOrEmptyOrAllScorer::Empty => {}
TermOrEmptyOrAllScorer::AllMatch(mut all_scorer) => {
all_scorer.for_each(callback);
for_each_scorer(&mut all_scorer, callback);
}
}
Ok(())
@@ -125,7 +125,7 @@ impl Weight for TermWeight {
match specialized_scorer {
TermOrEmptyOrAllScorer::TermScorer(term_scorer) => {
reader
.codec()
.codec
.for_each_pruning(threshold, term_scorer, callback);
}
TermOrEmptyOrAllScorer::Empty => {}

View File

@@ -1,6 +1,6 @@
use common::TinySet;
use crate::docset::{DocSet, SeekDangerResult, TERMINATED};
use crate::docset::{DocSet, TERMINATED};
use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner};
use crate::query::size_hint::estimate_union;
use crate::query::Scorer;
@@ -255,47 +255,25 @@ where
}
}
fn seek_danger(&mut self, target: DocId) -> SeekDangerResult {
if target >= TERMINATED {
return SeekDangerResult::SeekLowerBound(TERMINATED);
}
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
if self.is_in_horizon(target) {
// Our value is within the buffered horizon and the docset may already have been
// processed and removed, so we need to use seek, which uses the regular advance.
let seek_doc = self.seek(target);
if seek_doc == target {
return SeekDangerResult::Found;
} else {
return SeekDangerResult::SeekLowerBound(seek_doc);
};
}
// The docsets are not in the buffered range, so we can use seek_into_the_danger_zone
// of the underlying docsets
let mut is_hit = false;
let mut min_new_target = TERMINATED;
for docset in self.scorers.iter_mut() {
match docset.seek_danger(target) {
SeekDangerResult::Found => {
is_hit = true;
break;
}
SeekDangerResult::SeekLowerBound(new_target) => {
min_new_target = min_new_target.min(new_target);
}
}
}
// The API requires the DocSet to be in a valid state when `seek_into_the_danger_zone`
// returns Found.
if is_hit {
// The doc is found. Let's make sure we position the union on the target
// to bring it back to a valid state.
self.seek(target);
SeekDangerResult::Found
self.seek(target) == target
} else {
SeekDangerResult::SeekLowerBound(min_new_target)
// The docsets are not in the buffered range, so we can use seek_into_the_danger_zone
// of the underlying docsets
let is_hit = self
.scorers
.iter_mut()
.any(|docset| docset.seek_into_the_danger_zone(target));
// The API requires the DocSet to be in a valid state when `seek_into_the_danger_zone`
// returns true.
if is_hit {
self.seek(target);
}
is_hit
}
}
@@ -317,17 +295,21 @@ where
if self.doc == TERMINATED {
return 0;
}
let mut count = 1 + self.bitsets[self.bucket_idx..HORIZON_NUM_TINYBITSETS]
let mut count = self.bitsets[self.bucket_idx..HORIZON_NUM_TINYBITSETS]
.iter()
.copied()
.map(TinySet::len)
.sum::<u32>();
.map(|bitset| bitset.len())
.sum::<u32>()
+ 1;
for bitset in self.bitsets.iter_mut() {
bitset.clear();
}
while self.refill() {
count += self.bitsets.iter().copied().map(TinySet::len).sum::<u32>();
self.bitsets.fill(TinySet::empty());
count += self.bitsets.iter().map(|bitset| bitset.len()).sum::<u32>();
for bitset in self.bitsets.iter_mut() {
bitset.clear();
}
}
self.bucket_idx = HORIZON_NUM_TINYBITSETS;
self.doc = TERMINATED;
count
}
}

View File

@@ -14,7 +14,7 @@ mod tests {
use common::BitSet;
use super::{SimpleUnion, *};
use crate::docset::{DocSet, SeekDangerResult, TERMINATED};
use crate::docset::{DocSet, TERMINATED};
use crate::postings::tests::test_skip_against_unoptimized;
use crate::query::score_combiner::DoNothingCombiner;
use crate::query::union::bitset_union::BitSetPostingUnion;
@@ -254,27 +254,6 @@ mod tests {
vec![1, 2, 3, 7, 8, 9, 99, 100, 101, 500, 20000],
);
}
#[test]
fn test_buffered_union_seek_into_danger_zone_terminated() {
let scorer1 = ConstScorer::new(VecDocSet::from(vec![1, 2]), 1.0);
let scorer2 = ConstScorer::new(VecDocSet::from(vec![2, 3]), 1.0);
let mut union_scorer =
BufferedUnionScorer::build(vec![scorer1, scorer2], DoNothingCombiner::default, 100);
// Advance to end
while union_scorer.doc() != TERMINATED {
union_scorer.advance();
}
assert_eq!(union_scorer.doc(), TERMINATED);
assert_eq!(
union_scorer.seek_danger(TERMINATED),
SeekDangerResult::SeekLowerBound(TERMINATED)
);
}
}
#[cfg(all(test, feature = "unstable"))]

View File

@@ -17,9 +17,6 @@ pub struct VecDocSet {
impl From<Vec<DocId>> for VecDocSet {
fn from(doc_ids: Vec<DocId>) -> VecDocSet {
// We do not use `slice::is_sorted`, as we want to check for doc ids to be strictly
// sorted.
assert!(doc_ids.windows(2).all(|w| w[0] < w[1]));
VecDocSet { doc_ids, cursor: 0 }
}
}

View File

@@ -1,9 +1,21 @@
use super::Scorer;
use crate::docset::COLLECT_BLOCK_BUFFER_LEN;
use crate::index::SegmentReader;
use crate::query::explanation::does_not_match;
use crate::query::Explanation;
use crate::{DocId, DocSet, Score};
use crate::{DocId, DocSet, Score, TERMINATED};
/// Iterates through all of the documents and scores matched by the DocSet
/// `DocSet`.
pub(crate) fn for_each_scorer<TScorer: Scorer + ?Sized>(
scorer: &mut TScorer,
callback: &mut dyn FnMut(DocId, Score),
) {
let mut doc = scorer.doc();
while doc != TERMINATED {
callback(doc, scorer.score());
doc = scorer.advance();
}
}
/// Iterates through all of the documents matched by the DocSet
/// `DocSet`.
@@ -35,13 +47,7 @@ pub trait Weight: Send + Sync + 'static {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>>;
/// Returns an [`Explanation`] for the given document.
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0)?;
if scorer.doc() > doc || scorer.seek(doc) != doc {
return Err(does_not_match(doc));
}
Ok(scorer.explain())
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation>;
/// Returns the number documents within the given [`SegmentReader`].
fn count(&self, reader: &SegmentReader) -> crate::Result<u32> {
@@ -61,7 +67,7 @@ pub trait Weight: Send + Sync + 'static {
callback: &mut dyn FnMut(DocId, Score),
) -> crate::Result<()> {
let mut scorer = self.scorer(reader, 1.0)?;
scorer.for_each(callback);
for_each_scorer(scorer.as_mut(), callback);
Ok(())
}
@@ -89,6 +95,7 @@ pub trait Weight: Send + Sync + 'static {
///
/// More importantly, it makes it possible for scorers to implement
/// important optimization (e.g. BlockWAND for union).
// TODO remove and move to scorer?
fn for_each_pruning(
&self,
threshold: Score,

View File

@@ -124,6 +124,7 @@ impl SegmentSpaceUsage {
FieldNorms => PerField(self.fieldnorms().clone()),
Terms => PerField(self.termdict().clone()),
SegmentComponent::Store => ComponentSpaceUsage::Store(self.store().clone()),
SegmentComponent::TempStore => ComponentSpaceUsage::Store(self.store().clone()),
Delete => Basic(self.deletes()),
}
}

View File

@@ -95,7 +95,7 @@ impl<'a> TermMerger<'a> {
#[cfg(all(test, feature = "unstable"))]
mod bench {
use rand::distributions::Alphanumeric;
use rand::{rng, Rng};
use rand::{thread_rng, Rng};
use test::{self, Bencher};
use super::TermMerger;
@@ -117,9 +117,9 @@ mod bench {
let buffer: Vec<u8> = {
let mut terms = vec![];
for _i in 0..num_terms {
let rand_string: String = rng()
let rand_string: String = thread_rng()
.sample_iter(&Alphanumeric)
.take(rng().random_range(30..42))
.take(thread_rng().gen_range(30..42))
.map(char::from)
.collect();
terms.push(rand_string);

View File

@@ -25,7 +25,7 @@ zstd-compression = ["zstd"]
proptest = "1"
criterion = { version = "0.5", default-features = false }
names = "0.14"
rand = "0.9"
rand = "0.8"
[[bench]]
name = "stream_bench"

View File

@@ -10,9 +10,9 @@ use tantivy_sstable::{Dictionary, MonotonicU64SSTable};
const CHARSET: &[u8] = b"abcdefghij";
fn generate_key(rng: &mut impl Rng) -> String {
let len = rng.random_range(3..12);
let len = rng.gen_range(3..12);
std::iter::from_fn(|| {
let idx = rng.random_range(0..CHARSET.len());
let idx = rng.gen_range(0..CHARSET.len());
Some(CHARSET[idx] as char)
})
.take(len)

View File

@@ -23,12 +23,12 @@ name = "hashmap"
path = "example/hashmap.rs"
[dev-dependencies]
rand = "0.9"
rand = "0.8.5"
zipf = "7.0.0"
rustc-hash = "2.1.0"
proptest = "1.2.0"
binggan = { version = "0.14.0" }
rand_distr = "0.5"
rand_distr = "0.4.3"
[features]
compare_hash_only = ["ahash"] # Compare hash only, not the key in the Hashmap

View File

@@ -90,10 +90,10 @@ fn bench_vint() {
}
// benchmark zipfs distribution numbers
{
use rand::distr::Distribution;
use rand::distributions::Distribution;
use rand::rngs::StdRng;
let mut rng = StdRng::from_seed([3u8; 32]);
let zipf = rand_distr::Zipf::new(10_000.0f64, 1.03).unwrap();
let zipf = zipf::ZipfDistribution::new(10_000, 1.03).unwrap();
let numbers: Vec<[u8; 8]> = (0..num_numbers)
.map(|_| zipf.sample(&mut rng).to_le_bytes())
.collect();

View File

@@ -7,8 +7,8 @@ edition = "2021"
[dependencies]
ahash = "0.8.7"
rand = "0.9"
rand_distr = "0.5"
rand = "0.8.5"
rand_distr = "0.4.3"
tantivy-stacker = { version = "0.2.0", path = ".." }
[workspace]

View File

@@ -14,7 +14,7 @@ fn test_with_seed(seed: u64) {
let mut hash_map = AHashMap::new();
let mut arena_hashmap = ArenaHashMap::default();
let mut rng = StdRng::seed_from_u64(seed);
let key_count = rng.random_range(1_000..=1_000_000);
let key_count = rng.gen_range(1_000..=1_000_000);
let exp = Exp::new(0.05).unwrap();
for _ in 0..key_count {