mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-08 18:12:55 +00:00
Fix compilation errors after rebase
- Remove duplicate proptest_boolean_query module - Remove duplicate cost() method implementations - Fix TopDocs API usage (add .order_by_score()) - Remove duplicate imports - Remove unused variable assignments
This commit is contained in:
@@ -467,7 +467,6 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
|
||||
reader: &SegmentReader,
|
||||
callback: &mut dyn FnMut(DocId, Score),
|
||||
) -> crate::Result<()> {
|
||||
let num_docs = reader.num_docs();
|
||||
let scorer = self.complex_scorer(reader, 1.0, &self.score_combiner_fn)?;
|
||||
match scorer {
|
||||
SpecializedScorer::TermUnion(term_scorers) => {
|
||||
@@ -490,7 +489,6 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
|
||||
reader: &SegmentReader,
|
||||
callback: &mut dyn FnMut(&[DocId]),
|
||||
) -> crate::Result<()> {
|
||||
let num_docs = reader.num_docs();
|
||||
let scorer = self.complex_scorer(reader, 1.0, || DoNothingCombiner)?;
|
||||
let mut buffer = [0u32; COLLECT_BLOCK_BUFFER_LEN];
|
||||
|
||||
|
||||
@@ -817,7 +817,6 @@ mod proptest_boolean_query {
|
||||
fn proptest_boolean_query() {
|
||||
// In the presence of optimizations around buffering, it can take large numbers of
|
||||
// documents to uncover some issues.
|
||||
let num_docs = 10000;
|
||||
let num_fields = 8;
|
||||
let num_docs = 1 << num_fields;
|
||||
let (index, fields, range_field) =
|
||||
@@ -840,135 +839,3 @@ mod proptest_boolean_query {
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// A proptest which generates arbitrary permutations of a simple boolean AST, and then matches
|
||||
/// the result against an index which contains all permutations of documents with N fields.
|
||||
#[cfg(test)]
|
||||
mod proptest_boolean_query {
|
||||
use std::collections::{BTreeMap, HashSet};
|
||||
|
||||
use proptest::collection::vec;
|
||||
use proptest::prelude::*;
|
||||
|
||||
use crate::collector::DocSetCollector;
|
||||
use crate::query::{BooleanQuery, Occur, Query, TermQuery};
|
||||
use crate::schema::{Field, OwnedValue, Schema, TEXT};
|
||||
use crate::{DocId, Index, Term};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
enum BooleanQueryAST {
|
||||
Leaf { field_idx: usize },
|
||||
Union(Vec<BooleanQueryAST>),
|
||||
Intersection(Vec<BooleanQueryAST>),
|
||||
}
|
||||
|
||||
impl BooleanQueryAST {
|
||||
fn matches(&self, doc_id: DocId) -> bool {
|
||||
match self {
|
||||
BooleanQueryAST::Leaf { field_idx } => Self::matches_field(doc_id, *field_idx),
|
||||
BooleanQueryAST::Union(children) => {
|
||||
children.iter().any(|child| child.matches(doc_id))
|
||||
}
|
||||
BooleanQueryAST::Intersection(children) => {
|
||||
children.iter().all(|child| child.matches(doc_id))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn matches_field(doc_id: DocId, field_idx: usize) -> bool {
|
||||
((doc_id as usize) >> field_idx) & 1 == 1
|
||||
}
|
||||
|
||||
fn to_query(&self, fields: &[Field]) -> Box<dyn Query> {
|
||||
match self {
|
||||
BooleanQueryAST::Leaf { field_idx } => Box::new(TermQuery::new(
|
||||
Term::from_field_text(fields[*field_idx], "true"),
|
||||
crate::schema::IndexRecordOption::Basic,
|
||||
)),
|
||||
BooleanQueryAST::Union(children) => {
|
||||
let sub_queries = children
|
||||
.iter()
|
||||
.map(|child| (Occur::Should, child.to_query(fields)))
|
||||
.collect();
|
||||
Box::new(BooleanQuery::new(sub_queries))
|
||||
}
|
||||
BooleanQueryAST::Intersection(children) => {
|
||||
let sub_queries = children
|
||||
.iter()
|
||||
.map(|child| (Occur::Must, child.to_query(fields)))
|
||||
.collect();
|
||||
Box::new(BooleanQuery::new(sub_queries))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn doc_ids(num_docs: usize, num_fields: usize) -> impl Iterator<Item = DocId> {
|
||||
let permutations = 1 << num_fields;
|
||||
let copies = (num_docs as f32 / permutations as f32).ceil() as u32;
|
||||
(0..(permutations * copies)).into_iter()
|
||||
}
|
||||
|
||||
fn create_index_with_boolean_permutations(
|
||||
num_docs: usize,
|
||||
num_fields: usize,
|
||||
) -> (Index, Vec<Field>) {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let fields: Vec<Field> = (0..num_fields)
|
||||
.map(|i| schema_builder.add_text_field(&format!("field_{}", i), TEXT))
|
||||
.collect();
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer = index.writer_for_tests().unwrap();
|
||||
|
||||
for doc_id in doc_ids(num_docs, num_fields) {
|
||||
let mut doc: BTreeMap<_, OwnedValue> = BTreeMap::default();
|
||||
for (field_idx, &field) in fields.iter().enumerate() {
|
||||
if BooleanQueryAST::matches_field(doc_id, field_idx) {
|
||||
doc.insert(field, "true".into());
|
||||
}
|
||||
}
|
||||
writer.add_document(doc).unwrap();
|
||||
}
|
||||
writer.commit().unwrap();
|
||||
(index, fields)
|
||||
}
|
||||
|
||||
fn arb_boolean_query_ast(num_fields: usize) -> impl Strategy<Value = BooleanQueryAST> {
|
||||
let leaf = (0..num_fields).prop_map(|field_idx| BooleanQueryAST::Leaf { field_idx });
|
||||
leaf.prop_recursive(
|
||||
8, // 8 levels of recursion
|
||||
256, // 256 nodes max
|
||||
10, // 10 items per collection
|
||||
|inner| {
|
||||
prop_oneof![
|
||||
vec(inner.clone(), 1..10).prop_map(BooleanQueryAST::Union),
|
||||
vec(inner, 1..10).prop_map(BooleanQueryAST::Intersection),
|
||||
]
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn proptest_boolean_query() {
|
||||
let num_docs = 10000;
|
||||
let num_fields = 8;
|
||||
let (index, fields) = create_index_with_boolean_permutations(num_docs, num_fields);
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
proptest!(|(ast in arb_boolean_query_ast(num_fields))| {
|
||||
let query = ast.to_query(&fields);
|
||||
|
||||
let mut matching_docs = HashSet::new();
|
||||
for doc_id in doc_ids(num_docs, num_fields) {
|
||||
if ast.matches(doc_id as DocId) {
|
||||
matching_docs.insert(doc_id as DocId);
|
||||
}
|
||||
}
|
||||
|
||||
let doc_addresses = searcher.search(&*query, &DocSetCollector).unwrap();
|
||||
let result_docs: HashSet<DocId> =
|
||||
doc_addresses.into_iter().map(|doc_address| doc_address.doc_id).collect();
|
||||
prop_assert_eq!(result_docs, matching_docs);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use super::size_hint::estimate_intersection;
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::query::size_hint::estimate_intersection;
|
||||
use crate::query::term_query::TermScorer;
|
||||
use crate::query::{EmptyScorer, Scorer};
|
||||
use crate::{DocId, Score};
|
||||
|
||||
@@ -87,8 +87,9 @@ mod tests {
|
||||
let query = QueryParser::for_index(&index, vec![text_field])
|
||||
.parse_query(term)
|
||||
.unwrap();
|
||||
let top_docs: Vec<(f32, DocAddress)> =
|
||||
searcher.search(&query, &TopDocs::with_limit(10)).unwrap();
|
||||
let top_docs: Vec<(f32, DocAddress)> = searcher
|
||||
.search(&query, &TopDocs::with_limit(10).order_by_score())
|
||||
.unwrap();
|
||||
|
||||
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
|
||||
};
|
||||
@@ -126,8 +127,9 @@ mod tests {
|
||||
let query = QueryParser::for_index(&index, vec![text_field])
|
||||
.parse_query(term)
|
||||
.unwrap();
|
||||
let top_docs: Vec<(f32, DocAddress)> =
|
||||
searcher.search(&query, &TopDocs::with_limit(10)).unwrap();
|
||||
let top_docs: Vec<(f32, DocAddress)> = searcher
|
||||
.search(&query, &TopDocs::with_limit(10).order_by_score())
|
||||
.unwrap();
|
||||
|
||||
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
@@ -560,15 +560,6 @@ impl<TPostings: Postings> DocSet for PhraseScorer<TPostings> {
|
||||
// 10 * self.num_terms.
|
||||
self.intersection_docset.size_hint() as u64 * 10 * self.num_terms as u64
|
||||
}
|
||||
|
||||
/// Returns a best-effort hint of the
|
||||
/// cost to drive the docset.
|
||||
fn cost(&self) -> u64 {
|
||||
// Evaluating phrase matches is generally more expensive than simple term matches,
|
||||
// as it requires loading and comparing positions. Use a conservative multiplier
|
||||
// based on the number of terms.
|
||||
self.intersection_docset.size_hint() as u64 * 10 * self.num_terms as u64
|
||||
}
|
||||
}
|
||||
|
||||
impl<TPostings: Postings> Scorer for PhraseScorer<TPostings> {
|
||||
|
||||
@@ -205,14 +205,6 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
|
||||
// Ideally this would take the fast field codec into account
|
||||
(self.column.num_docs() as f64 * 0.8) as u64
|
||||
}
|
||||
|
||||
/// Returns a best-effort hint of the
|
||||
/// cost to drive the docset.
|
||||
fn cost(&self) -> u64 {
|
||||
// Advancing the docset is relatively expensive since it scans the column.
|
||||
// Keep cost relative to a term query driver; use num_docs as baseline.
|
||||
self.column.num_docs() as u64
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
Reference in New Issue
Block a user