diff --git a/src/query/boolean_query/boolean_weight.rs b/src/query/boolean_query/boolean_weight.rs index 96ce36994..c46e9b0b1 100644 --- a/src/query/boolean_query/boolean_weight.rs +++ b/src/query/boolean_query/boolean_weight.rs @@ -467,7 +467,6 @@ impl Weight for BooleanWeight crate::Result<()> { - let num_docs = reader.num_docs(); let scorer = self.complex_scorer(reader, 1.0, &self.score_combiner_fn)?; match scorer { SpecializedScorer::TermUnion(term_scorers) => { @@ -490,7 +489,6 @@ impl Weight for BooleanWeight crate::Result<()> { - let num_docs = reader.num_docs(); let scorer = self.complex_scorer(reader, 1.0, || DoNothingCombiner)?; let mut buffer = [0u32; COLLECT_BLOCK_BUFFER_LEN]; diff --git a/src/query/boolean_query/mod.rs b/src/query/boolean_query/mod.rs index 87899c166..a78cbe8e9 100644 --- a/src/query/boolean_query/mod.rs +++ b/src/query/boolean_query/mod.rs @@ -817,7 +817,6 @@ mod proptest_boolean_query { fn proptest_boolean_query() { // In the presence of optimizations around buffering, it can take large numbers of // documents to uncover some issues. - let num_docs = 10000; let num_fields = 8; let num_docs = 1 << num_fields; let (index, fields, range_field) = @@ -840,135 +839,3 @@ mod proptest_boolean_query { }); } } - -/// A proptest which generates arbitrary permutations of a simple boolean AST, and then matches -/// the result against an index which contains all permutations of documents with N fields. -#[cfg(test)] -mod proptest_boolean_query { - use std::collections::{BTreeMap, HashSet}; - - use proptest::collection::vec; - use proptest::prelude::*; - - use crate::collector::DocSetCollector; - use crate::query::{BooleanQuery, Occur, Query, TermQuery}; - use crate::schema::{Field, OwnedValue, Schema, TEXT}; - use crate::{DocId, Index, Term}; - - #[derive(Debug, Clone)] - enum BooleanQueryAST { - Leaf { field_idx: usize }, - Union(Vec), - Intersection(Vec), - } - - impl BooleanQueryAST { - fn matches(&self, doc_id: DocId) -> bool { - match self { - BooleanQueryAST::Leaf { field_idx } => Self::matches_field(doc_id, *field_idx), - BooleanQueryAST::Union(children) => { - children.iter().any(|child| child.matches(doc_id)) - } - BooleanQueryAST::Intersection(children) => { - children.iter().all(|child| child.matches(doc_id)) - } - } - } - - fn matches_field(doc_id: DocId, field_idx: usize) -> bool { - ((doc_id as usize) >> field_idx) & 1 == 1 - } - - fn to_query(&self, fields: &[Field]) -> Box { - match self { - BooleanQueryAST::Leaf { field_idx } => Box::new(TermQuery::new( - Term::from_field_text(fields[*field_idx], "true"), - crate::schema::IndexRecordOption::Basic, - )), - BooleanQueryAST::Union(children) => { - let sub_queries = children - .iter() - .map(|child| (Occur::Should, child.to_query(fields))) - .collect(); - Box::new(BooleanQuery::new(sub_queries)) - } - BooleanQueryAST::Intersection(children) => { - let sub_queries = children - .iter() - .map(|child| (Occur::Must, child.to_query(fields))) - .collect(); - Box::new(BooleanQuery::new(sub_queries)) - } - } - } - } - - fn doc_ids(num_docs: usize, num_fields: usize) -> impl Iterator { - let permutations = 1 << num_fields; - let copies = (num_docs as f32 / permutations as f32).ceil() as u32; - (0..(permutations * copies)).into_iter() - } - - fn create_index_with_boolean_permutations( - num_docs: usize, - num_fields: usize, - ) -> (Index, Vec) { - let mut schema_builder = Schema::builder(); - let fields: Vec = (0..num_fields) - .map(|i| schema_builder.add_text_field(&format!("field_{}", i), TEXT)) - .collect(); - let schema = schema_builder.build(); - let index = Index::create_in_ram(schema); - let mut writer = index.writer_for_tests().unwrap(); - - for doc_id in doc_ids(num_docs, num_fields) { - let mut doc: BTreeMap<_, OwnedValue> = BTreeMap::default(); - for (field_idx, &field) in fields.iter().enumerate() { - if BooleanQueryAST::matches_field(doc_id, field_idx) { - doc.insert(field, "true".into()); - } - } - writer.add_document(doc).unwrap(); - } - writer.commit().unwrap(); - (index, fields) - } - - fn arb_boolean_query_ast(num_fields: usize) -> impl Strategy { - let leaf = (0..num_fields).prop_map(|field_idx| BooleanQueryAST::Leaf { field_idx }); - leaf.prop_recursive( - 8, // 8 levels of recursion - 256, // 256 nodes max - 10, // 10 items per collection - |inner| { - prop_oneof![ - vec(inner.clone(), 1..10).prop_map(BooleanQueryAST::Union), - vec(inner, 1..10).prop_map(BooleanQueryAST::Intersection), - ] - }, - ) - } - - #[test] - fn proptest_boolean_query() { - let num_docs = 10000; - let num_fields = 8; - let (index, fields) = create_index_with_boolean_permutations(num_docs, num_fields); - let searcher = index.reader().unwrap().searcher(); - proptest!(|(ast in arb_boolean_query_ast(num_fields))| { - let query = ast.to_query(&fields); - - let mut matching_docs = HashSet::new(); - for doc_id in doc_ids(num_docs, num_fields) { - if ast.matches(doc_id as DocId) { - matching_docs.insert(doc_id as DocId); - } - } - - let doc_addresses = searcher.search(&*query, &DocSetCollector).unwrap(); - let result_docs: HashSet = - doc_addresses.into_iter().map(|doc_address| doc_address.doc_id).collect(); - prop_assert_eq!(result_docs, matching_docs); - }); - } -} diff --git a/src/query/intersection.rs b/src/query/intersection.rs index 78c03163d..3e8677d98 100644 --- a/src/query/intersection.rs +++ b/src/query/intersection.rs @@ -1,6 +1,5 @@ use super::size_hint::estimate_intersection; use crate::docset::{DocSet, TERMINATED}; -use crate::query::size_hint::estimate_intersection; use crate::query::term_query::TermScorer; use crate::query::{EmptyScorer, Scorer}; use crate::{DocId, Score}; diff --git a/src/query/mod.rs b/src/query/mod.rs index 478f26efb..0bc865921 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -87,8 +87,9 @@ mod tests { let query = QueryParser::for_index(&index, vec![text_field]) .parse_query(term) .unwrap(); - let top_docs: Vec<(f32, DocAddress)> = - searcher.search(&query, &TopDocs::with_limit(10)).unwrap(); + let top_docs: Vec<(f32, DocAddress)> = searcher + .search(&query, &TopDocs::with_limit(10).order_by_score()) + .unwrap(); top_docs.iter().map(|el| el.1.doc_id).collect::>() }; @@ -126,8 +127,9 @@ mod tests { let query = QueryParser::for_index(&index, vec![text_field]) .parse_query(term) .unwrap(); - let top_docs: Vec<(f32, DocAddress)> = - searcher.search(&query, &TopDocs::with_limit(10)).unwrap(); + let top_docs: Vec<(f32, DocAddress)> = searcher + .search(&query, &TopDocs::with_limit(10).order_by_score()) + .unwrap(); top_docs.iter().map(|el| el.1.doc_id).collect::>() }; diff --git a/src/query/phrase_query/phrase_scorer.rs b/src/query/phrase_query/phrase_scorer.rs index 886acf489..4f8541cd2 100644 --- a/src/query/phrase_query/phrase_scorer.rs +++ b/src/query/phrase_query/phrase_scorer.rs @@ -560,15 +560,6 @@ impl DocSet for PhraseScorer { // 10 * self.num_terms. self.intersection_docset.size_hint() as u64 * 10 * self.num_terms as u64 } - - /// Returns a best-effort hint of the - /// cost to drive the docset. - fn cost(&self) -> u64 { - // Evaluating phrase matches is generally more expensive than simple term matches, - // as it requires loading and comparing positions. Use a conservative multiplier - // based on the number of terms. - self.intersection_docset.size_hint() as u64 * 10 * self.num_terms as u64 - } } impl Scorer for PhraseScorer { diff --git a/src/query/range_query/fast_field_range_doc_set.rs b/src/query/range_query/fast_field_range_doc_set.rs index c736f7b08..24d2b1fe3 100644 --- a/src/query/range_query/fast_field_range_doc_set.rs +++ b/src/query/range_query/fast_field_range_doc_set.rs @@ -205,14 +205,6 @@ impl DocSet for RangeDocSe // Ideally this would take the fast field codec into account (self.column.num_docs() as f64 * 0.8) as u64 } - - /// Returns a best-effort hint of the - /// cost to drive the docset. - fn cost(&self) -> u64 { - // Advancing the docset is relatively expensive since it scans the column. - // Keep cost relative to a term query driver; use num_docs as baseline. - self.column.num_docs() as u64 - } } #[cfg(test)]