From 6e4b61154f501b0fee16b61042d4f45dbf4a96ec Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 3 Jun 2021 22:33:20 +0900 Subject: [PATCH] Issue/1070 (#1071) Add a boolean flag in the Query::query_terms informing on whether position information is required. Closes #1070 --- CHANGELOG.md | 12 ++++--- src/indexer/merger.rs | 42 +++++++++++----------- src/query/boolean_query/boolean_query.rs | 6 ++-- src/query/boost_query.rs | 6 ++-- src/query/mod.rs | 42 +++++++++++----------- src/query/more_like_this/more_like_this.rs | 25 ++++++------- src/query/phrase_query/phrase_query.rs | 9 ++--- src/query/query.rs | 11 +++--- src/query/term_query/term_query.rs | 6 ++-- src/snippet/mod.rs | 5 ++- 10 files changed, 83 insertions(+), 81 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 30f495beb..56cf0f1ad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,12 +13,14 @@ Tantivy 0.15.0 - Moved bitpacking to bitpacker subcrate and add BlockedBitpacker, which bitpacks blocks of 128 elements (@PSeitz) #1030 - Added support for more-like-this query in tantivy (@evanxg852000) #1011 - Added support for sorting an index, e.g presorting documents in an index by a timestamp field. This can heavily improve performance for certain scenarios, by utilizing the sorted data (Top-n optimizations)(@PSeitz). #1026 -- Add iterator over documents in doc store (@PSeitz). #1044 -- Fix log merge policy (@PSeitz). #1043 -- Add detection to avoid small doc store blocks on merge (@PSeitz). #1054 -- Make doc store compression dynamic (@PSeitz). #1060 -- Switch to json for footer version handling (@PSeitz). #1060 +- Add iterator over documents in doc store (@PSeitz). #1044 +- Fix log merge policy (@PSeitz). #1043 +- Add detection to avoid small doc store blocks on merge (@PSeitz). #1054 +- Make doc store compression dynamic (@PSeitz). #1060 +- Switch to json for footer version handling (@PSeitz). #1060 - Updated TermMerger implementation to rely on the union feature of the FST (@scampi) #469 +- Add boolean marking whether position is required in the query_terms API call (@fulmicoton). #1070 + Tantivy 0.14.0 ========================= diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 04a656afc..3bcc07a05 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -412,9 +412,9 @@ impl IndexMerger { Ok(everything_is_in_order) } - pub(crate) fn get_sort_field_accessor<'b>( + pub(crate) fn get_sort_field_accessor( reader: &SegmentReader, - sort_by_field: &'b IndexSortByField, + sort_by_field: &IndexSortByField, ) -> crate::Result> { let field_id = expect_field_id_for_sort_field(&reader.schema(), &sort_by_field)?; // for now expect fastfield, but not strictly required let value_accessor = reader.fast_fields().u64_lenient(field_id)?; @@ -456,22 +456,22 @@ impl IndexMerger { // Loading the field accessor on demand causes a 15x regression // create iterators over segment/sort_accessor/doc_id tuple - let doc_id_reader_pair = reader_and_field_accessors - .iter() - .map(|reader_and_field_accessor| { - reader_and_field_accessor - .0 - .reader - .doc_ids_alive() - .map(move |doc_id| { - ( - doc_id, - reader_and_field_accessor.0, - &reader_and_field_accessor.1, - ) - }) - }) - .collect::>(); + let doc_id_reader_pair = + reader_and_field_accessors + .iter() + .map(|reader_and_field_accessor| { + reader_and_field_accessor + .0 + .reader + .doc_ids_alive() + .map(move |doc_id| { + ( + doc_id, + reader_and_field_accessor.0, + &reader_and_field_accessor.1, + ) + }) + }); // create iterator tuple of (old doc_id, reader) in order of the new doc_ids let sorted_doc_ids: Vec<(DocId, SegmentReaderWithOrdinal)> = doc_id_reader_pair @@ -1017,12 +1017,12 @@ impl IndexMerger { if reader.num_deleted_docs() > 0 // If there is not enough data in the store, we avoid stacking in order to // avoid creating many small blocks in the doc store. Once we have 5 full blocks, - // we start stacking. In the worst case 2/7 of the blocks would be very small. - // [segment 1 - {1 doc}][segment 2 - {fullblock * 5}{1doc}] + // we start stacking. In the worst case 2/7 of the blocks would be very small. + // [segment 1 - {1 doc}][segment 2 - {fullblock * 5}{1doc}] // => 5 * full blocks, 2 * 1 document blocks // // In a more realistic scenario the segments are of the same size, so 1/6 of - // the doc stores would be on average half full, given total randomness (which + // the doc stores would be on average half full, given total randomness (which // is not the case here, but not sure how it behaves exactly). // // https://github.com/tantivy-search/tantivy/issues/1053 diff --git a/src/query/boolean_query/boolean_query.rs b/src/query/boolean_query/boolean_query.rs index 95f167f6d..513c81027 100644 --- a/src/query/boolean_query/boolean_query.rs +++ b/src/query/boolean_query/boolean_query.rs @@ -6,7 +6,7 @@ use crate::query::Weight; use crate::schema::IndexRecordOption; use crate::schema::Term; use crate::Searcher; -use std::collections::BTreeSet; +use std::collections::BTreeMap; /// The boolean query returns a set of documents /// that matches the Boolean combination of constituent subqueries. @@ -159,9 +159,9 @@ impl Query for BooleanQuery { Ok(Box::new(BooleanWeight::new(sub_weights, scoring_enabled))) } - fn query_terms(&self, term_set: &mut BTreeSet) { + fn query_terms(&self, terms: &mut BTreeMap) { for (_occur, subquery) in &self.subqueries { - subquery.query_terms(term_set); + subquery.query_terms(terms); } } } diff --git a/src/query/boost_query.rs b/src/query/boost_query.rs index ffa19c7ea..41c94e0f8 100644 --- a/src/query/boost_query.rs +++ b/src/query/boost_query.rs @@ -2,7 +2,7 @@ use crate::fastfield::DeleteBitSet; use crate::query::explanation::does_not_match; use crate::query::{Explanation, Query, Scorer, Weight}; use crate::{DocId, DocSet, Score, Searcher, SegmentReader, Term}; -use std::collections::BTreeSet; +use std::collections::BTreeMap; use std::fmt; /// `BoostQuery` is a wrapper over a query used to boost its score. @@ -48,8 +48,8 @@ impl Query for BoostQuery { Ok(boosted_weight) } - fn query_terms(&self, term_set: &mut BTreeSet) { - self.query.query_terms(term_set) + fn query_terms(&self, terms: &mut BTreeMap) { + self.query.query_terms(terms) } } diff --git a/src/query/mod.rs b/src/query/mod.rs index 502820934..c43ccb87d 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -66,7 +66,7 @@ mod tests { use crate::schema::{Schema, TEXT}; use crate::Index; use crate::Term; - use std::collections::BTreeSet; + use std::collections::BTreeMap; #[test] fn test_query_terms() { @@ -78,49 +78,49 @@ mod tests { let term_a = Term::from_field_text(text_field, "a"); let term_b = Term::from_field_text(text_field, "b"); { - let mut terms_set: BTreeSet = BTreeSet::new(); + let mut terms: BTreeMap = Default::default(); query_parser .parse_query("a") .unwrap() - .query_terms(&mut terms_set); - let terms: Vec<&Term> = terms_set.iter().collect(); - assert_eq!(vec![&term_a], terms); + .query_terms(&mut terms); + let terms: Vec<(&Term, &bool)> = terms.iter().collect(); + assert_eq!(vec![(&term_a, &false)], terms); } { - let mut terms_set: BTreeSet = BTreeSet::new(); + let mut terms: BTreeMap = Default::default(); query_parser .parse_query("a b") .unwrap() - .query_terms(&mut terms_set); - let terms: Vec<&Term> = terms_set.iter().collect(); - assert_eq!(vec![&term_a, &term_b], terms); + .query_terms(&mut terms); + let terms: Vec<(&Term, &bool)> = terms.iter().collect(); + assert_eq!(vec![(&term_a, &false), (&term_b, &false)], terms); } { - let mut terms_set: BTreeSet = BTreeSet::new(); + let mut terms: BTreeMap = Default::default(); query_parser .parse_query("\"a b\"") .unwrap() - .query_terms(&mut terms_set); - let terms: Vec<&Term> = terms_set.iter().collect(); - assert_eq!(vec![&term_a, &term_b], terms); + .query_terms(&mut terms); + let terms: Vec<(&Term, &bool)> = terms.iter().collect(); + assert_eq!(vec![(&term_a, &true), (&term_b, &true)], terms); } { - let mut terms_set: BTreeSet = BTreeSet::new(); + let mut terms: BTreeMap = Default::default(); query_parser .parse_query("a a a a a") .unwrap() - .query_terms(&mut terms_set); - let terms: Vec<&Term> = terms_set.iter().collect(); - assert_eq!(vec![&term_a], terms); + .query_terms(&mut terms); + let terms: Vec<(&Term, &bool)> = terms.iter().collect(); + assert_eq!(vec![(&term_a, &false)], terms); } { - let mut terms_set: BTreeSet = BTreeSet::new(); + let mut terms: BTreeMap = Default::default(); query_parser .parse_query("a -b") .unwrap() - .query_terms(&mut terms_set); - let terms: Vec<&Term> = terms_set.iter().collect(); - assert_eq!(vec![&term_a, &term_b], terms); + .query_terms(&mut terms); + let terms: Vec<(&Term, &bool)> = terms.iter().collect(); + assert_eq!(vec![(&term_a, &false), (&term_b, &false)], terms); } } } diff --git a/src/query/more_like_this/more_like_this.rs b/src/query/more_like_this/more_like_this.rs index 477f4da0b..fe5636646 100644 --- a/src/query/more_like_this/more_like_this.rs +++ b/src/query/more_like_this/more_like_this.rs @@ -233,10 +233,9 @@ impl MoreLikeThis { } FieldType::U64(_) => { for field_value in field_values { - let val = field_value - .value() - .u64_value() - .ok_or(TantivyError::InvalidArgument("invalid value".to_string()))?; + let val = field_value.value().u64_value().ok_or_else(|| { + TantivyError::InvalidArgument("invalid value".to_string()) + })?; if !self.is_noise_word(val.to_string()) { let term = Term::from_field_u64(field, val); *term_frequencies.entry(term).or_insert(0) += 1; @@ -249,7 +248,7 @@ impl MoreLikeThis { let val = field_value .value() .date_value() - .ok_or(TantivyError::InvalidArgument("invalid value".to_string()))? + .ok_or_else(|| TantivyError::InvalidArgument("invalid value".to_string()))? .timestamp(); if !self.is_noise_word(val.to_string()) { let term = Term::from_field_i64(field, val); @@ -259,10 +258,9 @@ impl MoreLikeThis { } FieldType::I64(_) => { for field_value in field_values { - let val = field_value - .value() - .i64_value() - .ok_or(TantivyError::InvalidArgument("invalid value".to_string()))?; + let val = field_value.value().i64_value().ok_or_else(|| { + TantivyError::InvalidArgument("invalid value".to_string()) + })?; if !self.is_noise_word(val.to_string()) { let term = Term::from_field_i64(field, val); *term_frequencies.entry(term).or_insert(0) += 1; @@ -271,10 +269,9 @@ impl MoreLikeThis { } FieldType::F64(_) => { for field_value in field_values { - let val = field_value - .value() - .f64_value() - .ok_or(TantivyError::InvalidArgument("invalid value".to_string()))?; + let val = field_value.value().f64_value().ok_or_else(|| { + TantivyError::InvalidArgument("invalid value".to_string()) + })?; if !self.is_noise_word(val.to_string()) { let term = Term::from_field_f64(field, val); *term_frequencies.entry(term).or_insert(0) += 1; @@ -306,7 +303,7 @@ impl MoreLikeThis { { return true; } - return self.stop_words.contains(&word); + self.stop_words.contains(&word) } /// Couputes the score for each term while ignoring not useful terms diff --git a/src/query/phrase_query/phrase_query.rs b/src/query/phrase_query/phrase_query.rs index b1e6c8344..4580a9f1f 100644 --- a/src/query/phrase_query/phrase_query.rs +++ b/src/query/phrase_query/phrase_query.rs @@ -1,3 +1,5 @@ +use std::collections::BTreeMap; + use super::PhraseWeight; use crate::core::searcher::Searcher; use crate::query::bm25::Bm25Weight; @@ -5,7 +7,6 @@ use crate::query::Query; use crate::query::Weight; use crate::schema::IndexRecordOption; use crate::schema::{Field, Term}; -use std::collections::BTreeSet; /// `PhraseQuery` matches a specific sequence of words. /// @@ -113,9 +114,9 @@ impl Query for PhraseQuery { Ok(Box::new(phrase_weight)) } - fn query_terms(&self, term_set: &mut BTreeSet) { - for (_, query_term) in &self.phrase_terms { - term_set.insert(query_term.clone()); + fn query_terms(&self, terms: &mut BTreeMap) { + for (_, term) in &self.phrase_terms { + terms.insert(term.clone(), true); } } } diff --git a/src/query/query.rs b/src/query/query.rs index 6bd63f094..03451be75 100644 --- a/src/query/query.rs +++ b/src/query/query.rs @@ -4,7 +4,7 @@ use crate::query::Explanation; use crate::DocAddress; use crate::Term; use downcast_rs::impl_downcast; -use std::collections::BTreeSet; +use std::collections::BTreeMap; use std::fmt; /// The `Query` trait defines a set of documents and a scoring method @@ -68,7 +68,10 @@ pub trait Query: QueryClone + Send + Sync + downcast_rs::Downcast + fmt::Debug { /// Extract all of the terms associated to the query and insert them in the /// term set given in arguments. - fn query_terms(&self, _term_set: &mut BTreeSet) {} + /// + /// Each term is associated with a boolean indicating whether + /// Positions are required or not. + fn query_terms(&self, _term_set: &mut BTreeMap) {} } /// Implements `box_clone`. @@ -95,8 +98,8 @@ impl Query for Box { self.as_ref().count(searcher) } - fn query_terms(&self, term_set: &mut BTreeSet>>) { - self.as_ref().query_terms(term_set); + fn query_terms(&self, terms: &mut BTreeMap) { + self.as_ref().query_terms(terms); } } diff --git a/src/query/term_query/term_query.rs b/src/query/term_query/term_query.rs index 6c064806e..b9e38959f 100644 --- a/src/query/term_query/term_query.rs +++ b/src/query/term_query/term_query.rs @@ -5,7 +5,7 @@ use crate::query::{Explanation, Query}; use crate::schema::IndexRecordOption; use crate::Searcher; use crate::Term; -use std::collections::BTreeSet; +use std::collections::BTreeMap; use std::fmt; /// A Term query matches all of the documents @@ -127,7 +127,7 @@ impl Query for TermQuery { self.specialized_weight(searcher, scoring_enabled)?, )) } - fn query_terms(&self, term_set: &mut BTreeSet) { - term_set.insert(self.term.clone()); + fn query_terms(&self, terms: &mut BTreeMap) { + terms.insert(self.term.clone(), false); } } diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index a2df79488..708cc82ae 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -7,7 +7,6 @@ use crate::{Document, Score}; use htmlescape::encode_minimal; use std::cmp::Ordering; use std::collections::BTreeMap; -use std::collections::BTreeSet; use std::ops::Range; const DEFAULT_MAX_NUM_CHARS: usize = 150; @@ -239,10 +238,10 @@ impl SnippetGenerator { query: &dyn Query, field: Field, ) -> crate::Result { - let mut terms = BTreeSet::new(); + let mut terms = BTreeMap::new(); query.query_terms(&mut terms); let mut terms_text: BTreeMap = Default::default(); - for term in terms { + for (term, _) in terms { if term.field() != field { continue; }