diff --git a/src/query/phrase_query/mod.rs b/src/query/phrase_query/mod.rs index b384c7a5c..f4f974388 100644 --- a/src/query/phrase_query/mod.rs +++ b/src/query/phrase_query/mod.rs @@ -173,4 +173,40 @@ mod tests { assert_eq!(test_query(vec!["a", "b"]), vec![1]); assert_eq!(test_query(vec!["b", "a"]), vec![2]); } + + #[test] // motivated by #234 + pub fn test_phrase_query_non_trivial_offsets() { + let mut schema_builder = SchemaBuilder::default(); + let text_field = schema_builder.add_text_field("text", TEXT); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + { + let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); + index_writer.add_document(doc!(text_field=>"a b c d e f g h")); + assert!(index_writer.commit().is_ok()); + } + index.load_searchers().unwrap(); + let searcher = index.searcher(); + let test_query = |texts: Vec<(usize, &str)>| { + let mut test_collector = TestCollector::default(); + let terms: Vec<(usize, Term)> = texts + .iter() + .map(|(offset, text)| (*offset, Term::from_field_text(text_field, text)) ) + .collect(); + let phrase_query = PhraseQuery::new_with_offset(terms); + searcher + .search(&phrase_query, &mut test_collector) + .expect("search should succeed"); + test_collector.docs() + }; + assert_eq!(test_query(vec![(0, "a"), (1, "b")]), vec![0]); + assert_eq!(test_query(vec![(1, "b"), (0, "a")]), vec![0]); + assert!(test_query(vec![(0, "a"), (2, "b")]).is_empty()); + assert_eq!(test_query(vec![(0, "a"), (2, "c")]), vec![0]); + assert_eq!(test_query(vec![(0, "a"), (2, "c"), (3, "d")]), vec![0]); + assert_eq!(test_query(vec![(0, "a"), (2, "c"), (4, "e")]), vec![0]); + assert_eq!(test_query(vec![(4, "e"), (0, "a"), (2, "c")]), vec![0]); + assert!(test_query(vec![(0, "a"), (2, "d")]).is_empty()); + assert_eq!(test_query(vec![(1, "a"), (3, "c")]), vec![0]); + } } diff --git a/src/query/phrase_query/phrase_query.rs b/src/query/phrase_query/phrase_query.rs index 0bb74b912..9cabe8cc4 100644 --- a/src/query/phrase_query/phrase_query.rs +++ b/src/query/phrase_query/phrase_query.rs @@ -24,7 +24,7 @@ use Result; #[derive(Clone, Debug)] pub struct PhraseQuery { field: Field, - phrase_terms: Vec, + phrase_terms: Vec<(usize, Term)>, } impl PhraseQuery { @@ -32,14 +32,25 @@ impl PhraseQuery { /// /// There must be at least two terms, and all terms /// must belong to the same field. + /// Offset for each term will be same as index in the Vector pub fn new(terms: Vec) -> PhraseQuery { + let terms_with_offset = terms.into_iter().enumerate().collect(); + PhraseQuery::new_with_offset(terms_with_offset) + } + + + /// Creates a new `PhraseQuery` given a list of terms and there offsets. + /// + /// Can be used to provide custom offset for each term. + pub fn new_with_offset(mut terms: Vec<(usize, Term)>) ->PhraseQuery { assert!( terms.len() > 1, "A phrase query is required to have strictly more than one term." ); - let field = terms[0].field(); + terms.sort_by_key(|&(offset, _)| offset); + let field = terms[0].1.field(); assert!( - terms[1..].iter().all(|term| term.field() == field), + terms[1..].iter().all(|term| term.1.field() == field), "All terms from a phrase query must belong to the same field" ); PhraseQuery { @@ -53,10 +64,11 @@ impl PhraseQuery { self.field } - /// The `Term`s in the phrase making up this `PhraseQuery`. - pub fn phrase_terms(&self) -> &[Term] { - &self.phrase_terms[..] - } + /// `Term`s in the phrase without the associated offsets. + pub fn phrase_terms(&self) -> Vec { + self.phrase_terms.iter().map(|(_, term)| term.clone()).collect::>() + } + } impl Query for PhraseQuery { @@ -78,13 +90,13 @@ impl Query for PhraseQuery { field_name ))) } - let terms = self.phrase_terms.clone(); if scoring_enabled { + let terms = self.phrase_terms(); let bm25_weight = BM25Weight::for_terms(searcher, &terms); - Ok(Box::new(PhraseWeight::new(terms, bm25_weight, true))) + Ok(Box::new(PhraseWeight::new(self.phrase_terms.clone(), bm25_weight, true))) } else { Ok(Box::new(PhraseWeight::new( - terms, + self.phrase_terms.clone(), BM25Weight::null(), false, ))) diff --git a/src/query/phrase_query/phrase_scorer.rs b/src/query/phrase_query/phrase_scorer.rs index 90590a0b8..6e04291c6 100644 --- a/src/query/phrase_query/phrase_scorer.rs +++ b/src/query/phrase_query/phrase_scorer.rs @@ -119,17 +119,20 @@ fn intersection(left: &mut [u32], right: &[u32]) -> usize { impl PhraseScorer { pub fn new( - term_postings: Vec, + term_postings: Vec<(usize, TPostings)>, similarity_weight: BM25Weight, fieldnorm_reader: FieldNormReader, score_needed: bool, ) -> PhraseScorer { + let max_offset = term_postings.iter() + .map(|&(offset, _)| offset) + .max() + .unwrap_or(0); let num_docsets = term_postings.len(); let postings_with_offsets = term_postings .into_iter() - .enumerate() .map(|(offset, postings)| { - PostingsWithOffset::new(postings, (num_docsets - offset) as u32) + PostingsWithOffset::new(postings, (max_offset - offset) as u32) }) .collect::>(); PhraseScorer { diff --git a/src/query/phrase_query/phrase_weight.rs b/src/query/phrase_query/phrase_weight.rs index 9f5a5c243..de8eeb0d2 100644 --- a/src/query/phrase_query/phrase_weight.rs +++ b/src/query/phrase_query/phrase_weight.rs @@ -9,7 +9,7 @@ use schema::Term; use Result; pub struct PhraseWeight { - phrase_terms: Vec, + phrase_terms: Vec<(usize, Term)>, similarity_weight: BM25Weight, score_needed: bool, } @@ -17,7 +17,7 @@ pub struct PhraseWeight { impl PhraseWeight { /// Creates a new phrase weight. pub fn new( - phrase_terms: Vec, + phrase_terms: Vec<(usize, Term)>, similarity_weight: BM25Weight, score_needed: bool, ) -> PhraseWeight { @@ -32,16 +32,16 @@ impl PhraseWeight { impl Weight for PhraseWeight { fn scorer(&self, reader: &SegmentReader) -> Result> { let similarity_weight = self.similarity_weight.clone(); - let field = self.phrase_terms[0].field(); + let field = self.phrase_terms[0].1.field(); let fieldnorm_reader = reader.get_fieldnorms_reader(field); if reader.has_deletes() { let mut term_postings_list = Vec::new(); - for term in &self.phrase_terms { + for &(offset, ref term) in &self.phrase_terms { if let Some(postings) = reader .inverted_index(term.field()) - .read_postings(term, IndexRecordOption::WithFreqsAndPositions) + .read_postings(&term, IndexRecordOption::WithFreqsAndPositions) { - term_postings_list.push(postings); + term_postings_list.push((offset, postings)); } else { return Ok(Box::new(EmptyScorer)); } @@ -54,12 +54,12 @@ impl Weight for PhraseWeight { ))) } else { let mut term_postings_list = Vec::new(); - for term in &self.phrase_terms { + for &(offset, ref term) in &self.phrase_terms { if let Some(postings) = reader .inverted_index(term.field()) - .read_postings_no_deletes(term, IndexRecordOption::WithFreqsAndPositions) + .read_postings_no_deletes(&term, IndexRecordOption::WithFreqsAndPositions) { - term_postings_list.push(postings); + term_postings_list.push((offset, postings)); } else { return Ok(Box::new(EmptyScorer)); } diff --git a/src/query/query_parser/logical_ast.rs b/src/query/query_parser/logical_ast.rs index eefb9f7c5..608831578 100644 --- a/src/query/query_parser/logical_ast.rs +++ b/src/query/query_parser/logical_ast.rs @@ -8,7 +8,7 @@ use std::ops::Bound; #[derive(Clone)] pub enum LogicalLiteral { Term(Term), - Phrase(Vec), + Phrase(Vec<(usize, Term)>), Range { field: Field, value_type: Type, diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index fb3258b17..0b413345b 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -180,7 +180,7 @@ impl QueryParser { &self, field: Field, phrase: &str, - ) -> Result, QueryParserError> { + ) -> Result, QueryParserError> { let field_entry = self.schema.get_field_entry(field); let field_type = field_entry.field_type(); if !field_type.is_indexed() { @@ -191,12 +191,12 @@ impl QueryParser { FieldType::I64(_) => { let val: i64 = i64::from_str(phrase)?; let term = Term::from_field_i64(field, val); - Ok(vec![term]) + Ok(vec![(0, term)]) } FieldType::U64(_) => { let val: u64 = u64::from_str(phrase)?; let term = Term::from_field_u64(field, val); - Ok(vec![term]) + Ok(vec![(0, term)]) } FieldType::Str(ref str_options) => { if let Some(option) = str_options.get_indexing_options() { @@ -208,11 +208,11 @@ impl QueryParser { ) }, )?; - let mut terms: Vec = Vec::new(); + let mut terms: Vec<(usize, Term)> = Vec::new(); let mut token_stream = tokenizer.token_stream(phrase); token_stream.process(&mut |token| { let term = Term::from_field_text(field, &token.text); - terms.push(term); + terms.push((token.position, term)); }); if terms.is_empty() { Ok(vec![]) @@ -242,7 +242,7 @@ impl QueryParser { )) } } - FieldType::HierarchicalFacet => Ok(vec![Term::from_field_text(field, phrase)]), + FieldType::HierarchicalFacet => Ok(vec![(0, Term::from_field_text(field, phrase))]), FieldType::Bytes => { let field_name = self.schema.get_field_name(field).to_string(); Err(QueryParserError::FieldNotIndexed(field_name)) @@ -256,12 +256,13 @@ impl QueryParser { phrase: &str, ) -> Result, QueryParserError> { let terms = self.compute_terms_for_string(field, phrase)?; - match terms.len() { - 0 => Ok(None), - 1 => Ok(Some(LogicalLiteral::Term( - terms.into_iter().next().unwrap(), - ))), - _ => Ok(Some(LogicalLiteral::Phrase(terms))), + match &terms[..] { + [] => + Ok(None), + [(_, term)] => + Ok(Some(LogicalLiteral::Term(term.clone()))), + _ => + Ok(Some(LogicalLiteral::Phrase(terms.clone()))), } } @@ -281,7 +282,7 @@ impl QueryParser { if terms.len() != 1 { return Err(QueryParserError::RangeMustNotHavePhrase); } - let term = terms.into_iter().next().unwrap(); + let (_, term) = terms.into_iter().next().unwrap(); match *bound { UserInputBound::Inclusive(_) => Ok(Bound::Included(term)), UserInputBound::Exclusive(_) => Ok(Bound::Excluded(term)), @@ -423,7 +424,7 @@ fn compose_occur(left: Occur, right: Occur) -> Occur { fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box { match logical_literal { LogicalLiteral::Term(term) => Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs)), - LogicalLiteral::Phrase(terms) => Box::new(PhraseQuery::new(terms)), + LogicalLiteral::Phrase(term_with_offsets) => Box::new(PhraseQuery::new_with_offset(term_with_offsets)), LogicalLiteral::Range { field, value_type, @@ -611,8 +612,8 @@ mod test { ); test_parse_query_to_logical_ast_helper( "title:\"a b\"", - "\"[Term([0, 0, 0, 0, 97]), \ - Term([0, 0, 0, 0, 98])]\"", + "\"[(0, Term([0, 0, 0, 0, 97])), \ + (1, Term([0, 0, 0, 0, 98]))]\"", false, ); test_parse_query_to_logical_ast_helper( @@ -757,8 +758,8 @@ mod test { ); test_parse_query_to_logical_ast_helper( "title:\"a b\"", - "\"[Term([0, 0, 0, 0, 97]), \ - Term([0, 0, 0, 0, 98])]\"", + "\"[(0, Term([0, 0, 0, 0, 97])), \ + (1, Term([0, 0, 0, 0, 98]))]\"", true, ); } diff --git a/src/tokenizer/japanese_tokenizer.rs b/src/tokenizer/japanese_tokenizer.rs index 7b4d19ef6..5b072e380 100644 --- a/src/tokenizer/japanese_tokenizer.rs +++ b/src/tokenizer/japanese_tokenizer.rs @@ -33,6 +33,7 @@ impl<'a> Tokenizer<'a> for JapaneseTokenizer { offset_to, position: pos, text: term, + position_length: 1 }); } } diff --git a/src/tokenizer/raw_tokenizer.rs b/src/tokenizer/raw_tokenizer.rs index 3ec714e37..12a5af82c 100644 --- a/src/tokenizer/raw_tokenizer.rs +++ b/src/tokenizer/raw_tokenizer.rs @@ -18,6 +18,7 @@ impl<'a> Tokenizer<'a> for RawTokenizer { offset_to: text.len(), position: 0, text: text.to_string(), + position_length: 1 }; RawTokenStream { token, diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index e92a49d06..fcdf8f21b 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -17,6 +17,8 @@ pub struct Token { pub position: usize, /// Actual text content of the token. pub text: String, + /// Is the length expressed in term of number of original tokens. + pub position_length: usize, } impl Default for Token { @@ -26,6 +28,7 @@ impl Default for Token { offset_to: 0, position: usize::max_value(), text: String::with_capacity(200), + position_length: 1, } } } @@ -273,6 +276,7 @@ mod test { offset_from: 2, offset_to: 3, text: "abc".to_string(), + position_length: 1 }; let t2 = t1.clone();