add position_length to Token (#337)

* add position_length to Token

refer #291

* Add term offset to `PhraseQuery`

ref #291

* Add new constructor for `PhraseQuery` that allows custom offset

* fix the method name as per pr comment

* Closes #291

Added unit test.
Using offsets from the analyzer in QueryParser.
This commit is contained in:
Vignesh Sarma K
2018-08-13 06:44:50 +05:30
committed by Paul Masurel
parent 290620fdee
commit 09e00f1d42
9 changed files with 99 additions and 41 deletions

View File

@@ -173,4 +173,40 @@ mod tests {
assert_eq!(test_query(vec!["a", "b"]), vec![1]);
assert_eq!(test_query(vec!["b", "a"]), vec![2]);
}
#[test] // motivated by #234
pub fn test_phrase_query_non_trivial_offsets() {
let mut schema_builder = SchemaBuilder::default();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"a b c d e f g h"));
assert!(index_writer.commit().is_ok());
}
index.load_searchers().unwrap();
let searcher = index.searcher();
let test_query = |texts: Vec<(usize, &str)>| {
let mut test_collector = TestCollector::default();
let terms: Vec<(usize, Term)> = texts
.iter()
.map(|(offset, text)| (*offset, Term::from_field_text(text_field, text)) )
.collect();
let phrase_query = PhraseQuery::new_with_offset(terms);
searcher
.search(&phrase_query, &mut test_collector)
.expect("search should succeed");
test_collector.docs()
};
assert_eq!(test_query(vec![(0, "a"), (1, "b")]), vec![0]);
assert_eq!(test_query(vec![(1, "b"), (0, "a")]), vec![0]);
assert!(test_query(vec![(0, "a"), (2, "b")]).is_empty());
assert_eq!(test_query(vec![(0, "a"), (2, "c")]), vec![0]);
assert_eq!(test_query(vec![(0, "a"), (2, "c"), (3, "d")]), vec![0]);
assert_eq!(test_query(vec![(0, "a"), (2, "c"), (4, "e")]), vec![0]);
assert_eq!(test_query(vec![(4, "e"), (0, "a"), (2, "c")]), vec![0]);
assert!(test_query(vec![(0, "a"), (2, "d")]).is_empty());
assert_eq!(test_query(vec![(1, "a"), (3, "c")]), vec![0]);
}
}

View File

@@ -24,7 +24,7 @@ use Result;
#[derive(Clone, Debug)]
pub struct PhraseQuery {
field: Field,
phrase_terms: Vec<Term>,
phrase_terms: Vec<(usize, Term)>,
}
impl PhraseQuery {
@@ -32,14 +32,25 @@ impl PhraseQuery {
///
/// There must be at least two terms, and all terms
/// must belong to the same field.
/// Offset for each term will be same as index in the Vector
pub fn new(terms: Vec<Term>) -> PhraseQuery {
let terms_with_offset = terms.into_iter().enumerate().collect();
PhraseQuery::new_with_offset(terms_with_offset)
}
/// Creates a new `PhraseQuery` given a list of terms and there offsets.
///
/// Can be used to provide custom offset for each term.
pub fn new_with_offset(mut terms: Vec<(usize, Term)>) ->PhraseQuery {
assert!(
terms.len() > 1,
"A phrase query is required to have strictly more than one term."
);
let field = terms[0].field();
terms.sort_by_key(|&(offset, _)| offset);
let field = terms[0].1.field();
assert!(
terms[1..].iter().all(|term| term.field() == field),
terms[1..].iter().all(|term| term.1.field() == field),
"All terms from a phrase query must belong to the same field"
);
PhraseQuery {
@@ -53,10 +64,11 @@ impl PhraseQuery {
self.field
}
/// The `Term`s in the phrase making up this `PhraseQuery`.
pub fn phrase_terms(&self) -> &[Term] {
&self.phrase_terms[..]
}
/// `Term`s in the phrase without the associated offsets.
pub fn phrase_terms(&self) -> Vec<Term> {
self.phrase_terms.iter().map(|(_, term)| term.clone()).collect::<Vec<Term>>()
}
}
impl Query for PhraseQuery {
@@ -78,13 +90,13 @@ impl Query for PhraseQuery {
field_name
)))
}
let terms = self.phrase_terms.clone();
if scoring_enabled {
let terms = self.phrase_terms();
let bm25_weight = BM25Weight::for_terms(searcher, &terms);
Ok(Box::new(PhraseWeight::new(terms, bm25_weight, true)))
Ok(Box::new(PhraseWeight::new(self.phrase_terms.clone(), bm25_weight, true)))
} else {
Ok(Box::new(PhraseWeight::new(
terms,
self.phrase_terms.clone(),
BM25Weight::null(),
false,
)))

View File

@@ -119,17 +119,20 @@ fn intersection(left: &mut [u32], right: &[u32]) -> usize {
impl<TPostings: Postings> PhraseScorer<TPostings> {
pub fn new(
term_postings: Vec<TPostings>,
term_postings: Vec<(usize, TPostings)>,
similarity_weight: BM25Weight,
fieldnorm_reader: FieldNormReader,
score_needed: bool,
) -> PhraseScorer<TPostings> {
let max_offset = term_postings.iter()
.map(|&(offset, _)| offset)
.max()
.unwrap_or(0);
let num_docsets = term_postings.len();
let postings_with_offsets = term_postings
.into_iter()
.enumerate()
.map(|(offset, postings)| {
PostingsWithOffset::new(postings, (num_docsets - offset) as u32)
PostingsWithOffset::new(postings, (max_offset - offset) as u32)
})
.collect::<Vec<_>>();
PhraseScorer {

View File

@@ -9,7 +9,7 @@ use schema::Term;
use Result;
pub struct PhraseWeight {
phrase_terms: Vec<Term>,
phrase_terms: Vec<(usize, Term)>,
similarity_weight: BM25Weight,
score_needed: bool,
}
@@ -17,7 +17,7 @@ pub struct PhraseWeight {
impl PhraseWeight {
/// Creates a new phrase weight.
pub fn new(
phrase_terms: Vec<Term>,
phrase_terms: Vec<(usize, Term)>,
similarity_weight: BM25Weight,
score_needed: bool,
) -> PhraseWeight {
@@ -32,16 +32,16 @@ impl PhraseWeight {
impl Weight for PhraseWeight {
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
let similarity_weight = self.similarity_weight.clone();
let field = self.phrase_terms[0].field();
let field = self.phrase_terms[0].1.field();
let fieldnorm_reader = reader.get_fieldnorms_reader(field);
if reader.has_deletes() {
let mut term_postings_list = Vec::new();
for term in &self.phrase_terms {
for &(offset, ref term) in &self.phrase_terms {
if let Some(postings) = reader
.inverted_index(term.field())
.read_postings(term, IndexRecordOption::WithFreqsAndPositions)
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
{
term_postings_list.push(postings);
term_postings_list.push((offset, postings));
} else {
return Ok(Box::new(EmptyScorer));
}
@@ -54,12 +54,12 @@ impl Weight for PhraseWeight {
)))
} else {
let mut term_postings_list = Vec::new();
for term in &self.phrase_terms {
for &(offset, ref term) in &self.phrase_terms {
if let Some(postings) = reader
.inverted_index(term.field())
.read_postings_no_deletes(term, IndexRecordOption::WithFreqsAndPositions)
.read_postings_no_deletes(&term, IndexRecordOption::WithFreqsAndPositions)
{
term_postings_list.push(postings);
term_postings_list.push((offset, postings));
} else {
return Ok(Box::new(EmptyScorer));
}

View File

@@ -8,7 +8,7 @@ use std::ops::Bound;
#[derive(Clone)]
pub enum LogicalLiteral {
Term(Term),
Phrase(Vec<Term>),
Phrase(Vec<(usize, Term)>),
Range {
field: Field,
value_type: Type,

View File

@@ -180,7 +180,7 @@ impl QueryParser {
&self,
field: Field,
phrase: &str,
) -> Result<Vec<Term>, QueryParserError> {
) -> Result<Vec<(usize, Term)>, QueryParserError> {
let field_entry = self.schema.get_field_entry(field);
let field_type = field_entry.field_type();
if !field_type.is_indexed() {
@@ -191,12 +191,12 @@ impl QueryParser {
FieldType::I64(_) => {
let val: i64 = i64::from_str(phrase)?;
let term = Term::from_field_i64(field, val);
Ok(vec![term])
Ok(vec![(0, term)])
}
FieldType::U64(_) => {
let val: u64 = u64::from_str(phrase)?;
let term = Term::from_field_u64(field, val);
Ok(vec![term])
Ok(vec![(0, term)])
}
FieldType::Str(ref str_options) => {
if let Some(option) = str_options.get_indexing_options() {
@@ -208,11 +208,11 @@ impl QueryParser {
)
},
)?;
let mut terms: Vec<Term> = Vec::new();
let mut terms: Vec<(usize, Term)> = Vec::new();
let mut token_stream = tokenizer.token_stream(phrase);
token_stream.process(&mut |token| {
let term = Term::from_field_text(field, &token.text);
terms.push(term);
terms.push((token.position, term));
});
if terms.is_empty() {
Ok(vec![])
@@ -242,7 +242,7 @@ impl QueryParser {
))
}
}
FieldType::HierarchicalFacet => Ok(vec![Term::from_field_text(field, phrase)]),
FieldType::HierarchicalFacet => Ok(vec![(0, Term::from_field_text(field, phrase))]),
FieldType::Bytes => {
let field_name = self.schema.get_field_name(field).to_string();
Err(QueryParserError::FieldNotIndexed(field_name))
@@ -256,12 +256,13 @@ impl QueryParser {
phrase: &str,
) -> Result<Option<LogicalLiteral>, QueryParserError> {
let terms = self.compute_terms_for_string(field, phrase)?;
match terms.len() {
0 => Ok(None),
1 => Ok(Some(LogicalLiteral::Term(
terms.into_iter().next().unwrap(),
))),
_ => Ok(Some(LogicalLiteral::Phrase(terms))),
match &terms[..] {
[] =>
Ok(None),
[(_, term)] =>
Ok(Some(LogicalLiteral::Term(term.clone()))),
_ =>
Ok(Some(LogicalLiteral::Phrase(terms.clone()))),
}
}
@@ -281,7 +282,7 @@ impl QueryParser {
if terms.len() != 1 {
return Err(QueryParserError::RangeMustNotHavePhrase);
}
let term = terms.into_iter().next().unwrap();
let (_, term) = terms.into_iter().next().unwrap();
match *bound {
UserInputBound::Inclusive(_) => Ok(Bound::Included(term)),
UserInputBound::Exclusive(_) => Ok(Bound::Excluded(term)),
@@ -423,7 +424,7 @@ fn compose_occur(left: Occur, right: Occur) -> Occur {
fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box<Query> {
match logical_literal {
LogicalLiteral::Term(term) => Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs)),
LogicalLiteral::Phrase(terms) => Box::new(PhraseQuery::new(terms)),
LogicalLiteral::Phrase(term_with_offsets) => Box::new(PhraseQuery::new_with_offset(term_with_offsets)),
LogicalLiteral::Range {
field,
value_type,
@@ -611,8 +612,8 @@ mod test {
);
test_parse_query_to_logical_ast_helper(
"title:\"a b\"",
"\"[Term([0, 0, 0, 0, 97]), \
Term([0, 0, 0, 0, 98])]\"",
"\"[(0, Term([0, 0, 0, 0, 97])), \
(1, Term([0, 0, 0, 0, 98]))]\"",
false,
);
test_parse_query_to_logical_ast_helper(
@@ -757,8 +758,8 @@ mod test {
);
test_parse_query_to_logical_ast_helper(
"title:\"a b\"",
"\"[Term([0, 0, 0, 0, 97]), \
Term([0, 0, 0, 0, 98])]\"",
"\"[(0, Term([0, 0, 0, 0, 97])), \
(1, Term([0, 0, 0, 0, 98]))]\"",
true,
);
}

View File

@@ -33,6 +33,7 @@ impl<'a> Tokenizer<'a> for JapaneseTokenizer {
offset_to,
position: pos,
text: term,
position_length: 1
});
}
}

View File

@@ -18,6 +18,7 @@ impl<'a> Tokenizer<'a> for RawTokenizer {
offset_to: text.len(),
position: 0,
text: text.to_string(),
position_length: 1
};
RawTokenStream {
token,

View File

@@ -17,6 +17,8 @@ pub struct Token {
pub position: usize,
/// Actual text content of the token.
pub text: String,
/// Is the length expressed in term of number of original tokens.
pub position_length: usize,
}
impl Default for Token {
@@ -26,6 +28,7 @@ impl Default for Token {
offset_to: 0,
position: usize::max_value(),
text: String::with_capacity(200),
position_length: 1,
}
}
}
@@ -273,6 +276,7 @@ mod test {
offset_from: 2,
offset_to: 3,
text: "abc".to_string(),
position_length: 1
};
let t2 = t1.clone();