mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-07 17:42:55 +00:00
add position_length to Token (#337)
* add position_length to Token refer #291 * Add term offset to `PhraseQuery` ref #291 * Add new constructor for `PhraseQuery` that allows custom offset * fix the method name as per pr comment * Closes #291 Added unit test. Using offsets from the analyzer in QueryParser.
This commit is contained in:
committed by
Paul Masurel
parent
290620fdee
commit
09e00f1d42
@@ -173,4 +173,40 @@ mod tests {
|
||||
assert_eq!(test_query(vec!["a", "b"]), vec![1]);
|
||||
assert_eq!(test_query(vec!["b", "a"]), vec![2]);
|
||||
}
|
||||
|
||||
#[test] // motivated by #234
|
||||
pub fn test_phrase_query_non_trivial_offsets() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
index_writer.add_document(doc!(text_field=>"a b c d e f g h"));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let test_query = |texts: Vec<(usize, &str)>| {
|
||||
let mut test_collector = TestCollector::default();
|
||||
let terms: Vec<(usize, Term)> = texts
|
||||
.iter()
|
||||
.map(|(offset, text)| (*offset, Term::from_field_text(text_field, text)) )
|
||||
.collect();
|
||||
let phrase_query = PhraseQuery::new_with_offset(terms);
|
||||
searcher
|
||||
.search(&phrase_query, &mut test_collector)
|
||||
.expect("search should succeed");
|
||||
test_collector.docs()
|
||||
};
|
||||
assert_eq!(test_query(vec![(0, "a"), (1, "b")]), vec![0]);
|
||||
assert_eq!(test_query(vec![(1, "b"), (0, "a")]), vec![0]);
|
||||
assert!(test_query(vec![(0, "a"), (2, "b")]).is_empty());
|
||||
assert_eq!(test_query(vec![(0, "a"), (2, "c")]), vec![0]);
|
||||
assert_eq!(test_query(vec![(0, "a"), (2, "c"), (3, "d")]), vec![0]);
|
||||
assert_eq!(test_query(vec![(0, "a"), (2, "c"), (4, "e")]), vec![0]);
|
||||
assert_eq!(test_query(vec![(4, "e"), (0, "a"), (2, "c")]), vec![0]);
|
||||
assert!(test_query(vec![(0, "a"), (2, "d")]).is_empty());
|
||||
assert_eq!(test_query(vec![(1, "a"), (3, "c")]), vec![0]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,7 +24,7 @@ use Result;
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct PhraseQuery {
|
||||
field: Field,
|
||||
phrase_terms: Vec<Term>,
|
||||
phrase_terms: Vec<(usize, Term)>,
|
||||
}
|
||||
|
||||
impl PhraseQuery {
|
||||
@@ -32,14 +32,25 @@ impl PhraseQuery {
|
||||
///
|
||||
/// There must be at least two terms, and all terms
|
||||
/// must belong to the same field.
|
||||
/// Offset for each term will be same as index in the Vector
|
||||
pub fn new(terms: Vec<Term>) -> PhraseQuery {
|
||||
let terms_with_offset = terms.into_iter().enumerate().collect();
|
||||
PhraseQuery::new_with_offset(terms_with_offset)
|
||||
}
|
||||
|
||||
|
||||
/// Creates a new `PhraseQuery` given a list of terms and there offsets.
|
||||
///
|
||||
/// Can be used to provide custom offset for each term.
|
||||
pub fn new_with_offset(mut terms: Vec<(usize, Term)>) ->PhraseQuery {
|
||||
assert!(
|
||||
terms.len() > 1,
|
||||
"A phrase query is required to have strictly more than one term."
|
||||
);
|
||||
let field = terms[0].field();
|
||||
terms.sort_by_key(|&(offset, _)| offset);
|
||||
let field = terms[0].1.field();
|
||||
assert!(
|
||||
terms[1..].iter().all(|term| term.field() == field),
|
||||
terms[1..].iter().all(|term| term.1.field() == field),
|
||||
"All terms from a phrase query must belong to the same field"
|
||||
);
|
||||
PhraseQuery {
|
||||
@@ -53,10 +64,11 @@ impl PhraseQuery {
|
||||
self.field
|
||||
}
|
||||
|
||||
/// The `Term`s in the phrase making up this `PhraseQuery`.
|
||||
pub fn phrase_terms(&self) -> &[Term] {
|
||||
&self.phrase_terms[..]
|
||||
}
|
||||
/// `Term`s in the phrase without the associated offsets.
|
||||
pub fn phrase_terms(&self) -> Vec<Term> {
|
||||
self.phrase_terms.iter().map(|(_, term)| term.clone()).collect::<Vec<Term>>()
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
impl Query for PhraseQuery {
|
||||
@@ -78,13 +90,13 @@ impl Query for PhraseQuery {
|
||||
field_name
|
||||
)))
|
||||
}
|
||||
let terms = self.phrase_terms.clone();
|
||||
if scoring_enabled {
|
||||
let terms = self.phrase_terms();
|
||||
let bm25_weight = BM25Weight::for_terms(searcher, &terms);
|
||||
Ok(Box::new(PhraseWeight::new(terms, bm25_weight, true)))
|
||||
Ok(Box::new(PhraseWeight::new(self.phrase_terms.clone(), bm25_weight, true)))
|
||||
} else {
|
||||
Ok(Box::new(PhraseWeight::new(
|
||||
terms,
|
||||
self.phrase_terms.clone(),
|
||||
BM25Weight::null(),
|
||||
false,
|
||||
)))
|
||||
|
||||
@@ -119,17 +119,20 @@ fn intersection(left: &mut [u32], right: &[u32]) -> usize {
|
||||
|
||||
impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
pub fn new(
|
||||
term_postings: Vec<TPostings>,
|
||||
term_postings: Vec<(usize, TPostings)>,
|
||||
similarity_weight: BM25Weight,
|
||||
fieldnorm_reader: FieldNormReader,
|
||||
score_needed: bool,
|
||||
) -> PhraseScorer<TPostings> {
|
||||
let max_offset = term_postings.iter()
|
||||
.map(|&(offset, _)| offset)
|
||||
.max()
|
||||
.unwrap_or(0);
|
||||
let num_docsets = term_postings.len();
|
||||
let postings_with_offsets = term_postings
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.map(|(offset, postings)| {
|
||||
PostingsWithOffset::new(postings, (num_docsets - offset) as u32)
|
||||
PostingsWithOffset::new(postings, (max_offset - offset) as u32)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
PhraseScorer {
|
||||
|
||||
@@ -9,7 +9,7 @@ use schema::Term;
|
||||
use Result;
|
||||
|
||||
pub struct PhraseWeight {
|
||||
phrase_terms: Vec<Term>,
|
||||
phrase_terms: Vec<(usize, Term)>,
|
||||
similarity_weight: BM25Weight,
|
||||
score_needed: bool,
|
||||
}
|
||||
@@ -17,7 +17,7 @@ pub struct PhraseWeight {
|
||||
impl PhraseWeight {
|
||||
/// Creates a new phrase weight.
|
||||
pub fn new(
|
||||
phrase_terms: Vec<Term>,
|
||||
phrase_terms: Vec<(usize, Term)>,
|
||||
similarity_weight: BM25Weight,
|
||||
score_needed: bool,
|
||||
) -> PhraseWeight {
|
||||
@@ -32,16 +32,16 @@ impl PhraseWeight {
|
||||
impl Weight for PhraseWeight {
|
||||
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
|
||||
let similarity_weight = self.similarity_weight.clone();
|
||||
let field = self.phrase_terms[0].field();
|
||||
let field = self.phrase_terms[0].1.field();
|
||||
let fieldnorm_reader = reader.get_fieldnorms_reader(field);
|
||||
if reader.has_deletes() {
|
||||
let mut term_postings_list = Vec::new();
|
||||
for term in &self.phrase_terms {
|
||||
for &(offset, ref term) in &self.phrase_terms {
|
||||
if let Some(postings) = reader
|
||||
.inverted_index(term.field())
|
||||
.read_postings(term, IndexRecordOption::WithFreqsAndPositions)
|
||||
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
|
||||
{
|
||||
term_postings_list.push(postings);
|
||||
term_postings_list.push((offset, postings));
|
||||
} else {
|
||||
return Ok(Box::new(EmptyScorer));
|
||||
}
|
||||
@@ -54,12 +54,12 @@ impl Weight for PhraseWeight {
|
||||
)))
|
||||
} else {
|
||||
let mut term_postings_list = Vec::new();
|
||||
for term in &self.phrase_terms {
|
||||
for &(offset, ref term) in &self.phrase_terms {
|
||||
if let Some(postings) = reader
|
||||
.inverted_index(term.field())
|
||||
.read_postings_no_deletes(term, IndexRecordOption::WithFreqsAndPositions)
|
||||
.read_postings_no_deletes(&term, IndexRecordOption::WithFreqsAndPositions)
|
||||
{
|
||||
term_postings_list.push(postings);
|
||||
term_postings_list.push((offset, postings));
|
||||
} else {
|
||||
return Ok(Box::new(EmptyScorer));
|
||||
}
|
||||
|
||||
@@ -8,7 +8,7 @@ use std::ops::Bound;
|
||||
#[derive(Clone)]
|
||||
pub enum LogicalLiteral {
|
||||
Term(Term),
|
||||
Phrase(Vec<Term>),
|
||||
Phrase(Vec<(usize, Term)>),
|
||||
Range {
|
||||
field: Field,
|
||||
value_type: Type,
|
||||
|
||||
@@ -180,7 +180,7 @@ impl QueryParser {
|
||||
&self,
|
||||
field: Field,
|
||||
phrase: &str,
|
||||
) -> Result<Vec<Term>, QueryParserError> {
|
||||
) -> Result<Vec<(usize, Term)>, QueryParserError> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
if !field_type.is_indexed() {
|
||||
@@ -191,12 +191,12 @@ impl QueryParser {
|
||||
FieldType::I64(_) => {
|
||||
let val: i64 = i64::from_str(phrase)?;
|
||||
let term = Term::from_field_i64(field, val);
|
||||
Ok(vec![term])
|
||||
Ok(vec![(0, term)])
|
||||
}
|
||||
FieldType::U64(_) => {
|
||||
let val: u64 = u64::from_str(phrase)?;
|
||||
let term = Term::from_field_u64(field, val);
|
||||
Ok(vec![term])
|
||||
Ok(vec![(0, term)])
|
||||
}
|
||||
FieldType::Str(ref str_options) => {
|
||||
if let Some(option) = str_options.get_indexing_options() {
|
||||
@@ -208,11 +208,11 @@ impl QueryParser {
|
||||
)
|
||||
},
|
||||
)?;
|
||||
let mut terms: Vec<Term> = Vec::new();
|
||||
let mut terms: Vec<(usize, Term)> = Vec::new();
|
||||
let mut token_stream = tokenizer.token_stream(phrase);
|
||||
token_stream.process(&mut |token| {
|
||||
let term = Term::from_field_text(field, &token.text);
|
||||
terms.push(term);
|
||||
terms.push((token.position, term));
|
||||
});
|
||||
if terms.is_empty() {
|
||||
Ok(vec![])
|
||||
@@ -242,7 +242,7 @@ impl QueryParser {
|
||||
))
|
||||
}
|
||||
}
|
||||
FieldType::HierarchicalFacet => Ok(vec![Term::from_field_text(field, phrase)]),
|
||||
FieldType::HierarchicalFacet => Ok(vec![(0, Term::from_field_text(field, phrase))]),
|
||||
FieldType::Bytes => {
|
||||
let field_name = self.schema.get_field_name(field).to_string();
|
||||
Err(QueryParserError::FieldNotIndexed(field_name))
|
||||
@@ -256,12 +256,13 @@ impl QueryParser {
|
||||
phrase: &str,
|
||||
) -> Result<Option<LogicalLiteral>, QueryParserError> {
|
||||
let terms = self.compute_terms_for_string(field, phrase)?;
|
||||
match terms.len() {
|
||||
0 => Ok(None),
|
||||
1 => Ok(Some(LogicalLiteral::Term(
|
||||
terms.into_iter().next().unwrap(),
|
||||
))),
|
||||
_ => Ok(Some(LogicalLiteral::Phrase(terms))),
|
||||
match &terms[..] {
|
||||
[] =>
|
||||
Ok(None),
|
||||
[(_, term)] =>
|
||||
Ok(Some(LogicalLiteral::Term(term.clone()))),
|
||||
_ =>
|
||||
Ok(Some(LogicalLiteral::Phrase(terms.clone()))),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -281,7 +282,7 @@ impl QueryParser {
|
||||
if terms.len() != 1 {
|
||||
return Err(QueryParserError::RangeMustNotHavePhrase);
|
||||
}
|
||||
let term = terms.into_iter().next().unwrap();
|
||||
let (_, term) = terms.into_iter().next().unwrap();
|
||||
match *bound {
|
||||
UserInputBound::Inclusive(_) => Ok(Bound::Included(term)),
|
||||
UserInputBound::Exclusive(_) => Ok(Bound::Excluded(term)),
|
||||
@@ -423,7 +424,7 @@ fn compose_occur(left: Occur, right: Occur) -> Occur {
|
||||
fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box<Query> {
|
||||
match logical_literal {
|
||||
LogicalLiteral::Term(term) => Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs)),
|
||||
LogicalLiteral::Phrase(terms) => Box::new(PhraseQuery::new(terms)),
|
||||
LogicalLiteral::Phrase(term_with_offsets) => Box::new(PhraseQuery::new_with_offset(term_with_offsets)),
|
||||
LogicalLiteral::Range {
|
||||
field,
|
||||
value_type,
|
||||
@@ -611,8 +612,8 @@ mod test {
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:\"a b\"",
|
||||
"\"[Term([0, 0, 0, 0, 97]), \
|
||||
Term([0, 0, 0, 0, 98])]\"",
|
||||
"\"[(0, Term([0, 0, 0, 0, 97])), \
|
||||
(1, Term([0, 0, 0, 0, 98]))]\"",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
@@ -757,8 +758,8 @@ mod test {
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:\"a b\"",
|
||||
"\"[Term([0, 0, 0, 0, 97]), \
|
||||
Term([0, 0, 0, 0, 98])]\"",
|
||||
"\"[(0, Term([0, 0, 0, 0, 97])), \
|
||||
(1, Term([0, 0, 0, 0, 98]))]\"",
|
||||
true,
|
||||
);
|
||||
}
|
||||
|
||||
@@ -33,6 +33,7 @@ impl<'a> Tokenizer<'a> for JapaneseTokenizer {
|
||||
offset_to,
|
||||
position: pos,
|
||||
text: term,
|
||||
position_length: 1
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,6 +18,7 @@ impl<'a> Tokenizer<'a> for RawTokenizer {
|
||||
offset_to: text.len(),
|
||||
position: 0,
|
||||
text: text.to_string(),
|
||||
position_length: 1
|
||||
};
|
||||
RawTokenStream {
|
||||
token,
|
||||
|
||||
@@ -17,6 +17,8 @@ pub struct Token {
|
||||
pub position: usize,
|
||||
/// Actual text content of the token.
|
||||
pub text: String,
|
||||
/// Is the length expressed in term of number of original tokens.
|
||||
pub position_length: usize,
|
||||
}
|
||||
|
||||
impl Default for Token {
|
||||
@@ -26,6 +28,7 @@ impl Default for Token {
|
||||
offset_to: 0,
|
||||
position: usize::max_value(),
|
||||
text: String::with_capacity(200),
|
||||
position_length: 1,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -273,6 +276,7 @@ mod test {
|
||||
offset_from: 2,
|
||||
offset_to: 3,
|
||||
text: "abc".to_string(),
|
||||
position_length: 1
|
||||
};
|
||||
let t2 = t1.clone();
|
||||
|
||||
|
||||
Reference in New Issue
Block a user