mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-08 18:12:55 +00:00
Added unit test for long tokens (#1635)
* Bugfix on long tokens and multivalue text fields. Fixes a minor bug for the strong edge case in which a tokenizer would emit tokens where the last token does not cover the last position. More importantly, this adds unit tests. Closes #1634 * Update src/indexer/segment_writer.rs Co-authored-by: PSeitz <PSeitz@users.noreply.github.com> Co-authored-by: PSeitz <PSeitz@users.noreply.github.com>
This commit is contained in:
@@ -785,4 +785,87 @@ mod tests {
|
||||
// On release this was [2, 1]. (< note the decreasing values)
|
||||
assert_eq!(positions, &[2, 5]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multiple_field_value_and_long_tokens() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let mut doc = Document::default();
|
||||
// This is a bit of a contrived example.
|
||||
let tokens = PreTokenizedString {
|
||||
text: "roller-coaster".to_string(),
|
||||
tokens: vec![Token {
|
||||
offset_from: 0,
|
||||
offset_to: 14,
|
||||
position: 0,
|
||||
text: "rollercoaster".to_string(),
|
||||
position_length: 2,
|
||||
}],
|
||||
};
|
||||
doc.add_pre_tokenized_text(text, tokens.clone());
|
||||
doc.add_pre_tokenized_text(text, tokens);
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
index_writer.add_document(doc).unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let seg_reader = searcher.segment_reader(0);
|
||||
let inv_index = seg_reader.inverted_index(text).unwrap();
|
||||
let term = Term::from_field_text(text, "rollercoaster");
|
||||
let mut postings = inv_index
|
||||
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(postings.doc(), 0u32);
|
||||
let mut positions = Vec::new();
|
||||
postings.positions(&mut positions);
|
||||
assert_eq!(positions, &[0, 3]); //< as opposed to 0, 2 if we had a position length of 1.
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_last_token_not_ending_last() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let mut doc = Document::default();
|
||||
// This is a bit of a contrived example.
|
||||
let tokens = PreTokenizedString {
|
||||
text: "contrived-example".to_string(), //< I can't think of a use case where this corner case happens in real life.
|
||||
tokens: vec![Token { // Not the last token, yet ends after the last token.
|
||||
offset_from: 0,
|
||||
offset_to: 14,
|
||||
position: 0,
|
||||
text: "long_token".to_string(),
|
||||
position_length: 3,
|
||||
},
|
||||
Token {
|
||||
offset_from: 0,
|
||||
offset_to: 14,
|
||||
position: 1,
|
||||
text: "short".to_string(),
|
||||
position_length: 1,
|
||||
}],
|
||||
};
|
||||
doc.add_pre_tokenized_text(text, tokens);
|
||||
doc.add_text(text, "hello");
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
index_writer.add_document(doc).unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let seg_reader = searcher.segment_reader(0);
|
||||
let inv_index = seg_reader.inverted_index(text).unwrap();
|
||||
let term = Term::from_field_text(text, "hello");
|
||||
let mut postings = inv_index
|
||||
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(postings.doc(), 0u32);
|
||||
let mut positions = Vec::new();
|
||||
postings.positions(&mut positions);
|
||||
assert_eq!(positions, &[4]); //< as opposed to 3 if we had a position length of 1.
|
||||
}
|
||||
}
|
||||
|
||||
@@ -170,7 +170,7 @@ pub(crate) trait PostingsWriter: Send + Sync {
|
||||
term_buffer.truncate_value_bytes(end_of_path_idx);
|
||||
term_buffer.append_bytes(token.text.as_bytes());
|
||||
let start_position = indexing_position.end_position + token.position as u32;
|
||||
end_position = start_position + token.position_length as u32;
|
||||
end_position = end_position.max(start_position + token.position_length as u32);
|
||||
let unordered_term_id = self.subscribe(doc_id, start_position, term_buffer, ctx);
|
||||
if let Some(term_id_fast_field_writer) = term_id_fast_field_writer_opt.as_mut() {
|
||||
term_id_fast_field_writer.add_val(unordered_term_id);
|
||||
|
||||
Reference in New Issue
Block a user