Added unit test for long tokens (#1635)

* Bugfix on long tokens and multivalue text fields. Fixes a minor bug for the strong edge case in which a tokenizer would emit tokens where the last token does not cover the last position. More importantly, this adds unit tests. Closes #1634 * Update src/indexer/segment_writer.rs Co-authored-by: PSeitz <PSeitz@users.noreply.github.com> Co-authored-by: PSeitz <PSeitz@users.noreply.github.com>
2026-01-08 18:12:55 +00:00 · 2022-10-20 15:05:37 +09:00
parent 8de7fa9d95
commit 483b1d13d4
2 changed files with 84 additions and 1 deletions
--- a/src/indexer/segment_writer.rs
+++ b/src/indexer/segment_writer.rs
@@ -785,4 +785,87 @@ mod tests {
        // On release this was [2, 1]. (< note the decreasing values)
        assert_eq!(positions, &[2, 5]);
    }
+
+    #[test]
+    fn test_multiple_field_value_and_long_tokens() {
+        let mut schema_builder = Schema::builder();
+        let text = schema_builder.add_text_field("text", TEXT);
+        let schema = schema_builder.build();
+        let mut doc = Document::default();
+        // This is a bit of a contrived example.
+        let tokens = PreTokenizedString {
+            text: "roller-coaster".to_string(),
+            tokens: vec![Token {
+                offset_from: 0,
+                offset_to: 14,
+                position: 0,
+                text: "rollercoaster".to_string(),
+                position_length: 2,
+            }],
+        };
+        doc.add_pre_tokenized_text(text, tokens.clone());
+        doc.add_pre_tokenized_text(text, tokens);
+        let index = Index::create_in_ram(schema);
+        let mut index_writer = index.writer_for_tests().unwrap();
+        index_writer.add_document(doc).unwrap();
+        index_writer.commit().unwrap();
+        let reader = index.reader().unwrap();
+        let searcher = reader.searcher();
+        let seg_reader = searcher.segment_reader(0);
+        let inv_index = seg_reader.inverted_index(text).unwrap();
+        let term = Term::from_field_text(text, "rollercoaster");
+        let mut postings = inv_index
+            .read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
+            .unwrap()
+            .unwrap();
+        assert_eq!(postings.doc(), 0u32);
+        let mut positions = Vec::new();
+        postings.positions(&mut positions);
+        assert_eq!(positions, &[0, 3]); //< as opposed to 0, 2 if we had a position length of 1.
+    }
+
+    #[test]
+    fn test_last_token_not_ending_last() {
+        let mut schema_builder = Schema::builder();
+        let text = schema_builder.add_text_field("text", TEXT);
+        let schema = schema_builder.build();
+        let mut doc = Document::default();
+        // This is a bit of a contrived example.
+        let tokens = PreTokenizedString {
+            text: "contrived-example".to_string(), //< I can't think of a use case where this corner case happens in real life.
+            tokens: vec![Token { // Not the last token, yet ends after the last token.
+                offset_from: 0,
+                offset_to: 14,
+                position: 0,
+                text: "long_token".to_string(),
+                position_length: 3,
+            },
+            Token {
+                offset_from: 0,
+                offset_to: 14,
+                position: 1,
+                text: "short".to_string(),
+                position_length: 1,
+            }],
+        };
+        doc.add_pre_tokenized_text(text, tokens);
+        doc.add_text(text, "hello");
+        let index = Index::create_in_ram(schema);
+        let mut index_writer = index.writer_for_tests().unwrap();
+        index_writer.add_document(doc).unwrap();
+        index_writer.commit().unwrap();
+        let reader = index.reader().unwrap();
+        let searcher = reader.searcher();
+        let seg_reader = searcher.segment_reader(0);
+        let inv_index = seg_reader.inverted_index(text).unwrap();
+        let term = Term::from_field_text(text, "hello");
+        let mut postings = inv_index
+            .read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
+            .unwrap()
+            .unwrap();
+        assert_eq!(postings.doc(), 0u32);
+        let mut positions = Vec::new();
+        postings.positions(&mut positions);
+        assert_eq!(positions, &[4]); //< as opposed to 3 if we had a position length of 1.
+    }
 }
--- a/src/postings/postings_writer.rs
+++ b/src/postings/postings_writer.rs
@@ -170,7 +170,7 @@ pub(crate) trait PostingsWriter: Send + Sync {
            term_buffer.truncate_value_bytes(end_of_path_idx);
            term_buffer.append_bytes(token.text.as_bytes());
            let start_position = indexing_position.end_position + token.position as u32;
-            end_position = start_position + token.position_length as u32;
+            end_position = end_position.max(start_position + token.position_length as u32);
            let unordered_term_id = self.subscribe(doc_id, start_position, term_buffer, ctx);
            if let Some(term_id_fast_field_writer) = term_id_fast_field_writer_opt.as_mut() {
                term_id_fast_field_writer.add_val(unordered_term_id);