mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-05 01:50:42 +00:00
@@ -7,7 +7,7 @@ Minor
|
||||
- Small simplification of the code.
|
||||
Calling .freq() or .doc() when .advance() has never
|
||||
on segment postings should panic from now on.
|
||||
|
||||
- Tokens exceeding `u16::max_value() - 4` chars are discarded silently instead of panicking.
|
||||
|
||||
|
||||
Tantivy 0.9.0
|
||||
|
||||
@@ -55,13 +55,15 @@ pub mod tests {
|
||||
use fieldnorm::FieldNormReader;
|
||||
use indexer::operation::AddOperation;
|
||||
use indexer::SegmentWriter;
|
||||
use merge_policy::NoMergePolicy;
|
||||
use query::Scorer;
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
use schema::Field;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::{Document, Schema, Term, INDEXED, STRING, TEXT};
|
||||
use schema::{Field, TextOptions};
|
||||
use schema::{IndexRecordOption, TextFieldIndexing};
|
||||
use std::iter;
|
||||
use tokenizer::{SimpleTokenizer, MAX_TOKEN_LEN};
|
||||
use DocId;
|
||||
use Score;
|
||||
|
||||
@@ -160,6 +162,52 @@ pub mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_drop_token_that_are_too_long() {
|
||||
let ok_token_text: String = iter::repeat('A').take(MAX_TOKEN_LEN).collect();
|
||||
let mut exceeding_token_text: String = iter::repeat('A').take(MAX_TOKEN_LEN + 1).collect();
|
||||
exceeding_token_text.push_str(" hello");
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_options = TextOptions::default().set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions)
|
||||
.set_tokenizer("simple_no_truncation"),
|
||||
);
|
||||
let text_field = schema_builder.add_text_field("text", text_options);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
index
|
||||
.tokenizers()
|
||||
.register("simple_no_truncation", SimpleTokenizer);
|
||||
let reader = index.reader().unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
{
|
||||
index_writer.add_document(doc!(text_field=>exceeding_token_text));
|
||||
index_writer.commit().unwrap();
|
||||
reader.reload().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
let inverted_index = segment_reader.inverted_index(text_field);
|
||||
assert_eq!(inverted_index.terms().num_terms(), 1);
|
||||
let mut bytes = vec![];
|
||||
assert!(inverted_index.terms().ord_to_term(0, &mut bytes));
|
||||
assert_eq!(&bytes, b"hello");
|
||||
}
|
||||
{
|
||||
index_writer.add_document(doc!(text_field=>ok_token_text.clone()));
|
||||
index_writer.commit().unwrap();
|
||||
reader.reload().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(1u32);
|
||||
let inverted_index = segment_reader.inverted_index(text_field);
|
||||
assert_eq!(inverted_index.terms().num_terms(), 1);
|
||||
let mut bytes = vec![];
|
||||
assert!(inverted_index.terms().ord_to_term(0, &mut bytes));
|
||||
assert_eq!(&bytes[..], ok_token_text.as_bytes());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_position_and_fieldnorm1() {
|
||||
let mut positions = Vec::new();
|
||||
|
||||
@@ -12,8 +12,8 @@ use std::io;
|
||||
use std::marker::PhantomData;
|
||||
use std::ops::DerefMut;
|
||||
use termdict::TermOrdinal;
|
||||
use tokenizer::Token;
|
||||
use tokenizer::TokenStream;
|
||||
use tokenizer::{Token, MAX_TOKEN_LEN};
|
||||
use DocId;
|
||||
use Result;
|
||||
|
||||
@@ -210,8 +210,11 @@ pub trait PostingsWriter {
|
||||
) -> u32 {
|
||||
let mut term = Term::for_field(field);
|
||||
let mut sink = |token: &Token| {
|
||||
term.set_text(token.text.as_str());
|
||||
self.subscribe(term_index, doc_id, token.position as u32, &term, heap);
|
||||
// We skip all tokens with a len greater than u16.
|
||||
if token.text.len() <= MAX_TOKEN_LEN {
|
||||
term.set_text(token.text.as_str());
|
||||
self.subscribe(term_index, doc_id, token.position as u32, &term, heap);
|
||||
}
|
||||
};
|
||||
token_stream.process(&mut sink)
|
||||
}
|
||||
|
||||
@@ -97,6 +97,8 @@
|
||||
//! If you built your schema programmatically, a complete example
|
||||
//! could like this for instance.
|
||||
//!
|
||||
//! Note that tokens with a len greater or equal to [`MAX_TOKEN_LEN`](./constant.MAX_TOKEN_LEN.html).
|
||||
//!
|
||||
//! # Example
|
||||
//!
|
||||
//! ```
|
||||
@@ -157,6 +159,13 @@ pub use self::tokenizer::BoxedTokenizer;
|
||||
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
pub use self::tokenizer_manager::TokenizerManager;
|
||||
|
||||
/// Maximum authorized len (in bytes) for a token.
|
||||
///
|
||||
/// Tokenizer are in charge of not emitting tokens larger than this value.
|
||||
/// Currently, if a faulty tokenizer implementation emits tokens with a length larger than
|
||||
/// `2^16 - 1 - 4`, the token will simply be ignored downstream.
|
||||
pub const MAX_TOKEN_LEN: usize = u16::max_value() as usize - 4;
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
use super::{
|
||||
|
||||
Reference in New Issue
Block a user