mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-06 01:02:55 +00:00
add seperate tokenizer manager for fast fields (#2019)
* add seperate tokenizer manager for fast fields * rename
This commit is contained in:
@@ -282,6 +282,7 @@ pub struct Index {
|
||||
settings: IndexSettings,
|
||||
executor: Arc<Executor>,
|
||||
tokenizers: TokenizerManager,
|
||||
fast_field_tokenizers: TokenizerManager,
|
||||
inventory: SegmentMetaInventory,
|
||||
}
|
||||
|
||||
@@ -394,6 +395,7 @@ impl Index {
|
||||
directory,
|
||||
schema,
|
||||
tokenizers: TokenizerManager::default(),
|
||||
fast_field_tokenizers: TokenizerManager::default(),
|
||||
executor: Arc::new(Executor::single_thread()),
|
||||
inventory,
|
||||
}
|
||||
@@ -409,6 +411,16 @@ impl Index {
|
||||
&self.tokenizers
|
||||
}
|
||||
|
||||
/// Setter for the fast field tokenizer manager.
|
||||
pub fn set_fast_field_tokenizers(&mut self, tokenizers: TokenizerManager) {
|
||||
self.fast_field_tokenizers = tokenizers;
|
||||
}
|
||||
|
||||
/// Accessor for the fast field tokenizer manager.
|
||||
pub fn fast_field_tokenizer(&self) -> &TokenizerManager {
|
||||
&self.fast_field_tokenizers
|
||||
}
|
||||
|
||||
/// Get the tokenizer associated with a specific field.
|
||||
pub fn tokenizer_for_field(&self, field: Field) -> crate::Result<TextAnalyzer> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
|
||||
@@ -90,10 +90,11 @@ mod tests {
|
||||
use crate::directory::{Directory, RamDirectory, WritePtr};
|
||||
use crate::merge_policy::NoMergePolicy;
|
||||
use crate::schema::{
|
||||
Document, Facet, FacetOptions, Field, JsonObjectOptions, Schema, SchemaBuilder, FAST,
|
||||
INDEXED, STORED, STRING, TEXT,
|
||||
Document, Facet, FacetOptions, Field, JsonObjectOptions, Schema, SchemaBuilder,
|
||||
TextOptions, FAST, INDEXED, STORED, STRING, TEXT,
|
||||
};
|
||||
use crate::time::OffsetDateTime;
|
||||
use crate::tokenizer::{LowerCaser, RawTokenizer, TextAnalyzer, TokenizerManager};
|
||||
use crate::{DateOptions, DatePrecision, Index, SegmentId, SegmentReader};
|
||||
|
||||
pub static SCHEMA: Lazy<Schema> = Lazy::new(|| {
|
||||
@@ -1173,6 +1174,35 @@ mod tests {
|
||||
assert_eq!(&vals, &[33]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fast_field_tokenizer() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let opt = TextOptions::default().set_fast(Some("custom_lowercase"));
|
||||
let text_field = schema_builder.add_text_field("text", opt);
|
||||
let schema = schema_builder.build();
|
||||
let ff_tokenizer_manager = TokenizerManager::default();
|
||||
ff_tokenizer_manager.register(
|
||||
"custom_lowercase",
|
||||
TextAnalyzer::builder(RawTokenizer)
|
||||
.filter(LowerCaser)
|
||||
.build(),
|
||||
);
|
||||
|
||||
let mut index = Index::create_in_ram(schema);
|
||||
index.set_fast_field_tokenizers(ff_tokenizer_manager);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(text_field => "Test1 test2"))
|
||||
.unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let fast_field_reader = searcher.segment_reader(0u32).fast_fields();
|
||||
let column = fast_field_reader.str("text").unwrap().unwrap();
|
||||
let mut out = String::new();
|
||||
column.ord_to_str(0u64, &mut out).unwrap();
|
||||
assert_eq!(&out, "test1 test2");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_text_fast_field_tokenizer() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
@@ -84,6 +84,7 @@ impl SegmentWriter {
|
||||
) -> crate::Result<SegmentWriter> {
|
||||
let schema = segment.schema();
|
||||
let tokenizer_manager = segment.index().tokenizers().clone();
|
||||
let tokenizer_manager_fast_field = segment.index().fast_field_tokenizer().clone();
|
||||
let table_size = compute_initial_table_size(memory_budget_in_bytes)?;
|
||||
let segment_serializer = SegmentSerializer::for_segment(segment, false)?;
|
||||
let per_field_postings_writers = PerFieldPostingsWriter::for_schema(&schema);
|
||||
@@ -113,7 +114,7 @@ impl SegmentWriter {
|
||||
segment_serializer,
|
||||
fast_field_writers: FastFieldsWriter::from_schema_and_tokenizer_manager(
|
||||
&schema,
|
||||
tokenizer_manager,
|
||||
tokenizer_manager_fast_field,
|
||||
)?,
|
||||
doc_opstamps: Vec::with_capacity(1_000),
|
||||
per_field_text_analyzers,
|
||||
|
||||
@@ -25,8 +25,12 @@ pub struct TextOptions {
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
/// Enum to control how the fast field setting of a text field.
|
||||
enum FastFieldOptions {
|
||||
/// Flag to enable/disable
|
||||
IsEnabled(bool),
|
||||
/// Enable with tokenizer. The tokenizer must be available on the fast field tokenizer manager.
|
||||
/// `Index::fast_field_tokenizer`.
|
||||
EnabledWithTokenizer { with_tokenizer: TokenizerName },
|
||||
}
|
||||
|
||||
@@ -111,6 +115,8 @@ impl TextOptions {
|
||||
/// The effective cardinality depends on the tokenizer. Without a tokenizer, the text will be
|
||||
/// stored as is, which equals to the "raw" tokenizer. The tokenizer can be used to apply
|
||||
/// normalization like lower case.
|
||||
/// The passed tokenizer_name must be available on the fast field tokenizer manager.
|
||||
/// `Index::fast_field_tokenizer`.
|
||||
///
|
||||
/// The original text can be retrieved via
|
||||
/// [`TermDictionary::ord_to_term()`](crate::termdict::TermDictionary::ord_to_term)
|
||||
|
||||
Reference in New Issue
Block a user