add seperate tokenizer manager for fast fields (#2019)

* add seperate tokenizer manager for fast fields

* rename
This commit is contained in:
PSeitz
2023-05-08 17:22:31 +08:00
committed by GitHub
parent 45ff0e3c5c
commit 4ee1b5cda0
4 changed files with 52 additions and 3 deletions

View File

@@ -282,6 +282,7 @@ pub struct Index {
settings: IndexSettings,
executor: Arc<Executor>,
tokenizers: TokenizerManager,
fast_field_tokenizers: TokenizerManager,
inventory: SegmentMetaInventory,
}
@@ -394,6 +395,7 @@ impl Index {
directory,
schema,
tokenizers: TokenizerManager::default(),
fast_field_tokenizers: TokenizerManager::default(),
executor: Arc::new(Executor::single_thread()),
inventory,
}
@@ -409,6 +411,16 @@ impl Index {
&self.tokenizers
}
/// Setter for the fast field tokenizer manager.
pub fn set_fast_field_tokenizers(&mut self, tokenizers: TokenizerManager) {
self.fast_field_tokenizers = tokenizers;
}
/// Accessor for the fast field tokenizer manager.
pub fn fast_field_tokenizer(&self) -> &TokenizerManager {
&self.fast_field_tokenizers
}
/// Get the tokenizer associated with a specific field.
pub fn tokenizer_for_field(&self, field: Field) -> crate::Result<TextAnalyzer> {
let field_entry = self.schema.get_field_entry(field);

View File

@@ -90,10 +90,11 @@ mod tests {
use crate::directory::{Directory, RamDirectory, WritePtr};
use crate::merge_policy::NoMergePolicy;
use crate::schema::{
Document, Facet, FacetOptions, Field, JsonObjectOptions, Schema, SchemaBuilder, FAST,
INDEXED, STORED, STRING, TEXT,
Document, Facet, FacetOptions, Field, JsonObjectOptions, Schema, SchemaBuilder,
TextOptions, FAST, INDEXED, STORED, STRING, TEXT,
};
use crate::time::OffsetDateTime;
use crate::tokenizer::{LowerCaser, RawTokenizer, TextAnalyzer, TokenizerManager};
use crate::{DateOptions, DatePrecision, Index, SegmentId, SegmentReader};
pub static SCHEMA: Lazy<Schema> = Lazy::new(|| {
@@ -1173,6 +1174,35 @@ mod tests {
assert_eq!(&vals, &[33]);
}
#[test]
fn test_fast_field_tokenizer() {
let mut schema_builder = Schema::builder();
let opt = TextOptions::default().set_fast(Some("custom_lowercase"));
let text_field = schema_builder.add_text_field("text", opt);
let schema = schema_builder.build();
let ff_tokenizer_manager = TokenizerManager::default();
ff_tokenizer_manager.register(
"custom_lowercase",
TextAnalyzer::builder(RawTokenizer)
.filter(LowerCaser)
.build(),
);
let mut index = Index::create_in_ram(schema);
index.set_fast_field_tokenizers(ff_tokenizer_manager);
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(text_field => "Test1 test2"))
.unwrap();
index_writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
let fast_field_reader = searcher.segment_reader(0u32).fast_fields();
let column = fast_field_reader.str("text").unwrap().unwrap();
let mut out = String::new();
column.ord_to_str(0u64, &mut out).unwrap();
assert_eq!(&out, "test1 test2");
}
#[test]
fn test_text_fast_field_tokenizer() {
let mut schema_builder = Schema::builder();

View File

@@ -84,6 +84,7 @@ impl SegmentWriter {
) -> crate::Result<SegmentWriter> {
let schema = segment.schema();
let tokenizer_manager = segment.index().tokenizers().clone();
let tokenizer_manager_fast_field = segment.index().fast_field_tokenizer().clone();
let table_size = compute_initial_table_size(memory_budget_in_bytes)?;
let segment_serializer = SegmentSerializer::for_segment(segment, false)?;
let per_field_postings_writers = PerFieldPostingsWriter::for_schema(&schema);
@@ -113,7 +114,7 @@ impl SegmentWriter {
segment_serializer,
fast_field_writers: FastFieldsWriter::from_schema_and_tokenizer_manager(
&schema,
tokenizer_manager,
tokenizer_manager_fast_field,
)?,
doc_opstamps: Vec::with_capacity(1_000),
per_field_text_analyzers,

View File

@@ -25,8 +25,12 @@ pub struct TextOptions {
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[serde(untagged)]
/// Enum to control how the fast field setting of a text field.
enum FastFieldOptions {
/// Flag to enable/disable
IsEnabled(bool),
/// Enable with tokenizer. The tokenizer must be available on the fast field tokenizer manager.
/// `Index::fast_field_tokenizer`.
EnabledWithTokenizer { with_tokenizer: TokenizerName },
}
@@ -111,6 +115,8 @@ impl TextOptions {
/// The effective cardinality depends on the tokenizer. Without a tokenizer, the text will be
/// stored as is, which equals to the "raw" tokenizer. The tokenizer can be used to apply
/// normalization like lower case.
/// The passed tokenizer_name must be available on the fast field tokenizer manager.
/// `Index::fast_field_tokenizer`.
///
/// The original text can be retrieved via
/// [`TermDictionary::ord_to_term()`](crate::termdict::TermDictionary::ord_to_term)