tokenizer option on text fastfield (#1945)

* tokenizer option on text fastfield

allow to set tokenizer option on text fastfield (fixes #1901)
handle PreTokenized strings in fast field

* change visibility

* remove custom de/serialization
This commit is contained in:
PSeitz
2023-03-31 16:03:38 +08:00
committed by GitHub
parent 4cf93dab7d
commit 5c4ea6a708
6 changed files with 231 additions and 41 deletions

View File

@@ -42,7 +42,7 @@ fn main() -> tantivy::Result<()> {
.set_index_option(IndexRecordOption::WithFreqs)
.set_tokenizer("raw"),
)
.set_fast()
.set_fast(None)
.set_stored();
schema_builder.add_text_field("category", text_fieldtype);
schema_builder.add_f64_field("stock", FAST);

View File

@@ -445,7 +445,7 @@ mod tests {
.set_index_option(IndexRecordOption::Basic)
.set_fieldnorms(false),
)
.set_fast()
.set_fast(None)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype.clone());
let text_field_id = schema_builder.add_text_field("text_id", text_fieldtype);
@@ -500,7 +500,7 @@ mod tests {
.set_indexing_options(
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
)
.set_fast()
.set_fast(None)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let date_field = schema_builder.add_date_field("date", FAST);

View File

@@ -115,7 +115,7 @@ mod tests {
let directory: RamDirectory = RamDirectory::create();
{
let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA).unwrap();
fast_field_writers
.add_document(&doc!(*FIELD=>13u64))
.unwrap();
@@ -148,7 +148,7 @@ mod tests {
let directory: RamDirectory = RamDirectory::create();
{
let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA).unwrap();
fast_field_writers
.add_document(&doc!(*FIELD=>4u64))
.unwrap();
@@ -203,7 +203,7 @@ mod tests {
let directory: RamDirectory = RamDirectory::create();
{
let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA).unwrap();
for _ in 0..10_000 {
fast_field_writers
.add_document(&doc!(*FIELD=>100_000u64))
@@ -231,7 +231,7 @@ mod tests {
{
let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA).unwrap();
// forcing the amplitude to be high
fast_field_writers
.add_document(&doc!(*FIELD=>0u64))
@@ -268,7 +268,7 @@ mod tests {
let schema = schema_builder.build();
{
let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
for i in -100i64..10_000i64 {
let mut doc = Document::default();
doc.add_i64(i64_field, i);
@@ -310,7 +310,7 @@ mod tests {
{
let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
let doc = Document::default();
fast_field_writers.add_document(&doc).unwrap();
fast_field_writers.serialize(&mut write, None).unwrap();
@@ -343,7 +343,7 @@ mod tests {
let schema = schema_builder.build();
{
let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
let doc = Document::default();
fast_field_writers.add_document(&doc).unwrap();
fast_field_writers.serialize(&mut write, None).unwrap();
@@ -379,7 +379,7 @@ mod tests {
let directory = RamDirectory::create();
{
let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA).unwrap();
for &x in &permutation {
fast_field_writers.add_document(&doc!(*FIELD=>x)).unwrap();
}
@@ -759,7 +759,7 @@ mod tests {
{
let mut write: WritePtr = directory.open_write(path).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
fast_field_writers.add_document(&doc!(field=>true)).unwrap();
fast_field_writers
.add_document(&doc!(field=>false))
@@ -793,7 +793,7 @@ mod tests {
{
let mut write: WritePtr = directory.open_write(path).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
for _ in 0..50 {
fast_field_writers.add_document(&doc!(field=>true)).unwrap();
fast_field_writers
@@ -822,7 +822,7 @@ mod tests {
let schema = schema_builder.build();
{
let mut write: WritePtr = directory.open_write(path).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
let doc = Document::default();
fast_field_writers.add_document(&doc).unwrap();
fast_field_writers.serialize(&mut write, None).unwrap();
@@ -849,7 +849,7 @@ mod tests {
let directory: RamDirectory = RamDirectory::create();
{
let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(schema);
let mut fast_field_writers = FastFieldsWriter::from_schema(schema).unwrap();
for doc in docs {
fast_field_writers.add_document(doc).unwrap();
}
@@ -1173,6 +1173,45 @@ mod tests {
assert_eq!(&vals, &[33]);
}
#[test]
fn test_text_fast_field_tokenizer() {
let mut schema_builder = Schema::builder();
let text_fieldtype = crate::schema::TextOptions::default()
.set_indexing_options(
crate::schema::TextFieldIndexing::default()
.set_index_option(crate::schema::IndexRecordOption::WithFreqs)
.set_tokenizer("raw"),
)
.set_fast(Some("default"))
.set_stored();
let log_field = schema_builder.add_text_field("log_level", text_fieldtype);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(log_field => "info"))
.unwrap();
index_writer
.add_document(doc!(log_field => "INFO"))
.unwrap();
index_writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
let fast_field_reader = searcher.segment_reader(0u32).fast_fields();
let text_fast_field = fast_field_reader.str("log_level").unwrap().unwrap();
let mut buffer = String::new();
assert!(text_fast_field.ord_to_str(0, &mut buffer).unwrap());
assert_eq!(buffer, "info");
assert!(!text_fast_field.ord_to_str(1, &mut buffer).unwrap());
assert!(text_fast_field.term_ords(0).eq([0].into_iter()));
assert!(text_fast_field.term_ords(1).eq([0].into_iter()));
assert!(text_fast_field.ords().values_for_doc(0u32).eq([0]));
assert!(text_fast_field.ords().values_for_doc(1u32).eq([0]));
}
#[test]
fn test_shadowing_fast_field_with_expand_dots() {
let mut schema_builder = Schema::builder();

View File

@@ -2,11 +2,13 @@ use std::io;
use columnar::{ColumnarWriter, NumericalValue};
use common::replace_in_place;
use tokenizer_api::Token;
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::schema::term::{JSON_PATH_SEGMENT_SEP, JSON_PATH_SEGMENT_SEP_STR};
use crate::schema::{value_type_to_column_type, Document, FieldType, Schema, Type, Value};
use crate::{DatePrecision, DocId};
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
use crate::{DatePrecision, DocId, TantivyError};
/// Only index JSON down to a depth of 20.
/// This is mostly to guard us from a stack overflow triggered by malicious input.
@@ -15,7 +17,8 @@ const JSON_DEPTH_LIMIT: usize = 20;
/// The `FastFieldsWriter` groups all of the fast field writers.
pub struct FastFieldsWriter {
columnar_writer: ColumnarWriter,
fast_field_names: Vec<Option<String>>, //< TODO see if we can cash the field name hash too.
fast_field_names: Vec<Option<String>>, //< TODO see if we can hash the field name hash too.
per_field_tokenizer: Vec<Option<TextAnalyzer>>,
date_precisions: Vec<DatePrecision>,
expand_dots: Vec<bool>,
num_docs: DocId,
@@ -25,14 +28,25 @@ pub struct FastFieldsWriter {
impl FastFieldsWriter {
/// Create all `FastFieldWriter` required by the schema.
pub fn from_schema(schema: &Schema) -> FastFieldsWriter {
#[cfg(test)]
pub fn from_schema(schema: &Schema) -> crate::Result<FastFieldsWriter> {
FastFieldsWriter::from_schema_and_tokenizer_manager(&schema, TokenizerManager::new())
}
/// Create all `FastFieldWriter` required by the schema.
pub fn from_schema_and_tokenizer_manager(
schema: &Schema,
tokenizer_manager: TokenizerManager,
) -> crate::Result<FastFieldsWriter> {
let mut columnar_writer = ColumnarWriter::default();
let mut fast_field_names: Vec<Option<String>> = vec![None; schema.num_fields()];
let mut date_precisions: Vec<DatePrecision> =
std::iter::repeat_with(DatePrecision::default)
.take(schema.num_fields())
.collect();
let mut expand_dots = vec![false; schema.num_fields()];
let mut per_field_tokenizer = vec![None; schema.num_fields()];
// TODO see other types
for (field_id, field_entry) in schema.fields() {
if !field_entry.field_type().is_fast() {
@@ -47,6 +61,18 @@ impl FastFieldsWriter {
expand_dots[field_id.field_id() as usize] =
json_object_options.is_expand_dots_enabled();
}
if let FieldType::Str(text_options) = field_entry.field_type() {
if let Some(tokenizer_name) = text_options.get_fast_field_tokenizer_name() {
let text_analyzer = tokenizer_manager.get(tokenizer_name).ok_or_else(|| {
TantivyError::InvalidArgument(format!(
"Tokenizer {:?} not found",
tokenizer_name
))
})?;
per_field_tokenizer[field_id.field_id() as usize] = Some(text_analyzer);
}
}
let sort_values_within_row = value_type == Type::Facet;
if let Some(column_type) = value_type_to_column_type(value_type) {
columnar_writer.record_column_type(
@@ -56,14 +82,15 @@ impl FastFieldsWriter {
);
}
}
FastFieldsWriter {
Ok(FastFieldsWriter {
columnar_writer,
fast_field_names,
per_field_tokenizer,
num_docs: 0u32,
date_precisions,
expand_dots,
json_path_buffer: String::new(),
}
})
}
/// The memory used (inclusive childs)
@@ -111,14 +138,35 @@ impl FastFieldsWriter {
);
}
Value::Str(text_val) => {
self.columnar_writer
.record_str(doc_id, field_name.as_str(), text_val);
if let Some(text_analyzer) =
&self.per_field_tokenizer[field_value.field().field_id() as usize]
{
let mut token_stream = text_analyzer.token_stream(text_val);
token_stream.process(&mut |token: &Token| {
self.columnar_writer.record_str(
doc_id,
field_name.as_str(),
&token.text,
);
})
} else {
self.columnar_writer
.record_str(doc_id, field_name.as_str(), text_val);
}
}
Value::Bytes(bytes_val) => {
self.columnar_writer
.record_bytes(doc_id, field_name.as_str(), bytes_val);
}
Value::PreTokStr(_) => todo!(),
Value::PreTokStr(pre_tok) => {
for token in &pre_tok.tokens {
self.columnar_writer.record_str(
doc_id,
field_name.as_str(),
&token.text,
);
}
}
Value::Bool(bool_val) => {
self.columnar_writer
.record_bool(doc_id, field_name.as_str(), *bool_val);

View File

@@ -111,7 +111,10 @@ impl SegmentWriter {
per_field_postings_writers,
fieldnorms_writer: FieldNormsWriter::for_schema(&schema),
segment_serializer,
fast_field_writers: FastFieldsWriter::from_schema(&schema),
fast_field_writers: FastFieldsWriter::from_schema_and_tokenizer_manager(
&schema,
tokenizer_manager,
)?,
doc_opstamps: Vec::with_capacity(1_000),
per_field_text_analyzers,
term_buffer: Term::with_capacity(16),

View File

@@ -16,13 +16,53 @@ pub struct TextOptions {
#[serde(default)]
stored: bool,
#[serde(default)]
fast: bool,
fast: FastFieldOptions,
#[serde(default)]
#[serde(skip_serializing_if = "is_false")]
/// coerce values if they are not of type string
/// coerce values into string if they are not of type string
coerce: bool,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[serde(untagged)]
enum FastFieldOptions {
IsEnabled(bool),
EnabledWithTokenizer { with_tokenizer: TokenizerName },
}
impl Default for FastFieldOptions {
fn default() -> Self {
FastFieldOptions::IsEnabled(false)
}
}
impl BitOr<FastFieldOptions> for FastFieldOptions {
type Output = FastFieldOptions;
fn bitor(self, other: FastFieldOptions) -> FastFieldOptions {
match (self, other) {
(
FastFieldOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
},
_,
)
| (
_,
FastFieldOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
},
) => FastFieldOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
},
(FastFieldOptions::IsEnabled(true), _) | (_, FastFieldOptions::IsEnabled(true)) => {
FastFieldOptions::IsEnabled(true)
}
(_, FastFieldOptions::IsEnabled(false)) => FastFieldOptions::IsEnabled(false),
}
}
}
fn is_false(val: &bool) -> bool {
!val
}
@@ -40,7 +80,21 @@ impl TextOptions {
/// Returns true if and only if the value is a fast field.
pub fn is_fast(&self) -> bool {
self.fast
matches!(self.fast, FastFieldOptions::IsEnabled(true))
|| matches!(
&self.fast,
FastFieldOptions::EnabledWithTokenizer { with_tokenizer: _ }
)
}
/// Returns true if and only if the value is a fast field.
pub fn get_fast_field_tokenizer_name(&self) -> Option<&str> {
match &self.fast {
FastFieldOptions::IsEnabled(true) | FastFieldOptions::IsEnabled(false) => None,
FastFieldOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
} => Some(tokenizer.name()),
}
}
/// Returns true if values should be coerced to strings (numbers, null).
@@ -53,19 +107,24 @@ impl TextOptions {
/// Fast fields are designed for random access.
/// Access time are similar to a random lookup in an array.
/// Text fast fields will have the term ids stored in the fast field.
/// The fast field will be a multivalued fast field.
///
/// The effective cardinality depends on the tokenizer. When creating fast fields on text
/// fields it is recommended to use the "raw" tokenizer, since it will store the original text
/// unchanged. The "default" tokenizer will store the terms as lower case and this will be
/// reflected in the dictionary.
/// The effective cardinality depends on the tokenizer. Without a tokenizer, the text will be
/// stored as is, which equals to the "raw" tokenizer. The tokenizer can be used to apply
/// normalization like lower case.
///
/// The original text can be retrieved via
/// [`TermDictionary::ord_to_term()`](crate::termdict::TermDictionary::ord_to_term)
/// from the dictionary.
#[must_use]
pub fn set_fast(mut self) -> TextOptions {
self.fast = true;
pub fn set_fast(mut self, tokenizer_name: Option<&str>) -> TextOptions {
if let Some(tokenizer) = tokenizer_name {
let tokenizer = TokenizerName::from_name(tokenizer);
self.fast = FastFieldOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
}
} else {
self.fast = FastFieldOptions::IsEnabled(true);
}
self
}
@@ -92,7 +151,7 @@ impl TextOptions {
}
#[derive(Clone, PartialEq, Debug, Eq, Serialize, Deserialize)]
struct TokenizerName(Cow<'static, str>);
pub(crate) struct TokenizerName(Cow<'static, str>);
const DEFAULT_TOKENIZER_NAME: &str = "default";
@@ -105,7 +164,7 @@ impl Default for TokenizerName {
}
impl TokenizerName {
const fn from_static(name: &'static str) -> Self {
pub const fn from_static(name: &'static str) -> Self {
TokenizerName(Cow::Borrowed(name))
}
fn from_name(name: &str) -> Self {
@@ -199,7 +258,7 @@ pub const STRING: TextOptions = TextOptions {
record: IndexRecordOption::Basic,
}),
stored: false,
fast: false,
fast: FastFieldOptions::IsEnabled(false),
coerce: false,
};
@@ -212,7 +271,7 @@ pub const TEXT: TextOptions = TextOptions {
}),
stored: false,
coerce: false,
fast: false,
fast: FastFieldOptions::IsEnabled(false),
};
impl<T: Into<TextOptions>> BitOr<T> for TextOptions {
@@ -240,7 +299,7 @@ impl From<StoredFlag> for TextOptions {
TextOptions {
indexing: None,
stored: true,
fast: false,
fast: FastFieldOptions::IsEnabled(false),
coerce: false,
}
}
@@ -251,7 +310,7 @@ impl From<CoerceFlag> for TextOptions {
TextOptions {
indexing: None,
stored: false,
fast: false,
fast: FastFieldOptions::IsEnabled(false),
coerce: true,
}
}
@@ -262,7 +321,7 @@ impl From<FastFlag> for TextOptions {
TextOptions {
indexing: None,
stored: false,
fast: true,
fast: FastFieldOptions::IsEnabled(true),
coerce: false,
}
}
@@ -281,6 +340,7 @@ where
#[cfg(test)]
mod tests {
use crate::schema::text_options::{FastFieldOptions, TokenizerName};
use crate::schema::*;
#[test]
@@ -323,4 +383,44 @@ mod tests {
let options3: TextOptions = serde_json::from_str("{}").unwrap();
assert_eq!(options3.indexing, None);
}
#[test]
fn serde_fast_field_tokenizer() {
let json = r#" {
"fast": { "with_tokenizer": "default" }
} "#;
let options: TextOptions = serde_json::from_str(json).unwrap();
assert_eq!(
options.fast,
FastFieldOptions::EnabledWithTokenizer {
with_tokenizer: TokenizerName::from_static("default")
}
);
let options: TextOptions =
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
assert_eq!(
options.fast,
FastFieldOptions::EnabledWithTokenizer {
with_tokenizer: TokenizerName::from_static("default")
}
);
let json = r#" {
"fast": true
} "#;
let options: TextOptions = serde_json::from_str(json).unwrap();
assert_eq!(options.fast, FastFieldOptions::IsEnabled(true));
let options: TextOptions =
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
assert_eq!(options.fast, FastFieldOptions::IsEnabled(true));
let json = r#" {
"fast": false
} "#;
let options: TextOptions = serde_json::from_str(json).unwrap();
assert_eq!(options.fast, FastFieldOptions::IsEnabled(false));
let options: TextOptions =
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
assert_eq!(options.fast, FastFieldOptions::IsEnabled(false));
}
}