Compare commits

..

1 Commits

Author SHA1 Message Date
Pascal Seitz
18de6f477b add failing test check_num_columnar_fields 2023-07-13 18:19:02 +09:00
24 changed files with 359 additions and 309 deletions

View File

@@ -53,7 +53,7 @@ jobs:
strategy:
matrix:
features: [
{ label: "all", flags: "mmap,stopwords,lz4-compression,zstd-compression,failpoints" },
{ label: "all", flags: "mmap,stopwords,brotli-compression,lz4-compression,snappy-compression,zstd-compression,failpoints" },
{ label: "quickwit", flags: "mmap,quickwit,failpoints" }
]

View File

@@ -25,7 +25,9 @@ aho-corasick = "1.0"
tantivy-fst = "0.4.0"
memmap2 = { version = "0.7.1", optional = true }
lz4_flex = { version = "0.11", default-features = false, optional = true }
brotli = { version = "3.3.4", optional = true }
zstd = { version = "0.12", optional = true, default-features = false }
snap = { version = "1.0.5", optional = true }
tempfile = { version = "3.3.0", optional = true }
log = "0.4.16"
serde = { version = "1.0.136", features = ["derive"] }
@@ -105,7 +107,9 @@ default = ["mmap", "stopwords", "lz4-compression"]
mmap = ["fs4", "tempfile", "memmap2"]
stopwords = []
brotli-compression = ["brotli"]
lz4-compression = ["lz4_flex"]
snappy-compression = ["snap"]
zstd-compression = ["zstd"]
failpoints = ["fail", "fail/failpoints"]

View File

@@ -44,7 +44,7 @@ Details about the benchmark can be found at this [repository](https://github.com
- Single valued and multivalued u64, i64, and f64 fast fields (equivalent of doc values in Lucene)
- `&[u8]` fast fields
- Text, i64, u64, f64, dates, ip, bool, and hierarchical facet fields
- Compressed document store (LZ4, Zstd, None)
- Compressed document store (LZ4, Zstd, None, Brotli, Snap)
- Range queries
- Faceted search
- Configurable indexing (optional term frequency and position indexing)

View File

@@ -37,7 +37,7 @@ fn main() -> tantivy::Result<()> {
.set_index_option(IndexRecordOption::WithFreqs)
.set_tokenizer("raw"),
)
.set_fast("default")
.set_fast(None)
.set_stored();
schema_builder.add_text_field("category", text_fieldtype);
schema_builder.add_f64_field("stock", FAST);

View File

@@ -1293,13 +1293,13 @@ mod tests {
// searching for terma, but min_doc_count will return all terms
let res = exec_request_with_query(agg_req, &index, Some(("string2", "hit")))?;
assert_eq!(res["my_texts"]["buckets"][0]["key"], "a");
assert_eq!(res["my_texts"]["buckets"][0]["key"], "A");
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 2);
assert_eq!(
res["my_texts"]["buckets"][0]["elhistogram"]["buckets"],
json!([{ "doc_count": 1, "key": 1.0 }, { "doc_count": 1, "key": 2.0 } ])
);
assert_eq!(res["my_texts"]["buckets"][1]["key"], "b");
assert_eq!(res["my_texts"]["buckets"][1]["key"], "B");
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 1);
assert_eq!(
res["my_texts"]["buckets"][1]["elhistogram"]["buckets"],
@@ -1421,10 +1421,10 @@ mod tests {
let res = exec_request_with_query(agg_req, &index, None).unwrap();
println!("{}", serde_json::to_string_pretty(&res).unwrap());
assert_eq!(res["my_texts"]["buckets"][0]["key"], "hallo hallo");
assert_eq!(res["my_texts"]["buckets"][0]["key"], "Hallo Hallo");
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 1);
assert_eq!(res["my_texts"]["buckets"][1]["key"], "hello hello");
assert_eq!(res["my_texts"]["buckets"][1]["key"], "Hello Hello");
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 1);
Ok(())

View File

@@ -411,7 +411,7 @@ mod tests {
.set_index_option(IndexRecordOption::Basic)
.set_fieldnorms(false),
)
.set_fast("default")
.set_fast(None)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype.clone());
let text_field_id = schema_builder.add_text_field("text_id", text_fieldtype);
@@ -466,7 +466,7 @@ mod tests {
.set_indexing_options(
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
)
.set_fast("default")
.set_fast(None)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let date_field = schema_builder.add_date_field("date", FAST);

View File

@@ -120,8 +120,8 @@ impl IndexBuilder {
Self {
schema: None,
index_settings: IndexSettings::default(),
tokenizer_manager: TokenizerManager::default_for_indexing(),
fast_field_tokenizer_manager: TokenizerManager::default_for_fast_fields(),
tokenizer_manager: TokenizerManager::default(),
fast_field_tokenizer_manager: TokenizerManager::default(),
}
}
@@ -400,8 +400,8 @@ impl Index {
settings: metas.index_settings.clone(),
directory,
schema,
tokenizers: TokenizerManager::default_for_indexing(),
fast_field_tokenizers: TokenizerManager::default_for_fast_fields(),
tokenizers: TokenizerManager::default(),
fast_field_tokenizers: TokenizerManager::default(),
executor: Arc::new(Executor::single_thread()),
inventory,
}

View File

@@ -410,9 +410,7 @@ mod tests {
use super::IndexMeta;
use crate::core::index_meta::UntrackedIndexMeta;
use crate::schema::{Schema, TEXT};
use crate::store::Compressor;
#[cfg(feature = "zstd-compression")]
use crate::store::ZstdCompressor;
use crate::store::{Compressor, ZstdCompressor};
use crate::{IndexSettings, IndexSortByField, Order};
#[test]
@@ -448,7 +446,6 @@ mod tests {
}
#[test]
#[cfg(feature = "zstd-compression")]
fn test_serialize_metas_zstd_compressor() {
let schema = {
let mut schema_builder = Schema::builder();
@@ -485,14 +482,13 @@ mod tests {
}
#[test]
#[cfg(all(feature = "lz4-compression", feature = "zstd-compression"))]
fn test_serialize_metas_invalid_comp() {
let json = r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zsstd","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;
let err = serde_json::from_str::<UntrackedIndexMeta>(json).unwrap_err();
assert_eq!(
err.to_string(),
"unknown variant `zsstd`, expected one of `none`, `lz4`, `zstd`, \
"unknown variant `zsstd`, expected one of `none`, `lz4`, `brotli`, `snappy`, `zstd`, \
`zstd(compression_level=5)` at line 1 column 96"
.to_string()
);
@@ -506,20 +502,6 @@ mod tests {
);
}
#[test]
#[cfg(not(feature = "zstd-compression"))]
fn test_serialize_metas_unsupported_comp() {
let json = r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zstd","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;
let err = serde_json::from_str::<UntrackedIndexMeta>(json).unwrap_err();
assert_eq!(
err.to_string(),
"unsupported variant `zstd`, please enable Tantivy's `zstd-compression` feature at \
line 1 column 95"
.to_string()
);
}
#[test]
#[cfg(feature = "lz4-compression")]
fn test_index_settings_default() {

View File

@@ -446,8 +446,7 @@ mod tests {
#[test]
fn test_text_fastfield() {
let mut schema_builder = Schema::builder();
let text_options: TextOptions = TextOptions::from(TEXT).set_fast("raw");
let text_field = schema_builder.add_text_field("text", text_options);
let text_field = schema_builder.add_text_field("text", TEXT | FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
@@ -1083,7 +1082,7 @@ mod tests {
#[test]
fn test_fast_field_in_json_field_expand_dots_disabled() {
let mut schema_builder = Schema::builder();
let json_option = JsonObjectOptions::default().set_fast("default");
let json_option = JsonObjectOptions::default().set_fast(None);
let json = schema_builder.add_json_field("json", json_option);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
@@ -1109,7 +1108,7 @@ mod tests {
#[test]
fn test_fast_field_in_json_field_with_tokenizer() {
let mut schema_builder = Schema::builder();
let json_option = JsonObjectOptions::default().set_fast("default");
let json_option = JsonObjectOptions::default().set_fast(Some("default"));
let json = schema_builder.add_json_field("json", json_option);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
@@ -1135,7 +1134,7 @@ mod tests {
fn test_fast_field_in_json_field_expand_dots_enabled() {
let mut schema_builder = Schema::builder();
let json_option = JsonObjectOptions::default()
.set_fast("default")
.set_fast(None)
.set_expand_dots_enabled();
let json = schema_builder.add_json_field("json", json_option);
let schema = schema_builder.build();
@@ -1203,10 +1202,10 @@ mod tests {
#[test]
fn test_fast_field_tokenizer() {
let mut schema_builder = Schema::builder();
let opt = TextOptions::default().set_fast("custom_lowercase");
let opt = TextOptions::default().set_fast(Some("custom_lowercase"));
let text_field = schema_builder.add_text_field("text", opt);
let schema = schema_builder.build();
let ff_tokenizer_manager = TokenizerManager::default_for_fast_fields();
let ff_tokenizer_manager = TokenizerManager::default();
ff_tokenizer_manager.register(
"custom_lowercase",
TextAnalyzer::builder(RawTokenizer::default())
@@ -1239,7 +1238,7 @@ mod tests {
.set_index_option(crate::schema::IndexRecordOption::WithFreqs)
.set_tokenizer("raw"),
)
.set_fast("default")
.set_fast(Some("default"))
.set_stored();
let log_field = schema_builder.add_text_field("log_level", text_fieldtype);
@@ -1272,7 +1271,7 @@ mod tests {
fn test_shadowing_fast_field_with_expand_dots() {
let mut schema_builder = Schema::builder();
let json_option = JsonObjectOptions::default()
.set_fast("default")
.set_fast(None)
.set_expand_dots_enabled();
let json_field = schema_builder.add_json_field("jsonfield", json_option.clone());
let shadowing_json_field = schema_builder.add_json_field("jsonfield.attr", json_option);
@@ -1292,4 +1291,28 @@ mod tests {
let vals: Vec<i64> = column.values_for_doc(0u32).collect();
assert_eq!(&vals, &[33]);
}
#[test]
fn check_num_columnar_fields() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let id_field = schema_builder.add_text_field("id", FAST);
let index = Index::create_in_ram(schema_builder.build());
{
let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer.add_document(doc!(
id_field => 1u64,
))?;
index_writer.commit()?;
}
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let ff_reader = searcher.segment_reader(0).fast_fields();
let fields = ff_reader.u64_lenient_for_type_all(None, "id").unwrap();
assert_eq!(fields.len(), 1);
Ok(())
}
}

View File

@@ -349,7 +349,7 @@ mod tests {
schema_builder.add_json_field(
"json_expand_dots_enabled",
JsonObjectOptions::default()
.set_fast("default")
.set_fast(None)
.set_expand_dots_enabled(),
);
let dynamic_field = schema_builder.add_json_field("_dyna", FAST);

View File

@@ -18,8 +18,6 @@ const JSON_DEPTH_LIMIT: usize = 20;
pub struct FastFieldsWriter {
columnar_writer: ColumnarWriter,
fast_field_names: Vec<Option<String>>, //< TODO see if we can hash the field name hash too.
// Field -> Fast field tokenizer mapping.
// All text fast fields should have a tokenizer.
per_field_tokenizer: Vec<Option<TextAnalyzer>>,
date_precisions: Vec<DateTimePrecision>,
expand_dots: Vec<bool>,
@@ -63,7 +61,7 @@ impl FastFieldsWriter {
if let Some(tokenizer_name) = json_object_options.get_fast_field_tokenizer_name() {
let text_analyzer = tokenizer_manager.get(tokenizer_name).ok_or_else(|| {
TantivyError::InvalidArgument(format!(
"Tokenizer `{tokenizer_name}` not found"
"Tokenizer {tokenizer_name:?} not found"
))
})?;
per_field_tokenizer[field_id.field_id() as usize] = Some(text_analyzer);
@@ -159,6 +157,9 @@ impl FastFieldsWriter {
&token.text,
);
})
} else {
self.columnar_writer
.record_str(doc_id, field_name.as_str(), text_val);
}
}
Value::Bytes(bytes_val) => {
@@ -200,20 +201,18 @@ impl FastFieldsWriter {
self.json_path_buffer.clear();
self.json_path_buffer.push_str(field_name);
let text_analyzer_opt =
let text_analyzer =
&mut self.per_field_tokenizer[field_value.field().field_id() as usize];
if let Some(text_analyzer) = text_analyzer_opt {
record_json_obj_to_columnar_writer(
doc_id,
json_obj,
expand_dots,
JSON_DEPTH_LIMIT,
&mut self.json_path_buffer,
&mut self.columnar_writer,
text_analyzer,
);
}
record_json_obj_to_columnar_writer(
doc_id,
json_obj,
expand_dots,
JSON_DEPTH_LIMIT,
&mut self.json_path_buffer,
&mut self.columnar_writer,
text_analyzer,
);
}
Value::IpAddr(ip_addr) => {
self.columnar_writer
@@ -264,7 +263,7 @@ fn record_json_obj_to_columnar_writer(
remaining_depth_limit: usize,
json_path_buffer: &mut String,
columnar_writer: &mut columnar::ColumnarWriter,
text_analyzer: &mut TextAnalyzer,
tokenizer: &mut Option<TextAnalyzer>,
) {
for (key, child) in json_obj {
let len_path = json_path_buffer.len();
@@ -289,7 +288,7 @@ fn record_json_obj_to_columnar_writer(
remaining_depth_limit,
json_path_buffer,
columnar_writer,
text_analyzer,
tokenizer,
);
// popping our sub path.
json_path_buffer.truncate(len_path);
@@ -303,7 +302,7 @@ fn record_json_value_to_columnar_writer(
mut remaining_depth_limit: usize,
json_path_writer: &mut String,
columnar_writer: &mut columnar::ColumnarWriter,
text_analyzer: &mut TextAnalyzer,
tokenizer: &mut Option<TextAnalyzer>,
) {
if remaining_depth_limit == 0 {
return;
@@ -322,10 +321,14 @@ fn record_json_value_to_columnar_writer(
}
}
serde_json::Value::String(text) => {
let mut token_stream = text_analyzer.token_stream(text);
token_stream.process(&mut |token| {
columnar_writer.record_str(doc, json_path_writer.as_str(), &token.text);
});
if let Some(text_analyzer) = tokenizer.as_mut() {
let mut token_stream = text_analyzer.token_stream(text);
token_stream.process(&mut |token| {
columnar_writer.record_str(doc, json_path_writer.as_str(), &token.text);
})
} else {
columnar_writer.record_str(doc, json_path_writer.as_str(), text);
}
}
serde_json::Value::Array(arr) => {
for el in arr {
@@ -336,7 +339,7 @@ fn record_json_value_to_columnar_writer(
remaining_depth_limit,
json_path_writer,
columnar_writer,
text_analyzer,
tokenizer,
);
}
}
@@ -348,7 +351,7 @@ fn record_json_value_to_columnar_writer(
remaining_depth_limit,
json_path_writer,
columnar_writer,
text_analyzer,
tokenizer,
);
}
}
@@ -368,9 +371,6 @@ mod tests {
) -> ColumnarReader {
let mut columnar_writer = ColumnarWriter::default();
let mut json_path = String::new();
let mut text_analyzer = crate::tokenizer::TokenizerManager::default_for_fast_fields()
.get(crate::schema::DEFAULT_FAST_FIELD_TOKENIZER)
.unwrap();
for (doc, json_doc) in json_docs.iter().enumerate() {
record_json_value_to_columnar_writer(
doc as u32,
@@ -379,7 +379,7 @@ mod tests {
JSON_DEPTH_LIMIT,
&mut json_path,
&mut columnar_writer,
&mut text_analyzer,
&mut None,
);
}
let mut buffer = Vec::new();
@@ -399,7 +399,6 @@ mod tests {
});
let columnar_reader = test_columnar_from_jsons_aux(&[json_doc], false);
let columns = columnar_reader.list_columns().unwrap();
assert_eq!(columns.len(), 5);
{
assert_eq!(columns[0].0, "arr");
let column_arr_opt: Option<StrColumn> = columns[0].1.open().unwrap().into();
@@ -435,9 +434,7 @@ mod tests {
{
assert_eq!(columns[4].0, "text");
let column_text_opt: Option<StrColumn> = columns[4].1.open().unwrap().into();
let column_text = column_text_opt.unwrap();
let term_ords: Vec<u64> = column_text.term_ords(0).collect();
assert_eq!(&term_ords[..], &[0]);
assert!(column_text_opt.unwrap().term_ords(0).eq([0].into_iter()));
}
}

View File

@@ -191,7 +191,7 @@ pub use crate::schema::{DateOptions, DateTimePrecision, Document, Term};
/// Index format version.
const INDEX_FORMAT_VERSION: u32 = 5;
#[cfg(all(feature = "mmap", unix))]
#[cfg(unix)]
pub use memmap2::Advice;
/// Structure version for the index.

View File

@@ -956,7 +956,7 @@ mod test {
.iter()
.flat_map(|field_name| schema.get_field(field_name))
.collect();
let tokenizer_manager = TokenizerManager::default_for_indexing();
let tokenizer_manager = TokenizerManager::default();
tokenizer_manager.register(
"en_with_stop_words",
TextAnalyzer::builder(SimpleTokenizer::default())
@@ -1447,7 +1447,7 @@ mod test {
let title = schema_builder.add_text_field("title", text_options);
let schema = schema_builder.build();
let default_fields = vec![title];
let tokenizer_manager = TokenizerManager::default_for_indexing();
let tokenizer_manager = TokenizerManager::default();
let query_parser = QueryParser::new(schema, default_fields, tokenizer_manager);
assert_matches!(
@@ -1622,8 +1622,7 @@ mod test {
let mut schema_builder = Schema::builder();
schema_builder.add_text_field(r#"a\.b"#, STRING);
let schema = schema_builder.build();
let query_parser =
QueryParser::new(schema, Vec::new(), TokenizerManager::default_for_indexing());
let query_parser = QueryParser::new(schema, Vec::new(), TokenizerManager::default());
let query = query_parser.parse_query(r#"a\.b:hello"#).unwrap();
assert_eq!(
format!("{query:?}"),
@@ -1640,11 +1639,8 @@ mod test {
schema_builder.add_text_field("first.toto.titi", STRING);
schema_builder.add_text_field("third.a.b.c", STRING);
let schema = schema_builder.build();
let query_parser = QueryParser::new(
schema.clone(),
Vec::new(),
TokenizerManager::default_for_indexing(),
);
let query_parser =
QueryParser::new(schema.clone(), Vec::new(), TokenizerManager::default());
assert_eq!(
query_parser.split_full_path("first.toto"),
Some((schema.get_field("first.toto").unwrap(), ""))

View File

@@ -72,14 +72,6 @@ impl Query for TermSetQuery {
fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
Ok(Box::new(self.specialized_weight(enable_scoring.schema())?))
}
fn query_terms<'a>(&'a self, visitor: &mut dyn FnMut(&'a Term, bool)) {
for terms in self.terms_map.values() {
for term in terms {
visitor(term, false);
}
}
}
}
struct SetDfaWrapper(Map<Vec<u8>>);

View File

@@ -4,7 +4,7 @@ use serde::{Deserialize, Serialize};
use super::text_options::{FastFieldTextOptions, TokenizerName};
use crate::schema::flags::{FastFlag, SchemaFlagList, StoredFlag};
use crate::schema::{TextFieldIndexing, TextOptions, DEFAULT_FAST_FIELD_TOKENIZER};
use crate::schema::{TextFieldIndexing, TextOptions};
/// The `JsonObjectOptions` make it possible to
/// configure how a json object field should be indexed and stored.
@@ -58,19 +58,20 @@ impl JsonObjectOptions {
/// Returns true if and only if the json object fields are
/// to be treated as fast fields.
pub fn is_fast(&self) -> bool {
match self.fast {
FastFieldTextOptions::Disabled => false,
FastFieldTextOptions::Enabled { .. } => true,
}
matches!(self.fast, FastFieldTextOptions::IsEnabled(true))
|| matches!(
&self.fast,
FastFieldTextOptions::EnabledWithTokenizer { with_tokenizer: _ }
)
}
/// Returns true if and only if the value is a fast field.
pub fn get_fast_field_tokenizer_name(&self) -> Option<&str> {
match &self.fast {
FastFieldTextOptions::Disabled => None,
FastFieldTextOptions::Enabled {
tokenizer: with_tokenizer,
} => Some(with_tokenizer.name()),
FastFieldTextOptions::IsEnabled(true) | FastFieldTextOptions::IsEnabled(false) => None,
FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
} => Some(tokenizer.name()),
}
}
@@ -129,11 +130,15 @@ impl JsonObjectOptions {
/// [`TermDictionary::ord_to_term()`](crate::termdict::TermDictionary::ord_to_term)
/// from the dictionary.
#[must_use]
pub fn set_fast(mut self, tokenizer_name: &str) -> Self {
let with_tokenizer = TokenizerName::from_name(tokenizer_name);
self.fast = FastFieldTextOptions::Enabled {
tokenizer: with_tokenizer,
};
pub fn set_fast(mut self, tokenizer_name: Option<&str>) -> Self {
if let Some(tokenizer) = tokenizer_name {
let tokenizer = TokenizerName::from_name(tokenizer);
self.fast = FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
}
} else {
self.fast = FastFieldTextOptions::IsEnabled(true);
}
self
}
@@ -161,9 +166,7 @@ impl From<FastFlag> for JsonObjectOptions {
JsonObjectOptions {
stored: false,
indexing: None,
fast: FastFieldTextOptions::Enabled {
tokenizer: TokenizerName::from_static(DEFAULT_FAST_FIELD_TOKENIZER),
},
fast: FastFieldTextOptions::IsEnabled(true),
expand_dots_enabled: false,
}
}

View File

@@ -1,6 +1,6 @@
//! Schema definition for tantivy's indices.
//! # Setting your schema in Tantivy
//!
//! # Setting your schema in Tantivy
//!
//! Tantivy has a very strict schema.
//! The schema defines information about the fields your index contains, that is, for each field:
@@ -153,8 +153,6 @@ pub use self::term::{Term, ValueBytes, JSON_END_OF_PATH};
pub use self::text_options::{TextFieldIndexing, TextOptions, STRING, TEXT};
pub use self::value::Value;
pub(crate) const DEFAULT_FAST_FIELD_TOKENIZER: &str = "default";
/// Validator for a potential `field_name`.
/// Returns true if the name can be use for a field name.
///

View File

@@ -24,68 +24,19 @@ pub struct TextOptions {
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[serde(
into = "FastFieldTextOptionsForSerialization",
from = "FastFieldTextOptionsForSerialization"
)]
#[serde(untagged)]
/// Enum to control how the fast field setting of a text field.
#[derive(Default)]
pub(crate) enum FastFieldTextOptions {
/// Fastfield disabled
#[default]
Disabled,
/// Flag to enable/disable
IsEnabled(bool),
/// Enable with tokenizer. The tokenizer must be available on the fast field tokenizer manager.
/// `Index::fast_field_tokenizer`.
Enabled { tokenizer: TokenizerName },
EnabledWithTokenizer { with_tokenizer: TokenizerName },
}
/// Enum used to control the way we serialize fast field text options.
///
/// For backward compatiblity reasons, we folow the format introduce in tantivy 0.19.
/// `false` -> Disabled
/// `true` -> Enabled with default tokenizer
/// `{ tokenizer: "something" }` -> Enabled with a specific tokenizer.
#[derive(Serialize, Deserialize)]
#[serde(untagged)]
enum FastFieldTextOptionsForSerialization {
IsEnabled(bool),
EnabledWithTokenizer {
#[serde(alias = "with_tokenizer")]
tokenizer: TokenizerName,
},
}
impl From<FastFieldTextOptionsForSerialization> for FastFieldTextOptions {
fn from(value: FastFieldTextOptionsForSerialization) -> Self {
match value {
FastFieldTextOptionsForSerialization::IsEnabled(enabled) => {
if enabled {
FastFieldTextOptions::Enabled {
tokenizer: TokenizerName::from_static(
crate::schema::DEFAULT_FAST_FIELD_TOKENIZER,
),
}
} else {
FastFieldTextOptions::Disabled
}
}
FastFieldTextOptionsForSerialization::EnabledWithTokenizer { tokenizer } => {
FastFieldTextOptions::Enabled { tokenizer }
}
}
}
}
impl From<FastFieldTextOptions> for FastFieldTextOptionsForSerialization {
fn from(value: FastFieldTextOptions) -> Self {
match value {
FastFieldTextOptions::Disabled => {
FastFieldTextOptionsForSerialization::IsEnabled(false)
}
FastFieldTextOptions::Enabled { tokenizer } => {
FastFieldTextOptionsForSerialization::EnabledWithTokenizer { tokenizer }
}
}
impl Default for FastFieldTextOptions {
fn default() -> Self {
FastFieldTextOptions::IsEnabled(false)
}
}
@@ -94,13 +45,23 @@ impl BitOr<FastFieldTextOptions> for FastFieldTextOptions {
fn bitor(self, other: FastFieldTextOptions) -> FastFieldTextOptions {
match (self, other) {
(FastFieldTextOptions::Enabled { tokenizer }, _)
| (_, FastFieldTextOptions::Enabled { tokenizer }) => {
FastFieldTextOptions::Enabled { tokenizer }
}
(FastFieldTextOptions::Disabled, FastFieldTextOptions::Disabled) => {
FastFieldTextOptions::Disabled
}
(
FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
},
_,
)
| (
_,
FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
},
) => FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
},
(FastFieldTextOptions::IsEnabled(true), _)
| (_, FastFieldTextOptions::IsEnabled(true)) => FastFieldTextOptions::IsEnabled(true),
(_, FastFieldTextOptions::IsEnabled(false)) => FastFieldTextOptions::IsEnabled(false),
}
}
}
@@ -122,17 +83,20 @@ impl TextOptions {
/// Returns true if and only if the value is a fast field.
pub fn is_fast(&self) -> bool {
match &self.fast {
FastFieldTextOptions::Disabled => false,
FastFieldTextOptions::Enabled { .. } => true,
}
matches!(self.fast, FastFieldTextOptions::IsEnabled(true))
|| matches!(
&self.fast,
FastFieldTextOptions::EnabledWithTokenizer { with_tokenizer: _ }
)
}
/// Returns true if and only if the value is a fast field.
pub fn get_fast_field_tokenizer_name(&self) -> Option<&str> {
match &self.fast {
FastFieldTextOptions::Disabled => None,
FastFieldTextOptions::Enabled { tokenizer } => Some(tokenizer.name()),
FastFieldTextOptions::IsEnabled(true) | FastFieldTextOptions::IsEnabled(false) => None,
FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
} => Some(tokenizer.name()),
}
}
@@ -157,9 +121,15 @@ impl TextOptions {
/// [`TermDictionary::ord_to_term()`](crate::termdict::TermDictionary::ord_to_term)
/// from the dictionary.
#[must_use]
pub fn set_fast(mut self, tokenizer_name: &str) -> TextOptions {
let tokenizer = TokenizerName::from_name(tokenizer_name);
self.fast = FastFieldTextOptions::Enabled { tokenizer };
pub fn set_fast(mut self, tokenizer_name: Option<&str>) -> TextOptions {
if let Some(tokenizer) = tokenizer_name {
let tokenizer = TokenizerName::from_name(tokenizer);
self.fast = FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
}
} else {
self.fast = FastFieldTextOptions::IsEnabled(true);
}
self
}
@@ -293,7 +263,7 @@ pub const STRING: TextOptions = TextOptions {
record: IndexRecordOption::Basic,
}),
stored: false,
fast: FastFieldTextOptions::Disabled,
fast: FastFieldTextOptions::IsEnabled(false),
coerce: false,
};
@@ -306,7 +276,7 @@ pub const TEXT: TextOptions = TextOptions {
}),
stored: false,
coerce: false,
fast: FastFieldTextOptions::Disabled,
fast: FastFieldTextOptions::IsEnabled(false),
};
impl<T: Into<TextOptions>> BitOr<T> for TextOptions {
@@ -356,9 +326,7 @@ impl From<FastFlag> for TextOptions {
TextOptions {
indexing: None,
stored: false,
fast: FastFieldTextOptions::Enabled {
tokenizer: TokenizerName::from_static(crate::schema::DEFAULT_FAST_FIELD_TOKENIZER),
},
fast: FastFieldTextOptions::IsEnabled(true),
coerce: false,
}
}
@@ -424,21 +392,21 @@ mod tests {
#[test]
fn serde_fast_field_tokenizer() {
let json = r#" {
"fast": { "tokenizer": "default" }
"fast": { "with_tokenizer": "default" }
} "#;
let options: TextOptions = serde_json::from_str(json).unwrap();
assert_eq!(
options.fast,
FastFieldTextOptions::Enabled {
tokenizer: TokenizerName::from_static("default")
FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: TokenizerName::from_static("default")
}
);
let options: TextOptions =
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
assert_eq!(
options.fast,
FastFieldTextOptions::Enabled {
tokenizer: TokenizerName::from_static("default")
FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: TokenizerName::from_static("default")
}
);
@@ -446,28 +414,18 @@ mod tests {
"fast": true
} "#;
let options: TextOptions = serde_json::from_str(json).unwrap();
assert_eq!(
options.fast,
FastFieldTextOptions::Enabled {
tokenizer: TokenizerName::from_static(DEFAULT_FAST_FIELD_TOKENIZER)
}
);
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(true));
let options: TextOptions =
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
assert_eq!(
options.fast,
FastFieldTextOptions::Enabled {
tokenizer: TokenizerName::from_static(DEFAULT_FAST_FIELD_TOKENIZER)
}
);
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(true));
let json = r#" {
"fast": false
} "#;
let options: TextOptions = serde_json::from_str(json).unwrap();
assert_eq!(options.fast, FastFieldTextOptions::Disabled);
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(false));
let options: TextOptions =
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
assert_eq!(options.fast, FastFieldTextOptions::Disabled);
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(false));
}
}

View File

@@ -0,0 +1,19 @@
use std::io;
#[inline]
pub fn compress(mut uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
let params = brotli::enc::BrotliEncoderParams {
quality: 5,
..Default::default()
};
compressed.clear();
brotli::BrotliCompress(&mut uncompressed, compressed, &params)?;
Ok(())
}
#[inline]
pub fn decompress(mut compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
decompressed.clear();
brotli::BrotliDecompress(&mut compressed, decompressed)?;
Ok(())
}

View File

@@ -0,0 +1,17 @@
use std::io::{self, Read, Write};
#[inline]
pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
compressed.clear();
let mut encoder = snap::write::FrameEncoder::new(compressed);
encoder.write_all(uncompressed)?;
encoder.flush()?;
Ok(())
}
#[inline]
pub fn decompress(compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
decompressed.clear();
snap::read::FrameDecoder::new(compressed).read_to_end(decompressed)?;
Ok(())
}

View File

@@ -17,10 +17,12 @@ pub enum Compressor {
/// No compression
None,
/// Use the lz4 compressor (block format)
#[cfg(feature = "lz4-compression")]
Lz4,
/// Use the brotli compressor
Brotli,
/// Use the snap compressor
Snappy,
/// Use the zstd compressor
#[cfg(feature = "zstd-compression")]
Zstd(ZstdCompressor),
}
@@ -29,9 +31,9 @@ impl Serialize for Compressor {
where S: serde::Serializer {
match *self {
Compressor::None => serializer.serialize_str("none"),
#[cfg(feature = "lz4-compression")]
Compressor::Lz4 => serializer.serialize_str("lz4"),
#[cfg(feature = "zstd-compression")]
Compressor::Brotli => serializer.serialize_str("brotli"),
Compressor::Snappy => serializer.serialize_str("snappy"),
Compressor::Zstd(zstd) => serializer.serialize_str(&zstd.ser_to_string()),
}
}
@@ -43,38 +45,27 @@ impl<'de> Deserialize<'de> for Compressor {
let buf = String::deserialize(deserializer)?;
let compressor = match buf.as_str() {
"none" => Compressor::None,
#[cfg(feature = "lz4-compression")]
"lz4" => Compressor::Lz4,
#[cfg(not(feature = "lz4-compression"))]
"lz4" => {
return Err(serde::de::Error::custom(
"unsupported variant `lz4`, please enable Tantivy's `lz4-compression` feature",
))
}
#[cfg(feature = "zstd-compression")]
_ if buf.starts_with("zstd") => Compressor::Zstd(
ZstdCompressor::deser_from_str(&buf).map_err(serde::de::Error::custom)?,
),
#[cfg(not(feature = "zstd-compression"))]
_ if buf.starts_with("zstd") => {
return Err(serde::de::Error::custom(
"unsupported variant `zstd`, please enable Tantivy's `zstd-compression` \
feature",
))
}
"brotli" => Compressor::Brotli,
"snappy" => Compressor::Snappy,
_ => {
return Err(serde::de::Error::unknown_variant(
&buf,
&[
"none",
#[cfg(feature = "lz4-compression")]
"lz4",
#[cfg(feature = "zstd-compression")]
"zstd",
#[cfg(feature = "zstd-compression")]
"zstd(compression_level=5)",
],
));
if buf.starts_with("zstd") {
Compressor::Zstd(
ZstdCompressor::deser_from_str(&buf).map_err(serde::de::Error::custom)?,
)
} else {
return Err(serde::de::Error::unknown_variant(
&buf,
&[
"none",
"lz4",
"brotli",
"snappy",
"zstd",
"zstd(compression_level=5)",
],
));
}
}
};
@@ -136,15 +127,18 @@ impl ZstdCompressor {
}
impl Default for Compressor {
#[allow(unreachable_code)]
fn default() -> Self {
#[cfg(feature = "lz4-compression")]
return Compressor::Lz4;
#[cfg(feature = "zstd-compression")]
return Compressor::Zstd(ZstdCompressor::default());
Compressor::None
if cfg!(feature = "lz4-compression") {
Compressor::Lz4
} else if cfg!(feature = "brotli-compression") {
Compressor::Brotli
} else if cfg!(feature = "snappy-compression") {
Compressor::Snappy
} else if cfg!(feature = "zstd-compression") {
Compressor::Zstd(ZstdCompressor::default())
} else {
Compressor::None
}
}
}
@@ -161,14 +155,50 @@ impl Compressor {
compressed.extend_from_slice(uncompressed);
Ok(())
}
#[cfg(feature = "lz4-compression")]
Self::Lz4 => super::compression_lz4_block::compress(uncompressed, compressed),
#[cfg(feature = "zstd-compression")]
Self::Zstd(_zstd_compressor) => super::compression_zstd_block::compress(
uncompressed,
compressed,
_zstd_compressor.compression_level,
),
Self::Lz4 => {
#[cfg(feature = "lz4-compression")]
{
super::compression_lz4_block::compress(uncompressed, compressed)
}
#[cfg(not(feature = "lz4-compression"))]
{
panic!("lz4-compression feature flag not activated");
}
}
Self::Brotli => {
#[cfg(feature = "brotli-compression")]
{
super::compression_brotli::compress(uncompressed, compressed)
}
#[cfg(not(feature = "brotli-compression"))]
{
panic!("brotli-compression-compression feature flag not activated");
}
}
Self::Snappy => {
#[cfg(feature = "snappy-compression")]
{
super::compression_snap::compress(uncompressed, compressed)
}
#[cfg(not(feature = "snappy-compression"))]
{
panic!("snappy-compression feature flag not activated");
}
}
Self::Zstd(_zstd_compressor) => {
#[cfg(feature = "zstd-compression")]
{
super::compression_zstd_block::compress(
uncompressed,
compressed,
_zstd_compressor.compression_level,
)
}
#[cfg(not(feature = "zstd-compression"))]
{
panic!("zstd-compression feature flag not activated");
}
}
}
}
}

View File

@@ -16,10 +16,12 @@ pub enum Decompressor {
/// No compression
None,
/// Use the lz4 decompressor (block format)
#[cfg(feature = "lz4-compression")]
Lz4,
/// Use the brotli decompressor
Brotli,
/// Use the snap decompressor
Snappy,
/// Use the zstd decompressor
#[cfg(feature = "zstd-compression")]
Zstd,
}
@@ -27,9 +29,9 @@ impl From<Compressor> for Decompressor {
fn from(compressor: Compressor) -> Self {
match compressor {
Compressor::None => Decompressor::None,
#[cfg(feature = "lz4-compression")]
Compressor::Lz4 => Decompressor::Lz4,
#[cfg(feature = "zstd-compression")]
Compressor::Brotli => Decompressor::Brotli,
Compressor::Snappy => Decompressor::Snappy,
Compressor::Zstd(_) => Decompressor::Zstd,
}
}
@@ -39,9 +41,9 @@ impl Decompressor {
pub(crate) fn from_id(id: u8) -> Decompressor {
match id {
0 => Decompressor::None,
#[cfg(feature = "lz4-compression")]
1 => Decompressor::Lz4,
#[cfg(feature = "zstd-compression")]
2 => Decompressor::Brotli,
3 => Decompressor::Snappy,
4 => Decompressor::Zstd,
_ => panic!("unknown compressor id {id:?}"),
}
@@ -50,9 +52,9 @@ impl Decompressor {
pub(crate) fn get_id(&self) -> u8 {
match self {
Self::None => 0,
#[cfg(feature = "lz4-compression")]
Self::Lz4 => 1,
#[cfg(feature = "zstd-compression")]
Self::Brotli => 2,
Self::Snappy => 3,
Self::Zstd => 4,
}
}
@@ -75,10 +77,46 @@ impl Decompressor {
decompressed.extend_from_slice(compressed);
Ok(())
}
#[cfg(feature = "lz4-compression")]
Self::Lz4 => super::compression_lz4_block::decompress(compressed, decompressed),
#[cfg(feature = "zstd-compression")]
Self::Zstd => super::compression_zstd_block::decompress(compressed, decompressed),
Self::Lz4 => {
#[cfg(feature = "lz4-compression")]
{
super::compression_lz4_block::decompress(compressed, decompressed)
}
#[cfg(not(feature = "lz4-compression"))]
{
panic!("lz4-compression feature flag not activated");
}
}
Self::Brotli => {
#[cfg(feature = "brotli-compression")]
{
super::compression_brotli::decompress(compressed, decompressed)
}
#[cfg(not(feature = "brotli-compression"))]
{
panic!("brotli-compression feature flag not activated");
}
}
Self::Snappy => {
#[cfg(feature = "snappy-compression")]
{
super::compression_snap::decompress(compressed, decompressed)
}
#[cfg(not(feature = "snappy-compression"))]
{
panic!("snappy-compression feature flag not activated");
}
}
Self::Zstd => {
#[cfg(feature = "zstd-compression")]
{
super::compression_zstd_block::decompress(compressed, decompressed)
}
#[cfg(not(feature = "zstd-compression"))]
{
panic!("zstd-compression feature flag not activated");
}
}
}
}
}
@@ -91,9 +129,9 @@ mod tests {
#[test]
fn compressor_decompressor_id_test() {
assert_eq!(Decompressor::from(Compressor::None), Decompressor::None);
#[cfg(feature = "lz4-compression")]
assert_eq!(Decompressor::from(Compressor::Lz4), Decompressor::Lz4);
#[cfg(feature = "zstd-compression")]
assert_eq!(Decompressor::from(Compressor::Brotli), Decompressor::Brotli);
assert_eq!(Decompressor::from(Compressor::Snappy), Decompressor::Snappy);
assert_eq!(
Decompressor::from(Compressor::Zstd(Default::default())),
Decompressor::Zstd

View File

@@ -4,8 +4,8 @@
//! order to be handled in the `Store`.
//!
//! Internally, documents (or rather their stored fields) are serialized to a buffer.
//! When the buffer exceeds `block_size` (defaults to 16K), the buffer is compressed
//! using LZ4 or Zstd and the resulting block is written to disk.
//! When the buffer exceeds `block_size` (defaults to 16K), the buffer is compressed using `brotli`,
//! `LZ4` or `snappy` and the resulting block is written to disk.
//!
//! One can then request for a specific `DocId`.
//! A skip list helps navigating to the right block,
@@ -48,6 +48,12 @@ pub(crate) const DOC_STORE_VERSION: u32 = 1;
#[cfg(feature = "lz4-compression")]
mod compression_lz4_block;
#[cfg(feature = "brotli-compression")]
mod compression_brotli;
#[cfg(feature = "snappy-compression")]
mod compression_snap;
#[cfg(feature = "zstd-compression")]
mod compression_zstd_block;
@@ -194,6 +200,16 @@ pub mod tests {
fn test_store_lz4_block() -> crate::Result<()> {
test_store(Compressor::Lz4, BLOCK_SIZE, true)
}
#[cfg(feature = "snappy-compression")]
#[test]
fn test_store_snap() -> crate::Result<()> {
test_store(Compressor::Snappy, BLOCK_SIZE, true)
}
#[cfg(feature = "brotli-compression")]
#[test]
fn test_store_brotli() -> crate::Result<()> {
test_store(Compressor::Brotli, BLOCK_SIZE, true)
}
#[cfg(feature = "zstd-compression")]
#[test]
@@ -245,8 +261,8 @@ pub mod tests {
Ok(())
}
#[cfg(feature = "snappy-compression")]
#[cfg(feature = "lz4-compression")]
#[cfg(feature = "zstd-compression")]
#[test]
fn test_merge_with_changed_compressor() -> crate::Result<()> {
let mut schema_builder = schema::Schema::builder();
@@ -278,7 +294,7 @@ pub mod tests {
);
// Change compressor, this disables stacking on merging
let index_settings = index.settings_mut();
index_settings.docstore_compression = Compressor::Zstd(Default::default());
index_settings.docstore_compression = Compressor::Snappy;
// Merging the segments
{
let segment_ids = index
@@ -300,7 +316,7 @@ pub mod tests {
LOREM.to_string()
);
}
assert_eq!(store.decompressor(), Decompressor::Zstd);
assert_eq!(store.decompressor(), Decompressor::Snappy);
Ok(())
}

View File

@@ -189,7 +189,7 @@ pub mod tests {
#[test]
fn test_raw_tokenizer2() {
let tokenizer_manager = TokenizerManager::default_for_indexing();
let tokenizer_manager = TokenizerManager::default();
let mut en_tokenizer = tokenizer_manager.get("raw").unwrap();
let mut tokens: Vec<Token> = vec![];
{
@@ -206,7 +206,7 @@ pub mod tests {
#[test]
fn test_en_tokenizer() {
let tokenizer_manager = TokenizerManager::default_for_indexing();
let tokenizer_manager = TokenizerManager::default();
assert!(tokenizer_manager.get("en_doesnotexist").is_none());
let mut en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
let mut tokens: Vec<Token> = vec![];
@@ -228,7 +228,7 @@ pub mod tests {
#[test]
fn test_non_en_tokenizer() {
let tokenizer_manager = TokenizerManager::default_for_indexing();
let tokenizer_manager = TokenizerManager::default();
tokenizer_manager.register(
"el_stem",
TextAnalyzer::builder(SimpleTokenizer::default())
@@ -256,7 +256,7 @@ pub mod tests {
#[test]
fn test_tokenizer_empty() {
let tokenizer_manager = TokenizerManager::default_for_indexing();
let tokenizer_manager = TokenizerManager::default();
let mut en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
{
let mut tokens: Vec<Token> = vec![];
@@ -282,7 +282,7 @@ pub mod tests {
#[test]
fn test_whitespace_tokenizer() {
let tokenizer_manager = TokenizerManager::default_for_indexing();
let tokenizer_manager = TokenizerManager::default();
let mut ws_tokenizer = tokenizer_manager.get("whitespace").unwrap();
let mut tokens: Vec<Token> = vec![];
{

View File

@@ -27,7 +27,6 @@ pub struct TokenizerManager {
impl TokenizerManager {
/// Creates an empty tokenizer manager.
#[allow(clippy::new_without_default)]
pub fn new() -> Self {
Self {
tokenizers: Arc::new(RwLock::new(HashMap::new())),
@@ -52,10 +51,12 @@ impl TokenizerManager {
.get(tokenizer_name)
.cloned()
}
}
impl Default for TokenizerManager {
/// Creates an `TokenizerManager` prepopulated with
/// the default pre-configured tokenizers of `tantivy`.
pub fn default_for_indexing() -> TokenizerManager {
fn default() -> TokenizerManager {
let manager = TokenizerManager::new();
manager.register("raw", RawTokenizer::default());
manager.register(
@@ -76,28 +77,4 @@ impl TokenizerManager {
manager.register("whitespace", WhitespaceTokenizer::default());
manager
}
/// Creates an `TokenizerManager` prepopulated with
/// the default pre-configured tokenizers of `tantivy`
/// for fast fields.
///
/// Fast fields usually do not really tokenize the text.
/// It is however very useful to filter / normalize the text.
pub fn default_for_fast_fields() -> TokenizerManager {
let manager = TokenizerManager::new();
let raw_tokenizer = TextAnalyzer::builder(RawTokenizer::default())
.filter(RemoveLongFilter::limit(255))
.build();
let lower_tokenizer = TextAnalyzer::builder(RawTokenizer::default())
.filter(RemoveLongFilter::limit(255))
.filter(LowerCaser)
.build();
manager.register(
crate::schema::DEFAULT_FAST_FIELD_TOKENIZER,
lower_tokenizer.clone(),
);
manager.register("raw", raw_tokenizer);
manager.register("lower", lower_tokenizer);
manager
}
}