mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-04 08:12:54 +00:00
Compare commits
19 Commits
dynamic-to
...
default_fa
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
61422d7cd5 | ||
|
|
820f126075 | ||
|
|
7e6c4a1856 | ||
|
|
5fafe4b1ab | ||
|
|
1e7cd48cfa | ||
|
|
7f51d85bbd | ||
|
|
ad76e32398 | ||
|
|
7575f9bf1c | ||
|
|
67bdf3f5f6 | ||
|
|
3c300666ad | ||
|
|
b91d3f6be4 | ||
|
|
a8e76513bb | ||
|
|
0a23201338 | ||
|
|
81330aaf89 | ||
|
|
98a3b01992 | ||
|
|
d341520938 | ||
|
|
5c9af73e41 | ||
|
|
ad4c940fa3 | ||
|
|
910b0b0c61 |
2
.github/workflows/test.yml
vendored
2
.github/workflows/test.yml
vendored
@@ -53,7 +53,7 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
features: [
|
||||
{ label: "all", flags: "mmap,stopwords,brotli-compression,lz4-compression,snappy-compression,zstd-compression,failpoints" },
|
||||
{ label: "all", flags: "mmap,stopwords,lz4-compression,zstd-compression,failpoints" },
|
||||
{ label: "quickwit", flags: "mmap,quickwit,failpoints" }
|
||||
]
|
||||
|
||||
|
||||
13
CHANGELOG.md
13
CHANGELOG.md
@@ -1,5 +1,14 @@
|
||||
|
||||
Tantivy 0.20 [Unreleased]
|
||||
Tantivy 0.20.2
|
||||
================================
|
||||
- Align numerical type priority order on the search side. [#2088](https://github.com/quickwit-oss/tantivy/issues/2088) (@fmassot)
|
||||
- Fix is_child_of function not considering the root facet. [#2086](https://github.com/quickwit-oss/tantivy/issues/2086) (@adamreichhold)
|
||||
|
||||
Tantivy 0.20.1
|
||||
================================
|
||||
- Fix building on windows with mmap [#2070](https://github.com/quickwit-oss/tantivy/issues/2070) (@ChillFish8)
|
||||
|
||||
Tantivy 0.20
|
||||
================================
|
||||
#### Bugfixes
|
||||
- Fix phrase queries with slop (slop supports now transpositions, algorithm that carries slop so far for num terms > 2) [#2031](https://github.com/quickwit-oss/tantivy/issues/2031)[#2020](https://github.com/quickwit-oss/tantivy/issues/2020)(@PSeitz)
|
||||
@@ -38,12 +47,14 @@ Tantivy 0.20 [Unreleased]
|
||||
- Add aggregation support for JSON type [#1888](https://github.com/quickwit-oss/tantivy/issues/1888) (@PSeitz)
|
||||
- Mixed types support on JSON fields in aggs [#1971](https://github.com/quickwit-oss/tantivy/issues/1971) (@PSeitz)
|
||||
- Perf: Fetch blocks of vals in aggregation for all cardinality [#1950](https://github.com/quickwit-oss/tantivy/issues/1950) (@PSeitz)
|
||||
- Allow histogram bounds to be passed as Rfc3339 [#2076](https://github.com/quickwit-oss/tantivy/issues/2076) (@PSeitz)
|
||||
- `Searcher` with disabled scoring via `EnableScoring::Disabled` [#1780](https://github.com/quickwit-oss/tantivy/issues/1780) (@shikhar)
|
||||
- Enable tokenizer on json fields [#2053](https://github.com/quickwit-oss/tantivy/issues/2053) (@PSeitz)
|
||||
- Enforcing "NOT" and "-" queries consistency in UserInputAst [#1609](https://github.com/quickwit-oss/tantivy/issues/1609) (@bazhenov)
|
||||
- Faster indexing
|
||||
- Refactor tokenization pipeline to use GATs [#1924](https://github.com/quickwit-oss/tantivy/issues/1924) (@trinity-1686a)
|
||||
- Faster term hash map [#2058](https://github.com/quickwit-oss/tantivy/issues/2058)[#1940](https://github.com/quickwit-oss/tantivy/issues/1940) (@PSeitz)
|
||||
- tokenizer-api: reduce Tokenizer allocation overhead [#2062](https://github.com/quickwit-oss/tantivy/issues/2062) (@PSeitz)
|
||||
- Refactor vint [#2010](https://github.com/quickwit-oss/tantivy/issues/2010) (@PSeitz)
|
||||
- Faster search
|
||||
- Work in batches of docs on the SegmentCollector (Only for cases without score for now) [#1937](https://github.com/quickwit-oss/tantivy/issues/1937) (@PSeitz)
|
||||
|
||||
@@ -25,9 +25,7 @@ aho-corasick = "1.0"
|
||||
tantivy-fst = "0.4.0"
|
||||
memmap2 = { version = "0.7.1", optional = true }
|
||||
lz4_flex = { version = "0.11", default-features = false, optional = true }
|
||||
brotli = { version = "3.3.4", optional = true }
|
||||
zstd = { version = "0.12", optional = true, default-features = false }
|
||||
snap = { version = "1.0.5", optional = true }
|
||||
tempfile = { version = "3.3.0", optional = true }
|
||||
log = "0.4.16"
|
||||
serde = { version = "1.0.136", features = ["derive"] }
|
||||
@@ -49,9 +47,9 @@ murmurhash32 = "0.3.0"
|
||||
time = { version = "0.3.10", features = ["serde-well-known"] }
|
||||
smallvec = "1.8.0"
|
||||
rayon = "1.5.2"
|
||||
lru = "0.10.0"
|
||||
lru = "0.11.0"
|
||||
fastdivide = "0.4.0"
|
||||
itertools = "0.10.3"
|
||||
itertools = "0.11.0"
|
||||
measure_time = "0.8.2"
|
||||
async-trait = "0.1.53"
|
||||
arc-swap = "1.5.0"
|
||||
@@ -107,9 +105,7 @@ default = ["mmap", "stopwords", "lz4-compression"]
|
||||
mmap = ["fs4", "tempfile", "memmap2"]
|
||||
stopwords = []
|
||||
|
||||
brotli-compression = ["brotli"]
|
||||
lz4-compression = ["lz4_flex"]
|
||||
snappy-compression = ["snap"]
|
||||
zstd-compression = ["zstd"]
|
||||
|
||||
failpoints = ["fail", "fail/failpoints"]
|
||||
|
||||
@@ -44,7 +44,7 @@ Details about the benchmark can be found at this [repository](https://github.com
|
||||
- Single valued and multivalued u64, i64, and f64 fast fields (equivalent of doc values in Lucene)
|
||||
- `&[u8]` fast fields
|
||||
- Text, i64, u64, f64, dates, ip, bool, and hierarchical facet fields
|
||||
- Compressed document store (LZ4, Zstd, None, Brotli, Snap)
|
||||
- Compressed document store (LZ4, Zstd, None)
|
||||
- Range queries
|
||||
- Faceted search
|
||||
- Configurable indexing (optional term frequency and position indexing)
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use tantivy::tokenizer::TokenizerManager;
|
||||
use tantivy::tokenizer::{
|
||||
LowerCaser, RemoveLongFilter, SimpleTokenizer, TextAnalyzer, TokenizerManager,
|
||||
};
|
||||
|
||||
const ALICE_TXT: &str = include_str!("alice.txt");
|
||||
|
||||
@@ -16,7 +18,26 @@ pub fn criterion_benchmark(c: &mut Criterion) {
|
||||
assert_eq!(word_count, 30_731);
|
||||
})
|
||||
});
|
||||
let mut dynamic_analyzer = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.dynamic()
|
||||
.filter_dynamic(RemoveLongFilter::limit(40))
|
||||
.filter_dynamic(LowerCaser)
|
||||
.build();
|
||||
c.bench_function("dynamic-tokenize-alice", |b| {
|
||||
b.iter(|| {
|
||||
let mut word_count = 0;
|
||||
let mut token_stream = dynamic_analyzer.token_stream(ALICE_TXT);
|
||||
while token_stream.advance() {
|
||||
word_count += 1;
|
||||
}
|
||||
assert_eq!(word_count, 30_731);
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
criterion_group!(benches, criterion_benchmark);
|
||||
criterion_group! {
|
||||
name = benches;
|
||||
config = Criterion::default().sample_size(200);
|
||||
targets = criterion_benchmark
|
||||
}
|
||||
criterion_main!(benches);
|
||||
|
||||
@@ -9,7 +9,7 @@ description = "column oriented storage for tantivy"
|
||||
categories = ["database-implementations", "data-structures", "compression"]
|
||||
|
||||
[dependencies]
|
||||
itertools = "0.10.5"
|
||||
itertools = "0.11.0"
|
||||
fnv = "1.0.7"
|
||||
fastdivide = "0.4.0"
|
||||
|
||||
|
||||
@@ -168,8 +168,9 @@ mod tests {
|
||||
)
|
||||
.into();
|
||||
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
|
||||
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index
|
||||
else { panic!("Excpected a multivalued index") };
|
||||
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
|
||||
panic!("Excpected a multivalued index")
|
||||
};
|
||||
let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
|
||||
assert_eq!(&start_indexes, &[0, 3, 5]);
|
||||
}
|
||||
@@ -200,8 +201,9 @@ mod tests {
|
||||
)
|
||||
.into();
|
||||
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
|
||||
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index
|
||||
else { panic!("Excpected a multivalued index") };
|
||||
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
|
||||
panic!("Excpected a multivalued index")
|
||||
};
|
||||
let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
|
||||
assert_eq!(&start_indexes, &[0, 3, 5, 6]);
|
||||
}
|
||||
|
||||
@@ -157,7 +157,13 @@ mod tests {
|
||||
Cardinality::Optional,
|
||||
&shuffle_merge_order,
|
||||
);
|
||||
let SerializableColumnIndex::Optional { non_null_row_ids, num_rows } = serializable_index else { panic!() };
|
||||
let SerializableColumnIndex::Optional {
|
||||
non_null_row_ids,
|
||||
num_rows,
|
||||
} = serializable_index
|
||||
else {
|
||||
panic!()
|
||||
};
|
||||
assert_eq!(num_rows, 2);
|
||||
let non_null_rows: Vec<RowId> = non_null_row_ids.boxed_iter().collect();
|
||||
assert_eq!(&non_null_rows, &[1]);
|
||||
|
||||
@@ -83,7 +83,8 @@ impl ColumnValues for BitpackedReader {
|
||||
doc_id_range: Range<u32>,
|
||||
positions: &mut Vec<u32>,
|
||||
) {
|
||||
let Some(transformed_range) = transform_range_before_linear_transformation(&self.stats, range)
|
||||
let Some(transformed_range) =
|
||||
transform_range_before_linear_transformation(&self.stats, range)
|
||||
else {
|
||||
positions.clear();
|
||||
return;
|
||||
|
||||
@@ -244,7 +244,9 @@ fn test_merge_columnar_numbers() {
|
||||
assert_eq!(columnar_reader.num_columns(), 1);
|
||||
let cols = columnar_reader.read_columns("numbers").unwrap();
|
||||
let dynamic_column = cols[0].open().unwrap();
|
||||
let DynamicColumn::F64(vals) = dynamic_column else { panic!() };
|
||||
let DynamicColumn::F64(vals) = dynamic_column else {
|
||||
panic!()
|
||||
};
|
||||
assert_eq!(vals.get_cardinality(), Cardinality::Optional);
|
||||
assert_eq!(vals.first(0u32), Some(-1f64));
|
||||
assert_eq!(vals.first(1u32), None);
|
||||
@@ -270,7 +272,9 @@ fn test_merge_columnar_texts() {
|
||||
assert_eq!(columnar_reader.num_columns(), 1);
|
||||
let cols = columnar_reader.read_columns("texts").unwrap();
|
||||
let dynamic_column = cols[0].open().unwrap();
|
||||
let DynamicColumn::Str(vals) = dynamic_column else { panic!() };
|
||||
let DynamicColumn::Str(vals) = dynamic_column else {
|
||||
panic!()
|
||||
};
|
||||
assert_eq!(vals.ords().get_cardinality(), Cardinality::Optional);
|
||||
|
||||
let get_str_for_ord = |ord| {
|
||||
@@ -317,7 +321,9 @@ fn test_merge_columnar_byte() {
|
||||
assert_eq!(columnar_reader.num_columns(), 1);
|
||||
let cols = columnar_reader.read_columns("bytes").unwrap();
|
||||
let dynamic_column = cols[0].open().unwrap();
|
||||
let DynamicColumn::Bytes(vals) = dynamic_column else { panic!() };
|
||||
let DynamicColumn::Bytes(vals) = dynamic_column else {
|
||||
panic!()
|
||||
};
|
||||
let get_bytes_for_ord = |ord| {
|
||||
let mut out = Vec::new();
|
||||
vals.ord_to_bytes(ord, &mut out).unwrap();
|
||||
@@ -371,7 +377,9 @@ fn test_merge_columnar_byte_with_missing() {
|
||||
assert_eq!(columnar_reader.num_columns(), 2);
|
||||
let cols = columnar_reader.read_columns("col").unwrap();
|
||||
let dynamic_column = cols[0].open().unwrap();
|
||||
let DynamicColumn::Bytes(vals) = dynamic_column else { panic!() };
|
||||
let DynamicColumn::Bytes(vals) = dynamic_column else {
|
||||
panic!()
|
||||
};
|
||||
let get_bytes_for_ord = |ord| {
|
||||
let mut out = Vec::new();
|
||||
vals.ord_to_bytes(ord, &mut out).unwrap();
|
||||
@@ -423,7 +431,9 @@ fn test_merge_columnar_different_types() {
|
||||
|
||||
// numeric column
|
||||
let dynamic_column = cols[0].open().unwrap();
|
||||
let DynamicColumn::I64(vals) = dynamic_column else { panic!() };
|
||||
let DynamicColumn::I64(vals) = dynamic_column else {
|
||||
panic!()
|
||||
};
|
||||
assert_eq!(vals.get_cardinality(), Cardinality::Optional);
|
||||
assert_eq!(vals.values_for_doc(0).collect_vec(), vec![]);
|
||||
assert_eq!(vals.values_for_doc(1).collect_vec(), vec![]);
|
||||
@@ -433,7 +443,9 @@ fn test_merge_columnar_different_types() {
|
||||
|
||||
// text column
|
||||
let dynamic_column = cols[1].open().unwrap();
|
||||
let DynamicColumn::Str(vals) = dynamic_column else { panic!() };
|
||||
let DynamicColumn::Str(vals) = dynamic_column else {
|
||||
panic!()
|
||||
};
|
||||
assert_eq!(vals.ords().get_cardinality(), Cardinality::Optional);
|
||||
let get_str_for_ord = |ord| {
|
||||
let mut out = String::new();
|
||||
|
||||
@@ -98,9 +98,11 @@ impl ColumnarWriter {
|
||||
///
|
||||
/// The sort applied is stable.
|
||||
pub fn sort_order(&self, sort_field: &str, num_docs: RowId, reversed: bool) -> Vec<u32> {
|
||||
let Some(numerical_col_writer) =
|
||||
self.numerical_field_hash_map.get::<NumericalColumnWriter>(sort_field.as_bytes()) else {
|
||||
return Vec::new();
|
||||
let Some(numerical_col_writer) = self
|
||||
.numerical_field_hash_map
|
||||
.get::<NumericalColumnWriter>(sort_field.as_bytes())
|
||||
else {
|
||||
return Vec::new();
|
||||
};
|
||||
let mut symbols_buffer = Vec::new();
|
||||
let mut values = Vec::new();
|
||||
|
||||
@@ -57,7 +57,9 @@ fn test_dataframe_writer_bool() {
|
||||
assert_eq!(cols[0].num_bytes(), 22);
|
||||
assert_eq!(cols[0].column_type(), ColumnType::Bool);
|
||||
let dyn_bool_col = cols[0].open().unwrap();
|
||||
let DynamicColumn::Bool(bool_col) = dyn_bool_col else { panic!(); };
|
||||
let DynamicColumn::Bool(bool_col) = dyn_bool_col else {
|
||||
panic!();
|
||||
};
|
||||
let vals: Vec<Option<bool>> = (0..5).map(|row_id| bool_col.first(row_id)).collect();
|
||||
assert_eq!(&vals, &[None, Some(false), None, Some(true), None,]);
|
||||
}
|
||||
@@ -79,7 +81,9 @@ fn test_dataframe_writer_u64_multivalued() {
|
||||
assert_eq!(cols.len(), 1);
|
||||
assert_eq!(cols[0].num_bytes(), 29);
|
||||
let dyn_i64_col = cols[0].open().unwrap();
|
||||
let DynamicColumn::I64(divisor_col) = dyn_i64_col else { panic!(); };
|
||||
let DynamicColumn::I64(divisor_col) = dyn_i64_col else {
|
||||
panic!();
|
||||
};
|
||||
assert_eq!(
|
||||
divisor_col.get_cardinality(),
|
||||
crate::Cardinality::Multivalued
|
||||
@@ -101,7 +105,9 @@ fn test_dataframe_writer_ip_addr() {
|
||||
assert_eq!(cols[0].num_bytes(), 42);
|
||||
assert_eq!(cols[0].column_type(), ColumnType::IpAddr);
|
||||
let dyn_bool_col = cols[0].open().unwrap();
|
||||
let DynamicColumn::IpAddr(ip_col) = dyn_bool_col else { panic!(); };
|
||||
let DynamicColumn::IpAddr(ip_col) = dyn_bool_col else {
|
||||
panic!();
|
||||
};
|
||||
let vals: Vec<Option<Ipv6Addr>> = (0..5).map(|row_id| ip_col.first(row_id)).collect();
|
||||
assert_eq!(
|
||||
&vals,
|
||||
@@ -134,7 +140,9 @@ fn test_dataframe_writer_numerical() {
|
||||
// - null footer 6 bytes
|
||||
assert_eq!(cols[0].num_bytes(), 33);
|
||||
let column = cols[0].open().unwrap();
|
||||
let DynamicColumn::I64(column_i64) = column else { panic!(); };
|
||||
let DynamicColumn::I64(column_i64) = column else {
|
||||
panic!();
|
||||
};
|
||||
assert_eq!(column_i64.index.get_cardinality(), Cardinality::Optional);
|
||||
assert_eq!(column_i64.first(0), None);
|
||||
assert_eq!(column_i64.first(1), Some(12i64));
|
||||
@@ -198,7 +206,9 @@ fn test_dictionary_encoded_str() {
|
||||
assert_eq!(columnar_reader.num_columns(), 2);
|
||||
let col_handles = columnar_reader.read_columns("my.column").unwrap();
|
||||
assert_eq!(col_handles.len(), 1);
|
||||
let DynamicColumn::Str(str_col) = col_handles[0].open().unwrap() else { panic!(); };
|
||||
let DynamicColumn::Str(str_col) = col_handles[0].open().unwrap() else {
|
||||
panic!();
|
||||
};
|
||||
let index: Vec<Option<u64>> = (0..5).map(|row_id| str_col.ords().first(row_id)).collect();
|
||||
assert_eq!(index, &[None, Some(0), None, Some(2), Some(1)]);
|
||||
assert_eq!(str_col.num_rows(), 5);
|
||||
@@ -230,7 +240,9 @@ fn test_dictionary_encoded_bytes() {
|
||||
assert_eq!(columnar_reader.num_columns(), 2);
|
||||
let col_handles = columnar_reader.read_columns("my.column").unwrap();
|
||||
assert_eq!(col_handles.len(), 1);
|
||||
let DynamicColumn::Bytes(bytes_col) = col_handles[0].open().unwrap() else { panic!(); };
|
||||
let DynamicColumn::Bytes(bytes_col) = col_handles[0].open().unwrap() else {
|
||||
panic!();
|
||||
};
|
||||
let index: Vec<Option<u64>> = (0..5)
|
||||
.map(|row_id| bytes_col.ords().first(row_id))
|
||||
.collect();
|
||||
@@ -533,28 +545,36 @@ trait AssertEqualToColumnValue {
|
||||
|
||||
impl AssertEqualToColumnValue for bool {
|
||||
fn assert_equal_to_column_value(&self, column_value: &ColumnValue) {
|
||||
let ColumnValue::Bool(val) = column_value else { panic!() };
|
||||
let ColumnValue::Bool(val) = column_value else {
|
||||
panic!()
|
||||
};
|
||||
assert_eq!(self, val);
|
||||
}
|
||||
}
|
||||
|
||||
impl AssertEqualToColumnValue for Ipv6Addr {
|
||||
fn assert_equal_to_column_value(&self, column_value: &ColumnValue) {
|
||||
let ColumnValue::IpAddr(val) = column_value else { panic!() };
|
||||
let ColumnValue::IpAddr(val) = column_value else {
|
||||
panic!()
|
||||
};
|
||||
assert_eq!(self, val);
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Coerce + PartialEq + Debug + Into<NumericalValue>> AssertEqualToColumnValue for T {
|
||||
fn assert_equal_to_column_value(&self, column_value: &ColumnValue) {
|
||||
let ColumnValue::Numerical(num) = column_value else { panic!() };
|
||||
let ColumnValue::Numerical(num) = column_value else {
|
||||
panic!()
|
||||
};
|
||||
assert_eq!(self, &T::coerce(*num));
|
||||
}
|
||||
}
|
||||
|
||||
impl AssertEqualToColumnValue for DateTime {
|
||||
fn assert_equal_to_column_value(&self, column_value: &ColumnValue) {
|
||||
let ColumnValue::DateTime(dt) = column_value else { panic!() };
|
||||
let ColumnValue::DateTime(dt) = column_value else {
|
||||
panic!()
|
||||
};
|
||||
assert_eq!(self, dt);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -37,7 +37,7 @@ fn main() -> tantivy::Result<()> {
|
||||
.set_index_option(IndexRecordOption::WithFreqs)
|
||||
.set_tokenizer("raw"),
|
||||
)
|
||||
.set_fast(None)
|
||||
.set_fast("default")
|
||||
.set_stored();
|
||||
schema_builder.add_text_field("category", text_fieldtype);
|
||||
schema_builder.add_f64_field("stock", FAST);
|
||||
|
||||
@@ -1293,13 +1293,13 @@ mod tests {
|
||||
// searching for terma, but min_doc_count will return all terms
|
||||
let res = exec_request_with_query(agg_req, &index, Some(("string2", "hit")))?;
|
||||
|
||||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "A");
|
||||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "a");
|
||||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 2);
|
||||
assert_eq!(
|
||||
res["my_texts"]["buckets"][0]["elhistogram"]["buckets"],
|
||||
json!([{ "doc_count": 1, "key": 1.0 }, { "doc_count": 1, "key": 2.0 } ])
|
||||
);
|
||||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "B");
|
||||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "b");
|
||||
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 1);
|
||||
assert_eq!(
|
||||
res["my_texts"]["buckets"][1]["elhistogram"]["buckets"],
|
||||
@@ -1421,10 +1421,10 @@ mod tests {
|
||||
let res = exec_request_with_query(agg_req, &index, None).unwrap();
|
||||
println!("{}", serde_json::to_string_pretty(&res).unwrap());
|
||||
|
||||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "Hallo Hallo");
|
||||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "hallo hallo");
|
||||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 1);
|
||||
|
||||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "Hello Hello");
|
||||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "hello hello");
|
||||
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 1);
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -411,7 +411,7 @@ mod tests {
|
||||
.set_index_option(IndexRecordOption::Basic)
|
||||
.set_fieldnorms(false),
|
||||
)
|
||||
.set_fast(None)
|
||||
.set_fast("default")
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype.clone());
|
||||
let text_field_id = schema_builder.add_text_field("text_id", text_fieldtype);
|
||||
@@ -466,7 +466,7 @@ mod tests {
|
||||
.set_indexing_options(
|
||||
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
|
||||
)
|
||||
.set_fast(None)
|
||||
.set_fast("default")
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||
let date_field = schema_builder.add_date_field("date", FAST);
|
||||
|
||||
@@ -14,7 +14,7 @@ use crate::collector::{
|
||||
};
|
||||
use crate::fastfield::{FastFieldNotAvailableError, FastValue};
|
||||
use crate::query::Weight;
|
||||
use crate::{DocAddress, DocId, Score, SegmentOrdinal, SegmentReader, TantivyError};
|
||||
use crate::{DocAddress, DocId, Order, Score, SegmentOrdinal, SegmentReader, TantivyError};
|
||||
|
||||
struct FastFieldConvertCollector<
|
||||
TCollector: Collector<Fruit = Vec<(u64, DocAddress)>>,
|
||||
@@ -23,6 +23,7 @@ struct FastFieldConvertCollector<
|
||||
pub collector: TCollector,
|
||||
pub field: String,
|
||||
pub fast_value: std::marker::PhantomData<TFastValue>,
|
||||
order: Order,
|
||||
}
|
||||
|
||||
impl<TCollector, TFastValue> Collector for FastFieldConvertCollector<TCollector, TFastValue>
|
||||
@@ -70,7 +71,13 @@ where
|
||||
let raw_result = self.collector.merge_fruits(segment_fruits)?;
|
||||
let transformed_result = raw_result
|
||||
.into_iter()
|
||||
.map(|(score, doc_address)| (TFastValue::from_u64(score), doc_address))
|
||||
.map(|(score, doc_address)| {
|
||||
if self.order.is_desc() {
|
||||
(TFastValue::from_u64(score), doc_address)
|
||||
} else {
|
||||
(TFastValue::from_u64(u64::MAX - score), doc_address)
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
Ok(transformed_result)
|
||||
}
|
||||
@@ -131,16 +138,23 @@ impl fmt::Debug for TopDocs {
|
||||
|
||||
struct ScorerByFastFieldReader {
|
||||
sort_column: Arc<dyn ColumnValues<u64>>,
|
||||
order: Order,
|
||||
}
|
||||
|
||||
impl CustomSegmentScorer<u64> for ScorerByFastFieldReader {
|
||||
fn score(&mut self, doc: DocId) -> u64 {
|
||||
self.sort_column.get_val(doc)
|
||||
let value = self.sort_column.get_val(doc);
|
||||
if self.order.is_desc() {
|
||||
value
|
||||
} else {
|
||||
u64::MAX - value
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct ScorerByField {
|
||||
field: String,
|
||||
order: Order,
|
||||
}
|
||||
|
||||
impl CustomScorer<u64> for ScorerByField {
|
||||
@@ -157,8 +171,13 @@ impl CustomScorer<u64> for ScorerByField {
|
||||
sort_column_opt.ok_or_else(|| FastFieldNotAvailableError {
|
||||
field_name: self.field.clone(),
|
||||
})?;
|
||||
let mut default_value = 0u64;
|
||||
if self.order.is_asc() {
|
||||
default_value = u64::MAX;
|
||||
}
|
||||
Ok(ScorerByFastFieldReader {
|
||||
sort_column: sort_column.first_or_default_col(0u64),
|
||||
sort_column: sort_column.first_or_default_col(default_value),
|
||||
order: self.order.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -230,7 +249,7 @@ impl TopDocs {
|
||||
///
|
||||
/// ```rust
|
||||
/// # use tantivy::schema::{Schema, FAST, TEXT};
|
||||
/// # use tantivy::{doc, Index, DocAddress};
|
||||
/// # use tantivy::{doc, Index, DocAddress, Order};
|
||||
/// # use tantivy::query::{Query, QueryParser};
|
||||
/// use tantivy::Searcher;
|
||||
/// use tantivy::collector::TopDocs;
|
||||
@@ -268,7 +287,7 @@ impl TopDocs {
|
||||
/// // Note the `rating_field` needs to be a FAST field here.
|
||||
/// let top_books_by_rating = TopDocs
|
||||
/// ::with_limit(10)
|
||||
/// .order_by_u64_field("rating");
|
||||
/// .order_by_fast_field("rating", Order::Desc);
|
||||
///
|
||||
/// // ... and here are our documents. Note this is a simple vec.
|
||||
/// // The `u64` in the pair is the value of our fast field for
|
||||
@@ -288,13 +307,15 @@ impl TopDocs {
|
||||
///
|
||||
/// To comfortably work with `u64`s, `i64`s, `f64`s, or `date`s, please refer to
|
||||
/// the [.order_by_fast_field(...)](TopDocs::order_by_fast_field) method.
|
||||
pub fn order_by_u64_field(
|
||||
fn order_by_u64_field(
|
||||
self,
|
||||
field: impl ToString,
|
||||
order: Order,
|
||||
) -> impl Collector<Fruit = Vec<(u64, DocAddress)>> {
|
||||
CustomScoreTopCollector::new(
|
||||
ScorerByField {
|
||||
field: field.to_string(),
|
||||
order,
|
||||
},
|
||||
self.0.into_tscore(),
|
||||
)
|
||||
@@ -316,7 +337,7 @@ impl TopDocs {
|
||||
///
|
||||
/// ```rust
|
||||
/// # use tantivy::schema::{Schema, FAST, TEXT};
|
||||
/// # use tantivy::{doc, Index, DocAddress};
|
||||
/// # use tantivy::{doc, Index, DocAddress,Order};
|
||||
/// # use tantivy::query::{Query, AllQuery};
|
||||
/// use tantivy::Searcher;
|
||||
/// use tantivy::collector::TopDocs;
|
||||
@@ -354,7 +375,7 @@ impl TopDocs {
|
||||
/// // type `sort_by_field`. revenue_field here is a FAST i64 field.
|
||||
/// let top_company_by_revenue = TopDocs
|
||||
/// ::with_limit(2)
|
||||
/// .order_by_fast_field("revenue");
|
||||
/// .order_by_fast_field("revenue", Order::Desc);
|
||||
///
|
||||
/// // ... and here are our documents. Note this is a simple vec.
|
||||
/// // The `i64` in the pair is the value of our fast field for
|
||||
@@ -372,15 +393,17 @@ impl TopDocs {
|
||||
pub fn order_by_fast_field<TFastValue>(
|
||||
self,
|
||||
fast_field: impl ToString,
|
||||
order: Order,
|
||||
) -> impl Collector<Fruit = Vec<(TFastValue, DocAddress)>>
|
||||
where
|
||||
TFastValue: FastValue,
|
||||
{
|
||||
let u64_collector = self.order_by_u64_field(fast_field.to_string());
|
||||
let u64_collector = self.order_by_u64_field(fast_field.to_string(), order.clone());
|
||||
FastFieldConvertCollector {
|
||||
collector: u64_collector,
|
||||
field: fast_field.to_string(),
|
||||
fast_value: PhantomData,
|
||||
order,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -721,7 +744,7 @@ mod tests {
|
||||
use crate::schema::{Field, Schema, FAST, STORED, TEXT};
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
use crate::time::OffsetDateTime;
|
||||
use crate::{DateTime, DocAddress, DocId, Index, IndexWriter, Score, SegmentReader};
|
||||
use crate::{DateTime, DocAddress, DocId, Index, IndexWriter, Order, Score, SegmentReader};
|
||||
|
||||
fn make_index() -> crate::Result<Index> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -882,7 +905,7 @@ mod tests {
|
||||
});
|
||||
let searcher = index.reader()?.searcher();
|
||||
|
||||
let top_collector = TopDocs::with_limit(4).order_by_u64_field(SIZE);
|
||||
let top_collector = TopDocs::with_limit(4).order_by_u64_field(SIZE, Order::Desc);
|
||||
let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector)?;
|
||||
assert_eq!(
|
||||
&top_docs[..],
|
||||
@@ -921,7 +944,7 @@ mod tests {
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let top_collector = TopDocs::with_limit(3).order_by_fast_field("birthday");
|
||||
let top_collector = TopDocs::with_limit(3).order_by_fast_field("birthday", Order::Desc);
|
||||
let top_docs: Vec<(DateTime, DocAddress)> = searcher.search(&AllQuery, &top_collector)?;
|
||||
assert_eq!(
|
||||
&top_docs[..],
|
||||
@@ -951,7 +974,7 @@ mod tests {
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let top_collector = TopDocs::with_limit(3).order_by_fast_field("altitude");
|
||||
let top_collector = TopDocs::with_limit(3).order_by_fast_field("altitude", Order::Desc);
|
||||
let top_docs: Vec<(i64, DocAddress)> = searcher.search(&AllQuery, &top_collector)?;
|
||||
assert_eq!(
|
||||
&top_docs[..],
|
||||
@@ -981,7 +1004,7 @@ mod tests {
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let top_collector = TopDocs::with_limit(3).order_by_fast_field("altitude");
|
||||
let top_collector = TopDocs::with_limit(3).order_by_fast_field("altitude", Order::Desc);
|
||||
let top_docs: Vec<(f64, DocAddress)> = searcher.search(&AllQuery, &top_collector)?;
|
||||
assert_eq!(
|
||||
&top_docs[..],
|
||||
@@ -1009,7 +1032,7 @@ mod tests {
|
||||
.unwrap();
|
||||
});
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let top_collector = TopDocs::with_limit(4).order_by_u64_field("missing_field");
|
||||
let top_collector = TopDocs::with_limit(4).order_by_u64_field("missing_field", Order::Desc);
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
top_collector
|
||||
.for_segment(0, segment_reader)
|
||||
@@ -1027,7 +1050,7 @@ mod tests {
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let segment = searcher.segment_reader(0);
|
||||
let top_collector = TopDocs::with_limit(4).order_by_u64_field(SIZE);
|
||||
let top_collector = TopDocs::with_limit(4).order_by_u64_field(SIZE, Order::Desc);
|
||||
let err = top_collector.for_segment(0, segment).err().unwrap();
|
||||
assert!(matches!(err, crate::TantivyError::InvalidArgument(_)));
|
||||
Ok(())
|
||||
@@ -1044,7 +1067,7 @@ mod tests {
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let segment = searcher.segment_reader(0);
|
||||
let top_collector = TopDocs::with_limit(4).order_by_fast_field::<i64>(SIZE);
|
||||
let top_collector = TopDocs::with_limit(4).order_by_fast_field::<i64>(SIZE, Order::Desc);
|
||||
let err = top_collector.for_segment(0, segment).err().unwrap();
|
||||
assert!(
|
||||
matches!(err, crate::TantivyError::SchemaError(msg) if msg == "Field \"size\" is not a fast field.")
|
||||
@@ -1106,4 +1129,50 @@ mod tests {
|
||||
let query = query_parser.parse_query(query).unwrap();
|
||||
(index, query)
|
||||
}
|
||||
#[test]
|
||||
fn test_fast_field_ascending_order() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let title = schema_builder.add_text_field(TITLE, TEXT);
|
||||
let size = schema_builder.add_u64_field(SIZE, FAST);
|
||||
let schema = schema_builder.build();
|
||||
let (index, query) = index("beer", title, schema, |index_writer| {
|
||||
index_writer
|
||||
.add_document(doc!(
|
||||
title => "bottle of beer",
|
||||
size => 12u64,
|
||||
))
|
||||
.unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(
|
||||
title => "growler of beer",
|
||||
size => 64u64,
|
||||
))
|
||||
.unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(
|
||||
title => "pint of beer",
|
||||
size => 16u64,
|
||||
))
|
||||
.unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(
|
||||
title => "empty beer",
|
||||
))
|
||||
.unwrap();
|
||||
});
|
||||
let searcher = index.reader()?.searcher();
|
||||
|
||||
let top_collector = TopDocs::with_limit(4).order_by_fast_field(SIZE, Order::Asc);
|
||||
let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector)?;
|
||||
assert_eq!(
|
||||
&top_docs[..],
|
||||
&[
|
||||
(12, DocAddress::new(0, 0)),
|
||||
(16, DocAddress::new(0, 2)),
|
||||
(64, DocAddress::new(0, 1)),
|
||||
(18446744073709551615, DocAddress::new(0, 3)),
|
||||
]
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -120,8 +120,8 @@ impl IndexBuilder {
|
||||
Self {
|
||||
schema: None,
|
||||
index_settings: IndexSettings::default(),
|
||||
tokenizer_manager: TokenizerManager::default(),
|
||||
fast_field_tokenizer_manager: TokenizerManager::default(),
|
||||
tokenizer_manager: TokenizerManager::default_for_indexing(),
|
||||
fast_field_tokenizer_manager: TokenizerManager::default_for_fast_fields(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -400,8 +400,8 @@ impl Index {
|
||||
settings: metas.index_settings.clone(),
|
||||
directory,
|
||||
schema,
|
||||
tokenizers: TokenizerManager::default(),
|
||||
fast_field_tokenizers: TokenizerManager::default(),
|
||||
tokenizers: TokenizerManager::default_for_indexing(),
|
||||
fast_field_tokenizers: TokenizerManager::default_for_fast_fields(),
|
||||
executor: Arc::new(Executor::single_thread()),
|
||||
inventory,
|
||||
}
|
||||
|
||||
@@ -410,7 +410,9 @@ mod tests {
|
||||
use super::IndexMeta;
|
||||
use crate::core::index_meta::UntrackedIndexMeta;
|
||||
use crate::schema::{Schema, TEXT};
|
||||
use crate::store::{Compressor, ZstdCompressor};
|
||||
use crate::store::Compressor;
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
use crate::store::ZstdCompressor;
|
||||
use crate::{IndexSettings, IndexSortByField, Order};
|
||||
|
||||
#[test]
|
||||
@@ -446,6 +448,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
fn test_serialize_metas_zstd_compressor() {
|
||||
let schema = {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -482,13 +485,14 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(all(feature = "lz4-compression", feature = "zstd-compression"))]
|
||||
fn test_serialize_metas_invalid_comp() {
|
||||
let json = r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zsstd","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;
|
||||
|
||||
let err = serde_json::from_str::<UntrackedIndexMeta>(json).unwrap_err();
|
||||
assert_eq!(
|
||||
err.to_string(),
|
||||
"unknown variant `zsstd`, expected one of `none`, `lz4`, `brotli`, `snappy`, `zstd`, \
|
||||
"unknown variant `zsstd`, expected one of `none`, `lz4`, `zstd`, \
|
||||
`zstd(compression_level=5)` at line 1 column 96"
|
||||
.to_string()
|
||||
);
|
||||
@@ -502,6 +506,20 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(not(feature = "zstd-compression"))]
|
||||
fn test_serialize_metas_unsupported_comp() {
|
||||
let json = r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zstd","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;
|
||||
|
||||
let err = serde_json::from_str::<UntrackedIndexMeta>(json).unwrap_err();
|
||||
assert_eq!(
|
||||
err.to_string(),
|
||||
"unsupported variant `zstd`, please enable Tantivy's `zstd-compression` feature at \
|
||||
line 1 column 95"
|
||||
.to_string()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
fn test_index_settings_default() {
|
||||
|
||||
@@ -446,7 +446,8 @@ mod tests {
|
||||
#[test]
|
||||
fn test_text_fastfield() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT | FAST);
|
||||
let text_options: TextOptions = TextOptions::from(TEXT).set_fast("raw");
|
||||
let text_field = schema_builder.add_text_field("text", text_options);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
@@ -1082,7 +1083,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_fast_field_in_json_field_expand_dots_disabled() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_option = JsonObjectOptions::default().set_fast(None);
|
||||
let json_option = JsonObjectOptions::default().set_fast("default");
|
||||
let json = schema_builder.add_json_field("json", json_option);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
@@ -1108,7 +1109,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_fast_field_in_json_field_with_tokenizer() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_option = JsonObjectOptions::default().set_fast(Some("default"));
|
||||
let json_option = JsonObjectOptions::default().set_fast("default");
|
||||
let json = schema_builder.add_json_field("json", json_option);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
@@ -1134,7 +1135,7 @@ mod tests {
|
||||
fn test_fast_field_in_json_field_expand_dots_enabled() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_option = JsonObjectOptions::default()
|
||||
.set_fast(None)
|
||||
.set_fast("default")
|
||||
.set_expand_dots_enabled();
|
||||
let json = schema_builder.add_json_field("json", json_option);
|
||||
let schema = schema_builder.build();
|
||||
@@ -1202,10 +1203,10 @@ mod tests {
|
||||
#[test]
|
||||
fn test_fast_field_tokenizer() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let opt = TextOptions::default().set_fast(Some("custom_lowercase"));
|
||||
let opt = TextOptions::default().set_fast("custom_lowercase");
|
||||
let text_field = schema_builder.add_text_field("text", opt);
|
||||
let schema = schema_builder.build();
|
||||
let ff_tokenizer_manager = TokenizerManager::default();
|
||||
let ff_tokenizer_manager = TokenizerManager::default_for_fast_fields();
|
||||
ff_tokenizer_manager.register(
|
||||
"custom_lowercase",
|
||||
TextAnalyzer::builder(RawTokenizer::default())
|
||||
@@ -1238,7 +1239,7 @@ mod tests {
|
||||
.set_index_option(crate::schema::IndexRecordOption::WithFreqs)
|
||||
.set_tokenizer("raw"),
|
||||
)
|
||||
.set_fast(Some("default"))
|
||||
.set_fast("default")
|
||||
.set_stored();
|
||||
|
||||
let log_field = schema_builder.add_text_field("log_level", text_fieldtype);
|
||||
@@ -1271,7 +1272,7 @@ mod tests {
|
||||
fn test_shadowing_fast_field_with_expand_dots() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_option = JsonObjectOptions::default()
|
||||
.set_fast(None)
|
||||
.set_fast("default")
|
||||
.set_expand_dots_enabled();
|
||||
let json_field = schema_builder.add_json_field("jsonfield", json_option.clone());
|
||||
let shadowing_json_field = schema_builder.add_json_field("jsonfield.attr", json_option);
|
||||
|
||||
@@ -88,7 +88,7 @@ impl FastFieldReaders {
|
||||
let Some((field, path)): Option<(Field, &str)> = self
|
||||
.schema
|
||||
.find_field_with_default(field_name, default_field_opt)
|
||||
else{
|
||||
else {
|
||||
return Ok(None);
|
||||
};
|
||||
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
|
||||
@@ -120,7 +120,8 @@ impl FastFieldReaders {
|
||||
T: HasAssociatedColumnType,
|
||||
DynamicColumn: Into<Option<Column<T>>>,
|
||||
{
|
||||
let Some(dynamic_column_handle) = self.dynamic_column_handle(field_name, T::column_type())?
|
||||
let Some(dynamic_column_handle) =
|
||||
self.dynamic_column_handle(field_name, T::column_type())?
|
||||
else {
|
||||
return Ok(None);
|
||||
};
|
||||
@@ -196,7 +197,8 @@ impl FastFieldReaders {
|
||||
|
||||
/// Returns a `str` column.
|
||||
pub fn str(&self, field_name: &str) -> crate::Result<Option<StrColumn>> {
|
||||
let Some(dynamic_column_handle) = self.dynamic_column_handle(field_name, ColumnType::Str)?
|
||||
let Some(dynamic_column_handle) =
|
||||
self.dynamic_column_handle(field_name, ColumnType::Str)?
|
||||
else {
|
||||
return Ok(None);
|
||||
};
|
||||
@@ -206,7 +208,8 @@ impl FastFieldReaders {
|
||||
|
||||
/// Returns a `bytes` column.
|
||||
pub fn bytes(&self, field_name: &str) -> crate::Result<Option<BytesColumn>> {
|
||||
let Some(dynamic_column_handle) = self.dynamic_column_handle(field_name, ColumnType::Bytes)?
|
||||
let Some(dynamic_column_handle) =
|
||||
self.dynamic_column_handle(field_name, ColumnType::Bytes)?
|
||||
else {
|
||||
return Ok(None);
|
||||
};
|
||||
@@ -346,7 +349,7 @@ mod tests {
|
||||
schema_builder.add_json_field(
|
||||
"json_expand_dots_enabled",
|
||||
JsonObjectOptions::default()
|
||||
.set_fast(None)
|
||||
.set_fast("default")
|
||||
.set_expand_dots_enabled(),
|
||||
);
|
||||
let dynamic_field = schema_builder.add_json_field("_dyna", FAST);
|
||||
|
||||
@@ -18,6 +18,8 @@ const JSON_DEPTH_LIMIT: usize = 20;
|
||||
pub struct FastFieldsWriter {
|
||||
columnar_writer: ColumnarWriter,
|
||||
fast_field_names: Vec<Option<String>>, //< TODO see if we can hash the field name hash too.
|
||||
// Field -> Fast field tokenizer mapping.
|
||||
// All text fast fields should have a tokenizer.
|
||||
per_field_tokenizer: Vec<Option<TextAnalyzer>>,
|
||||
date_precisions: Vec<DateTimePrecision>,
|
||||
expand_dots: Vec<bool>,
|
||||
@@ -61,7 +63,7 @@ impl FastFieldsWriter {
|
||||
if let Some(tokenizer_name) = json_object_options.get_fast_field_tokenizer_name() {
|
||||
let text_analyzer = tokenizer_manager.get(tokenizer_name).ok_or_else(|| {
|
||||
TantivyError::InvalidArgument(format!(
|
||||
"Tokenizer {tokenizer_name:?} not found"
|
||||
"Tokenizer `{tokenizer_name}` not found"
|
||||
))
|
||||
})?;
|
||||
per_field_tokenizer[field_id.field_id() as usize] = Some(text_analyzer);
|
||||
@@ -157,9 +159,6 @@ impl FastFieldsWriter {
|
||||
&token.text,
|
||||
);
|
||||
})
|
||||
} else {
|
||||
self.columnar_writer
|
||||
.record_str(doc_id, field_name.as_str(), text_val);
|
||||
}
|
||||
}
|
||||
Value::Bytes(bytes_val) => {
|
||||
@@ -201,18 +200,20 @@ impl FastFieldsWriter {
|
||||
self.json_path_buffer.clear();
|
||||
self.json_path_buffer.push_str(field_name);
|
||||
|
||||
let text_analyzer =
|
||||
let text_analyzer_opt =
|
||||
&mut self.per_field_tokenizer[field_value.field().field_id() as usize];
|
||||
|
||||
record_json_obj_to_columnar_writer(
|
||||
doc_id,
|
||||
json_obj,
|
||||
expand_dots,
|
||||
JSON_DEPTH_LIMIT,
|
||||
&mut self.json_path_buffer,
|
||||
&mut self.columnar_writer,
|
||||
text_analyzer,
|
||||
);
|
||||
if let Some(text_analyzer) = text_analyzer_opt {
|
||||
record_json_obj_to_columnar_writer(
|
||||
doc_id,
|
||||
json_obj,
|
||||
expand_dots,
|
||||
JSON_DEPTH_LIMIT,
|
||||
&mut self.json_path_buffer,
|
||||
&mut self.columnar_writer,
|
||||
text_analyzer,
|
||||
);
|
||||
}
|
||||
}
|
||||
Value::IpAddr(ip_addr) => {
|
||||
self.columnar_writer
|
||||
@@ -263,7 +264,7 @@ fn record_json_obj_to_columnar_writer(
|
||||
remaining_depth_limit: usize,
|
||||
json_path_buffer: &mut String,
|
||||
columnar_writer: &mut columnar::ColumnarWriter,
|
||||
tokenizer: &mut Option<TextAnalyzer>,
|
||||
text_analyzer: &mut TextAnalyzer,
|
||||
) {
|
||||
for (key, child) in json_obj {
|
||||
let len_path = json_path_buffer.len();
|
||||
@@ -288,7 +289,7 @@ fn record_json_obj_to_columnar_writer(
|
||||
remaining_depth_limit,
|
||||
json_path_buffer,
|
||||
columnar_writer,
|
||||
tokenizer,
|
||||
text_analyzer,
|
||||
);
|
||||
// popping our sub path.
|
||||
json_path_buffer.truncate(len_path);
|
||||
@@ -302,7 +303,7 @@ fn record_json_value_to_columnar_writer(
|
||||
mut remaining_depth_limit: usize,
|
||||
json_path_writer: &mut String,
|
||||
columnar_writer: &mut columnar::ColumnarWriter,
|
||||
tokenizer: &mut Option<TextAnalyzer>,
|
||||
text_analyzer: &mut TextAnalyzer,
|
||||
) {
|
||||
if remaining_depth_limit == 0 {
|
||||
return;
|
||||
@@ -321,14 +322,10 @@ fn record_json_value_to_columnar_writer(
|
||||
}
|
||||
}
|
||||
serde_json::Value::String(text) => {
|
||||
if let Some(text_analyzer) = tokenizer.as_mut() {
|
||||
let mut token_stream = text_analyzer.token_stream(text);
|
||||
token_stream.process(&mut |token| {
|
||||
columnar_writer.record_str(doc, json_path_writer.as_str(), &token.text);
|
||||
})
|
||||
} else {
|
||||
columnar_writer.record_str(doc, json_path_writer.as_str(), text);
|
||||
}
|
||||
let mut token_stream = text_analyzer.token_stream(text);
|
||||
token_stream.process(&mut |token| {
|
||||
columnar_writer.record_str(doc, json_path_writer.as_str(), &token.text);
|
||||
});
|
||||
}
|
||||
serde_json::Value::Array(arr) => {
|
||||
for el in arr {
|
||||
@@ -339,7 +336,7 @@ fn record_json_value_to_columnar_writer(
|
||||
remaining_depth_limit,
|
||||
json_path_writer,
|
||||
columnar_writer,
|
||||
tokenizer,
|
||||
text_analyzer,
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -351,7 +348,7 @@ fn record_json_value_to_columnar_writer(
|
||||
remaining_depth_limit,
|
||||
json_path_writer,
|
||||
columnar_writer,
|
||||
tokenizer,
|
||||
text_analyzer,
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -371,6 +368,9 @@ mod tests {
|
||||
) -> ColumnarReader {
|
||||
let mut columnar_writer = ColumnarWriter::default();
|
||||
let mut json_path = String::new();
|
||||
let mut text_analyzer = crate::tokenizer::TokenizerManager::default_for_fast_fields()
|
||||
.get(crate::schema::DEFAULT_FAST_FIELD_TOKENIZER)
|
||||
.unwrap();
|
||||
for (doc, json_doc) in json_docs.iter().enumerate() {
|
||||
record_json_value_to_columnar_writer(
|
||||
doc as u32,
|
||||
@@ -379,7 +379,7 @@ mod tests {
|
||||
JSON_DEPTH_LIMIT,
|
||||
&mut json_path,
|
||||
&mut columnar_writer,
|
||||
&mut None,
|
||||
&mut text_analyzer,
|
||||
);
|
||||
}
|
||||
let mut buffer = Vec::new();
|
||||
@@ -399,6 +399,7 @@ mod tests {
|
||||
});
|
||||
let columnar_reader = test_columnar_from_jsons_aux(&[json_doc], false);
|
||||
let columns = columnar_reader.list_columns().unwrap();
|
||||
assert_eq!(columns.len(), 5);
|
||||
{
|
||||
assert_eq!(columns[0].0, "arr");
|
||||
let column_arr_opt: Option<StrColumn> = columns[0].1.open().unwrap().into();
|
||||
@@ -434,7 +435,9 @@ mod tests {
|
||||
{
|
||||
assert_eq!(columns[4].0, "text");
|
||||
let column_text_opt: Option<StrColumn> = columns[4].1.open().unwrap().into();
|
||||
assert!(column_text_opt.unwrap().term_ords(0).eq([0].into_iter()));
|
||||
let column_text = column_text_opt.unwrap();
|
||||
let term_ords: Vec<u64> = column_text.term_ords(0).collect();
|
||||
assert_eq!(&term_ords[..], &[0]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -191,7 +191,7 @@ pub use crate::schema::{DateOptions, DateTimePrecision, Document, Term};
|
||||
/// Index format version.
|
||||
const INDEX_FORMAT_VERSION: u32 = 5;
|
||||
|
||||
#[cfg(unix)]
|
||||
#[cfg(all(feature = "mmap", unix))]
|
||||
pub use memmap2::Advice;
|
||||
|
||||
/// Structure version for the index.
|
||||
|
||||
@@ -4,9 +4,7 @@ use std::collections::{BinaryHeap, HashMap};
|
||||
use crate::query::bm25::idf;
|
||||
use crate::query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery};
|
||||
use crate::schema::{Field, FieldType, IndexRecordOption, Term, Value};
|
||||
use crate::tokenizer::{
|
||||
FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer,
|
||||
};
|
||||
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer};
|
||||
use crate::{DocAddress, Result, Searcher, TantivyError};
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
@@ -206,8 +204,7 @@ impl MoreLikeThis {
|
||||
for value in values {
|
||||
match value {
|
||||
Value::PreTokStr(tok_str) => {
|
||||
let mut token_stream =
|
||||
PreTokenizedStream::from(tok_str.clone());
|
||||
let mut token_stream = PreTokenizedStream::from(tok_str.clone());
|
||||
token_stream.process(&mut |token| {
|
||||
if !self.is_noise_word(token.text.clone()) {
|
||||
let term = Term::from_field_text(field, &token.text);
|
||||
|
||||
@@ -956,7 +956,7 @@ mod test {
|
||||
.iter()
|
||||
.flat_map(|field_name| schema.get_field(field_name))
|
||||
.collect();
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
let tokenizer_manager = TokenizerManager::default_for_indexing();
|
||||
tokenizer_manager.register(
|
||||
"en_with_stop_words",
|
||||
TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
@@ -1447,7 +1447,7 @@ mod test {
|
||||
let title = schema_builder.add_text_field("title", text_options);
|
||||
let schema = schema_builder.build();
|
||||
let default_fields = vec![title];
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
let tokenizer_manager = TokenizerManager::default_for_indexing();
|
||||
let query_parser = QueryParser::new(schema, default_fields, tokenizer_manager);
|
||||
|
||||
assert_matches!(
|
||||
@@ -1622,7 +1622,8 @@ mod test {
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_text_field(r#"a\.b"#, STRING);
|
||||
let schema = schema_builder.build();
|
||||
let query_parser = QueryParser::new(schema, Vec::new(), TokenizerManager::default());
|
||||
let query_parser =
|
||||
QueryParser::new(schema, Vec::new(), TokenizerManager::default_for_indexing());
|
||||
let query = query_parser.parse_query(r#"a\.b:hello"#).unwrap();
|
||||
assert_eq!(
|
||||
format!("{query:?}"),
|
||||
@@ -1639,8 +1640,11 @@ mod test {
|
||||
schema_builder.add_text_field("first.toto.titi", STRING);
|
||||
schema_builder.add_text_field("third.a.b.c", STRING);
|
||||
let schema = schema_builder.build();
|
||||
let query_parser =
|
||||
QueryParser::new(schema.clone(), Vec::new(), TokenizerManager::default());
|
||||
let query_parser = QueryParser::new(
|
||||
schema.clone(),
|
||||
Vec::new(),
|
||||
TokenizerManager::default_for_indexing(),
|
||||
);
|
||||
assert_eq!(
|
||||
query_parser.split_full_path("first.toto"),
|
||||
Some((schema.get_field("first.toto").unwrap(), ""))
|
||||
|
||||
@@ -31,9 +31,10 @@ impl IPFastFieldRangeWeight {
|
||||
|
||||
impl Weight for IPFastFieldRangeWeight {
|
||||
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
||||
let Some(ip_addr_column): Option<Column<Ipv6Addr>> = reader.fast_fields()
|
||||
.column_opt(&self.field)? else {
|
||||
return Ok(Box::new(EmptyScorer))
|
||||
let Some(ip_addr_column): Option<Column<Ipv6Addr>> =
|
||||
reader.fast_fields().column_opt(&self.field)?
|
||||
else {
|
||||
return Ok(Box::new(EmptyScorer));
|
||||
};
|
||||
let value_range = bound_to_value_range(
|
||||
&self.lower_bound,
|
||||
|
||||
@@ -71,7 +71,9 @@ impl Weight for FastFieldRangeWeight {
|
||||
let column_type_opt_ref: Option<&[ColumnType]> = column_type_opt
|
||||
.as_ref()
|
||||
.map(|column_types| column_types.as_slice());
|
||||
let Some((column, _)) = fast_field_reader.u64_lenient_for_type(column_type_opt_ref, &self.field)? else {
|
||||
let Some((column, _)) =
|
||||
fast_field_reader.u64_lenient_for_type(column_type_opt_ref, &self.field)?
|
||||
else {
|
||||
return Ok(Box::new(EmptyScorer));
|
||||
};
|
||||
let value_range = bound_to_value_range(
|
||||
|
||||
@@ -72,6 +72,14 @@ impl Query for TermSetQuery {
|
||||
fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
|
||||
Ok(Box::new(self.specialized_weight(enable_scoring.schema())?))
|
||||
}
|
||||
|
||||
fn query_terms<'a>(&'a self, visitor: &mut dyn FnMut(&'a Term, bool)) {
|
||||
for terms in self.terms_map.values() {
|
||||
for term in terms {
|
||||
visitor(term, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct SetDfaWrapper(Map<Vec<u8>>);
|
||||
|
||||
@@ -4,7 +4,7 @@ use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::text_options::{FastFieldTextOptions, TokenizerName};
|
||||
use crate::schema::flags::{FastFlag, SchemaFlagList, StoredFlag};
|
||||
use crate::schema::{TextFieldIndexing, TextOptions};
|
||||
use crate::schema::{TextFieldIndexing, TextOptions, DEFAULT_FAST_FIELD_TOKENIZER};
|
||||
|
||||
/// The `JsonObjectOptions` make it possible to
|
||||
/// configure how a json object field should be indexed and stored.
|
||||
@@ -58,20 +58,19 @@ impl JsonObjectOptions {
|
||||
/// Returns true if and only if the json object fields are
|
||||
/// to be treated as fast fields.
|
||||
pub fn is_fast(&self) -> bool {
|
||||
matches!(self.fast, FastFieldTextOptions::IsEnabled(true))
|
||||
|| matches!(
|
||||
&self.fast,
|
||||
FastFieldTextOptions::EnabledWithTokenizer { with_tokenizer: _ }
|
||||
)
|
||||
match self.fast {
|
||||
FastFieldTextOptions::Disabled => false,
|
||||
FastFieldTextOptions::Enabled { .. } => true,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if and only if the value is a fast field.
|
||||
pub fn get_fast_field_tokenizer_name(&self) -> Option<&str> {
|
||||
match &self.fast {
|
||||
FastFieldTextOptions::IsEnabled(true) | FastFieldTextOptions::IsEnabled(false) => None,
|
||||
FastFieldTextOptions::EnabledWithTokenizer {
|
||||
with_tokenizer: tokenizer,
|
||||
} => Some(tokenizer.name()),
|
||||
FastFieldTextOptions::Disabled => None,
|
||||
FastFieldTextOptions::Enabled {
|
||||
tokenizer: with_tokenizer,
|
||||
} => Some(with_tokenizer.name()),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -130,15 +129,11 @@ impl JsonObjectOptions {
|
||||
/// [`TermDictionary::ord_to_term()`](crate::termdict::TermDictionary::ord_to_term)
|
||||
/// from the dictionary.
|
||||
#[must_use]
|
||||
pub fn set_fast(mut self, tokenizer_name: Option<&str>) -> Self {
|
||||
if let Some(tokenizer) = tokenizer_name {
|
||||
let tokenizer = TokenizerName::from_name(tokenizer);
|
||||
self.fast = FastFieldTextOptions::EnabledWithTokenizer {
|
||||
with_tokenizer: tokenizer,
|
||||
}
|
||||
} else {
|
||||
self.fast = FastFieldTextOptions::IsEnabled(true);
|
||||
}
|
||||
pub fn set_fast(mut self, tokenizer_name: &str) -> Self {
|
||||
let with_tokenizer = TokenizerName::from_name(tokenizer_name);
|
||||
self.fast = FastFieldTextOptions::Enabled {
|
||||
tokenizer: with_tokenizer,
|
||||
};
|
||||
self
|
||||
}
|
||||
|
||||
@@ -166,7 +161,9 @@ impl From<FastFlag> for JsonObjectOptions {
|
||||
JsonObjectOptions {
|
||||
stored: false,
|
||||
indexing: None,
|
||||
fast: FastFieldTextOptions::IsEnabled(true),
|
||||
fast: FastFieldTextOptions::Enabled {
|
||||
tokenizer: TokenizerName::from_static(DEFAULT_FAST_FIELD_TOKENIZER),
|
||||
},
|
||||
expand_dots_enabled: false,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
//! Schema definition for tantivy's indices.
|
||||
//!
|
||||
//! # Setting your schema in Tantivy
|
||||
//!
|
||||
//!
|
||||
//! Tantivy has a very strict schema.
|
||||
//! The schema defines information about the fields your index contains, that is, for each field:
|
||||
//!
|
||||
@@ -153,6 +153,8 @@ pub use self::term::{Term, ValueBytes, JSON_END_OF_PATH};
|
||||
pub use self::text_options::{TextFieldIndexing, TextOptions, STRING, TEXT};
|
||||
pub use self::value::Value;
|
||||
|
||||
pub(crate) const DEFAULT_FAST_FIELD_TOKENIZER: &str = "default";
|
||||
|
||||
/// Validator for a potential `field_name`.
|
||||
/// Returns true if the name can be use for a field name.
|
||||
///
|
||||
|
||||
@@ -24,19 +24,68 @@ pub struct TextOptions {
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
#[serde(
|
||||
into = "FastFieldTextOptionsForSerialization",
|
||||
from = "FastFieldTextOptionsForSerialization"
|
||||
)]
|
||||
/// Enum to control how the fast field setting of a text field.
|
||||
#[derive(Default)]
|
||||
pub(crate) enum FastFieldTextOptions {
|
||||
/// Flag to enable/disable
|
||||
IsEnabled(bool),
|
||||
/// Fastfield disabled
|
||||
#[default]
|
||||
Disabled,
|
||||
/// Enable with tokenizer. The tokenizer must be available on the fast field tokenizer manager.
|
||||
/// `Index::fast_field_tokenizer`.
|
||||
EnabledWithTokenizer { with_tokenizer: TokenizerName },
|
||||
Enabled { tokenizer: TokenizerName },
|
||||
}
|
||||
|
||||
impl Default for FastFieldTextOptions {
|
||||
fn default() -> Self {
|
||||
FastFieldTextOptions::IsEnabled(false)
|
||||
/// Enum used to control the way we serialize fast field text options.
|
||||
///
|
||||
/// For backward compatiblity reasons, we folow the format introduce in tantivy 0.19.
|
||||
/// `false` -> Disabled
|
||||
/// `true` -> Enabled with default tokenizer
|
||||
/// `{ tokenizer: "something" }` -> Enabled with a specific tokenizer.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
enum FastFieldTextOptionsForSerialization {
|
||||
IsEnabled(bool),
|
||||
EnabledWithTokenizer {
|
||||
#[serde(alias = "with_tokenizer")]
|
||||
tokenizer: TokenizerName,
|
||||
},
|
||||
}
|
||||
|
||||
impl From<FastFieldTextOptionsForSerialization> for FastFieldTextOptions {
|
||||
fn from(value: FastFieldTextOptionsForSerialization) -> Self {
|
||||
match value {
|
||||
FastFieldTextOptionsForSerialization::IsEnabled(enabled) => {
|
||||
if enabled {
|
||||
FastFieldTextOptions::Enabled {
|
||||
tokenizer: TokenizerName::from_static(
|
||||
crate::schema::DEFAULT_FAST_FIELD_TOKENIZER,
|
||||
),
|
||||
}
|
||||
} else {
|
||||
FastFieldTextOptions::Disabled
|
||||
}
|
||||
}
|
||||
FastFieldTextOptionsForSerialization::EnabledWithTokenizer { tokenizer } => {
|
||||
FastFieldTextOptions::Enabled { tokenizer }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<FastFieldTextOptions> for FastFieldTextOptionsForSerialization {
|
||||
fn from(value: FastFieldTextOptions) -> Self {
|
||||
match value {
|
||||
FastFieldTextOptions::Disabled => {
|
||||
FastFieldTextOptionsForSerialization::IsEnabled(false)
|
||||
}
|
||||
FastFieldTextOptions::Enabled { tokenizer } => {
|
||||
FastFieldTextOptionsForSerialization::EnabledWithTokenizer { tokenizer }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -45,23 +94,13 @@ impl BitOr<FastFieldTextOptions> for FastFieldTextOptions {
|
||||
|
||||
fn bitor(self, other: FastFieldTextOptions) -> FastFieldTextOptions {
|
||||
match (self, other) {
|
||||
(
|
||||
FastFieldTextOptions::EnabledWithTokenizer {
|
||||
with_tokenizer: tokenizer,
|
||||
},
|
||||
_,
|
||||
)
|
||||
| (
|
||||
_,
|
||||
FastFieldTextOptions::EnabledWithTokenizer {
|
||||
with_tokenizer: tokenizer,
|
||||
},
|
||||
) => FastFieldTextOptions::EnabledWithTokenizer {
|
||||
with_tokenizer: tokenizer,
|
||||
},
|
||||
(FastFieldTextOptions::IsEnabled(true), _)
|
||||
| (_, FastFieldTextOptions::IsEnabled(true)) => FastFieldTextOptions::IsEnabled(true),
|
||||
(_, FastFieldTextOptions::IsEnabled(false)) => FastFieldTextOptions::IsEnabled(false),
|
||||
(FastFieldTextOptions::Enabled { tokenizer }, _)
|
||||
| (_, FastFieldTextOptions::Enabled { tokenizer }) => {
|
||||
FastFieldTextOptions::Enabled { tokenizer }
|
||||
}
|
||||
(FastFieldTextOptions::Disabled, FastFieldTextOptions::Disabled) => {
|
||||
FastFieldTextOptions::Disabled
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -83,20 +122,17 @@ impl TextOptions {
|
||||
|
||||
/// Returns true if and only if the value is a fast field.
|
||||
pub fn is_fast(&self) -> bool {
|
||||
matches!(self.fast, FastFieldTextOptions::IsEnabled(true))
|
||||
|| matches!(
|
||||
&self.fast,
|
||||
FastFieldTextOptions::EnabledWithTokenizer { with_tokenizer: _ }
|
||||
)
|
||||
match &self.fast {
|
||||
FastFieldTextOptions::Disabled => false,
|
||||
FastFieldTextOptions::Enabled { .. } => true,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if and only if the value is a fast field.
|
||||
pub fn get_fast_field_tokenizer_name(&self) -> Option<&str> {
|
||||
match &self.fast {
|
||||
FastFieldTextOptions::IsEnabled(true) | FastFieldTextOptions::IsEnabled(false) => None,
|
||||
FastFieldTextOptions::EnabledWithTokenizer {
|
||||
with_tokenizer: tokenizer,
|
||||
} => Some(tokenizer.name()),
|
||||
FastFieldTextOptions::Disabled => None,
|
||||
FastFieldTextOptions::Enabled { tokenizer } => Some(tokenizer.name()),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -121,15 +157,9 @@ impl TextOptions {
|
||||
/// [`TermDictionary::ord_to_term()`](crate::termdict::TermDictionary::ord_to_term)
|
||||
/// from the dictionary.
|
||||
#[must_use]
|
||||
pub fn set_fast(mut self, tokenizer_name: Option<&str>) -> TextOptions {
|
||||
if let Some(tokenizer) = tokenizer_name {
|
||||
let tokenizer = TokenizerName::from_name(tokenizer);
|
||||
self.fast = FastFieldTextOptions::EnabledWithTokenizer {
|
||||
with_tokenizer: tokenizer,
|
||||
}
|
||||
} else {
|
||||
self.fast = FastFieldTextOptions::IsEnabled(true);
|
||||
}
|
||||
pub fn set_fast(mut self, tokenizer_name: &str) -> TextOptions {
|
||||
let tokenizer = TokenizerName::from_name(tokenizer_name);
|
||||
self.fast = FastFieldTextOptions::Enabled { tokenizer };
|
||||
self
|
||||
}
|
||||
|
||||
@@ -263,7 +293,7 @@ pub const STRING: TextOptions = TextOptions {
|
||||
record: IndexRecordOption::Basic,
|
||||
}),
|
||||
stored: false,
|
||||
fast: FastFieldTextOptions::IsEnabled(false),
|
||||
fast: FastFieldTextOptions::Disabled,
|
||||
coerce: false,
|
||||
};
|
||||
|
||||
@@ -276,7 +306,7 @@ pub const TEXT: TextOptions = TextOptions {
|
||||
}),
|
||||
stored: false,
|
||||
coerce: false,
|
||||
fast: FastFieldTextOptions::IsEnabled(false),
|
||||
fast: FastFieldTextOptions::Disabled,
|
||||
};
|
||||
|
||||
impl<T: Into<TextOptions>> BitOr<T> for TextOptions {
|
||||
@@ -326,7 +356,9 @@ impl From<FastFlag> for TextOptions {
|
||||
TextOptions {
|
||||
indexing: None,
|
||||
stored: false,
|
||||
fast: FastFieldTextOptions::IsEnabled(true),
|
||||
fast: FastFieldTextOptions::Enabled {
|
||||
tokenizer: TokenizerName::from_static(crate::schema::DEFAULT_FAST_FIELD_TOKENIZER),
|
||||
},
|
||||
coerce: false,
|
||||
}
|
||||
}
|
||||
@@ -392,21 +424,21 @@ mod tests {
|
||||
#[test]
|
||||
fn serde_fast_field_tokenizer() {
|
||||
let json = r#" {
|
||||
"fast": { "with_tokenizer": "default" }
|
||||
"fast": { "tokenizer": "default" }
|
||||
} "#;
|
||||
let options: TextOptions = serde_json::from_str(json).unwrap();
|
||||
assert_eq!(
|
||||
options.fast,
|
||||
FastFieldTextOptions::EnabledWithTokenizer {
|
||||
with_tokenizer: TokenizerName::from_static("default")
|
||||
FastFieldTextOptions::Enabled {
|
||||
tokenizer: TokenizerName::from_static("default")
|
||||
}
|
||||
);
|
||||
let options: TextOptions =
|
||||
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
|
||||
assert_eq!(
|
||||
options.fast,
|
||||
FastFieldTextOptions::EnabledWithTokenizer {
|
||||
with_tokenizer: TokenizerName::from_static("default")
|
||||
FastFieldTextOptions::Enabled {
|
||||
tokenizer: TokenizerName::from_static("default")
|
||||
}
|
||||
);
|
||||
|
||||
@@ -414,18 +446,28 @@ mod tests {
|
||||
"fast": true
|
||||
} "#;
|
||||
let options: TextOptions = serde_json::from_str(json).unwrap();
|
||||
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(true));
|
||||
assert_eq!(
|
||||
options.fast,
|
||||
FastFieldTextOptions::Enabled {
|
||||
tokenizer: TokenizerName::from_static(DEFAULT_FAST_FIELD_TOKENIZER)
|
||||
}
|
||||
);
|
||||
let options: TextOptions =
|
||||
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
|
||||
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(true));
|
||||
assert_eq!(
|
||||
options.fast,
|
||||
FastFieldTextOptions::Enabled {
|
||||
tokenizer: TokenizerName::from_static(DEFAULT_FAST_FIELD_TOKENIZER)
|
||||
}
|
||||
);
|
||||
|
||||
let json = r#" {
|
||||
"fast": false
|
||||
} "#;
|
||||
let options: TextOptions = serde_json::from_str(json).unwrap();
|
||||
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(false));
|
||||
assert_eq!(options.fast, FastFieldTextOptions::Disabled);
|
||||
let options: TextOptions =
|
||||
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
|
||||
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(false));
|
||||
assert_eq!(options.fast, FastFieldTextOptions::Disabled);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,19 +0,0 @@
|
||||
use std::io;
|
||||
|
||||
#[inline]
|
||||
pub fn compress(mut uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
let params = brotli::enc::BrotliEncoderParams {
|
||||
quality: 5,
|
||||
..Default::default()
|
||||
};
|
||||
compressed.clear();
|
||||
brotli::BrotliCompress(&mut uncompressed, compressed, ¶ms)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn decompress(mut compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
decompressed.clear();
|
||||
brotli::BrotliDecompress(&mut compressed, decompressed)?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,17 +0,0 @@
|
||||
use std::io::{self, Read, Write};
|
||||
|
||||
#[inline]
|
||||
pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
compressed.clear();
|
||||
let mut encoder = snap::write::FrameEncoder::new(compressed);
|
||||
encoder.write_all(uncompressed)?;
|
||||
encoder.flush()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn decompress(compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
decompressed.clear();
|
||||
snap::read::FrameDecoder::new(compressed).read_to_end(decompressed)?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -17,12 +17,10 @@ pub enum Compressor {
|
||||
/// No compression
|
||||
None,
|
||||
/// Use the lz4 compressor (block format)
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
Lz4,
|
||||
/// Use the brotli compressor
|
||||
Brotli,
|
||||
/// Use the snap compressor
|
||||
Snappy,
|
||||
/// Use the zstd compressor
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
Zstd(ZstdCompressor),
|
||||
}
|
||||
|
||||
@@ -31,9 +29,9 @@ impl Serialize for Compressor {
|
||||
where S: serde::Serializer {
|
||||
match *self {
|
||||
Compressor::None => serializer.serialize_str("none"),
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
Compressor::Lz4 => serializer.serialize_str("lz4"),
|
||||
Compressor::Brotli => serializer.serialize_str("brotli"),
|
||||
Compressor::Snappy => serializer.serialize_str("snappy"),
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
Compressor::Zstd(zstd) => serializer.serialize_str(&zstd.ser_to_string()),
|
||||
}
|
||||
}
|
||||
@@ -45,27 +43,38 @@ impl<'de> Deserialize<'de> for Compressor {
|
||||
let buf = String::deserialize(deserializer)?;
|
||||
let compressor = match buf.as_str() {
|
||||
"none" => Compressor::None,
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
"lz4" => Compressor::Lz4,
|
||||
"brotli" => Compressor::Brotli,
|
||||
"snappy" => Compressor::Snappy,
|
||||
#[cfg(not(feature = "lz4-compression"))]
|
||||
"lz4" => {
|
||||
return Err(serde::de::Error::custom(
|
||||
"unsupported variant `lz4`, please enable Tantivy's `lz4-compression` feature",
|
||||
))
|
||||
}
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
_ if buf.starts_with("zstd") => Compressor::Zstd(
|
||||
ZstdCompressor::deser_from_str(&buf).map_err(serde::de::Error::custom)?,
|
||||
),
|
||||
#[cfg(not(feature = "zstd-compression"))]
|
||||
_ if buf.starts_with("zstd") => {
|
||||
return Err(serde::de::Error::custom(
|
||||
"unsupported variant `zstd`, please enable Tantivy's `zstd-compression` \
|
||||
feature",
|
||||
))
|
||||
}
|
||||
_ => {
|
||||
if buf.starts_with("zstd") {
|
||||
Compressor::Zstd(
|
||||
ZstdCompressor::deser_from_str(&buf).map_err(serde::de::Error::custom)?,
|
||||
)
|
||||
} else {
|
||||
return Err(serde::de::Error::unknown_variant(
|
||||
&buf,
|
||||
&[
|
||||
"none",
|
||||
"lz4",
|
||||
"brotli",
|
||||
"snappy",
|
||||
"zstd",
|
||||
"zstd(compression_level=5)",
|
||||
],
|
||||
));
|
||||
}
|
||||
return Err(serde::de::Error::unknown_variant(
|
||||
&buf,
|
||||
&[
|
||||
"none",
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
"lz4",
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
"zstd",
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
"zstd(compression_level=5)",
|
||||
],
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
@@ -127,18 +136,15 @@ impl ZstdCompressor {
|
||||
}
|
||||
|
||||
impl Default for Compressor {
|
||||
#[allow(unreachable_code)]
|
||||
fn default() -> Self {
|
||||
if cfg!(feature = "lz4-compression") {
|
||||
Compressor::Lz4
|
||||
} else if cfg!(feature = "brotli-compression") {
|
||||
Compressor::Brotli
|
||||
} else if cfg!(feature = "snappy-compression") {
|
||||
Compressor::Snappy
|
||||
} else if cfg!(feature = "zstd-compression") {
|
||||
Compressor::Zstd(ZstdCompressor::default())
|
||||
} else {
|
||||
Compressor::None
|
||||
}
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
return Compressor::Lz4;
|
||||
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
return Compressor::Zstd(ZstdCompressor::default());
|
||||
|
||||
Compressor::None
|
||||
}
|
||||
}
|
||||
|
||||
@@ -155,50 +161,14 @@ impl Compressor {
|
||||
compressed.extend_from_slice(uncompressed);
|
||||
Ok(())
|
||||
}
|
||||
Self::Lz4 => {
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
{
|
||||
super::compression_lz4_block::compress(uncompressed, compressed)
|
||||
}
|
||||
#[cfg(not(feature = "lz4-compression"))]
|
||||
{
|
||||
panic!("lz4-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
Self::Brotli => {
|
||||
#[cfg(feature = "brotli-compression")]
|
||||
{
|
||||
super::compression_brotli::compress(uncompressed, compressed)
|
||||
}
|
||||
#[cfg(not(feature = "brotli-compression"))]
|
||||
{
|
||||
panic!("brotli-compression-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
Self::Snappy => {
|
||||
#[cfg(feature = "snappy-compression")]
|
||||
{
|
||||
super::compression_snap::compress(uncompressed, compressed)
|
||||
}
|
||||
#[cfg(not(feature = "snappy-compression"))]
|
||||
{
|
||||
panic!("snappy-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
Self::Zstd(_zstd_compressor) => {
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
{
|
||||
super::compression_zstd_block::compress(
|
||||
uncompressed,
|
||||
compressed,
|
||||
_zstd_compressor.compression_level,
|
||||
)
|
||||
}
|
||||
#[cfg(not(feature = "zstd-compression"))]
|
||||
{
|
||||
panic!("zstd-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
Self::Lz4 => super::compression_lz4_block::compress(uncompressed, compressed),
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
Self::Zstd(_zstd_compressor) => super::compression_zstd_block::compress(
|
||||
uncompressed,
|
||||
compressed,
|
||||
_zstd_compressor.compression_level,
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,12 +16,10 @@ pub enum Decompressor {
|
||||
/// No compression
|
||||
None,
|
||||
/// Use the lz4 decompressor (block format)
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
Lz4,
|
||||
/// Use the brotli decompressor
|
||||
Brotli,
|
||||
/// Use the snap decompressor
|
||||
Snappy,
|
||||
/// Use the zstd decompressor
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
Zstd,
|
||||
}
|
||||
|
||||
@@ -29,9 +27,9 @@ impl From<Compressor> for Decompressor {
|
||||
fn from(compressor: Compressor) -> Self {
|
||||
match compressor {
|
||||
Compressor::None => Decompressor::None,
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
Compressor::Lz4 => Decompressor::Lz4,
|
||||
Compressor::Brotli => Decompressor::Brotli,
|
||||
Compressor::Snappy => Decompressor::Snappy,
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
Compressor::Zstd(_) => Decompressor::Zstd,
|
||||
}
|
||||
}
|
||||
@@ -41,9 +39,9 @@ impl Decompressor {
|
||||
pub(crate) fn from_id(id: u8) -> Decompressor {
|
||||
match id {
|
||||
0 => Decompressor::None,
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
1 => Decompressor::Lz4,
|
||||
2 => Decompressor::Brotli,
|
||||
3 => Decompressor::Snappy,
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
4 => Decompressor::Zstd,
|
||||
_ => panic!("unknown compressor id {id:?}"),
|
||||
}
|
||||
@@ -52,9 +50,9 @@ impl Decompressor {
|
||||
pub(crate) fn get_id(&self) -> u8 {
|
||||
match self {
|
||||
Self::None => 0,
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
Self::Lz4 => 1,
|
||||
Self::Brotli => 2,
|
||||
Self::Snappy => 3,
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
Self::Zstd => 4,
|
||||
}
|
||||
}
|
||||
@@ -77,46 +75,10 @@ impl Decompressor {
|
||||
decompressed.extend_from_slice(compressed);
|
||||
Ok(())
|
||||
}
|
||||
Self::Lz4 => {
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
{
|
||||
super::compression_lz4_block::decompress(compressed, decompressed)
|
||||
}
|
||||
#[cfg(not(feature = "lz4-compression"))]
|
||||
{
|
||||
panic!("lz4-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
Self::Brotli => {
|
||||
#[cfg(feature = "brotli-compression")]
|
||||
{
|
||||
super::compression_brotli::decompress(compressed, decompressed)
|
||||
}
|
||||
#[cfg(not(feature = "brotli-compression"))]
|
||||
{
|
||||
panic!("brotli-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
Self::Snappy => {
|
||||
#[cfg(feature = "snappy-compression")]
|
||||
{
|
||||
super::compression_snap::decompress(compressed, decompressed)
|
||||
}
|
||||
#[cfg(not(feature = "snappy-compression"))]
|
||||
{
|
||||
panic!("snappy-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
Self::Zstd => {
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
{
|
||||
super::compression_zstd_block::decompress(compressed, decompressed)
|
||||
}
|
||||
#[cfg(not(feature = "zstd-compression"))]
|
||||
{
|
||||
panic!("zstd-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
Self::Lz4 => super::compression_lz4_block::decompress(compressed, decompressed),
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
Self::Zstd => super::compression_zstd_block::decompress(compressed, decompressed),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -129,9 +91,9 @@ mod tests {
|
||||
#[test]
|
||||
fn compressor_decompressor_id_test() {
|
||||
assert_eq!(Decompressor::from(Compressor::None), Decompressor::None);
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
assert_eq!(Decompressor::from(Compressor::Lz4), Decompressor::Lz4);
|
||||
assert_eq!(Decompressor::from(Compressor::Brotli), Decompressor::Brotli);
|
||||
assert_eq!(Decompressor::from(Compressor::Snappy), Decompressor::Snappy);
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
assert_eq!(
|
||||
Decompressor::from(Compressor::Zstd(Default::default())),
|
||||
Decompressor::Zstd
|
||||
|
||||
@@ -4,8 +4,8 @@
|
||||
//! order to be handled in the `Store`.
|
||||
//!
|
||||
//! Internally, documents (or rather their stored fields) are serialized to a buffer.
|
||||
//! When the buffer exceeds `block_size` (defaults to 16K), the buffer is compressed using `brotli`,
|
||||
//! `LZ4` or `snappy` and the resulting block is written to disk.
|
||||
//! When the buffer exceeds `block_size` (defaults to 16K), the buffer is compressed
|
||||
//! using LZ4 or Zstd and the resulting block is written to disk.
|
||||
//!
|
||||
//! One can then request for a specific `DocId`.
|
||||
//! A skip list helps navigating to the right block,
|
||||
@@ -48,12 +48,6 @@ pub(crate) const DOC_STORE_VERSION: u32 = 1;
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
mod compression_lz4_block;
|
||||
|
||||
#[cfg(feature = "brotli-compression")]
|
||||
mod compression_brotli;
|
||||
|
||||
#[cfg(feature = "snappy-compression")]
|
||||
mod compression_snap;
|
||||
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
mod compression_zstd_block;
|
||||
|
||||
@@ -200,16 +194,6 @@ pub mod tests {
|
||||
fn test_store_lz4_block() -> crate::Result<()> {
|
||||
test_store(Compressor::Lz4, BLOCK_SIZE, true)
|
||||
}
|
||||
#[cfg(feature = "snappy-compression")]
|
||||
#[test]
|
||||
fn test_store_snap() -> crate::Result<()> {
|
||||
test_store(Compressor::Snappy, BLOCK_SIZE, true)
|
||||
}
|
||||
#[cfg(feature = "brotli-compression")]
|
||||
#[test]
|
||||
fn test_store_brotli() -> crate::Result<()> {
|
||||
test_store(Compressor::Brotli, BLOCK_SIZE, true)
|
||||
}
|
||||
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
#[test]
|
||||
@@ -261,8 +245,8 @@ pub mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "snappy-compression")]
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
#[test]
|
||||
fn test_merge_with_changed_compressor() -> crate::Result<()> {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
@@ -294,7 +278,7 @@ pub mod tests {
|
||||
);
|
||||
// Change compressor, this disables stacking on merging
|
||||
let index_settings = index.settings_mut();
|
||||
index_settings.docstore_compression = Compressor::Snappy;
|
||||
index_settings.docstore_compression = Compressor::Zstd(Default::default());
|
||||
// Merging the segments
|
||||
{
|
||||
let segment_ids = index
|
||||
@@ -316,7 +300,7 @@ pub mod tests {
|
||||
LOREM.to_string()
|
||||
);
|
||||
}
|
||||
assert_eq!(store.decompressor(), Decompressor::Snappy);
|
||||
assert_eq!(store.decompressor(), Decompressor::Zstd);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -189,7 +189,7 @@ pub mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_raw_tokenizer2() {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
let tokenizer_manager = TokenizerManager::default_for_indexing();
|
||||
let mut en_tokenizer = tokenizer_manager.get("raw").unwrap();
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
{
|
||||
@@ -206,7 +206,7 @@ pub mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_en_tokenizer() {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
let tokenizer_manager = TokenizerManager::default_for_indexing();
|
||||
assert!(tokenizer_manager.get("en_doesnotexist").is_none());
|
||||
let mut en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
@@ -228,7 +228,7 @@ pub mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_non_en_tokenizer() {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
let tokenizer_manager = TokenizerManager::default_for_indexing();
|
||||
tokenizer_manager.register(
|
||||
"el_stem",
|
||||
TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
@@ -256,7 +256,7 @@ pub mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_tokenizer_empty() {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
let tokenizer_manager = TokenizerManager::default_for_indexing();
|
||||
let mut en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
|
||||
{
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
@@ -282,7 +282,7 @@ pub mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_whitespace_tokenizer() {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
let tokenizer_manager = TokenizerManager::default_for_indexing();
|
||||
let mut ws_tokenizer = tokenizer_manager.get("whitespace").unwrap();
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
{
|
||||
|
||||
@@ -86,6 +86,8 @@ impl TokenFilter for SplitCompoundWords {
|
||||
SplitCompoundWordsFilter {
|
||||
dict: self.dict,
|
||||
inner: tokenizer,
|
||||
cuts: Vec::new(),
|
||||
parts: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -94,29 +96,33 @@ impl TokenFilter for SplitCompoundWords {
|
||||
pub struct SplitCompoundWordsFilter<T> {
|
||||
dict: AhoCorasick,
|
||||
inner: T,
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> Tokenizer for SplitCompoundWordsFilter<T> {
|
||||
type TokenStream<'a> = SplitCompoundWordsTokenStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
SplitCompoundWordsTokenStream {
|
||||
dict: self.dict.clone(),
|
||||
tail: self.inner.token_stream(text),
|
||||
cuts: Vec::new(),
|
||||
parts: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct SplitCompoundWordsTokenStream<T> {
|
||||
dict: AhoCorasick,
|
||||
tail: T,
|
||||
cuts: Vec<usize>,
|
||||
parts: Vec<Token>,
|
||||
}
|
||||
|
||||
impl<T: TokenStream> SplitCompoundWordsTokenStream<T> {
|
||||
impl<T: Tokenizer> Tokenizer for SplitCompoundWordsFilter<T> {
|
||||
type TokenStream<'a> = SplitCompoundWordsTokenStream<'a, T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
self.cuts.clear();
|
||||
self.parts.clear();
|
||||
SplitCompoundWordsTokenStream {
|
||||
dict: self.dict.clone(),
|
||||
tail: self.inner.token_stream(text),
|
||||
cuts: &mut self.cuts,
|
||||
parts: &mut self.parts,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct SplitCompoundWordsTokenStream<'a, T> {
|
||||
dict: AhoCorasick,
|
||||
tail: T,
|
||||
cuts: &'a mut Vec<usize>,
|
||||
parts: &'a mut Vec<Token>,
|
||||
}
|
||||
|
||||
impl<'a, T: TokenStream> SplitCompoundWordsTokenStream<'a, T> {
|
||||
// Will use `self.cuts` to fill `self.parts` if `self.tail.token()`
|
||||
// can fully be split into consecutive matches against `self.dict`.
|
||||
fn split(&mut self) {
|
||||
@@ -152,7 +158,7 @@ impl<T: TokenStream> SplitCompoundWordsTokenStream<T> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: TokenStream> TokenStream for SplitCompoundWordsTokenStream<T> {
|
||||
impl<'a, T: TokenStream> TokenStream for SplitCompoundWordsTokenStream<'a, T> {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.parts.pop();
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ use tokenizer_api::{BoxTokenStream, TokenFilter, Tokenizer};
|
||||
use crate::tokenizer::empty_tokenizer::EmptyTokenizer;
|
||||
|
||||
/// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`.
|
||||
#[derive(Clone)]
|
||||
pub struct TextAnalyzer {
|
||||
tokenizer: Box<dyn BoxableTokenizer>,
|
||||
}
|
||||
@@ -12,25 +13,24 @@ pub struct TextAnalyzer {
|
||||
impl Tokenizer for Box<dyn BoxableTokenizer> {
|
||||
type TokenStream<'a> = BoxTokenStream<'a>;
|
||||
|
||||
// Note: we want to call `box_token_stream` on the concrete `Tokenizer`
|
||||
// implementation, not the `BoxableTokenizer` one as it will cause
|
||||
// a recursive call (and a stack overflow).
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
self.box_token_stream(text)
|
||||
(**self).box_token_stream(text)
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for Box<dyn BoxableTokenizer> {
|
||||
// Note: we want to call `box_clone` on the concrete `Tokenizer`
|
||||
// implementation in order to clone the concrete `Tokenizer`.
|
||||
fn clone(&self) -> Self {
|
||||
self.box_clone()
|
||||
(**self).box_clone()
|
||||
}
|
||||
}
|
||||
|
||||
fn add_filter<F: TokenFilter>(tokenizer: Box<dyn BoxableTokenizer>, filter: F) -> Box<dyn BoxableTokenizer> {
|
||||
let filtered_tokenizer = filter.transform(tokenizer);
|
||||
Box::new(filtered_tokenizer)
|
||||
}
|
||||
|
||||
|
||||
/// A boxable `Tokenizer`, with its `TokenStream` type erased.
|
||||
trait BoxableTokenizer: 'static + Send + Sync {
|
||||
pub trait BoxableTokenizer: 'static + Send + Sync {
|
||||
/// Creates a boxed token stream for a given `str`.
|
||||
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a>;
|
||||
/// Clone this tokenizer.
|
||||
@@ -46,14 +46,6 @@ impl<T: Tokenizer> BoxableTokenizer for T {
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for TextAnalyzer {
|
||||
fn clone(&self) -> Self {
|
||||
TextAnalyzer {
|
||||
tokenizer: self.tokenizer.box_clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for TextAnalyzer {
|
||||
fn default() -> TextAnalyzer {
|
||||
TextAnalyzer::from(EmptyTokenizer)
|
||||
@@ -74,12 +66,12 @@ impl TextAnalyzer {
|
||||
|
||||
/// Creates a token stream for a given `str`.
|
||||
pub fn token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
self.tokenizer.box_token_stream(text)
|
||||
self.tokenizer.token_stream(text)
|
||||
}
|
||||
}
|
||||
|
||||
/// Builder helper for [`TextAnalyzer`]
|
||||
pub struct TextAnalyzerBuilder<T> {
|
||||
pub struct TextAnalyzerBuilder<T = Box<dyn BoxableTokenizer>> {
|
||||
tokenizer: T,
|
||||
}
|
||||
|
||||
@@ -103,6 +95,23 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Boxes the internal tokenizer. This is useful for adding dynamic filters.
|
||||
/// Note: this will be less performant than the non boxed version.
|
||||
pub fn dynamic(self) -> TextAnalyzerBuilder {
|
||||
let boxed_tokenizer = Box::new(self.tokenizer);
|
||||
TextAnalyzerBuilder {
|
||||
tokenizer: boxed_tokenizer,
|
||||
}
|
||||
}
|
||||
|
||||
/// Appends a token filter to the current builder and returns a boxed version of the
|
||||
/// tokenizer. This is useful when you want to build a `TextAnalyzer` dynamically.
|
||||
/// Prefer using `TextAnalyzer::builder(tokenizer).filter(token_filter).build()` if
|
||||
/// possible as it will be more performant and create less boxes.
|
||||
pub fn filter_dynamic<F: TokenFilter>(self, token_filter: F) -> TextAnalyzerBuilder {
|
||||
self.filter(token_filter).dynamic()
|
||||
}
|
||||
|
||||
/// Finalize building the TextAnalyzer
|
||||
pub fn build(self) -> TextAnalyzer {
|
||||
TextAnalyzer {
|
||||
@@ -110,3 +119,57 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::tokenizer::{LowerCaser, RemoveLongFilter, SimpleTokenizer};
|
||||
|
||||
#[test]
|
||||
fn test_text_analyzer_builder() {
|
||||
let mut analyzer = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.build();
|
||||
let mut stream = analyzer.token_stream("- first bullet point");
|
||||
assert_eq!(stream.next().unwrap().text, "first");
|
||||
assert_eq!(stream.next().unwrap().text, "bullet");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_text_analyzer_with_filters_boxed() {
|
||||
// This test shows how one can build a TextAnalyzer dynamically, by stacking a list
|
||||
// of parametrizable token filters.
|
||||
//
|
||||
// The following enum is the thing that would be serializable.
|
||||
// Note that token filters can have their own parameters, too, like the RemoveLongFilter
|
||||
enum SerializableTokenFilterEnum {
|
||||
LowerCaser(LowerCaser),
|
||||
RemoveLongFilter(RemoveLongFilter),
|
||||
}
|
||||
// Note that everything below is dynamic.
|
||||
let filters: Vec<SerializableTokenFilterEnum> = vec![
|
||||
SerializableTokenFilterEnum::LowerCaser(LowerCaser),
|
||||
SerializableTokenFilterEnum::RemoveLongFilter(RemoveLongFilter::limit(12)),
|
||||
];
|
||||
let mut analyzer_builder: TextAnalyzerBuilder =
|
||||
TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter_dynamic(RemoveLongFilter::limit(40))
|
||||
.filter_dynamic(LowerCaser);
|
||||
for filter in filters {
|
||||
analyzer_builder = match filter {
|
||||
SerializableTokenFilterEnum::LowerCaser(lower_caser) => {
|
||||
analyzer_builder.filter_dynamic(lower_caser)
|
||||
}
|
||||
SerializableTokenFilterEnum::RemoveLongFilter(remove_long_filter) => {
|
||||
analyzer_builder.filter_dynamic(remove_long_filter)
|
||||
}
|
||||
}
|
||||
}
|
||||
let mut analyzer = analyzer_builder.build();
|
||||
let mut stream = analyzer.token_stream("first bullet point");
|
||||
assert_eq!(stream.next().unwrap().text, "first");
|
||||
assert_eq!(stream.next().unwrap().text, "bullet");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,6 +27,7 @@ pub struct TokenizerManager {
|
||||
|
||||
impl TokenizerManager {
|
||||
/// Creates an empty tokenizer manager.
|
||||
#[allow(clippy::new_without_default)]
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
tokenizers: Arc::new(RwLock::new(HashMap::new())),
|
||||
@@ -51,12 +52,10 @@ impl TokenizerManager {
|
||||
.get(tokenizer_name)
|
||||
.cloned()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for TokenizerManager {
|
||||
/// Creates an `TokenizerManager` prepopulated with
|
||||
/// the default pre-configured tokenizers of `tantivy`.
|
||||
fn default() -> TokenizerManager {
|
||||
pub fn default_for_indexing() -> TokenizerManager {
|
||||
let manager = TokenizerManager::new();
|
||||
manager.register("raw", RawTokenizer::default());
|
||||
manager.register(
|
||||
@@ -77,4 +76,28 @@ impl Default for TokenizerManager {
|
||||
manager.register("whitespace", WhitespaceTokenizer::default());
|
||||
manager
|
||||
}
|
||||
|
||||
/// Creates an `TokenizerManager` prepopulated with
|
||||
/// the default pre-configured tokenizers of `tantivy`
|
||||
/// for fast fields.
|
||||
///
|
||||
/// Fast fields usually do not really tokenize the text.
|
||||
/// It is however very useful to filter / normalize the text.
|
||||
pub fn default_for_fast_fields() -> TokenizerManager {
|
||||
let manager = TokenizerManager::new();
|
||||
let raw_tokenizer = TextAnalyzer::builder(RawTokenizer::default())
|
||||
.filter(RemoveLongFilter::limit(255))
|
||||
.build();
|
||||
let lower_tokenizer = TextAnalyzer::builder(RawTokenizer::default())
|
||||
.filter(RemoveLongFilter::limit(255))
|
||||
.filter(LowerCaser)
|
||||
.build();
|
||||
manager.register(
|
||||
crate::schema::DEFAULT_FAST_FIELD_TOKENIZER,
|
||||
lower_tokenizer.clone(),
|
||||
);
|
||||
manager.register("raw", raw_tokenizer);
|
||||
manager.register("lower", lower_tokenizer);
|
||||
manager
|
||||
}
|
||||
}
|
||||
|
||||
@@ -157,7 +157,6 @@ pub trait TokenFilter: 'static + Send + Sync {
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> Self::Tokenizer<T>;
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
Reference in New Issue
Block a user