Compare commits

..

1 Commits

Author SHA1 Message Date
Pascal Seitz
4f2e810b83 add From impl to BoxTokenStream, Bump tokenizer-api version 2023-06-23 18:54:03 +08:00
63 changed files with 563 additions and 1073 deletions

View File

@@ -53,7 +53,7 @@ jobs:
strategy:
matrix:
features: [
{ label: "all", flags: "mmap,stopwords,lz4-compression,zstd-compression,failpoints" },
{ label: "all", flags: "mmap,stopwords,brotli-compression,lz4-compression,snappy-compression,zstd-compression,failpoints" },
{ label: "quickwit", flags: "mmap,quickwit,failpoints" }
]

View File

@@ -1,14 +1,5 @@
Tantivy 0.20.2
================================
- Align numerical type priority order on the search side. [#2088](https://github.com/quickwit-oss/tantivy/issues/2088) (@fmassot)
- Fix is_child_of function not considering the root facet. [#2086](https://github.com/quickwit-oss/tantivy/issues/2086) (@adamreichhold)
Tantivy 0.20.1
================================
- Fix building on windows with mmap [#2070](https://github.com/quickwit-oss/tantivy/issues/2070) (@ChillFish8)
Tantivy 0.20
Tantivy 0.20 [Unreleased]
================================
#### Bugfixes
- Fix phrase queries with slop (slop supports now transpositions, algorithm that carries slop so far for num terms > 2) [#2031](https://github.com/quickwit-oss/tantivy/issues/2031)[#2020](https://github.com/quickwit-oss/tantivy/issues/2020)(@PSeitz)
@@ -47,14 +38,12 @@ Tantivy 0.20
- Add aggregation support for JSON type [#1888](https://github.com/quickwit-oss/tantivy/issues/1888) (@PSeitz)
- Mixed types support on JSON fields in aggs [#1971](https://github.com/quickwit-oss/tantivy/issues/1971) (@PSeitz)
- Perf: Fetch blocks of vals in aggregation for all cardinality [#1950](https://github.com/quickwit-oss/tantivy/issues/1950) (@PSeitz)
- Allow histogram bounds to be passed as Rfc3339 [#2076](https://github.com/quickwit-oss/tantivy/issues/2076) (@PSeitz)
- `Searcher` with disabled scoring via `EnableScoring::Disabled` [#1780](https://github.com/quickwit-oss/tantivy/issues/1780) (@shikhar)
- Enable tokenizer on json fields [#2053](https://github.com/quickwit-oss/tantivy/issues/2053) (@PSeitz)
- Enforcing "NOT" and "-" queries consistency in UserInputAst [#1609](https://github.com/quickwit-oss/tantivy/issues/1609) (@bazhenov)
- Faster indexing
- Refactor tokenization pipeline to use GATs [#1924](https://github.com/quickwit-oss/tantivy/issues/1924) (@trinity-1686a)
- Faster term hash map [#2058](https://github.com/quickwit-oss/tantivy/issues/2058)[#1940](https://github.com/quickwit-oss/tantivy/issues/1940) (@PSeitz)
- tokenizer-api: reduce Tokenizer allocation overhead [#2062](https://github.com/quickwit-oss/tantivy/issues/2062) (@PSeitz)
- Refactor vint [#2010](https://github.com/quickwit-oss/tantivy/issues/2010) (@PSeitz)
- Faster search
- Work in batches of docs on the SegmentCollector (Only for cases without score for now) [#1937](https://github.com/quickwit-oss/tantivy/issues/1937) (@PSeitz)

View File

@@ -23,9 +23,11 @@ once_cell = "1.10.0"
regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] }
aho-corasick = "1.0"
tantivy-fst = "0.4.0"
memmap2 = { version = "0.7.1", optional = true }
lz4_flex = { version = "0.11", default-features = false, optional = true }
memmap2 = { version = "0.6.0", optional = true }
lz4_flex = { version = "0.10", default-features = false, features = ["checked-decode"], optional = true }
brotli = { version = "3.3.4", optional = true }
zstd = { version = "0.12", optional = true, default-features = false }
snap = { version = "1.0.5", optional = true }
tempfile = { version = "3.3.0", optional = true }
log = "0.4.16"
serde = { version = "1.0.136", features = ["derive"] }
@@ -42,14 +44,14 @@ census = "0.4.0"
rustc-hash = "1.1.0"
thiserror = "1.0.30"
htmlescape = "0.3.1"
fail = { version = "0.5.0", optional = true }
fail = "0.5.0"
murmurhash32 = "0.3.0"
time = { version = "0.3.10", features = ["serde-well-known"] }
smallvec = "1.8.0"
rayon = "1.5.2"
lru = "0.11.0"
lru = "0.10.0"
fastdivide = "0.4.0"
itertools = "0.11.0"
itertools = "0.10.3"
measure_time = "0.8.2"
async-trait = "0.1.53"
arc-swap = "1.5.0"
@@ -105,10 +107,12 @@ default = ["mmap", "stopwords", "lz4-compression"]
mmap = ["fs4", "tempfile", "memmap2"]
stopwords = []
brotli-compression = ["brotli"]
lz4-compression = ["lz4_flex"]
snappy-compression = ["snap"]
zstd-compression = ["zstd"]
failpoints = ["fail", "fail/failpoints"]
failpoints = ["fail/failpoints"]
unstable = [] # useful for benches.
quickwit = ["sstable", "futures-util"]

View File

@@ -44,7 +44,7 @@ Details about the benchmark can be found at this [repository](https://github.com
- Single valued and multivalued u64, i64, and f64 fast fields (equivalent of doc values in Lucene)
- `&[u8]` fast fields
- Text, i64, u64, f64, dates, ip, bool, and hierarchical facet fields
- Compressed document store (LZ4, Zstd, None)
- Compressed document store (LZ4, Zstd, None, Brotli, Snap)
- Range queries
- Faceted search
- Configurable indexing (optional term frequency and position indexing)

View File

@@ -1,7 +1,5 @@
use criterion::{criterion_group, criterion_main, Criterion};
use tantivy::tokenizer::{
LowerCaser, RemoveLongFilter, SimpleTokenizer, TextAnalyzer, TokenizerManager,
};
use tantivy::tokenizer::TokenizerManager;
const ALICE_TXT: &str = include_str!("alice.txt");
@@ -18,26 +16,7 @@ pub fn criterion_benchmark(c: &mut Criterion) {
assert_eq!(word_count, 30_731);
})
});
let mut dynamic_analyzer = TextAnalyzer::builder(SimpleTokenizer::default())
.dynamic()
.filter_dynamic(RemoveLongFilter::limit(40))
.filter_dynamic(LowerCaser)
.build();
c.bench_function("dynamic-tokenize-alice", |b| {
b.iter(|| {
let mut word_count = 0;
let mut token_stream = dynamic_analyzer.token_stream(ALICE_TXT);
while token_stream.advance() {
word_count += 1;
}
assert_eq!(word_count, 30_731);
})
});
}
criterion_group! {
name = benches;
config = Criterion::default().sample_size(200);
targets = criterion_benchmark
}
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);

View File

@@ -1,6 +1,6 @@
use std::ops::RangeInclusive;
#[cfg(target_arch = "x86_64")]
#[cfg(any(target_arch = "x86_64"))]
mod avx2;
mod scalar;

View File

@@ -5,11 +5,11 @@ edition = "2021"
license = "MIT"
homepage = "https://github.com/quickwit-oss/tantivy"
repository = "https://github.com/quickwit-oss/tantivy"
description = "column oriented storage for tantivy"
desciption = "column oriented storage for tantivy"
categories = ["database-implementations", "data-structures", "compression"]
[dependencies]
itertools = "0.11.0"
itertools = "0.10.5"
fnv = "1.0.7"
fastdivide = "0.4.0"

View File

@@ -168,9 +168,8 @@ mod tests {
)
.into();
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
panic!("Excpected a multivalued index")
};
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index
else { panic!("Excpected a multivalued index") };
let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
assert_eq!(&start_indexes, &[0, 3, 5]);
}
@@ -201,9 +200,8 @@ mod tests {
)
.into();
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
panic!("Excpected a multivalued index")
};
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index
else { panic!("Excpected a multivalued index") };
let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
assert_eq!(&start_indexes, &[0, 3, 5, 6]);
}

View File

@@ -157,13 +157,7 @@ mod tests {
Cardinality::Optional,
&shuffle_merge_order,
);
let SerializableColumnIndex::Optional {
non_null_row_ids,
num_rows,
} = serializable_index
else {
panic!()
};
let SerializableColumnIndex::Optional { non_null_row_ids, num_rows } = serializable_index else { panic!() };
assert_eq!(num_rows, 2);
let non_null_rows: Vec<RowId> = non_null_row_ids.boxed_iter().collect();
assert_eq!(&non_null_rows, &[1]);

View File

@@ -2,7 +2,7 @@
//! # `fastfield_codecs`
//!
//! - Columnar storage of data for tantivy [`crate::Column`].
//! - Columnar storage of data for tantivy [`Column`].
//! - Encode data in different codecs.
//! - Monotonically map values to u64/u128

View File

@@ -83,8 +83,7 @@ impl ColumnValues for BitpackedReader {
doc_id_range: Range<u32>,
positions: &mut Vec<u32>,
) {
let Some(transformed_range) =
transform_range_before_linear_transformation(&self.stats, range)
let Some(transformed_range) = transform_range_before_linear_transformation(&self.stats, range)
else {
positions.clear();
return;

View File

@@ -52,8 +52,8 @@ pub enum MergeRowOrder {
/// Columnar tables are simply stacked one above the other.
/// If the i-th columnar_readers has n_rows_i rows, then
/// in the resulting columnar,
/// rows [r0..n_row_0) contains the row of `columnar_readers[0]`, in ordder
/// rows [n_row_0..n_row_0 + n_row_1 contains the row of `columnar_readers[1]`, in order.
/// rows [r0..n_row_0) contains the row of columnar_readers[0], in ordder
/// rows [n_row_0..n_row_0 + n_row_1 contains the row of columnar_readers[1], in order.
/// ..
/// No documents is deleted.
Stack(StackMergeOrder),

View File

@@ -244,9 +244,7 @@ fn test_merge_columnar_numbers() {
assert_eq!(columnar_reader.num_columns(), 1);
let cols = columnar_reader.read_columns("numbers").unwrap();
let dynamic_column = cols[0].open().unwrap();
let DynamicColumn::F64(vals) = dynamic_column else {
panic!()
};
let DynamicColumn::F64(vals) = dynamic_column else { panic!() };
assert_eq!(vals.get_cardinality(), Cardinality::Optional);
assert_eq!(vals.first(0u32), Some(-1f64));
assert_eq!(vals.first(1u32), None);
@@ -272,9 +270,7 @@ fn test_merge_columnar_texts() {
assert_eq!(columnar_reader.num_columns(), 1);
let cols = columnar_reader.read_columns("texts").unwrap();
let dynamic_column = cols[0].open().unwrap();
let DynamicColumn::Str(vals) = dynamic_column else {
panic!()
};
let DynamicColumn::Str(vals) = dynamic_column else { panic!() };
assert_eq!(vals.ords().get_cardinality(), Cardinality::Optional);
let get_str_for_ord = |ord| {
@@ -321,9 +317,7 @@ fn test_merge_columnar_byte() {
assert_eq!(columnar_reader.num_columns(), 1);
let cols = columnar_reader.read_columns("bytes").unwrap();
let dynamic_column = cols[0].open().unwrap();
let DynamicColumn::Bytes(vals) = dynamic_column else {
panic!()
};
let DynamicColumn::Bytes(vals) = dynamic_column else { panic!() };
let get_bytes_for_ord = |ord| {
let mut out = Vec::new();
vals.ord_to_bytes(ord, &mut out).unwrap();
@@ -377,9 +371,7 @@ fn test_merge_columnar_byte_with_missing() {
assert_eq!(columnar_reader.num_columns(), 2);
let cols = columnar_reader.read_columns("col").unwrap();
let dynamic_column = cols[0].open().unwrap();
let DynamicColumn::Bytes(vals) = dynamic_column else {
panic!()
};
let DynamicColumn::Bytes(vals) = dynamic_column else { panic!() };
let get_bytes_for_ord = |ord| {
let mut out = Vec::new();
vals.ord_to_bytes(ord, &mut out).unwrap();
@@ -431,9 +423,7 @@ fn test_merge_columnar_different_types() {
// numeric column
let dynamic_column = cols[0].open().unwrap();
let DynamicColumn::I64(vals) = dynamic_column else {
panic!()
};
let DynamicColumn::I64(vals) = dynamic_column else { panic!() };
assert_eq!(vals.get_cardinality(), Cardinality::Optional);
assert_eq!(vals.values_for_doc(0).collect_vec(), vec![]);
assert_eq!(vals.values_for_doc(1).collect_vec(), vec![]);
@@ -443,9 +433,7 @@ fn test_merge_columnar_different_types() {
// text column
let dynamic_column = cols[1].open().unwrap();
let DynamicColumn::Str(vals) = dynamic_column else {
panic!()
};
let DynamicColumn::Str(vals) = dynamic_column else { panic!() };
assert_eq!(vals.ords().get_cardinality(), Cardinality::Optional);
let get_str_for_ord = |ord| {
let mut out = String::new();

View File

@@ -98,11 +98,9 @@ impl ColumnarWriter {
///
/// The sort applied is stable.
pub fn sort_order(&self, sort_field: &str, num_docs: RowId, reversed: bool) -> Vec<u32> {
let Some(numerical_col_writer) = self
.numerical_field_hash_map
.get::<NumericalColumnWriter>(sort_field.as_bytes())
else {
return Vec::new();
let Some(numerical_col_writer) =
self.numerical_field_hash_map.get::<NumericalColumnWriter>(sort_field.as_bytes()) else {
return Vec::new();
};
let mut symbols_buffer = Vec::new();
let mut values = Vec::new();

View File

@@ -57,9 +57,7 @@ fn test_dataframe_writer_bool() {
assert_eq!(cols[0].num_bytes(), 22);
assert_eq!(cols[0].column_type(), ColumnType::Bool);
let dyn_bool_col = cols[0].open().unwrap();
let DynamicColumn::Bool(bool_col) = dyn_bool_col else {
panic!();
};
let DynamicColumn::Bool(bool_col) = dyn_bool_col else { panic!(); };
let vals: Vec<Option<bool>> = (0..5).map(|row_id| bool_col.first(row_id)).collect();
assert_eq!(&vals, &[None, Some(false), None, Some(true), None,]);
}
@@ -81,9 +79,7 @@ fn test_dataframe_writer_u64_multivalued() {
assert_eq!(cols.len(), 1);
assert_eq!(cols[0].num_bytes(), 29);
let dyn_i64_col = cols[0].open().unwrap();
let DynamicColumn::I64(divisor_col) = dyn_i64_col else {
panic!();
};
let DynamicColumn::I64(divisor_col) = dyn_i64_col else { panic!(); };
assert_eq!(
divisor_col.get_cardinality(),
crate::Cardinality::Multivalued
@@ -105,9 +101,7 @@ fn test_dataframe_writer_ip_addr() {
assert_eq!(cols[0].num_bytes(), 42);
assert_eq!(cols[0].column_type(), ColumnType::IpAddr);
let dyn_bool_col = cols[0].open().unwrap();
let DynamicColumn::IpAddr(ip_col) = dyn_bool_col else {
panic!();
};
let DynamicColumn::IpAddr(ip_col) = dyn_bool_col else { panic!(); };
let vals: Vec<Option<Ipv6Addr>> = (0..5).map(|row_id| ip_col.first(row_id)).collect();
assert_eq!(
&vals,
@@ -140,9 +134,7 @@ fn test_dataframe_writer_numerical() {
// - null footer 6 bytes
assert_eq!(cols[0].num_bytes(), 33);
let column = cols[0].open().unwrap();
let DynamicColumn::I64(column_i64) = column else {
panic!();
};
let DynamicColumn::I64(column_i64) = column else { panic!(); };
assert_eq!(column_i64.index.get_cardinality(), Cardinality::Optional);
assert_eq!(column_i64.first(0), None);
assert_eq!(column_i64.first(1), Some(12i64));
@@ -206,9 +198,7 @@ fn test_dictionary_encoded_str() {
assert_eq!(columnar_reader.num_columns(), 2);
let col_handles = columnar_reader.read_columns("my.column").unwrap();
assert_eq!(col_handles.len(), 1);
let DynamicColumn::Str(str_col) = col_handles[0].open().unwrap() else {
panic!();
};
let DynamicColumn::Str(str_col) = col_handles[0].open().unwrap() else { panic!(); };
let index: Vec<Option<u64>> = (0..5).map(|row_id| str_col.ords().first(row_id)).collect();
assert_eq!(index, &[None, Some(0), None, Some(2), Some(1)]);
assert_eq!(str_col.num_rows(), 5);
@@ -240,9 +230,7 @@ fn test_dictionary_encoded_bytes() {
assert_eq!(columnar_reader.num_columns(), 2);
let col_handles = columnar_reader.read_columns("my.column").unwrap();
assert_eq!(col_handles.len(), 1);
let DynamicColumn::Bytes(bytes_col) = col_handles[0].open().unwrap() else {
panic!();
};
let DynamicColumn::Bytes(bytes_col) = col_handles[0].open().unwrap() else { panic!(); };
let index: Vec<Option<u64>> = (0..5)
.map(|row_id| bytes_col.ords().first(row_id))
.collect();
@@ -545,36 +533,28 @@ trait AssertEqualToColumnValue {
impl AssertEqualToColumnValue for bool {
fn assert_equal_to_column_value(&self, column_value: &ColumnValue) {
let ColumnValue::Bool(val) = column_value else {
panic!()
};
let ColumnValue::Bool(val) = column_value else { panic!() };
assert_eq!(self, val);
}
}
impl AssertEqualToColumnValue for Ipv6Addr {
fn assert_equal_to_column_value(&self, column_value: &ColumnValue) {
let ColumnValue::IpAddr(val) = column_value else {
panic!()
};
let ColumnValue::IpAddr(val) = column_value else { panic!() };
assert_eq!(self, val);
}
}
impl<T: Coerce + PartialEq + Debug + Into<NumericalValue>> AssertEqualToColumnValue for T {
fn assert_equal_to_column_value(&self, column_value: &ColumnValue) {
let ColumnValue::Numerical(num) = column_value else {
panic!()
};
let ColumnValue::Numerical(num) = column_value else { panic!() };
assert_eq!(self, &T::coerce(*num));
}
}
impl AssertEqualToColumnValue for DateTime {
fn assert_equal_to_column_value(&self, column_value: &ColumnValue) {
let ColumnValue::DateTime(dt) = column_value else {
panic!()
};
let ColumnValue::DateTime(dt) = column_value else { panic!() };
assert_eq!(self, dt);
}
}

View File

@@ -37,7 +37,7 @@ fn main() -> tantivy::Result<()> {
.set_index_option(IndexRecordOption::WithFreqs)
.set_tokenizer("raw"),
)
.set_fast("default")
.set_fast(None)
.set_stored();
schema_builder.add_text_field("category", text_fieldtype);
schema_builder.add_f64_field("stock", FAST);

View File

@@ -53,7 +53,7 @@ fn main() -> tantivy::Result<()> {
// this will store tokens of 3 characters each
index
.tokenizers()
.register("ngram3", NgramTokenizer::new(3, 3, false).unwrap());
.register("ngram3", NgramTokenizer::new(3, 3, false));
// To insert document we need an index writer.
// There must be only one writer at a time.

View File

@@ -6,14 +6,12 @@ use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::{Schema, FAST, TEXT};
use tantivy::{
doc, DocAddress, DocId, Index, Opstamp, Searcher, SearcherGeneration, SegmentId, SegmentReader,
Warmer,
doc, DocAddress, DocId, Index, IndexReader, Opstamp, Searcher, SearcherGeneration, SegmentId,
SegmentReader, Warmer,
};
// This example shows how warmers can be used to
// load values from an external sources and
// tie their lifecycle to that of the index segments
// using the Warmer API.
// load a values from an external sources using the Warmer API.
//
// In this example, we assume an e-commerce search engine.
@@ -25,11 +23,9 @@ pub trait PriceFetcher: Send + Sync + 'static {
fn fetch_prices(&self, product_ids: &[ProductId]) -> Vec<Price>;
}
type SegmentKey = (SegmentId, Option<Opstamp>);
struct DynamicPriceColumn {
field: String,
price_cache: RwLock<HashMap<SegmentKey, Arc<Vec<Price>>>>,
price_cache: RwLock<HashMap<(SegmentId, Option<Opstamp>), Arc<Vec<Price>>>>,
price_fetcher: Box<dyn PriceFetcher>,
}
@@ -50,6 +46,7 @@ impl DynamicPriceColumn {
impl Warmer for DynamicPriceColumn {
fn warm(&self, searcher: &Searcher) -> tantivy::Result<()> {
for segment in searcher.segment_readers() {
let key = (segment.segment_id(), segment.delete_opstamp());
let product_id_reader = segment
.fast_fields()
.u64(&self.field)?
@@ -58,40 +55,37 @@ impl Warmer for DynamicPriceColumn {
.doc_ids_alive()
.map(|doc| product_id_reader.get_val(doc))
.collect();
let mut prices = self.price_fetcher.fetch_prices(&product_ids).into_iter();
let prices: Vec<Price> = (0..segment.max_doc())
.map(|doc| {
if !segment.is_deleted(doc) {
prices.next().unwrap()
} else {
0
}
})
.collect();
let key = (segment.segment_id(), segment.delete_opstamp());
let mut prices_it = self.price_fetcher.fetch_prices(&product_ids).into_iter();
let mut price_vals: Vec<Price> = Vec::new();
for doc in 0..segment.max_doc() {
if segment.is_deleted(doc) {
price_vals.push(0);
} else {
price_vals.push(prices_it.next().unwrap())
}
}
self.price_cache
.write()
.unwrap()
.insert(key, Arc::new(prices));
.insert(key, Arc::new(price_vals));
}
Ok(())
}
fn garbage_collect(&self, live_generations: &[&SearcherGeneration]) {
let live_keys: HashSet<SegmentKey> = live_generations
.iter()
.flat_map(|gen| gen.segments())
.map(|(&segment_id, &opstamp)| (segment_id, opstamp))
let live_segment_id_and_delete_ops: HashSet<(SegmentId, Option<Opstamp>)> =
live_generations
.iter()
.flat_map(|gen| gen.segments())
.map(|(&segment_id, &opstamp)| (segment_id, opstamp))
.collect();
let mut price_cache_wrt = self.price_cache.write().unwrap();
// let price_cache = std::mem::take(&mut *price_cache_wrt);
// Drain would be nicer here.
*price_cache_wrt = std::mem::take(&mut *price_cache_wrt)
.into_iter()
.filter(|(seg_id_and_op, _)| !live_segment_id_and_delete_ops.contains(seg_id_and_op))
.collect();
self.price_cache
.write()
.unwrap()
.retain(|key, _| live_keys.contains(key));
}
}
@@ -106,17 +100,17 @@ pub struct ExternalPriceTable {
impl ExternalPriceTable {
pub fn update_price(&self, product_id: ProductId, price: Price) {
self.prices.write().unwrap().insert(product_id, price);
let mut prices_wrt = self.prices.write().unwrap();
prices_wrt.insert(product_id, price);
}
}
impl PriceFetcher for ExternalPriceTable {
fn fetch_prices(&self, product_ids: &[ProductId]) -> Vec<Price> {
let prices = self.prices.read().unwrap();
let prices_read = self.prices.read().unwrap();
product_ids
.iter()
.map(|product_id| prices.get(product_id).cloned().unwrap_or(0))
.map(|product_id| prices_read.get(product_id).cloned().unwrap_or(0))
.collect()
}
}
@@ -149,8 +143,11 @@ fn main() -> tantivy::Result<()> {
writer.add_document(doc!(product_id=>SNEAKERS, text=>"uber sweet sneakers"))?;
writer.commit()?;
let warmers = vec![Arc::downgrade(&price_dynamic_column) as Weak<dyn Warmer>];
let reader = index.reader_builder().warmers(warmers).try_into()?;
let warmers: Vec<Weak<dyn Warmer>> = vec![Arc::downgrade(
&(price_dynamic_column.clone() as Arc<dyn Warmer>),
)];
let reader: IndexReader = index.reader_builder().warmers(warmers).try_into()?;
reader.reload()?;
let query_parser = QueryParser::for_index(&index, vec![text]);
let query = query_parser.parse_query("cooking")?;

View File

@@ -15,12 +15,6 @@
//! Results of final buckets are [`BucketResult`](super::agg_result::BucketResult).
//! Results of intermediate buckets are
//! [`IntermediateBucketResult`](super::intermediate_agg_result::IntermediateBucketResult)
//!
//! ## Supported Bucket Aggregations
//! - [Histogram](HistogramAggregation)
//! - [DateHistogram](DateHistogramAggregationReq)
//! - [Range](RangeAggregation)
//! - [Terms](TermsAggregation)
mod histogram;
mod range;

View File

@@ -1293,13 +1293,13 @@ mod tests {
// searching for terma, but min_doc_count will return all terms
let res = exec_request_with_query(agg_req, &index, Some(("string2", "hit")))?;
assert_eq!(res["my_texts"]["buckets"][0]["key"], "a");
assert_eq!(res["my_texts"]["buckets"][0]["key"], "A");
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 2);
assert_eq!(
res["my_texts"]["buckets"][0]["elhistogram"]["buckets"],
json!([{ "doc_count": 1, "key": 1.0 }, { "doc_count": 1, "key": 2.0 } ])
);
assert_eq!(res["my_texts"]["buckets"][1]["key"], "b");
assert_eq!(res["my_texts"]["buckets"][1]["key"], "B");
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 1);
assert_eq!(
res["my_texts"]["buckets"][1]["elhistogram"]["buckets"],
@@ -1421,10 +1421,10 @@ mod tests {
let res = exec_request_with_query(agg_req, &index, None).unwrap();
println!("{}", serde_json::to_string_pretty(&res).unwrap());
assert_eq!(res["my_texts"]["buckets"][0]["key"], "hallo hallo");
assert_eq!(res["my_texts"]["buckets"][0]["key"], "Hallo Hallo");
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 1);
assert_eq!(res["my_texts"]["buckets"][1]["key"], "hello hello");
assert_eq!(res["my_texts"]["buckets"][1]["key"], "Hello Hello");
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 1);
Ok(())

View File

@@ -6,15 +6,6 @@
//! Some aggregations output a single numeric metric (e.g. Average) and are called
//! single-value numeric metrics aggregation, others generate multiple metrics (e.g. Stats) and are
//! called multi-value numeric metrics aggregation.
//!
//! ## Supported Metric Aggregations
//! - [Average](AverageAggregation)
//! - [Stats](StatsAggregation)
//! - [Min](MinAggregation)
//! - [Max](MaxAggregation)
//! - [Sum](SumAggregation)
//! - [Count](CountAggregation)
//! - [Percentiles](PercentilesAggregationReq)
mod average;
mod count;

View File

@@ -411,7 +411,7 @@ mod tests {
.set_index_option(IndexRecordOption::Basic)
.set_fieldnorms(false),
)
.set_fast("default")
.set_fast(None)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype.clone());
let text_field_id = schema_builder.add_text_field("text_id", text_fieldtype);
@@ -466,7 +466,7 @@ mod tests {
.set_indexing_options(
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
)
.set_fast("default")
.set_fast(None)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let date_field = schema_builder.add_date_field("date", FAST);

View File

@@ -6,35 +6,32 @@
//
// Of course, you can have a look at the tantivy's built-in collectors
// such as the `CountCollector` for more examples.
use std::fmt::Debug;
use std::marker::PhantomData;
use columnar::{BytesColumn, Column, DynamicColumn, HasAssociatedColumnType};
// ---
// Importing tantivy...
use std::marker::PhantomData;
use std::sync::Arc;
use columnar::{ColumnValues, DynamicColumn, HasAssociatedColumnType};
use crate::collector::{Collector, SegmentCollector};
use crate::schema::Field;
use crate::{DocId, Score, SegmentReader, TantivyError};
use crate::{Score, SegmentReader, TantivyError};
/// The `FilterCollector` filters docs using a fast field value and a predicate.
///
/// Only the documents containing at least one value for which the predicate returns `true`
/// will be passed on to the next collector.
///
/// In other words,
/// - documents with no values are filtered out.
/// - documents with several values are accepted if at least one value matches the predicate.
///
/// Only the documents for which the predicate returned "true" will be passed on to the next
/// collector.
///
/// ```rust
/// use tantivy::collector::{TopDocs, FilterCollector};
/// use tantivy::query::QueryParser;
/// use tantivy::schema::{Schema, TEXT, FAST};
/// use tantivy::schema::{Schema, TEXT, INDEXED, FAST};
/// use tantivy::{doc, DocAddress, Index};
///
/// # fn main() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder();
/// let title = schema_builder.add_text_field("title", TEXT);
/// let price = schema_builder.add_u64_field("price", FAST);
/// let price = schema_builder.add_u64_field("price", INDEXED | FAST);
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
///
@@ -50,24 +47,20 @@ use crate::{DocId, Score, SegmentReader, TantivyError};
///
/// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary")?;
/// let no_filter_collector = FilterCollector::new(price, |value: u64| value > 20_120u64, TopDocs::with_limit(2));
/// let no_filter_collector = FilterCollector::new(price, &|value: u64| value > 20_120u64, TopDocs::with_limit(2));
/// let top_docs = searcher.search(&query, &no_filter_collector)?;
///
/// assert_eq!(top_docs.len(), 1);
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
///
/// let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new(price, |value| value < 5u64, TopDocs::with_limit(2));
/// let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new(price, &|value| value < 5u64, TopDocs::with_limit(2));
/// let filtered_top_docs = searcher.search(&query, &filter_all_collector)?;
///
/// assert_eq!(filtered_top_docs.len(), 0);
/// # Ok(())
/// # }
/// ```
///
/// Note that this is limited to fast fields which implement the
/// [`FastValue`][crate::fastfield::FastValue] trait, e.g. `u64` but not `&[u8]`.
/// To filter based on a bytes fast field, use a [`BytesFilterCollector`] instead.
pub struct FilterCollector<TCollector, TPredicate, TPredicateValue>
pub struct FilterCollector<TCollector, TPredicate, TPredicateValue: Default>
where TPredicate: 'static + Clone
{
field: Field,
@@ -76,15 +69,19 @@ where TPredicate: 'static + Clone
t_predicate_value: PhantomData<TPredicateValue>,
}
impl<TCollector, TPredicate, TPredicateValue>
impl<TCollector, TPredicate, TPredicateValue: Default>
FilterCollector<TCollector, TPredicate, TPredicateValue>
where
TCollector: Collector + Send + Sync,
TPredicate: Fn(TPredicateValue) -> bool + Send + Sync + Clone,
{
/// Create a new `FilterCollector`.
pub fn new(field: Field, predicate: TPredicate, collector: TCollector) -> Self {
Self {
/// Create a new FilterCollector.
pub fn new(
field: Field,
predicate: TPredicate,
collector: TCollector,
) -> FilterCollector<TCollector, TPredicate, TPredicateValue> {
FilterCollector {
field,
predicate,
collector,
@@ -93,7 +90,7 @@ where
}
}
impl<TCollector, TPredicate, TPredicateValue> Collector
impl<TCollector, TPredicate, TPredicateValue: Default> Collector
for FilterCollector<TCollector, TPredicate, TPredicateValue>
where
TCollector: Collector + Send + Sync,
@@ -101,6 +98,8 @@ where
TPredicateValue: HasAssociatedColumnType,
DynamicColumn: Into<Option<columnar::Column<TPredicateValue>>>,
{
// That's the type of our result.
// Our standard deviation will be a float.
type Fruit = TCollector::Fruit;
type Child = FilterSegmentCollector<TCollector::Child, TPredicate, TPredicateValue>;
@@ -109,7 +108,7 @@ where
&self,
segment_local_id: u32,
segment_reader: &SegmentReader,
) -> crate::Result<Self::Child> {
) -> crate::Result<FilterSegmentCollector<TCollector::Child, TPredicate, TPredicateValue>> {
let schema = segment_reader.schema();
let field_entry = schema.get_field_entry(self.field);
if !field_entry.is_fast() {
@@ -119,16 +118,16 @@ where
)));
}
let column_opt = segment_reader
let fast_field_reader = segment_reader
.fast_fields()
.column_opt(field_entry.name())?;
.column_first_or_default(schema.get_field_name(self.field))?;
let segment_collector = self
.collector
.for_segment(segment_local_id, segment_reader)?;
Ok(FilterSegmentCollector {
column_opt,
fast_field_reader,
segment_collector,
predicate: self.predicate.clone(),
t_predicate_value: PhantomData,
@@ -147,208 +146,35 @@ where
}
}
pub struct FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue> {
column_opt: Option<Column<TPredicateValue>>,
pub struct FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue>
where
TPredicate: 'static,
DynamicColumn: Into<Option<columnar::Column<TPredicateValue>>>,
{
fast_field_reader: Arc<dyn ColumnValues<TPredicateValue>>,
segment_collector: TSegmentCollector,
predicate: TPredicate,
t_predicate_value: PhantomData<TPredicateValue>,
}
impl<TSegmentCollector, TPredicate, TPredicateValue>
FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue>
where
TPredicateValue: PartialOrd + Copy + Debug + Send + Sync + 'static,
TPredicate: 'static + Fn(TPredicateValue) -> bool + Send + Sync,
{
#[inline]
fn accept_document(&self, doc_id: DocId) -> bool {
if let Some(column) = &self.column_opt {
for val in column.values_for_doc(doc_id) {
if (self.predicate)(val) {
return true;
}
}
}
false
}
}
impl<TSegmentCollector, TPredicate, TPredicateValue> SegmentCollector
for FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue>
where
TSegmentCollector: SegmentCollector,
TPredicateValue: HasAssociatedColumnType,
TPredicate: 'static + Fn(TPredicateValue) -> bool + Send + Sync, /* DynamicColumn: Into<Option<columnar::Column<TPredicateValue>>> */
TPredicate: 'static + Fn(TPredicateValue) -> bool + Send + Sync,
DynamicColumn: Into<Option<columnar::Column<TPredicateValue>>>,
{
type Fruit = TSegmentCollector::Fruit;
fn collect(&mut self, doc: u32, score: Score) {
if self.accept_document(doc) {
self.segment_collector.collect(doc, score);
let value = self.fast_field_reader.get_val(doc);
if (self.predicate)(value) {
self.segment_collector.collect(doc, score)
}
}
fn harvest(self) -> TSegmentCollector::Fruit {
self.segment_collector.harvest()
}
}
/// A variant of the [`FilterCollector`] specialized for bytes fast fields, i.e.
/// it transparently wraps an inner [`Collector`] but filters documents
/// based on the result of applying the predicate to the bytes fast field.
///
/// A document is accepted if and only if the predicate returns `true` for at least one value.
///
/// In other words,
/// - documents with no values are filtered out.
/// - documents with several values are accepted if at least one value matches the predicate.
///
/// ```rust
/// use tantivy::collector::{TopDocs, BytesFilterCollector};
/// use tantivy::query::QueryParser;
/// use tantivy::schema::{Schema, TEXT, FAST};
/// use tantivy::{doc, DocAddress, Index};
///
/// # fn main() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder();
/// let title = schema_builder.add_text_field("title", TEXT);
/// let barcode = schema_builder.add_bytes_field("barcode", FAST);
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
///
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// index_writer.add_document(doc!(title => "The Name of the Wind", barcode => &b"010101"[..]))?;
/// index_writer.add_document(doc!(title => "The Diary of Muadib", barcode => &b"110011"[..]))?;
/// index_writer.add_document(doc!(title => "A Dairy Cow", barcode => &b"110111"[..]))?;
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl", barcode => &b"011101"[..]))?;
/// index_writer.add_document(doc!(title => "Bridget Jones's Diary"))?;
/// index_writer.commit()?;
///
/// let reader = index.reader()?;
/// let searcher = reader.searcher();
///
/// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary")?;
/// let filter_collector = BytesFilterCollector::new(barcode, |bytes: &[u8]| bytes.starts_with(b"01"), TopDocs::with_limit(2));
/// let top_docs = searcher.search(&query, &filter_collector)?;
///
/// assert_eq!(top_docs.len(), 1);
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 3));
/// # Ok(())
/// # }
/// ```
pub struct BytesFilterCollector<TCollector, TPredicate>
where TPredicate: 'static + Clone
{
field: Field,
collector: TCollector,
predicate: TPredicate,
}
impl<TCollector, TPredicate> BytesFilterCollector<TCollector, TPredicate>
where
TCollector: Collector + Send + Sync,
TPredicate: Fn(&[u8]) -> bool + Send + Sync + Clone,
{
/// Create a new `BytesFilterCollector`.
pub fn new(field: Field, predicate: TPredicate, collector: TCollector) -> Self {
Self {
field,
predicate,
collector,
}
}
}
impl<TCollector, TPredicate> Collector for BytesFilterCollector<TCollector, TPredicate>
where
TCollector: Collector + Send + Sync,
TPredicate: 'static + Fn(&[u8]) -> bool + Send + Sync + Clone,
{
type Fruit = TCollector::Fruit;
type Child = BytesFilterSegmentCollector<TCollector::Child, TPredicate>;
fn for_segment(
&self,
segment_local_id: u32,
segment_reader: &SegmentReader,
) -> crate::Result<Self::Child> {
let schema = segment_reader.schema();
let field_name = schema.get_field_name(self.field);
let column_opt = segment_reader.fast_fields().bytes(field_name)?;
let segment_collector = self
.collector
.for_segment(segment_local_id, segment_reader)?;
Ok(BytesFilterSegmentCollector {
column_opt,
segment_collector,
predicate: self.predicate.clone(),
buffer: Vec::new(),
})
}
fn requires_scoring(&self) -> bool {
self.collector.requires_scoring()
}
fn merge_fruits(
&self,
segment_fruits: Vec<<TCollector::Child as SegmentCollector>::Fruit>,
) -> crate::Result<TCollector::Fruit> {
self.collector.merge_fruits(segment_fruits)
}
}
pub struct BytesFilterSegmentCollector<TSegmentCollector, TPredicate>
where TPredicate: 'static
{
column_opt: Option<BytesColumn>,
segment_collector: TSegmentCollector,
predicate: TPredicate,
buffer: Vec<u8>,
}
impl<TSegmentCollector, TPredicate> BytesFilterSegmentCollector<TSegmentCollector, TPredicate>
where
TSegmentCollector: SegmentCollector,
TPredicate: 'static + Fn(&[u8]) -> bool + Send + Sync,
{
#[inline]
fn accept_document(&mut self, doc_id: DocId) -> bool {
if let Some(column) = &self.column_opt {
for ord in column.term_ords(doc_id) {
self.buffer.clear();
let found = column.ord_to_bytes(ord, &mut self.buffer).unwrap_or(false);
if found && (self.predicate)(&self.buffer) {
return true;
}
}
}
false
}
}
impl<TSegmentCollector, TPredicate> SegmentCollector
for BytesFilterSegmentCollector<TSegmentCollector, TPredicate>
where
TSegmentCollector: SegmentCollector,
TPredicate: 'static + Fn(&[u8]) -> bool + Send + Sync,
{
type Fruit = TSegmentCollector::Fruit;
fn collect(&mut self, doc: u32, score: Score) {
if self.accept_document(doc) {
self.segment_collector.collect(doc, score);
}
}
fn harvest(self) -> TSegmentCollector::Fruit {
fn harvest(self) -> <TSegmentCollector as SegmentCollector>::Fruit {
self.segment_collector.harvest()
}
}

View File

@@ -112,7 +112,7 @@ mod docset_collector;
pub use self::docset_collector::DocSetCollector;
mod filter_collector_wrapper;
pub use self::filter_collector_wrapper::{BytesFilterCollector, FilterCollector};
pub use self::filter_collector_wrapper::FilterCollector;
/// `Fruit` is the type for the result of our collection.
/// e.g. `usize` for the `Count` collector.

View File

@@ -14,7 +14,7 @@ use crate::collector::{
};
use crate::fastfield::{FastFieldNotAvailableError, FastValue};
use crate::query::Weight;
use crate::{DocAddress, DocId, Order, Score, SegmentOrdinal, SegmentReader, TantivyError};
use crate::{DocAddress, DocId, Score, SegmentOrdinal, SegmentReader, TantivyError};
struct FastFieldConvertCollector<
TCollector: Collector<Fruit = Vec<(u64, DocAddress)>>,
@@ -23,7 +23,6 @@ struct FastFieldConvertCollector<
pub collector: TCollector,
pub field: String,
pub fast_value: std::marker::PhantomData<TFastValue>,
order: Order,
}
impl<TCollector, TFastValue> Collector for FastFieldConvertCollector<TCollector, TFastValue>
@@ -71,13 +70,7 @@ where
let raw_result = self.collector.merge_fruits(segment_fruits)?;
let transformed_result = raw_result
.into_iter()
.map(|(score, doc_address)| {
if self.order.is_desc() {
(TFastValue::from_u64(score), doc_address)
} else {
(TFastValue::from_u64(u64::MAX - score), doc_address)
}
})
.map(|(score, doc_address)| (TFastValue::from_u64(score), doc_address))
.collect::<Vec<_>>();
Ok(transformed_result)
}
@@ -138,23 +131,16 @@ impl fmt::Debug for TopDocs {
struct ScorerByFastFieldReader {
sort_column: Arc<dyn ColumnValues<u64>>,
order: Order,
}
impl CustomSegmentScorer<u64> for ScorerByFastFieldReader {
fn score(&mut self, doc: DocId) -> u64 {
let value = self.sort_column.get_val(doc);
if self.order.is_desc() {
value
} else {
u64::MAX - value
}
self.sort_column.get_val(doc)
}
}
struct ScorerByField {
field: String,
order: Order,
}
impl CustomScorer<u64> for ScorerByField {
@@ -171,13 +157,8 @@ impl CustomScorer<u64> for ScorerByField {
sort_column_opt.ok_or_else(|| FastFieldNotAvailableError {
field_name: self.field.clone(),
})?;
let mut default_value = 0u64;
if self.order.is_asc() {
default_value = u64::MAX;
}
Ok(ScorerByFastFieldReader {
sort_column: sort_column.first_or_default_col(default_value),
order: self.order.clone(),
sort_column: sort_column.first_or_default_col(0u64),
})
}
}
@@ -249,7 +230,7 @@ impl TopDocs {
///
/// ```rust
/// # use tantivy::schema::{Schema, FAST, TEXT};
/// # use tantivy::{doc, Index, DocAddress, Order};
/// # use tantivy::{doc, Index, DocAddress};
/// # use tantivy::query::{Query, QueryParser};
/// use tantivy::Searcher;
/// use tantivy::collector::TopDocs;
@@ -287,7 +268,7 @@ impl TopDocs {
/// // Note the `rating_field` needs to be a FAST field here.
/// let top_books_by_rating = TopDocs
/// ::with_limit(10)
/// .order_by_fast_field("rating", Order::Desc);
/// .order_by_u64_field("rating");
///
/// // ... and here are our documents. Note this is a simple vec.
/// // The `u64` in the pair is the value of our fast field for
@@ -307,15 +288,13 @@ impl TopDocs {
///
/// To comfortably work with `u64`s, `i64`s, `f64`s, or `date`s, please refer to
/// the [.order_by_fast_field(...)](TopDocs::order_by_fast_field) method.
fn order_by_u64_field(
pub fn order_by_u64_field(
self,
field: impl ToString,
order: Order,
) -> impl Collector<Fruit = Vec<(u64, DocAddress)>> {
CustomScoreTopCollector::new(
ScorerByField {
field: field.to_string(),
order,
},
self.0.into_tscore(),
)
@@ -337,7 +316,7 @@ impl TopDocs {
///
/// ```rust
/// # use tantivy::schema::{Schema, FAST, TEXT};
/// # use tantivy::{doc, Index, DocAddress,Order};
/// # use tantivy::{doc, Index, DocAddress};
/// # use tantivy::query::{Query, AllQuery};
/// use tantivy::Searcher;
/// use tantivy::collector::TopDocs;
@@ -375,7 +354,7 @@ impl TopDocs {
/// // type `sort_by_field`. revenue_field here is a FAST i64 field.
/// let top_company_by_revenue = TopDocs
/// ::with_limit(2)
/// .order_by_fast_field("revenue", Order::Desc);
/// .order_by_fast_field("revenue");
///
/// // ... and here are our documents. Note this is a simple vec.
/// // The `i64` in the pair is the value of our fast field for
@@ -393,17 +372,15 @@ impl TopDocs {
pub fn order_by_fast_field<TFastValue>(
self,
fast_field: impl ToString,
order: Order,
) -> impl Collector<Fruit = Vec<(TFastValue, DocAddress)>>
where
TFastValue: FastValue,
{
let u64_collector = self.order_by_u64_field(fast_field.to_string(), order.clone());
let u64_collector = self.order_by_u64_field(fast_field.to_string());
FastFieldConvertCollector {
collector: u64_collector,
field: fast_field.to_string(),
fast_value: PhantomData,
order,
}
}
@@ -744,7 +721,7 @@ mod tests {
use crate::schema::{Field, Schema, FAST, STORED, TEXT};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::OffsetDateTime;
use crate::{DateTime, DocAddress, DocId, Index, IndexWriter, Order, Score, SegmentReader};
use crate::{DateTime, DocAddress, DocId, Index, IndexWriter, Score, SegmentReader};
fn make_index() -> crate::Result<Index> {
let mut schema_builder = Schema::builder();
@@ -905,7 +882,7 @@ mod tests {
});
let searcher = index.reader()?.searcher();
let top_collector = TopDocs::with_limit(4).order_by_u64_field(SIZE, Order::Desc);
let top_collector = TopDocs::with_limit(4).order_by_u64_field(SIZE);
let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector)?;
assert_eq!(
&top_docs[..],
@@ -944,7 +921,7 @@ mod tests {
))?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let top_collector = TopDocs::with_limit(3).order_by_fast_field("birthday", Order::Desc);
let top_collector = TopDocs::with_limit(3).order_by_fast_field("birthday");
let top_docs: Vec<(DateTime, DocAddress)> = searcher.search(&AllQuery, &top_collector)?;
assert_eq!(
&top_docs[..],
@@ -974,7 +951,7 @@ mod tests {
))?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let top_collector = TopDocs::with_limit(3).order_by_fast_field("altitude", Order::Desc);
let top_collector = TopDocs::with_limit(3).order_by_fast_field("altitude");
let top_docs: Vec<(i64, DocAddress)> = searcher.search(&AllQuery, &top_collector)?;
assert_eq!(
&top_docs[..],
@@ -1004,7 +981,7 @@ mod tests {
))?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let top_collector = TopDocs::with_limit(3).order_by_fast_field("altitude", Order::Desc);
let top_collector = TopDocs::with_limit(3).order_by_fast_field("altitude");
let top_docs: Vec<(f64, DocAddress)> = searcher.search(&AllQuery, &top_collector)?;
assert_eq!(
&top_docs[..],
@@ -1032,7 +1009,7 @@ mod tests {
.unwrap();
});
let searcher = index.reader().unwrap().searcher();
let top_collector = TopDocs::with_limit(4).order_by_u64_field("missing_field", Order::Desc);
let top_collector = TopDocs::with_limit(4).order_by_u64_field("missing_field");
let segment_reader = searcher.segment_reader(0u32);
top_collector
.for_segment(0, segment_reader)
@@ -1050,7 +1027,7 @@ mod tests {
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let segment = searcher.segment_reader(0);
let top_collector = TopDocs::with_limit(4).order_by_u64_field(SIZE, Order::Desc);
let top_collector = TopDocs::with_limit(4).order_by_u64_field(SIZE);
let err = top_collector.for_segment(0, segment).err().unwrap();
assert!(matches!(err, crate::TantivyError::InvalidArgument(_)));
Ok(())
@@ -1067,7 +1044,7 @@ mod tests {
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let segment = searcher.segment_reader(0);
let top_collector = TopDocs::with_limit(4).order_by_fast_field::<i64>(SIZE, Order::Desc);
let top_collector = TopDocs::with_limit(4).order_by_fast_field::<i64>(SIZE);
let err = top_collector.for_segment(0, segment).err().unwrap();
assert!(
matches!(err, crate::TantivyError::SchemaError(msg) if msg == "Field \"size\" is not a fast field.")
@@ -1129,50 +1106,4 @@ mod tests {
let query = query_parser.parse_query(query).unwrap();
(index, query)
}
#[test]
fn test_fast_field_ascending_order() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let title = schema_builder.add_text_field(TITLE, TEXT);
let size = schema_builder.add_u64_field(SIZE, FAST);
let schema = schema_builder.build();
let (index, query) = index("beer", title, schema, |index_writer| {
index_writer
.add_document(doc!(
title => "bottle of beer",
size => 12u64,
))
.unwrap();
index_writer
.add_document(doc!(
title => "growler of beer",
size => 64u64,
))
.unwrap();
index_writer
.add_document(doc!(
title => "pint of beer",
size => 16u64,
))
.unwrap();
index_writer
.add_document(doc!(
title => "empty beer",
))
.unwrap();
});
let searcher = index.reader()?.searcher();
let top_collector = TopDocs::with_limit(4).order_by_fast_field(SIZE, Order::Asc);
let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector)?;
assert_eq!(
&top_docs[..],
&[
(12, DocAddress::new(0, 0)),
(16, DocAddress::new(0, 2)),
(64, DocAddress::new(0, 1)),
(18446744073709551615, DocAddress::new(0, 3)),
]
);
Ok(())
}
}

View File

@@ -120,8 +120,8 @@ impl IndexBuilder {
Self {
schema: None,
index_settings: IndexSettings::default(),
tokenizer_manager: TokenizerManager::default_for_indexing(),
fast_field_tokenizer_manager: TokenizerManager::default_for_fast_fields(),
tokenizer_manager: TokenizerManager::default(),
fast_field_tokenizer_manager: TokenizerManager::default(),
}
}
@@ -400,8 +400,8 @@ impl Index {
settings: metas.index_settings.clone(),
directory,
schema,
tokenizers: TokenizerManager::default_for_indexing(),
fast_field_tokenizers: TokenizerManager::default_for_fast_fields(),
tokenizers: TokenizerManager::default(),
fast_field_tokenizers: TokenizerManager::default(),
executor: Arc::new(Executor::single_thread()),
inventory,
}

View File

@@ -410,9 +410,7 @@ mod tests {
use super::IndexMeta;
use crate::core::index_meta::UntrackedIndexMeta;
use crate::schema::{Schema, TEXT};
use crate::store::Compressor;
#[cfg(feature = "zstd-compression")]
use crate::store::ZstdCompressor;
use crate::store::{Compressor, ZstdCompressor};
use crate::{IndexSettings, IndexSortByField, Order};
#[test]
@@ -448,7 +446,6 @@ mod tests {
}
#[test]
#[cfg(feature = "zstd-compression")]
fn test_serialize_metas_zstd_compressor() {
let schema = {
let mut schema_builder = Schema::builder();
@@ -485,14 +482,13 @@ mod tests {
}
#[test]
#[cfg(all(feature = "lz4-compression", feature = "zstd-compression"))]
fn test_serialize_metas_invalid_comp() {
let json = r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zsstd","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;
let err = serde_json::from_str::<UntrackedIndexMeta>(json).unwrap_err();
assert_eq!(
err.to_string(),
"unknown variant `zsstd`, expected one of `none`, `lz4`, `zstd`, \
"unknown variant `zsstd`, expected one of `none`, `lz4`, `brotli`, `snappy`, `zstd`, \
`zstd(compression_level=5)` at line 1 column 96"
.to_string()
);
@@ -506,20 +502,6 @@ mod tests {
);
}
#[test]
#[cfg(not(feature = "zstd-compression"))]
fn test_serialize_metas_unsupported_comp() {
let json = r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zstd","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;
let err = serde_json::from_str::<UntrackedIndexMeta>(json).unwrap_err();
assert_eq!(
err.to_string(),
"unsupported variant `zstd`, please enable Tantivy's `zstd-compression` feature at \
line 1 column 95"
.to_string()
);
}
#[test]
#[cfg(feature = "lz4-compression")]
fn test_index_settings_default() {

View File

@@ -259,7 +259,7 @@ pub(crate) fn set_string_and_get_terms(
/// Writes a value of a JSON field to a `Term`.
/// The Term format is as follows:
/// `[JSON_TYPE][JSON_PATH][JSON_END_OF_PATH][VALUE_BYTES]`
/// [JSON_TYPE][JSON_PATH][JSON_END_OF_PATH][VALUE_BYTES]
pub struct JsonTermWriter<'a> {
term_buffer: &'a mut Term,
path_stack: Vec<usize>,

View File

@@ -2,6 +2,8 @@ use std::collections::HashMap;
use std::sync::{Arc, RwLock};
use std::{fmt, io};
use fail::fail_point;
use crate::core::{InvertedIndexReader, Segment, SegmentComponent, SegmentId};
use crate::directory::{CompositeFile, FileSlice};
use crate::error::DataCorruption;
@@ -149,7 +151,7 @@ impl SegmentReader {
let store_file = segment.open_read(SegmentComponent::Store)?;
crate::fail_point!("SegmentReader::open#middle");
fail_point!("SegmentReader::open#middle");
let postings_file = segment.open_read(SegmentComponent::Postings)?;
let postings_composite = CompositeFile::open(&postings_file)?;

View File

@@ -5,6 +5,7 @@ use std::sync::{Arc, RwLock};
use std::{fmt, result};
use common::HasLen;
use fail::fail_point;
use super::FileHandle;
use crate::core::META_FILEPATH;
@@ -183,7 +184,7 @@ impl Directory for RamDirectory {
}
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
crate::fail_point!("RamDirectory::delete", |_| {
fail_point!("RamDirectory::delete", |_| {
Err(DeleteError::IoError {
io_error: Arc::new(io::Error::from(io::ErrorKind::Other)),
filepath: path.to_path_buf(),

View File

@@ -446,8 +446,7 @@ mod tests {
#[test]
fn test_text_fastfield() {
let mut schema_builder = Schema::builder();
let text_options: TextOptions = TextOptions::from(TEXT).set_fast("raw");
let text_field = schema_builder.add_text_field("text", text_options);
let text_field = schema_builder.add_text_field("text", TEXT | FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
@@ -1083,7 +1082,7 @@ mod tests {
#[test]
fn test_fast_field_in_json_field_expand_dots_disabled() {
let mut schema_builder = Schema::builder();
let json_option = JsonObjectOptions::default().set_fast("default");
let json_option = JsonObjectOptions::default().set_fast(None);
let json = schema_builder.add_json_field("json", json_option);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
@@ -1109,7 +1108,7 @@ mod tests {
#[test]
fn test_fast_field_in_json_field_with_tokenizer() {
let mut schema_builder = Schema::builder();
let json_option = JsonObjectOptions::default().set_fast("default");
let json_option = JsonObjectOptions::default().set_fast(Some("default"));
let json = schema_builder.add_json_field("json", json_option);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
@@ -1135,7 +1134,7 @@ mod tests {
fn test_fast_field_in_json_field_expand_dots_enabled() {
let mut schema_builder = Schema::builder();
let json_option = JsonObjectOptions::default()
.set_fast("default")
.set_fast(None)
.set_expand_dots_enabled();
let json = schema_builder.add_json_field("json", json_option);
let schema = schema_builder.build();
@@ -1203,10 +1202,10 @@ mod tests {
#[test]
fn test_fast_field_tokenizer() {
let mut schema_builder = Schema::builder();
let opt = TextOptions::default().set_fast("custom_lowercase");
let opt = TextOptions::default().set_fast(Some("custom_lowercase"));
let text_field = schema_builder.add_text_field("text", opt);
let schema = schema_builder.build();
let ff_tokenizer_manager = TokenizerManager::default_for_fast_fields();
let ff_tokenizer_manager = TokenizerManager::default();
ff_tokenizer_manager.register(
"custom_lowercase",
TextAnalyzer::builder(RawTokenizer::default())
@@ -1239,7 +1238,7 @@ mod tests {
.set_index_option(crate::schema::IndexRecordOption::WithFreqs)
.set_tokenizer("raw"),
)
.set_fast("default")
.set_fast(Some("default"))
.set_stored();
let log_field = schema_builder.add_text_field("log_level", text_fieldtype);
@@ -1272,7 +1271,7 @@ mod tests {
fn test_shadowing_fast_field_with_expand_dots() {
let mut schema_builder = Schema::builder();
let json_option = JsonObjectOptions::default()
.set_fast("default")
.set_fast(None)
.set_expand_dots_enabled();
let json_field = schema_builder.add_json_field("jsonfield", json_option.clone());
let shadowing_json_field = schema_builder.add_json_field("jsonfield.attr", json_option);

View File

@@ -88,7 +88,7 @@ impl FastFieldReaders {
let Some((field, path)): Option<(Field, &str)> = self
.schema
.find_field_with_default(field_name, default_field_opt)
else {
else{
return Ok(None);
};
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
@@ -120,8 +120,7 @@ impl FastFieldReaders {
T: HasAssociatedColumnType,
DynamicColumn: Into<Option<Column<T>>>,
{
let Some(dynamic_column_handle) =
self.dynamic_column_handle(field_name, T::column_type())?
let Some(dynamic_column_handle) = self.dynamic_column_handle(field_name, T::column_type())?
else {
return Ok(None);
};
@@ -197,8 +196,7 @@ impl FastFieldReaders {
/// Returns a `str` column.
pub fn str(&self, field_name: &str) -> crate::Result<Option<StrColumn>> {
let Some(dynamic_column_handle) =
self.dynamic_column_handle(field_name, ColumnType::Str)?
let Some(dynamic_column_handle) = self.dynamic_column_handle(field_name, ColumnType::Str)?
else {
return Ok(None);
};
@@ -208,8 +206,7 @@ impl FastFieldReaders {
/// Returns a `bytes` column.
pub fn bytes(&self, field_name: &str) -> crate::Result<Option<BytesColumn>> {
let Some(dynamic_column_handle) =
self.dynamic_column_handle(field_name, ColumnType::Bytes)?
let Some(dynamic_column_handle) = self.dynamic_column_handle(field_name, ColumnType::Bytes)?
else {
return Ok(None);
};
@@ -349,7 +346,7 @@ mod tests {
schema_builder.add_json_field(
"json_expand_dots_enabled",
JsonObjectOptions::default()
.set_fast("default")
.set_fast(None)
.set_expand_dots_enabled(),
);
let dynamic_field = schema_builder.add_json_field("_dyna", FAST);

View File

@@ -18,8 +18,6 @@ const JSON_DEPTH_LIMIT: usize = 20;
pub struct FastFieldsWriter {
columnar_writer: ColumnarWriter,
fast_field_names: Vec<Option<String>>, //< TODO see if we can hash the field name hash too.
// Field -> Fast field tokenizer mapping.
// All text fast fields should have a tokenizer.
per_field_tokenizer: Vec<Option<TextAnalyzer>>,
date_precisions: Vec<DateTimePrecision>,
expand_dots: Vec<bool>,
@@ -63,7 +61,7 @@ impl FastFieldsWriter {
if let Some(tokenizer_name) = json_object_options.get_fast_field_tokenizer_name() {
let text_analyzer = tokenizer_manager.get(tokenizer_name).ok_or_else(|| {
TantivyError::InvalidArgument(format!(
"Tokenizer `{tokenizer_name}` not found"
"Tokenizer {tokenizer_name:?} not found"
))
})?;
per_field_tokenizer[field_id.field_id() as usize] = Some(text_analyzer);
@@ -159,6 +157,9 @@ impl FastFieldsWriter {
&token.text,
);
})
} else {
self.columnar_writer
.record_str(doc_id, field_name.as_str(), text_val);
}
}
Value::Bytes(bytes_val) => {
@@ -200,20 +201,18 @@ impl FastFieldsWriter {
self.json_path_buffer.clear();
self.json_path_buffer.push_str(field_name);
let text_analyzer_opt =
let text_analyzer =
&mut self.per_field_tokenizer[field_value.field().field_id() as usize];
if let Some(text_analyzer) = text_analyzer_opt {
record_json_obj_to_columnar_writer(
doc_id,
json_obj,
expand_dots,
JSON_DEPTH_LIMIT,
&mut self.json_path_buffer,
&mut self.columnar_writer,
text_analyzer,
);
}
record_json_obj_to_columnar_writer(
doc_id,
json_obj,
expand_dots,
JSON_DEPTH_LIMIT,
&mut self.json_path_buffer,
&mut self.columnar_writer,
text_analyzer,
);
}
Value::IpAddr(ip_addr) => {
self.columnar_writer
@@ -264,7 +263,7 @@ fn record_json_obj_to_columnar_writer(
remaining_depth_limit: usize,
json_path_buffer: &mut String,
columnar_writer: &mut columnar::ColumnarWriter,
text_analyzer: &mut TextAnalyzer,
tokenizer: &mut Option<TextAnalyzer>,
) {
for (key, child) in json_obj {
let len_path = json_path_buffer.len();
@@ -289,7 +288,7 @@ fn record_json_obj_to_columnar_writer(
remaining_depth_limit,
json_path_buffer,
columnar_writer,
text_analyzer,
tokenizer,
);
// popping our sub path.
json_path_buffer.truncate(len_path);
@@ -303,7 +302,7 @@ fn record_json_value_to_columnar_writer(
mut remaining_depth_limit: usize,
json_path_writer: &mut String,
columnar_writer: &mut columnar::ColumnarWriter,
text_analyzer: &mut TextAnalyzer,
tokenizer: &mut Option<TextAnalyzer>,
) {
if remaining_depth_limit == 0 {
return;
@@ -322,10 +321,14 @@ fn record_json_value_to_columnar_writer(
}
}
serde_json::Value::String(text) => {
let mut token_stream = text_analyzer.token_stream(text);
token_stream.process(&mut |token| {
columnar_writer.record_str(doc, json_path_writer.as_str(), &token.text);
});
if let Some(text_analyzer) = tokenizer.as_mut() {
let mut token_stream = text_analyzer.token_stream(text);
token_stream.process(&mut |token| {
columnar_writer.record_str(doc, json_path_writer.as_str(), &token.text);
})
} else {
columnar_writer.record_str(doc, json_path_writer.as_str(), text);
}
}
serde_json::Value::Array(arr) => {
for el in arr {
@@ -336,7 +339,7 @@ fn record_json_value_to_columnar_writer(
remaining_depth_limit,
json_path_writer,
columnar_writer,
text_analyzer,
tokenizer,
);
}
}
@@ -348,7 +351,7 @@ fn record_json_value_to_columnar_writer(
remaining_depth_limit,
json_path_writer,
columnar_writer,
text_analyzer,
tokenizer,
);
}
}
@@ -368,9 +371,6 @@ mod tests {
) -> ColumnarReader {
let mut columnar_writer = ColumnarWriter::default();
let mut json_path = String::new();
let mut text_analyzer = crate::tokenizer::TokenizerManager::default_for_fast_fields()
.get(crate::schema::DEFAULT_FAST_FIELD_TOKENIZER)
.unwrap();
for (doc, json_doc) in json_docs.iter().enumerate() {
record_json_value_to_columnar_writer(
doc as u32,
@@ -379,7 +379,7 @@ mod tests {
JSON_DEPTH_LIMIT,
&mut json_path,
&mut columnar_writer,
&mut text_analyzer,
&mut None,
);
}
let mut buffer = Vec::new();
@@ -399,7 +399,6 @@ mod tests {
});
let columnar_reader = test_columnar_from_jsons_aux(&[json_doc], false);
let columns = columnar_reader.list_columns().unwrap();
assert_eq!(columns.len(), 5);
{
assert_eq!(columns[0].0, "arr");
let column_arr_opt: Option<StrColumn> = columns[0].1.open().unwrap().into();
@@ -435,9 +434,7 @@ mod tests {
{
assert_eq!(columns[4].0, "text");
let column_text_opt: Option<StrColumn> = columns[4].1.open().unwrap().into();
let column_text = column_text_opt.unwrap();
let term_ords: Vec<u64> = column_text.term_ords(0).collect();
assert_eq!(&term_ords[..], &[0]);
assert!(column_text_opt.unwrap().term_ords(0).eq([0].into_iter()));
}
}

View File

@@ -6,6 +6,7 @@ use std::path::PathBuf;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, RwLock};
use fail::fail_point;
use rayon::{ThreadPool, ThreadPoolBuilder};
use super::segment_manager::SegmentManager;
@@ -42,7 +43,7 @@ pub(crate) fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate:
let mut buffer = serde_json::to_vec_pretty(metas)?;
// Just adding a new line at the end of the buffer.
writeln!(&mut buffer)?;
crate::fail_point!("save_metas", |msg| Err(crate::TantivyError::from(
fail_point!("save_metas", |msg| Err(crate::TantivyError::from(
std::io::Error::new(
std::io::ErrorKind::Other,
msg.unwrap_or_else(|| "Undefined".to_string())

View File

@@ -1,6 +1,5 @@
use columnar::MonotonicallyMappableToU64;
use itertools::Itertools;
use tokenizer_api::BoxTokenStream;
use super::doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping};
use super::operation::AddOperation;
@@ -16,7 +15,7 @@ use crate::postings::{
use crate::schema::{FieldEntry, FieldType, Schema, Term, Value, DATE_TIME_PRECISION_INDEXED};
use crate::store::{StoreReader, StoreWriter};
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer};
use crate::{DocId, Document, Opstamp, SegmentComponent, TantivyError};
use crate::{DocId, Document, Opstamp, SegmentComponent};
/// Computes the initial size of the hash table.
///
@@ -99,18 +98,14 @@ impl SegmentWriter {
}
_ => None,
};
let tokenizer_name = text_options
.map(|text_index_option| text_index_option.tokenizer())
.unwrap_or("default");
tokenizer_manager.get(tokenizer_name).ok_or_else(|| {
TantivyError::SchemaError(format!(
"Error getting tokenizer for field: {}",
field_entry.name()
))
})
text_options
.and_then(|text_index_option| {
let tokenizer_name = &text_index_option.tokenizer();
tokenizer_manager.get(tokenizer_name)
})
.unwrap_or_default()
})
.collect::<Result<Vec<_>, _>>()?;
.collect();
Ok(SegmentWriter {
max_doc: 0,
ctx: IndexingContext::new(table_size),
@@ -210,7 +205,7 @@ impl SegmentWriter {
for value in values {
let mut token_stream = match value {
Value::PreTokStr(tok_str) => {
BoxTokenStream::new(PreTokenizedStream::from(tok_str.clone()))
PreTokenizedStream::from(tok_str.clone()).into()
}
Value::Str(ref text) => {
let text_analyzer =
@@ -443,9 +438,7 @@ fn remap_and_write(
#[cfg(test)]
mod tests {
use std::path::{Path, PathBuf};
use tempfile::TempDir;
use std::path::Path;
use super::compute_initial_table_size;
use crate::collector::Count;
@@ -453,9 +446,7 @@ mod tests {
use crate::directory::RamDirectory;
use crate::postings::TermInfo;
use crate::query::PhraseQuery;
use crate::schema::{
IndexRecordOption, Schema, TextFieldIndexing, TextOptions, Type, STORED, STRING, TEXT,
};
use crate::schema::{IndexRecordOption, Schema, Type, STORED, STRING, TEXT};
use crate::store::{Compressor, StoreReader, StoreWriter};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::OffsetDateTime;
@@ -909,32 +900,4 @@ mod tests {
postings.positions(&mut positions);
assert_eq!(positions, &[4]); //< as opposed to 3 if we had a position length of 1.
}
#[test]
fn test_show_error_when_tokenizer_not_registered() {
let text_field_indexing = TextFieldIndexing::default()
.set_tokenizer("custom_en")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let text_options = TextOptions::default()
.set_indexing_options(text_field_indexing)
.set_stored();
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("title", text_options);
let schema = schema_builder.build();
let tempdir = TempDir::new().unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
Index::create_in_dir(&tempdir_path, schema).unwrap();
let index = Index::open_in_dir(tempdir_path).unwrap();
let schema = index.schema();
let mut index_writer = index.writer(50_000_000).unwrap();
let title = schema.get_field("title").unwrap();
let mut document = Document::default();
document.add_text(title, "The Old Man and the Sea");
index_writer.add_document(document).unwrap();
let error = index_writer.commit().unwrap_err();
assert_eq!(
error.to_string(),
"Schema error: 'Error getting tokenizer for field: title'"
);
}
}

View File

@@ -101,7 +101,6 @@ mod test {
use super::Stamper;
#[allow(clippy::redundant_clone)]
#[test]
fn test_stamper() {
let stamper = Stamper::new(7u64);
@@ -117,7 +116,6 @@ mod test {
assert_eq!(stamper.stamp(), 15u64);
}
#[allow(clippy::redundant_clone)]
#[test]
fn test_stamper_revert() {
let stamper = Stamper::new(7u64);

View File

@@ -191,7 +191,7 @@ pub use crate::schema::{DateOptions, DateTimePrecision, Document, Term};
/// Index format version.
const INDEX_FORMAT_VERSION: u32 = 5;
#[cfg(all(feature = "mmap", unix))]
#[cfg(unix)]
pub use memmap2::Advice;
/// Structure version for the index.
@@ -299,35 +299,6 @@ pub struct DocAddress {
pub doc_id: DocId,
}
#[macro_export]
/// Enable fail_point if feature is enabled.
macro_rules! fail_point {
($name:expr) => {{
#[cfg(feature = "failpoints")]
{
fail::eval($name, |_| {
panic!("Return is not supported for the fail point \"{}\"", $name);
});
}
}};
($name:expr, $e:expr) => {{
#[cfg(feature = "failpoints")]
{
if let Some(res) = fail::eval($name, $e) {
return res;
}
}
}};
($name:expr, $cond:expr, $e:expr) => {{
#[cfg(feature = "failpoints")]
{
if $cond {
fail::fail_point!($name, $e);
}
}
}};
}
#[cfg(test)]
pub mod tests {
use common::{BinarySerializable, FixedSize};
@@ -905,8 +876,8 @@ pub mod tests {
}"#,
)
.unwrap();
let doc = doc!(json_field=>json_val);
let index = Index::create_in_ram(schema);
let doc = doc!(json_field=>json_val.clone());
let index = Index::create_in_ram(schema.clone());
let mut writer = index.writer_for_tests().unwrap();
writer.add_document(doc).unwrap();
writer.commit().unwrap();

View File

@@ -2,6 +2,7 @@ use std::cmp::Ordering;
use std::io::{self, Write};
use common::{BinarySerializable, CountingWriter, VInt};
use fail::fail_point;
use super::TermInfo;
use crate::core::Segment;
@@ -204,7 +205,7 @@ impl<'a> FieldSerializer<'a> {
/// If the current block is incomplete, it needs to be encoded
/// using `VInt` encoding.
pub fn close_term(&mut self) -> io::Result<()> {
crate::fail_point!("FieldSerializer::close_term", |msg: Option<String>| {
fail_point!("FieldSerializer::close_term", |msg: Option<String>| {
Err(io::Error::new(io::ErrorKind::Other, format!("{msg:?}")))
});
if self.term_open {

View File

@@ -4,7 +4,9 @@ use std::collections::{BinaryHeap, HashMap};
use crate::query::bm25::idf;
use crate::query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery};
use crate::schema::{Field, FieldType, IndexRecordOption, Term, Value};
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer};
use crate::tokenizer::{
BoxTokenStream, FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer,
};
use crate::{DocAddress, Result, Searcher, TantivyError};
#[derive(Debug, PartialEq)]
@@ -204,7 +206,8 @@ impl MoreLikeThis {
for value in values {
match value {
Value::PreTokStr(tok_str) => {
let mut token_stream = PreTokenizedStream::from(tok_str.clone());
let mut token_stream: BoxTokenStream =
PreTokenizedStream::from(tok_str.clone()).into();
token_stream.process(&mut |token| {
if !self.is_noise_word(token.text.clone()) {
let term = Term::from_field_text(field, &token.text);

View File

@@ -956,7 +956,7 @@ mod test {
.iter()
.flat_map(|field_name| schema.get_field(field_name))
.collect();
let tokenizer_manager = TokenizerManager::default_for_indexing();
let tokenizer_manager = TokenizerManager::default();
tokenizer_manager.register(
"en_with_stop_words",
TextAnalyzer::builder(SimpleTokenizer::default())
@@ -1447,7 +1447,7 @@ mod test {
let title = schema_builder.add_text_field("title", text_options);
let schema = schema_builder.build();
let default_fields = vec![title];
let tokenizer_manager = TokenizerManager::default_for_indexing();
let tokenizer_manager = TokenizerManager::default();
let query_parser = QueryParser::new(schema, default_fields, tokenizer_manager);
assert_matches!(
@@ -1622,8 +1622,7 @@ mod test {
let mut schema_builder = Schema::builder();
schema_builder.add_text_field(r#"a\.b"#, STRING);
let schema = schema_builder.build();
let query_parser =
QueryParser::new(schema, Vec::new(), TokenizerManager::default_for_indexing());
let query_parser = QueryParser::new(schema, Vec::new(), TokenizerManager::default());
let query = query_parser.parse_query(r#"a\.b:hello"#).unwrap();
assert_eq!(
format!("{query:?}"),
@@ -1640,11 +1639,8 @@ mod test {
schema_builder.add_text_field("first.toto.titi", STRING);
schema_builder.add_text_field("third.a.b.c", STRING);
let schema = schema_builder.build();
let query_parser = QueryParser::new(
schema.clone(),
Vec::new(),
TokenizerManager::default_for_indexing(),
);
let query_parser =
QueryParser::new(schema.clone(), Vec::new(), TokenizerManager::default());
assert_eq!(
query_parser.split_full_path("first.toto"),
Some((schema.get_field("first.toto").unwrap(), ""))

View File

@@ -472,7 +472,6 @@ mod tests {
use super::RangeQuery;
use crate::collector::{Count, TopDocs};
use crate::indexer::NoMergePolicy;
use crate::query::QueryParser;
use crate::schema::{Document, Field, IntoIpv6Addr, Schema, FAST, INDEXED, STORED, TEXT};
use crate::{doc, Index};
@@ -548,8 +547,7 @@ mod tests {
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 60_000_000)?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
let mut index_writer = index.writer_with_num_threads(2, 60_000_000)?;
for i in 1..100 {
let mut doc = Document::new();
@@ -559,9 +557,6 @@ mod tests {
}
}
index_writer.add_document(doc)?;
if i == 10 {
index_writer.commit()?;
}
}
index_writer.commit()?;

View File

@@ -31,10 +31,9 @@ impl IPFastFieldRangeWeight {
impl Weight for IPFastFieldRangeWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let Some(ip_addr_column): Option<Column<Ipv6Addr>> =
reader.fast_fields().column_opt(&self.field)?
else {
return Ok(Box::new(EmptyScorer));
let Some(ip_addr_column): Option<Column<Ipv6Addr>> = reader.fast_fields()
.column_opt(&self.field)? else {
return Ok(Box::new(EmptyScorer))
};
let value_range = bound_to_value_range(
&self.lower_bound,

View File

@@ -71,9 +71,7 @@ impl Weight for FastFieldRangeWeight {
let column_type_opt_ref: Option<&[ColumnType]> = column_type_opt
.as_ref()
.map(|column_types| column_types.as_slice());
let Some((column, _)) =
fast_field_reader.u64_lenient_for_type(column_type_opt_ref, &self.field)?
else {
let Some((column, _)) = fast_field_reader.u64_lenient_for_type(column_type_opt_ref, &self.field)? else {
return Ok(Box::new(EmptyScorer));
};
let value_range = bound_to_value_range(

View File

@@ -72,14 +72,6 @@ impl Query for TermSetQuery {
fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
Ok(Box::new(self.specialized_weight(enable_scoring.schema())?))
}
fn query_terms<'a>(&'a self, visitor: &mut dyn FnMut(&'a Term, bool)) {
for terms in self.terms_map.values() {
for term in terms {
visitor(term, false);
}
}
}
}
struct SetDfaWrapper(Map<Vec<u8>>);

View File

@@ -4,7 +4,7 @@ use serde::{Deserialize, Serialize};
use super::text_options::{FastFieldTextOptions, TokenizerName};
use crate::schema::flags::{FastFlag, SchemaFlagList, StoredFlag};
use crate::schema::{TextFieldIndexing, TextOptions, DEFAULT_FAST_FIELD_TOKENIZER};
use crate::schema::{TextFieldIndexing, TextOptions};
/// The `JsonObjectOptions` make it possible to
/// configure how a json object field should be indexed and stored.
@@ -58,19 +58,20 @@ impl JsonObjectOptions {
/// Returns true if and only if the json object fields are
/// to be treated as fast fields.
pub fn is_fast(&self) -> bool {
match self.fast {
FastFieldTextOptions::Disabled => false,
FastFieldTextOptions::Enabled { .. } => true,
}
matches!(self.fast, FastFieldTextOptions::IsEnabled(true))
|| matches!(
&self.fast,
FastFieldTextOptions::EnabledWithTokenizer { with_tokenizer: _ }
)
}
/// Returns true if and only if the value is a fast field.
pub fn get_fast_field_tokenizer_name(&self) -> Option<&str> {
match &self.fast {
FastFieldTextOptions::Disabled => None,
FastFieldTextOptions::Enabled {
tokenizer: with_tokenizer,
} => Some(with_tokenizer.name()),
FastFieldTextOptions::IsEnabled(true) | FastFieldTextOptions::IsEnabled(false) => None,
FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
} => Some(tokenizer.name()),
}
}
@@ -129,11 +130,15 @@ impl JsonObjectOptions {
/// [`TermDictionary::ord_to_term()`](crate::termdict::TermDictionary::ord_to_term)
/// from the dictionary.
#[must_use]
pub fn set_fast(mut self, tokenizer_name: &str) -> Self {
let with_tokenizer = TokenizerName::from_name(tokenizer_name);
self.fast = FastFieldTextOptions::Enabled {
tokenizer: with_tokenizer,
};
pub fn set_fast(mut self, tokenizer_name: Option<&str>) -> Self {
if let Some(tokenizer) = tokenizer_name {
let tokenizer = TokenizerName::from_name(tokenizer);
self.fast = FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
}
} else {
self.fast = FastFieldTextOptions::IsEnabled(true);
}
self
}
@@ -161,9 +166,7 @@ impl From<FastFlag> for JsonObjectOptions {
JsonObjectOptions {
stored: false,
indexing: None,
fast: FastFieldTextOptions::Enabled {
tokenizer: TokenizerName::from_static(DEFAULT_FAST_FIELD_TOKENIZER),
},
fast: FastFieldTextOptions::IsEnabled(true),
expand_dots_enabled: false,
}
}

View File

@@ -1,6 +1,6 @@
//! Schema definition for tantivy's indices.
//! # Setting your schema in Tantivy
//!
//! # Setting your schema in Tantivy
//!
//! Tantivy has a very strict schema.
//! The schema defines information about the fields your index contains, that is, for each field:
@@ -153,8 +153,6 @@ pub use self::term::{Term, ValueBytes, JSON_END_OF_PATH};
pub use self::text_options::{TextFieldIndexing, TextOptions, STRING, TEXT};
pub use self::value::Value;
pub(crate) const DEFAULT_FAST_FIELD_TOKENIZER: &str = "default";
/// Validator for a potential `field_name`.
/// Returns true if the name can be use for a field name.
///

View File

@@ -24,68 +24,19 @@ pub struct TextOptions {
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[serde(
into = "FastFieldTextOptionsForSerialization",
from = "FastFieldTextOptionsForSerialization"
)]
#[serde(untagged)]
/// Enum to control how the fast field setting of a text field.
#[derive(Default)]
pub(crate) enum FastFieldTextOptions {
/// Fastfield disabled
#[default]
Disabled,
/// Flag to enable/disable
IsEnabled(bool),
/// Enable with tokenizer. The tokenizer must be available on the fast field tokenizer manager.
/// `Index::fast_field_tokenizer`.
Enabled { tokenizer: TokenizerName },
EnabledWithTokenizer { with_tokenizer: TokenizerName },
}
/// Enum used to control the way we serialize fast field text options.
///
/// For backward compatiblity reasons, we folow the format introduce in tantivy 0.19.
/// `false` -> Disabled
/// `true` -> Enabled with default tokenizer
/// `{ tokenizer: "something" }` -> Enabled with a specific tokenizer.
#[derive(Serialize, Deserialize)]
#[serde(untagged)]
enum FastFieldTextOptionsForSerialization {
IsEnabled(bool),
EnabledWithTokenizer {
#[serde(alias = "with_tokenizer")]
tokenizer: TokenizerName,
},
}
impl From<FastFieldTextOptionsForSerialization> for FastFieldTextOptions {
fn from(value: FastFieldTextOptionsForSerialization) -> Self {
match value {
FastFieldTextOptionsForSerialization::IsEnabled(enabled) => {
if enabled {
FastFieldTextOptions::Enabled {
tokenizer: TokenizerName::from_static(
crate::schema::DEFAULT_FAST_FIELD_TOKENIZER,
),
}
} else {
FastFieldTextOptions::Disabled
}
}
FastFieldTextOptionsForSerialization::EnabledWithTokenizer { tokenizer } => {
FastFieldTextOptions::Enabled { tokenizer }
}
}
}
}
impl From<FastFieldTextOptions> for FastFieldTextOptionsForSerialization {
fn from(value: FastFieldTextOptions) -> Self {
match value {
FastFieldTextOptions::Disabled => {
FastFieldTextOptionsForSerialization::IsEnabled(false)
}
FastFieldTextOptions::Enabled { tokenizer } => {
FastFieldTextOptionsForSerialization::EnabledWithTokenizer { tokenizer }
}
}
impl Default for FastFieldTextOptions {
fn default() -> Self {
FastFieldTextOptions::IsEnabled(false)
}
}
@@ -94,13 +45,23 @@ impl BitOr<FastFieldTextOptions> for FastFieldTextOptions {
fn bitor(self, other: FastFieldTextOptions) -> FastFieldTextOptions {
match (self, other) {
(FastFieldTextOptions::Enabled { tokenizer }, _)
| (_, FastFieldTextOptions::Enabled { tokenizer }) => {
FastFieldTextOptions::Enabled { tokenizer }
}
(FastFieldTextOptions::Disabled, FastFieldTextOptions::Disabled) => {
FastFieldTextOptions::Disabled
}
(
FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
},
_,
)
| (
_,
FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
},
) => FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
},
(FastFieldTextOptions::IsEnabled(true), _)
| (_, FastFieldTextOptions::IsEnabled(true)) => FastFieldTextOptions::IsEnabled(true),
(_, FastFieldTextOptions::IsEnabled(false)) => FastFieldTextOptions::IsEnabled(false),
}
}
}
@@ -122,17 +83,20 @@ impl TextOptions {
/// Returns true if and only if the value is a fast field.
pub fn is_fast(&self) -> bool {
match &self.fast {
FastFieldTextOptions::Disabled => false,
FastFieldTextOptions::Enabled { .. } => true,
}
matches!(self.fast, FastFieldTextOptions::IsEnabled(true))
|| matches!(
&self.fast,
FastFieldTextOptions::EnabledWithTokenizer { with_tokenizer: _ }
)
}
/// Returns true if and only if the value is a fast field.
pub fn get_fast_field_tokenizer_name(&self) -> Option<&str> {
match &self.fast {
FastFieldTextOptions::Disabled => None,
FastFieldTextOptions::Enabled { tokenizer } => Some(tokenizer.name()),
FastFieldTextOptions::IsEnabled(true) | FastFieldTextOptions::IsEnabled(false) => None,
FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
} => Some(tokenizer.name()),
}
}
@@ -157,9 +121,15 @@ impl TextOptions {
/// [`TermDictionary::ord_to_term()`](crate::termdict::TermDictionary::ord_to_term)
/// from the dictionary.
#[must_use]
pub fn set_fast(mut self, tokenizer_name: &str) -> TextOptions {
let tokenizer = TokenizerName::from_name(tokenizer_name);
self.fast = FastFieldTextOptions::Enabled { tokenizer };
pub fn set_fast(mut self, tokenizer_name: Option<&str>) -> TextOptions {
if let Some(tokenizer) = tokenizer_name {
let tokenizer = TokenizerName::from_name(tokenizer);
self.fast = FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: tokenizer,
}
} else {
self.fast = FastFieldTextOptions::IsEnabled(true);
}
self
}
@@ -293,7 +263,7 @@ pub const STRING: TextOptions = TextOptions {
record: IndexRecordOption::Basic,
}),
stored: false,
fast: FastFieldTextOptions::Disabled,
fast: FastFieldTextOptions::IsEnabled(false),
coerce: false,
};
@@ -306,7 +276,7 @@ pub const TEXT: TextOptions = TextOptions {
}),
stored: false,
coerce: false,
fast: FastFieldTextOptions::Disabled,
fast: FastFieldTextOptions::IsEnabled(false),
};
impl<T: Into<TextOptions>> BitOr<T> for TextOptions {
@@ -356,9 +326,7 @@ impl From<FastFlag> for TextOptions {
TextOptions {
indexing: None,
stored: false,
fast: FastFieldTextOptions::Enabled {
tokenizer: TokenizerName::from_static(crate::schema::DEFAULT_FAST_FIELD_TOKENIZER),
},
fast: FastFieldTextOptions::IsEnabled(true),
coerce: false,
}
}
@@ -424,21 +392,21 @@ mod tests {
#[test]
fn serde_fast_field_tokenizer() {
let json = r#" {
"fast": { "tokenizer": "default" }
"fast": { "with_tokenizer": "default" }
} "#;
let options: TextOptions = serde_json::from_str(json).unwrap();
assert_eq!(
options.fast,
FastFieldTextOptions::Enabled {
tokenizer: TokenizerName::from_static("default")
FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: TokenizerName::from_static("default")
}
);
let options: TextOptions =
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
assert_eq!(
options.fast,
FastFieldTextOptions::Enabled {
tokenizer: TokenizerName::from_static("default")
FastFieldTextOptions::EnabledWithTokenizer {
with_tokenizer: TokenizerName::from_static("default")
}
);
@@ -446,28 +414,18 @@ mod tests {
"fast": true
} "#;
let options: TextOptions = serde_json::from_str(json).unwrap();
assert_eq!(
options.fast,
FastFieldTextOptions::Enabled {
tokenizer: TokenizerName::from_static(DEFAULT_FAST_FIELD_TOKENIZER)
}
);
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(true));
let options: TextOptions =
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
assert_eq!(
options.fast,
FastFieldTextOptions::Enabled {
tokenizer: TokenizerName::from_static(DEFAULT_FAST_FIELD_TOKENIZER)
}
);
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(true));
let json = r#" {
"fast": false
} "#;
let options: TextOptions = serde_json::from_str(json).unwrap();
assert_eq!(options.fast, FastFieldTextOptions::Disabled);
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(false));
let options: TextOptions =
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
assert_eq!(options.fast, FastFieldTextOptions::Disabled);
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(false));
}
}

View File

@@ -693,7 +693,7 @@ Survey in 2016, 2017, and 2018."#;
terms.insert(String::from("bc"), 1.0);
let fragments = search_fragments(
&mut From::from(NgramTokenizer::all_ngrams(2, 2).unwrap()),
&mut From::from(NgramTokenizer::all_ngrams(2, 2)),
text,
&terms,
3,

View File

@@ -0,0 +1,19 @@
use std::io;
#[inline]
pub fn compress(mut uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
let params = brotli::enc::BrotliEncoderParams {
quality: 5,
..Default::default()
};
compressed.clear();
brotli::BrotliCompress(&mut uncompressed, compressed, &params)?;
Ok(())
}
#[inline]
pub fn decompress(mut compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
decompressed.clear();
brotli::BrotliDecompress(&mut compressed, decompressed)?;
Ok(())
}

View File

@@ -0,0 +1,17 @@
use std::io::{self, Read, Write};
#[inline]
pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
compressed.clear();
let mut encoder = snap::write::FrameEncoder::new(compressed);
encoder.write_all(uncompressed)?;
encoder.flush()?;
Ok(())
}
#[inline]
pub fn decompress(compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
decompressed.clear();
snap::read::FrameDecoder::new(compressed).read_to_end(decompressed)?;
Ok(())
}

View File

@@ -17,10 +17,12 @@ pub enum Compressor {
/// No compression
None,
/// Use the lz4 compressor (block format)
#[cfg(feature = "lz4-compression")]
Lz4,
/// Use the brotli compressor
Brotli,
/// Use the snap compressor
Snappy,
/// Use the zstd compressor
#[cfg(feature = "zstd-compression")]
Zstd(ZstdCompressor),
}
@@ -29,9 +31,9 @@ impl Serialize for Compressor {
where S: serde::Serializer {
match *self {
Compressor::None => serializer.serialize_str("none"),
#[cfg(feature = "lz4-compression")]
Compressor::Lz4 => serializer.serialize_str("lz4"),
#[cfg(feature = "zstd-compression")]
Compressor::Brotli => serializer.serialize_str("brotli"),
Compressor::Snappy => serializer.serialize_str("snappy"),
Compressor::Zstd(zstd) => serializer.serialize_str(&zstd.ser_to_string()),
}
}
@@ -43,38 +45,27 @@ impl<'de> Deserialize<'de> for Compressor {
let buf = String::deserialize(deserializer)?;
let compressor = match buf.as_str() {
"none" => Compressor::None,
#[cfg(feature = "lz4-compression")]
"lz4" => Compressor::Lz4,
#[cfg(not(feature = "lz4-compression"))]
"lz4" => {
return Err(serde::de::Error::custom(
"unsupported variant `lz4`, please enable Tantivy's `lz4-compression` feature",
))
}
#[cfg(feature = "zstd-compression")]
_ if buf.starts_with("zstd") => Compressor::Zstd(
ZstdCompressor::deser_from_str(&buf).map_err(serde::de::Error::custom)?,
),
#[cfg(not(feature = "zstd-compression"))]
_ if buf.starts_with("zstd") => {
return Err(serde::de::Error::custom(
"unsupported variant `zstd`, please enable Tantivy's `zstd-compression` \
feature",
))
}
"brotli" => Compressor::Brotli,
"snappy" => Compressor::Snappy,
_ => {
return Err(serde::de::Error::unknown_variant(
&buf,
&[
"none",
#[cfg(feature = "lz4-compression")]
"lz4",
#[cfg(feature = "zstd-compression")]
"zstd",
#[cfg(feature = "zstd-compression")]
"zstd(compression_level=5)",
],
));
if buf.starts_with("zstd") {
Compressor::Zstd(
ZstdCompressor::deser_from_str(&buf).map_err(serde::de::Error::custom)?,
)
} else {
return Err(serde::de::Error::unknown_variant(
&buf,
&[
"none",
"lz4",
"brotli",
"snappy",
"zstd",
"zstd(compression_level=5)",
],
));
}
}
};
@@ -136,15 +127,18 @@ impl ZstdCompressor {
}
impl Default for Compressor {
#[allow(unreachable_code)]
fn default() -> Self {
#[cfg(feature = "lz4-compression")]
return Compressor::Lz4;
#[cfg(feature = "zstd-compression")]
return Compressor::Zstd(ZstdCompressor::default());
Compressor::None
if cfg!(feature = "lz4-compression") {
Compressor::Lz4
} else if cfg!(feature = "brotli-compression") {
Compressor::Brotli
} else if cfg!(feature = "snappy-compression") {
Compressor::Snappy
} else if cfg!(feature = "zstd-compression") {
Compressor::Zstd(ZstdCompressor::default())
} else {
Compressor::None
}
}
}
@@ -161,14 +155,50 @@ impl Compressor {
compressed.extend_from_slice(uncompressed);
Ok(())
}
#[cfg(feature = "lz4-compression")]
Self::Lz4 => super::compression_lz4_block::compress(uncompressed, compressed),
#[cfg(feature = "zstd-compression")]
Self::Zstd(_zstd_compressor) => super::compression_zstd_block::compress(
uncompressed,
compressed,
_zstd_compressor.compression_level,
),
Self::Lz4 => {
#[cfg(feature = "lz4-compression")]
{
super::compression_lz4_block::compress(uncompressed, compressed)
}
#[cfg(not(feature = "lz4-compression"))]
{
panic!("lz4-compression feature flag not activated");
}
}
Self::Brotli => {
#[cfg(feature = "brotli-compression")]
{
super::compression_brotli::compress(uncompressed, compressed)
}
#[cfg(not(feature = "brotli-compression"))]
{
panic!("brotli-compression-compression feature flag not activated");
}
}
Self::Snappy => {
#[cfg(feature = "snappy-compression")]
{
super::compression_snap::compress(uncompressed, compressed)
}
#[cfg(not(feature = "snappy-compression"))]
{
panic!("snappy-compression feature flag not activated");
}
}
Self::Zstd(_zstd_compressor) => {
#[cfg(feature = "zstd-compression")]
{
super::compression_zstd_block::compress(
uncompressed,
compressed,
_zstd_compressor.compression_level,
)
}
#[cfg(not(feature = "zstd-compression"))]
{
panic!("zstd-compression feature flag not activated");
}
}
}
}
}

View File

@@ -16,10 +16,12 @@ pub enum Decompressor {
/// No compression
None,
/// Use the lz4 decompressor (block format)
#[cfg(feature = "lz4-compression")]
Lz4,
/// Use the brotli decompressor
Brotli,
/// Use the snap decompressor
Snappy,
/// Use the zstd decompressor
#[cfg(feature = "zstd-compression")]
Zstd,
}
@@ -27,9 +29,9 @@ impl From<Compressor> for Decompressor {
fn from(compressor: Compressor) -> Self {
match compressor {
Compressor::None => Decompressor::None,
#[cfg(feature = "lz4-compression")]
Compressor::Lz4 => Decompressor::Lz4,
#[cfg(feature = "zstd-compression")]
Compressor::Brotli => Decompressor::Brotli,
Compressor::Snappy => Decompressor::Snappy,
Compressor::Zstd(_) => Decompressor::Zstd,
}
}
@@ -39,9 +41,9 @@ impl Decompressor {
pub(crate) fn from_id(id: u8) -> Decompressor {
match id {
0 => Decompressor::None,
#[cfg(feature = "lz4-compression")]
1 => Decompressor::Lz4,
#[cfg(feature = "zstd-compression")]
2 => Decompressor::Brotli,
3 => Decompressor::Snappy,
4 => Decompressor::Zstd,
_ => panic!("unknown compressor id {id:?}"),
}
@@ -50,9 +52,9 @@ impl Decompressor {
pub(crate) fn get_id(&self) -> u8 {
match self {
Self::None => 0,
#[cfg(feature = "lz4-compression")]
Self::Lz4 => 1,
#[cfg(feature = "zstd-compression")]
Self::Brotli => 2,
Self::Snappy => 3,
Self::Zstd => 4,
}
}
@@ -75,10 +77,46 @@ impl Decompressor {
decompressed.extend_from_slice(compressed);
Ok(())
}
#[cfg(feature = "lz4-compression")]
Self::Lz4 => super::compression_lz4_block::decompress(compressed, decompressed),
#[cfg(feature = "zstd-compression")]
Self::Zstd => super::compression_zstd_block::decompress(compressed, decompressed),
Self::Lz4 => {
#[cfg(feature = "lz4-compression")]
{
super::compression_lz4_block::decompress(compressed, decompressed)
}
#[cfg(not(feature = "lz4-compression"))]
{
panic!("lz4-compression feature flag not activated");
}
}
Self::Brotli => {
#[cfg(feature = "brotli-compression")]
{
super::compression_brotli::decompress(compressed, decompressed)
}
#[cfg(not(feature = "brotli-compression"))]
{
panic!("brotli-compression feature flag not activated");
}
}
Self::Snappy => {
#[cfg(feature = "snappy-compression")]
{
super::compression_snap::decompress(compressed, decompressed)
}
#[cfg(not(feature = "snappy-compression"))]
{
panic!("snappy-compression feature flag not activated");
}
}
Self::Zstd => {
#[cfg(feature = "zstd-compression")]
{
super::compression_zstd_block::decompress(compressed, decompressed)
}
#[cfg(not(feature = "zstd-compression"))]
{
panic!("zstd-compression feature flag not activated");
}
}
}
}
}
@@ -91,9 +129,9 @@ mod tests {
#[test]
fn compressor_decompressor_id_test() {
assert_eq!(Decompressor::from(Compressor::None), Decompressor::None);
#[cfg(feature = "lz4-compression")]
assert_eq!(Decompressor::from(Compressor::Lz4), Decompressor::Lz4);
#[cfg(feature = "zstd-compression")]
assert_eq!(Decompressor::from(Compressor::Brotli), Decompressor::Brotli);
assert_eq!(Decompressor::from(Compressor::Snappy), Decompressor::Snappy);
assert_eq!(
Decompressor::from(Compressor::Zstd(Default::default())),
Decompressor::Zstd

View File

@@ -4,8 +4,8 @@
//! order to be handled in the `Store`.
//!
//! Internally, documents (or rather their stored fields) are serialized to a buffer.
//! When the buffer exceeds `block_size` (defaults to 16K), the buffer is compressed
//! using LZ4 or Zstd and the resulting block is written to disk.
//! When the buffer exceeds `block_size` (defaults to 16K), the buffer is compressed using `brotli`,
//! `LZ4` or `snappy` and the resulting block is written to disk.
//!
//! One can then request for a specific `DocId`.
//! A skip list helps navigating to the right block,
@@ -48,6 +48,12 @@ pub(crate) const DOC_STORE_VERSION: u32 = 1;
#[cfg(feature = "lz4-compression")]
mod compression_lz4_block;
#[cfg(feature = "brotli-compression")]
mod compression_brotli;
#[cfg(feature = "snappy-compression")]
mod compression_snap;
#[cfg(feature = "zstd-compression")]
mod compression_zstd_block;
@@ -194,6 +200,16 @@ pub mod tests {
fn test_store_lz4_block() -> crate::Result<()> {
test_store(Compressor::Lz4, BLOCK_SIZE, true)
}
#[cfg(feature = "snappy-compression")]
#[test]
fn test_store_snap() -> crate::Result<()> {
test_store(Compressor::Snappy, BLOCK_SIZE, true)
}
#[cfg(feature = "brotli-compression")]
#[test]
fn test_store_brotli() -> crate::Result<()> {
test_store(Compressor::Brotli, BLOCK_SIZE, true)
}
#[cfg(feature = "zstd-compression")]
#[test]
@@ -245,8 +261,8 @@ pub mod tests {
Ok(())
}
#[cfg(feature = "snappy-compression")]
#[cfg(feature = "lz4-compression")]
#[cfg(feature = "zstd-compression")]
#[test]
fn test_merge_with_changed_compressor() -> crate::Result<()> {
let mut schema_builder = schema::Schema::builder();
@@ -278,7 +294,7 @@ pub mod tests {
);
// Change compressor, this disables stacking on merging
let index_settings = index.settings_mut();
index_settings.docstore_compression = Compressor::Zstd(Default::default());
index_settings.docstore_compression = Compressor::Snappy;
// Merging the segments
{
let segment_ids = index
@@ -300,7 +316,7 @@ pub mod tests {
LOREM.to_string()
);
}
assert_eq!(store.decompressor(), Decompressor::Zstd);
assert_eq!(store.decompressor(), Decompressor::Snappy);
Ok(())
}

View File

@@ -426,7 +426,7 @@ mod tests {
assert_eq!(store.cache_stats().cache_hits, 1);
assert_eq!(store.cache_stats().cache_misses, 2);
assert_eq!(store.cache.peek_lru(), Some(11207));
assert_eq!(store.cache.peek_lru(), Some(11163));
Ok(())
}

View File

@@ -154,7 +154,7 @@ pub use self::split_compound_words::SplitCompoundWords;
pub use self::stemmer::{Language, Stemmer};
pub use self::stop_word_filter::StopWordFilter;
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
pub use self::tokenizer::{TextAnalyzer, TextAnalyzerBuilder};
pub use self::tokenizer::TextAnalyzer;
pub use self::tokenizer_manager::TokenizerManager;
pub use self::whitespace_tokenizer::WhitespaceTokenizer;
@@ -189,7 +189,7 @@ pub mod tests {
#[test]
fn test_raw_tokenizer2() {
let tokenizer_manager = TokenizerManager::default_for_indexing();
let tokenizer_manager = TokenizerManager::default();
let mut en_tokenizer = tokenizer_manager.get("raw").unwrap();
let mut tokens: Vec<Token> = vec![];
{
@@ -206,7 +206,7 @@ pub mod tests {
#[test]
fn test_en_tokenizer() {
let tokenizer_manager = TokenizerManager::default_for_indexing();
let tokenizer_manager = TokenizerManager::default();
assert!(tokenizer_manager.get("en_doesnotexist").is_none());
let mut en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
let mut tokens: Vec<Token> = vec![];
@@ -228,7 +228,7 @@ pub mod tests {
#[test]
fn test_non_en_tokenizer() {
let tokenizer_manager = TokenizerManager::default_for_indexing();
let tokenizer_manager = TokenizerManager::default();
tokenizer_manager.register(
"el_stem",
TextAnalyzer::builder(SimpleTokenizer::default())
@@ -256,7 +256,7 @@ pub mod tests {
#[test]
fn test_tokenizer_empty() {
let tokenizer_manager = TokenizerManager::default_for_indexing();
let tokenizer_manager = TokenizerManager::default();
let mut en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
{
let mut tokens: Vec<Token> = vec![];
@@ -282,7 +282,7 @@ pub mod tests {
#[test]
fn test_whitespace_tokenizer() {
let tokenizer_manager = TokenizerManager::default_for_indexing();
let tokenizer_manager = TokenizerManager::default();
let mut ws_tokenizer = tokenizer_manager.get("whitespace").unwrap();
let mut tokens: Vec<Token> = vec![];
{

View File

@@ -1,5 +1,4 @@
use super::{Token, TokenStream, Tokenizer};
use crate::TantivyError;
/// Tokenize the text by splitting words into n-grams of the given size(s)
///
@@ -34,7 +33,7 @@ use crate::TantivyError;
/// ```rust
/// use tantivy::tokenizer::*;
///
/// let mut tokenizer = NgramTokenizer::new(2, 3, false).unwrap();
/// let mut tokenizer = NgramTokenizer::new(2, 3, false);
/// let mut stream = tokenizer.token_stream("hello");
/// {
/// let token = stream.next().unwrap();
@@ -80,7 +79,7 @@ use crate::TantivyError;
/// }
/// assert!(stream.next().is_none());
/// ```
#[derive(Clone, Debug)]
#[derive(Clone)]
pub struct NgramTokenizer {
/// min size of the n-gram
min_gram: usize,
@@ -93,39 +92,30 @@ pub struct NgramTokenizer {
impl NgramTokenizer {
/// Configures a new Ngram tokenizer
pub fn new(
min_gram: usize,
max_gram: usize,
prefix_only: bool,
) -> crate::Result<NgramTokenizer> {
if min_gram == 0 {
return Err(TantivyError::InvalidArgument(
"min_gram must be greater than 0".to_string(),
));
}
if min_gram > max_gram {
return Err(TantivyError::InvalidArgument(
"min_gram must not be greater than max_gram".to_string(),
));
}
Ok(NgramTokenizer {
pub fn new(min_gram: usize, max_gram: usize, prefix_only: bool) -> NgramTokenizer {
assert!(min_gram > 0, "min_gram must be greater than 0");
assert!(
min_gram <= max_gram,
"min_gram must not be greater than max_gram"
);
NgramTokenizer {
min_gram,
max_gram,
prefix_only,
token: Token::default(),
})
}
}
/// Create a `NGramTokenizer` which generates tokens for all inner ngrams.
///
/// This is as opposed to only prefix ngrams .
pub fn all_ngrams(min_gram: usize, max_gram: usize) -> crate::Result<NgramTokenizer> {
pub fn all_ngrams(min_gram: usize, max_gram: usize) -> NgramTokenizer {
Self::new(min_gram, max_gram, false)
}
/// Create a `NGramTokenizer` which only generates tokens for the
/// prefix ngrams.
pub fn prefix_only(min_gram: usize, max_gram: usize) -> crate::Result<NgramTokenizer> {
pub fn prefix_only(min_gram: usize, max_gram: usize) -> NgramTokenizer {
Self::new(min_gram, max_gram, true)
}
}
@@ -359,11 +349,7 @@ mod tests {
#[test]
fn test_ngram_tokenizer_1_2_false() {
let tokens = test_helper(
NgramTokenizer::all_ngrams(1, 2)
.unwrap()
.token_stream("hello"),
);
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hello"));
assert_eq!(tokens.len(), 9);
assert_token(&tokens[0], 0, "h", 0, 1);
assert_token(&tokens[1], 0, "he", 0, 2);
@@ -378,11 +364,7 @@ mod tests {
#[test]
fn test_ngram_tokenizer_min_max_equal() {
let tokens = test_helper(
NgramTokenizer::all_ngrams(3, 3)
.unwrap()
.token_stream("hello"),
);
let tokens = test_helper(NgramTokenizer::all_ngrams(3, 3).token_stream("hello"));
assert_eq!(tokens.len(), 3);
assert_token(&tokens[0], 0, "hel", 0, 3);
assert_token(&tokens[1], 0, "ell", 1, 4);
@@ -391,11 +373,7 @@ mod tests {
#[test]
fn test_ngram_tokenizer_2_5_prefix() {
let tokens = test_helper(
NgramTokenizer::prefix_only(2, 5)
.unwrap()
.token_stream("frankenstein"),
);
let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("frankenstein"));
assert_eq!(tokens.len(), 4);
assert_token(&tokens[0], 0, "fr", 0, 2);
assert_token(&tokens[1], 0, "fra", 0, 3);
@@ -405,11 +383,7 @@ mod tests {
#[test]
fn test_ngram_non_ascii_1_2() {
let tokens = test_helper(
NgramTokenizer::all_ngrams(1, 2)
.unwrap()
.token_stream("hεllo"),
);
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hεllo"));
assert_eq!(tokens.len(), 9);
assert_token(&tokens[0], 0, "h", 0, 1);
assert_token(&tokens[1], 0, "", 0, 3);
@@ -424,11 +398,7 @@ mod tests {
#[test]
fn test_ngram_non_ascii_2_5_prefix() {
let tokens = test_helper(
NgramTokenizer::prefix_only(2, 5)
.unwrap()
.token_stream("hεllo"),
);
let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("hεllo"));
assert_eq!(tokens.len(), 4);
assert_token(&tokens[0], 0, "", 0, 3);
assert_token(&tokens[1], 0, "hεl", 0, 4);
@@ -438,26 +408,22 @@ mod tests {
#[test]
fn test_ngram_empty() {
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 5).unwrap().token_stream(""));
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 5).token_stream(""));
assert!(tokens.is_empty());
let tokens = test_helper(NgramTokenizer::all_ngrams(2, 5).unwrap().token_stream(""));
let tokens = test_helper(NgramTokenizer::all_ngrams(2, 5).token_stream(""));
assert!(tokens.is_empty());
}
#[test]
#[should_panic(expected = "min_gram must be greater than 0")]
fn test_ngram_min_max_interval_empty() {
test_helper(
NgramTokenizer::all_ngrams(0, 2)
.unwrap()
.token_stream("hellossss"),
);
test_helper(NgramTokenizer::all_ngrams(0, 2).token_stream("hellossss"));
}
#[test]
#[should_panic(expected = "min_gram must not be greater than max_gram")]
fn test_invalid_interval_should_panic_if_smaller() {
NgramTokenizer::all_ngrams(2, 1).unwrap();
NgramTokenizer::all_ngrams(2, 1);
}
#[test]

View File

@@ -86,8 +86,6 @@ impl TokenFilter for SplitCompoundWords {
SplitCompoundWordsFilter {
dict: self.dict,
inner: tokenizer,
cuts: Vec::new(),
parts: Vec::new(),
}
}
}
@@ -96,33 +94,29 @@ impl TokenFilter for SplitCompoundWords {
pub struct SplitCompoundWordsFilter<T> {
dict: AhoCorasick,
inner: T,
cuts: Vec<usize>,
parts: Vec<Token>,
}
impl<T: Tokenizer> Tokenizer for SplitCompoundWordsFilter<T> {
type TokenStream<'a> = SplitCompoundWordsTokenStream<'a, T::TokenStream<'a>>;
type TokenStream<'a> = SplitCompoundWordsTokenStream<T::TokenStream<'a>>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
self.cuts.clear();
self.parts.clear();
SplitCompoundWordsTokenStream {
dict: self.dict.clone(),
tail: self.inner.token_stream(text),
cuts: &mut self.cuts,
parts: &mut self.parts,
cuts: Vec::new(),
parts: Vec::new(),
}
}
}
pub struct SplitCompoundWordsTokenStream<'a, T> {
pub struct SplitCompoundWordsTokenStream<T> {
dict: AhoCorasick,
tail: T,
cuts: &'a mut Vec<usize>,
parts: &'a mut Vec<Token>,
cuts: Vec<usize>,
parts: Vec<Token>,
}
impl<'a, T: TokenStream> SplitCompoundWordsTokenStream<'a, T> {
impl<T: TokenStream> SplitCompoundWordsTokenStream<T> {
// Will use `self.cuts` to fill `self.parts` if `self.tail.token()`
// can fully be split into consecutive matches against `self.dict`.
fn split(&mut self) {
@@ -158,7 +152,7 @@ impl<'a, T: TokenStream> SplitCompoundWordsTokenStream<'a, T> {
}
}
impl<'a, T: TokenStream> TokenStream for SplitCompoundWordsTokenStream<'a, T> {
impl<T: TokenStream> TokenStream for SplitCompoundWordsTokenStream<T> {
fn advance(&mut self) -> bool {
self.parts.pop();

View File

@@ -5,32 +5,12 @@ use tokenizer_api::{BoxTokenStream, TokenFilter, Tokenizer};
use crate::tokenizer::empty_tokenizer::EmptyTokenizer;
/// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`.
#[derive(Clone)]
pub struct TextAnalyzer {
tokenizer: Box<dyn BoxableTokenizer>,
}
impl Tokenizer for Box<dyn BoxableTokenizer> {
type TokenStream<'a> = BoxTokenStream<'a>;
// Note: we want to call `box_token_stream` on the concrete `Tokenizer`
// implementation, not the `BoxableTokenizer` one as it will cause
// a recursive call (and a stack overflow).
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
(**self).box_token_stream(text)
}
}
impl Clone for Box<dyn BoxableTokenizer> {
// Note: we want to call `box_clone` on the concrete `Tokenizer`
// implementation in order to clone the concrete `Tokenizer`.
fn clone(&self) -> Self {
(**self).box_clone()
}
}
/// A boxable `Tokenizer`, with its `TokenStream` type erased.
pub trait BoxableTokenizer: 'static + Send + Sync {
trait BoxableTokenizer: 'static + Send + Sync {
/// Creates a boxed token stream for a given `str`.
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a>;
/// Clone this tokenizer.
@@ -39,13 +19,21 @@ pub trait BoxableTokenizer: 'static + Send + Sync {
impl<T: Tokenizer> BoxableTokenizer for T {
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
BoxTokenStream::new(self.token_stream(text))
self.token_stream(text).into()
}
fn box_clone(&self) -> Box<dyn BoxableTokenizer> {
Box::new(self.clone())
}
}
impl Clone for TextAnalyzer {
fn clone(&self) -> Self {
TextAnalyzer {
tokenizer: self.tokenizer.box_clone(),
}
}
}
impl Default for TextAnalyzer {
fn default() -> TextAnalyzer {
TextAnalyzer::from(EmptyTokenizer)
@@ -66,12 +54,12 @@ impl TextAnalyzer {
/// Creates a token stream for a given `str`.
pub fn token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
self.tokenizer.token_stream(text)
self.tokenizer.box_token_stream(text)
}
}
/// Builder helper for [`TextAnalyzer`]
pub struct TextAnalyzerBuilder<T = Box<dyn BoxableTokenizer>> {
pub struct TextAnalyzerBuilder<T> {
tokenizer: T,
}
@@ -95,23 +83,6 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
}
}
/// Boxes the internal tokenizer. This is useful for adding dynamic filters.
/// Note: this will be less performant than the non boxed version.
pub fn dynamic(self) -> TextAnalyzerBuilder {
let boxed_tokenizer = Box::new(self.tokenizer);
TextAnalyzerBuilder {
tokenizer: boxed_tokenizer,
}
}
/// Appends a token filter to the current builder and returns a boxed version of the
/// tokenizer. This is useful when you want to build a `TextAnalyzer` dynamically.
/// Prefer using `TextAnalyzer::builder(tokenizer).filter(token_filter).build()` if
/// possible as it will be more performant and create less boxes.
pub fn filter_dynamic<F: TokenFilter>(self, token_filter: F) -> TextAnalyzerBuilder {
self.filter(token_filter).dynamic()
}
/// Finalize building the TextAnalyzer
pub fn build(self) -> TextAnalyzer {
TextAnalyzer {
@@ -119,57 +90,3 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::tokenizer::{LowerCaser, RemoveLongFilter, SimpleTokenizer};
#[test]
fn test_text_analyzer_builder() {
let mut analyzer = TextAnalyzer::builder(SimpleTokenizer::default())
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.build();
let mut stream = analyzer.token_stream("- first bullet point");
assert_eq!(stream.next().unwrap().text, "first");
assert_eq!(stream.next().unwrap().text, "bullet");
}
#[test]
fn test_text_analyzer_with_filters_boxed() {
// This test shows how one can build a TextAnalyzer dynamically, by stacking a list
// of parametrizable token filters.
//
// The following enum is the thing that would be serializable.
// Note that token filters can have their own parameters, too, like the RemoveLongFilter
enum SerializableTokenFilterEnum {
LowerCaser(LowerCaser),
RemoveLongFilter(RemoveLongFilter),
}
// Note that everything below is dynamic.
let filters: Vec<SerializableTokenFilterEnum> = vec![
SerializableTokenFilterEnum::LowerCaser(LowerCaser),
SerializableTokenFilterEnum::RemoveLongFilter(RemoveLongFilter::limit(12)),
];
let mut analyzer_builder: TextAnalyzerBuilder =
TextAnalyzer::builder(SimpleTokenizer::default())
.filter_dynamic(RemoveLongFilter::limit(40))
.filter_dynamic(LowerCaser);
for filter in filters {
analyzer_builder = match filter {
SerializableTokenFilterEnum::LowerCaser(lower_caser) => {
analyzer_builder.filter_dynamic(lower_caser)
}
SerializableTokenFilterEnum::RemoveLongFilter(remove_long_filter) => {
analyzer_builder.filter_dynamic(remove_long_filter)
}
}
}
let mut analyzer = analyzer_builder.build();
let mut stream = analyzer.token_stream("first bullet point");
assert_eq!(stream.next().unwrap().text, "first");
assert_eq!(stream.next().unwrap().text, "bullet");
}
}

View File

@@ -27,7 +27,6 @@ pub struct TokenizerManager {
impl TokenizerManager {
/// Creates an empty tokenizer manager.
#[allow(clippy::new_without_default)]
pub fn new() -> Self {
Self {
tokenizers: Arc::new(RwLock::new(HashMap::new())),
@@ -52,10 +51,12 @@ impl TokenizerManager {
.get(tokenizer_name)
.cloned()
}
}
impl Default for TokenizerManager {
/// Creates an `TokenizerManager` prepopulated with
/// the default pre-configured tokenizers of `tantivy`.
pub fn default_for_indexing() -> TokenizerManager {
fn default() -> TokenizerManager {
let manager = TokenizerManager::new();
manager.register("raw", RawTokenizer::default());
manager.register(
@@ -76,28 +77,4 @@ impl TokenizerManager {
manager.register("whitespace", WhitespaceTokenizer::default());
manager
}
/// Creates an `TokenizerManager` prepopulated with
/// the default pre-configured tokenizers of `tantivy`
/// for fast fields.
///
/// Fast fields usually do not really tokenize the text.
/// It is however very useful to filter / normalize the text.
pub fn default_for_fast_fields() -> TokenizerManager {
let manager = TokenizerManager::new();
let raw_tokenizer = TextAnalyzer::builder(RawTokenizer::default())
.filter(RemoveLongFilter::limit(255))
.build();
let lower_tokenizer = TextAnalyzer::builder(RawTokenizer::default())
.filter(RemoveLongFilter::limit(255))
.filter(LowerCaser)
.build();
manager.register(
crate::schema::DEFAULT_FAST_FIELD_TOKENIZER,
lower_tokenizer.clone(),
);
manager.register("raw", raw_tokenizer);
manager.register("lower", lower_tokenizer);
manager
}
}

View File

@@ -7,7 +7,7 @@ homepage = "https://github.com/quickwit-oss/tantivy"
repository = "https://github.com/quickwit-oss/tantivy"
keywords = ["search", "information", "retrieval", "sstable"]
categories = ["database-implementations", "data-structures", "compression"]
description = "sstables for tantivy"
desciption = "sstables for tantivy"
[dependencies]
common = {version= "0.5", path="../common", package="tantivy-common"}

View File

@@ -44,7 +44,7 @@ pub fn fast_short_slice_copy(src: &[u8], dst: &mut [u8]) {
return;
}
// The code will use the vmovdqu instruction to copy 32 bytes at a time.
/// The code will use the vmovdqu instruction to copy 32 bytes at a time.
#[cfg(target_feature = "avx")]
{
if len <= 64 {

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy-tokenizer-api"
version = "0.1.0"
version = "0.1.1"
license = "MIT"
edition = "2021"
description = "Tokenizer API of tantivy"

View File

@@ -63,22 +63,16 @@ pub trait Tokenizer: 'static + Clone + Send + Sync {
/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
impl<'a> TokenStream for BoxTokenStream<'a> {
fn advance(&mut self) -> bool {
self.0.advance()
}
fn token(&self) -> &Token {
self.0.token()
}
fn token_mut(&mut self) -> &mut Token {
self.0.token_mut()
impl<'a> From<BoxTokenStream<'a>> for Box<dyn TokenStream + 'a> {
fn from(token_stream: BoxTokenStream<'a>) -> Self {
token_stream.0
}
}
impl<'a> BoxTokenStream<'a> {
pub fn new<T: TokenStream + 'a>(token_stream: T) -> BoxTokenStream<'a> {
impl<'a, T> From<T> for BoxTokenStream<'a>
where T: TokenStream + 'a
{
fn from(token_stream: T) -> BoxTokenStream<'a> {
BoxTokenStream(Box::new(token_stream))
}
}