Compare commits

..

5 Commits

Author SHA1 Message Date
François Massot
dc783f8328 Remove BoxTokenStream. 2023-06-23 13:33:40 +02:00
François Massot
b82cd08f5d Fix comment. 2023-06-22 09:13:21 +02:00
François Massot
54f43135f2 Use dyn_clone. 2023-06-22 09:13:21 +02:00
François Massot
6c6b97d4ef Clean code and improve docs. 2023-06-22 09:13:20 +02:00
François Massot
ad9b825067 Add boxed token filter to ease the building of TextAnalyzer with a vec of filters. 2023-06-22 09:12:23 +02:00
30 changed files with 202 additions and 465 deletions

View File

@@ -1,14 +1,5 @@
Tantivy 0.20.2
================================
- Align numerical type priority order on the search side. [#2088](https://github.com/quickwit-oss/tantivy/issues/2088) (@fmassot)
- Fix is_child_of function not considering the root facet. [#2086](https://github.com/quickwit-oss/tantivy/issues/2086) (@adamreichhold)
Tantivy 0.20.1
================================
- Fix building on windows with mmap [#2070](https://github.com/quickwit-oss/tantivy/issues/2070) (@ChillFish8)
Tantivy 0.20
Tantivy 0.20 [Unreleased]
================================
#### Bugfixes
- Fix phrase queries with slop (slop supports now transpositions, algorithm that carries slop so far for num terms > 2) [#2031](https://github.com/quickwit-oss/tantivy/issues/2031)[#2020](https://github.com/quickwit-oss/tantivy/issues/2020)(@PSeitz)
@@ -47,14 +38,12 @@ Tantivy 0.20
- Add aggregation support for JSON type [#1888](https://github.com/quickwit-oss/tantivy/issues/1888) (@PSeitz)
- Mixed types support on JSON fields in aggs [#1971](https://github.com/quickwit-oss/tantivy/issues/1971) (@PSeitz)
- Perf: Fetch blocks of vals in aggregation for all cardinality [#1950](https://github.com/quickwit-oss/tantivy/issues/1950) (@PSeitz)
- Allow histogram bounds to be passed as Rfc3339 [#2076](https://github.com/quickwit-oss/tantivy/issues/2076) (@PSeitz)
- `Searcher` with disabled scoring via `EnableScoring::Disabled` [#1780](https://github.com/quickwit-oss/tantivy/issues/1780) (@shikhar)
- Enable tokenizer on json fields [#2053](https://github.com/quickwit-oss/tantivy/issues/2053) (@PSeitz)
- Enforcing "NOT" and "-" queries consistency in UserInputAst [#1609](https://github.com/quickwit-oss/tantivy/issues/1609) (@bazhenov)
- Faster indexing
- Refactor tokenization pipeline to use GATs [#1924](https://github.com/quickwit-oss/tantivy/issues/1924) (@trinity-1686a)
- Faster term hash map [#2058](https://github.com/quickwit-oss/tantivy/issues/2058)[#1940](https://github.com/quickwit-oss/tantivy/issues/1940) (@PSeitz)
- tokenizer-api: reduce Tokenizer allocation overhead [#2062](https://github.com/quickwit-oss/tantivy/issues/2062) (@PSeitz)
- Refactor vint [#2010](https://github.com/quickwit-oss/tantivy/issues/2010) (@PSeitz)
- Faster search
- Work in batches of docs on the SegmentCollector (Only for cases without score for now) [#1937](https://github.com/quickwit-oss/tantivy/issues/1937) (@PSeitz)

View File

@@ -19,12 +19,13 @@ oneshot = "0.1.5"
base64 = "0.21.0"
byteorder = "1.4.3"
crc32fast = "1.3.2"
dyn-clone = "1.0.11"
once_cell = "1.10.0"
regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] }
aho-corasick = "1.0"
tantivy-fst = "0.4.0"
memmap2 = { version = "0.7.1", optional = true }
lz4_flex = { version = "0.11", default-features = false, optional = true }
memmap2 = { version = "0.6.0", optional = true }
lz4_flex = { version = "0.10", default-features = false, features = ["checked-decode"], optional = true }
brotli = { version = "3.3.4", optional = true }
zstd = { version = "0.12", optional = true, default-features = false }
snap = { version = "1.0.5", optional = true }
@@ -49,9 +50,9 @@ murmurhash32 = "0.3.0"
time = { version = "0.3.10", features = ["serde-well-known"] }
smallvec = "1.8.0"
rayon = "1.5.2"
lru = "0.11.0"
lru = "0.10.0"
fastdivide = "0.4.0"
itertools = "0.11.0"
itertools = "0.10.3"
measure_time = "0.8.2"
async-trait = "0.1.53"
arc-swap = "1.5.0"

View File

@@ -1,7 +1,5 @@
use criterion::{criterion_group, criterion_main, Criterion};
use tantivy::tokenizer::{
LowerCaser, RemoveLongFilter, SimpleTokenizer, TextAnalyzer, TokenizerManager,
};
use tantivy::tokenizer::TokenizerManager;
const ALICE_TXT: &str = include_str!("alice.txt");
@@ -18,26 +16,7 @@ pub fn criterion_benchmark(c: &mut Criterion) {
assert_eq!(word_count, 30_731);
})
});
let mut dynamic_analyzer = TextAnalyzer::builder(SimpleTokenizer::default())
.dynamic()
.filter_dynamic(RemoveLongFilter::limit(40))
.filter_dynamic(LowerCaser)
.build();
c.bench_function("dynamic-tokenize-alice", |b| {
b.iter(|| {
let mut word_count = 0;
let mut token_stream = dynamic_analyzer.token_stream(ALICE_TXT);
while token_stream.advance() {
word_count += 1;
}
assert_eq!(word_count, 30_731);
})
});
}
criterion_group! {
name = benches;
config = Criterion::default().sample_size(200);
targets = criterion_benchmark
}
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);

View File

@@ -9,7 +9,7 @@ description = "column oriented storage for tantivy"
categories = ["database-implementations", "data-structures", "compression"]
[dependencies]
itertools = "0.11.0"
itertools = "0.10.5"
fnv = "1.0.7"
fastdivide = "0.4.0"

View File

@@ -168,9 +168,8 @@ mod tests {
)
.into();
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
panic!("Excpected a multivalued index")
};
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index
else { panic!("Excpected a multivalued index") };
let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
assert_eq!(&start_indexes, &[0, 3, 5]);
}
@@ -201,9 +200,8 @@ mod tests {
)
.into();
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
panic!("Excpected a multivalued index")
};
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index
else { panic!("Excpected a multivalued index") };
let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
assert_eq!(&start_indexes, &[0, 3, 5, 6]);
}

View File

@@ -157,13 +157,7 @@ mod tests {
Cardinality::Optional,
&shuffle_merge_order,
);
let SerializableColumnIndex::Optional {
non_null_row_ids,
num_rows,
} = serializable_index
else {
panic!()
};
let SerializableColumnIndex::Optional { non_null_row_ids, num_rows } = serializable_index else { panic!() };
assert_eq!(num_rows, 2);
let non_null_rows: Vec<RowId> = non_null_row_ids.boxed_iter().collect();
assert_eq!(&non_null_rows, &[1]);

View File

@@ -2,7 +2,7 @@
//! # `fastfield_codecs`
//!
//! - Columnar storage of data for tantivy [`crate::Column`].
//! - Columnar storage of data for tantivy [`Column`].
//! - Encode data in different codecs.
//! - Monotonically map values to u64/u128

View File

@@ -83,8 +83,7 @@ impl ColumnValues for BitpackedReader {
doc_id_range: Range<u32>,
positions: &mut Vec<u32>,
) {
let Some(transformed_range) =
transform_range_before_linear_transformation(&self.stats, range)
let Some(transformed_range) = transform_range_before_linear_transformation(&self.stats, range)
else {
positions.clear();
return;

View File

@@ -52,8 +52,8 @@ pub enum MergeRowOrder {
/// Columnar tables are simply stacked one above the other.
/// If the i-th columnar_readers has n_rows_i rows, then
/// in the resulting columnar,
/// rows [r0..n_row_0) contains the row of `columnar_readers[0]`, in ordder
/// rows [n_row_0..n_row_0 + n_row_1 contains the row of `columnar_readers[1]`, in order.
/// rows [r0..n_row_0) contains the row of columnar_readers[0], in ordder
/// rows [n_row_0..n_row_0 + n_row_1 contains the row of columnar_readers[1], in order.
/// ..
/// No documents is deleted.
Stack(StackMergeOrder),

View File

@@ -244,9 +244,7 @@ fn test_merge_columnar_numbers() {
assert_eq!(columnar_reader.num_columns(), 1);
let cols = columnar_reader.read_columns("numbers").unwrap();
let dynamic_column = cols[0].open().unwrap();
let DynamicColumn::F64(vals) = dynamic_column else {
panic!()
};
let DynamicColumn::F64(vals) = dynamic_column else { panic!() };
assert_eq!(vals.get_cardinality(), Cardinality::Optional);
assert_eq!(vals.first(0u32), Some(-1f64));
assert_eq!(vals.first(1u32), None);
@@ -272,9 +270,7 @@ fn test_merge_columnar_texts() {
assert_eq!(columnar_reader.num_columns(), 1);
let cols = columnar_reader.read_columns("texts").unwrap();
let dynamic_column = cols[0].open().unwrap();
let DynamicColumn::Str(vals) = dynamic_column else {
panic!()
};
let DynamicColumn::Str(vals) = dynamic_column else { panic!() };
assert_eq!(vals.ords().get_cardinality(), Cardinality::Optional);
let get_str_for_ord = |ord| {
@@ -321,9 +317,7 @@ fn test_merge_columnar_byte() {
assert_eq!(columnar_reader.num_columns(), 1);
let cols = columnar_reader.read_columns("bytes").unwrap();
let dynamic_column = cols[0].open().unwrap();
let DynamicColumn::Bytes(vals) = dynamic_column else {
panic!()
};
let DynamicColumn::Bytes(vals) = dynamic_column else { panic!() };
let get_bytes_for_ord = |ord| {
let mut out = Vec::new();
vals.ord_to_bytes(ord, &mut out).unwrap();
@@ -377,9 +371,7 @@ fn test_merge_columnar_byte_with_missing() {
assert_eq!(columnar_reader.num_columns(), 2);
let cols = columnar_reader.read_columns("col").unwrap();
let dynamic_column = cols[0].open().unwrap();
let DynamicColumn::Bytes(vals) = dynamic_column else {
panic!()
};
let DynamicColumn::Bytes(vals) = dynamic_column else { panic!() };
let get_bytes_for_ord = |ord| {
let mut out = Vec::new();
vals.ord_to_bytes(ord, &mut out).unwrap();
@@ -431,9 +423,7 @@ fn test_merge_columnar_different_types() {
// numeric column
let dynamic_column = cols[0].open().unwrap();
let DynamicColumn::I64(vals) = dynamic_column else {
panic!()
};
let DynamicColumn::I64(vals) = dynamic_column else { panic!() };
assert_eq!(vals.get_cardinality(), Cardinality::Optional);
assert_eq!(vals.values_for_doc(0).collect_vec(), vec![]);
assert_eq!(vals.values_for_doc(1).collect_vec(), vec![]);
@@ -443,9 +433,7 @@ fn test_merge_columnar_different_types() {
// text column
let dynamic_column = cols[1].open().unwrap();
let DynamicColumn::Str(vals) = dynamic_column else {
panic!()
};
let DynamicColumn::Str(vals) = dynamic_column else { panic!() };
assert_eq!(vals.ords().get_cardinality(), Cardinality::Optional);
let get_str_for_ord = |ord| {
let mut out = String::new();

View File

@@ -98,11 +98,9 @@ impl ColumnarWriter {
///
/// The sort applied is stable.
pub fn sort_order(&self, sort_field: &str, num_docs: RowId, reversed: bool) -> Vec<u32> {
let Some(numerical_col_writer) = self
.numerical_field_hash_map
.get::<NumericalColumnWriter>(sort_field.as_bytes())
else {
return Vec::new();
let Some(numerical_col_writer) =
self.numerical_field_hash_map.get::<NumericalColumnWriter>(sort_field.as_bytes()) else {
return Vec::new();
};
let mut symbols_buffer = Vec::new();
let mut values = Vec::new();

View File

@@ -57,9 +57,7 @@ fn test_dataframe_writer_bool() {
assert_eq!(cols[0].num_bytes(), 22);
assert_eq!(cols[0].column_type(), ColumnType::Bool);
let dyn_bool_col = cols[0].open().unwrap();
let DynamicColumn::Bool(bool_col) = dyn_bool_col else {
panic!();
};
let DynamicColumn::Bool(bool_col) = dyn_bool_col else { panic!(); };
let vals: Vec<Option<bool>> = (0..5).map(|row_id| bool_col.first(row_id)).collect();
assert_eq!(&vals, &[None, Some(false), None, Some(true), None,]);
}
@@ -81,9 +79,7 @@ fn test_dataframe_writer_u64_multivalued() {
assert_eq!(cols.len(), 1);
assert_eq!(cols[0].num_bytes(), 29);
let dyn_i64_col = cols[0].open().unwrap();
let DynamicColumn::I64(divisor_col) = dyn_i64_col else {
panic!();
};
let DynamicColumn::I64(divisor_col) = dyn_i64_col else { panic!(); };
assert_eq!(
divisor_col.get_cardinality(),
crate::Cardinality::Multivalued
@@ -105,9 +101,7 @@ fn test_dataframe_writer_ip_addr() {
assert_eq!(cols[0].num_bytes(), 42);
assert_eq!(cols[0].column_type(), ColumnType::IpAddr);
let dyn_bool_col = cols[0].open().unwrap();
let DynamicColumn::IpAddr(ip_col) = dyn_bool_col else {
panic!();
};
let DynamicColumn::IpAddr(ip_col) = dyn_bool_col else { panic!(); };
let vals: Vec<Option<Ipv6Addr>> = (0..5).map(|row_id| ip_col.first(row_id)).collect();
assert_eq!(
&vals,
@@ -140,9 +134,7 @@ fn test_dataframe_writer_numerical() {
// - null footer 6 bytes
assert_eq!(cols[0].num_bytes(), 33);
let column = cols[0].open().unwrap();
let DynamicColumn::I64(column_i64) = column else {
panic!();
};
let DynamicColumn::I64(column_i64) = column else { panic!(); };
assert_eq!(column_i64.index.get_cardinality(), Cardinality::Optional);
assert_eq!(column_i64.first(0), None);
assert_eq!(column_i64.first(1), Some(12i64));
@@ -206,9 +198,7 @@ fn test_dictionary_encoded_str() {
assert_eq!(columnar_reader.num_columns(), 2);
let col_handles = columnar_reader.read_columns("my.column").unwrap();
assert_eq!(col_handles.len(), 1);
let DynamicColumn::Str(str_col) = col_handles[0].open().unwrap() else {
panic!();
};
let DynamicColumn::Str(str_col) = col_handles[0].open().unwrap() else { panic!(); };
let index: Vec<Option<u64>> = (0..5).map(|row_id| str_col.ords().first(row_id)).collect();
assert_eq!(index, &[None, Some(0), None, Some(2), Some(1)]);
assert_eq!(str_col.num_rows(), 5);
@@ -240,9 +230,7 @@ fn test_dictionary_encoded_bytes() {
assert_eq!(columnar_reader.num_columns(), 2);
let col_handles = columnar_reader.read_columns("my.column").unwrap();
assert_eq!(col_handles.len(), 1);
let DynamicColumn::Bytes(bytes_col) = col_handles[0].open().unwrap() else {
panic!();
};
let DynamicColumn::Bytes(bytes_col) = col_handles[0].open().unwrap() else { panic!(); };
let index: Vec<Option<u64>> = (0..5)
.map(|row_id| bytes_col.ords().first(row_id))
.collect();
@@ -545,36 +533,28 @@ trait AssertEqualToColumnValue {
impl AssertEqualToColumnValue for bool {
fn assert_equal_to_column_value(&self, column_value: &ColumnValue) {
let ColumnValue::Bool(val) = column_value else {
panic!()
};
let ColumnValue::Bool(val) = column_value else { panic!() };
assert_eq!(self, val);
}
}
impl AssertEqualToColumnValue for Ipv6Addr {
fn assert_equal_to_column_value(&self, column_value: &ColumnValue) {
let ColumnValue::IpAddr(val) = column_value else {
panic!()
};
let ColumnValue::IpAddr(val) = column_value else { panic!() };
assert_eq!(self, val);
}
}
impl<T: Coerce + PartialEq + Debug + Into<NumericalValue>> AssertEqualToColumnValue for T {
fn assert_equal_to_column_value(&self, column_value: &ColumnValue) {
let ColumnValue::Numerical(num) = column_value else {
panic!()
};
let ColumnValue::Numerical(num) = column_value else { panic!() };
assert_eq!(self, &T::coerce(*num));
}
}
impl AssertEqualToColumnValue for DateTime {
fn assert_equal_to_column_value(&self, column_value: &ColumnValue) {
let ColumnValue::DateTime(dt) = column_value else {
panic!()
};
let ColumnValue::DateTime(dt) = column_value else { panic!() };
assert_eq!(self, dt);
}
}

View File

@@ -53,7 +53,7 @@ fn main() -> tantivy::Result<()> {
// this will store tokens of 3 characters each
index
.tokenizers()
.register("ngram3", NgramTokenizer::new(3, 3, false).unwrap());
.register("ngram3", NgramTokenizer::new(3, 3, false));
// To insert document we need an index writer.
// There must be only one writer at a time.

View File

@@ -15,12 +15,6 @@
//! Results of final buckets are [`BucketResult`](super::agg_result::BucketResult).
//! Results of intermediate buckets are
//! [`IntermediateBucketResult`](super::intermediate_agg_result::IntermediateBucketResult)
//!
//! ## Supported Bucket Aggregations
//! - [Histogram](HistogramAggregation)
//! - [DateHistogram](DateHistogramAggregationReq)
//! - [Range](RangeAggregation)
//! - [Terms](TermsAggregation)
mod histogram;
mod range;

View File

@@ -6,15 +6,6 @@
//! Some aggregations output a single numeric metric (e.g. Average) and are called
//! single-value numeric metrics aggregation, others generate multiple metrics (e.g. Stats) and are
//! called multi-value numeric metrics aggregation.
//!
//! ## Supported Metric Aggregations
//! - [Average](AverageAggregation)
//! - [Stats](StatsAggregation)
//! - [Min](MinAggregation)
//! - [Max](MaxAggregation)
//! - [Sum](SumAggregation)
//! - [Count](CountAggregation)
//! - [Percentiles](PercentilesAggregationReq)
mod average;
mod count;

View File

@@ -14,7 +14,7 @@ use crate::collector::{
};
use crate::fastfield::{FastFieldNotAvailableError, FastValue};
use crate::query::Weight;
use crate::{DocAddress, DocId, Order, Score, SegmentOrdinal, SegmentReader, TantivyError};
use crate::{DocAddress, DocId, Score, SegmentOrdinal, SegmentReader, TantivyError};
struct FastFieldConvertCollector<
TCollector: Collector<Fruit = Vec<(u64, DocAddress)>>,
@@ -23,7 +23,6 @@ struct FastFieldConvertCollector<
pub collector: TCollector,
pub field: String,
pub fast_value: std::marker::PhantomData<TFastValue>,
order: Order,
}
impl<TCollector, TFastValue> Collector for FastFieldConvertCollector<TCollector, TFastValue>
@@ -71,13 +70,7 @@ where
let raw_result = self.collector.merge_fruits(segment_fruits)?;
let transformed_result = raw_result
.into_iter()
.map(|(score, doc_address)| {
if self.order.is_desc() {
(TFastValue::from_u64(score), doc_address)
} else {
(TFastValue::from_u64(u64::MAX - score), doc_address)
}
})
.map(|(score, doc_address)| (TFastValue::from_u64(score), doc_address))
.collect::<Vec<_>>();
Ok(transformed_result)
}
@@ -138,23 +131,16 @@ impl fmt::Debug for TopDocs {
struct ScorerByFastFieldReader {
sort_column: Arc<dyn ColumnValues<u64>>,
order: Order,
}
impl CustomSegmentScorer<u64> for ScorerByFastFieldReader {
fn score(&mut self, doc: DocId) -> u64 {
let value = self.sort_column.get_val(doc);
if self.order.is_desc() {
value
} else {
u64::MAX - value
}
self.sort_column.get_val(doc)
}
}
struct ScorerByField {
field: String,
order: Order,
}
impl CustomScorer<u64> for ScorerByField {
@@ -171,13 +157,8 @@ impl CustomScorer<u64> for ScorerByField {
sort_column_opt.ok_or_else(|| FastFieldNotAvailableError {
field_name: self.field.clone(),
})?;
let mut default_value = 0u64;
if self.order.is_asc() {
default_value = u64::MAX;
}
Ok(ScorerByFastFieldReader {
sort_column: sort_column.first_or_default_col(default_value),
order: self.order.clone(),
sort_column: sort_column.first_or_default_col(0u64),
})
}
}
@@ -249,7 +230,7 @@ impl TopDocs {
///
/// ```rust
/// # use tantivy::schema::{Schema, FAST, TEXT};
/// # use tantivy::{doc, Index, DocAddress, Order};
/// # use tantivy::{doc, Index, DocAddress};
/// # use tantivy::query::{Query, QueryParser};
/// use tantivy::Searcher;
/// use tantivy::collector::TopDocs;
@@ -287,7 +268,7 @@ impl TopDocs {
/// // Note the `rating_field` needs to be a FAST field here.
/// let top_books_by_rating = TopDocs
/// ::with_limit(10)
/// .order_by_fast_field("rating", Order::Desc);
/// .order_by_u64_field("rating");
///
/// // ... and here are our documents. Note this is a simple vec.
/// // The `u64` in the pair is the value of our fast field for
@@ -307,15 +288,13 @@ impl TopDocs {
///
/// To comfortably work with `u64`s, `i64`s, `f64`s, or `date`s, please refer to
/// the [.order_by_fast_field(...)](TopDocs::order_by_fast_field) method.
fn order_by_u64_field(
pub fn order_by_u64_field(
self,
field: impl ToString,
order: Order,
) -> impl Collector<Fruit = Vec<(u64, DocAddress)>> {
CustomScoreTopCollector::new(
ScorerByField {
field: field.to_string(),
order,
},
self.0.into_tscore(),
)
@@ -337,7 +316,7 @@ impl TopDocs {
///
/// ```rust
/// # use tantivy::schema::{Schema, FAST, TEXT};
/// # use tantivy::{doc, Index, DocAddress,Order};
/// # use tantivy::{doc, Index, DocAddress};
/// # use tantivy::query::{Query, AllQuery};
/// use tantivy::Searcher;
/// use tantivy::collector::TopDocs;
@@ -375,7 +354,7 @@ impl TopDocs {
/// // type `sort_by_field`. revenue_field here is a FAST i64 field.
/// let top_company_by_revenue = TopDocs
/// ::with_limit(2)
/// .order_by_fast_field("revenue", Order::Desc);
/// .order_by_fast_field("revenue");
///
/// // ... and here are our documents. Note this is a simple vec.
/// // The `i64` in the pair is the value of our fast field for
@@ -393,17 +372,15 @@ impl TopDocs {
pub fn order_by_fast_field<TFastValue>(
self,
fast_field: impl ToString,
order: Order,
) -> impl Collector<Fruit = Vec<(TFastValue, DocAddress)>>
where
TFastValue: FastValue,
{
let u64_collector = self.order_by_u64_field(fast_field.to_string(), order.clone());
let u64_collector = self.order_by_u64_field(fast_field.to_string());
FastFieldConvertCollector {
collector: u64_collector,
field: fast_field.to_string(),
fast_value: PhantomData,
order,
}
}
@@ -744,7 +721,7 @@ mod tests {
use crate::schema::{Field, Schema, FAST, STORED, TEXT};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::OffsetDateTime;
use crate::{DateTime, DocAddress, DocId, Index, IndexWriter, Order, Score, SegmentReader};
use crate::{DateTime, DocAddress, DocId, Index, IndexWriter, Score, SegmentReader};
fn make_index() -> crate::Result<Index> {
let mut schema_builder = Schema::builder();
@@ -905,7 +882,7 @@ mod tests {
});
let searcher = index.reader()?.searcher();
let top_collector = TopDocs::with_limit(4).order_by_u64_field(SIZE, Order::Desc);
let top_collector = TopDocs::with_limit(4).order_by_u64_field(SIZE);
let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector)?;
assert_eq!(
&top_docs[..],
@@ -944,7 +921,7 @@ mod tests {
))?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let top_collector = TopDocs::with_limit(3).order_by_fast_field("birthday", Order::Desc);
let top_collector = TopDocs::with_limit(3).order_by_fast_field("birthday");
let top_docs: Vec<(DateTime, DocAddress)> = searcher.search(&AllQuery, &top_collector)?;
assert_eq!(
&top_docs[..],
@@ -974,7 +951,7 @@ mod tests {
))?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let top_collector = TopDocs::with_limit(3).order_by_fast_field("altitude", Order::Desc);
let top_collector = TopDocs::with_limit(3).order_by_fast_field("altitude");
let top_docs: Vec<(i64, DocAddress)> = searcher.search(&AllQuery, &top_collector)?;
assert_eq!(
&top_docs[..],
@@ -1004,7 +981,7 @@ mod tests {
))?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let top_collector = TopDocs::with_limit(3).order_by_fast_field("altitude", Order::Desc);
let top_collector = TopDocs::with_limit(3).order_by_fast_field("altitude");
let top_docs: Vec<(f64, DocAddress)> = searcher.search(&AllQuery, &top_collector)?;
assert_eq!(
&top_docs[..],
@@ -1032,7 +1009,7 @@ mod tests {
.unwrap();
});
let searcher = index.reader().unwrap().searcher();
let top_collector = TopDocs::with_limit(4).order_by_u64_field("missing_field", Order::Desc);
let top_collector = TopDocs::with_limit(4).order_by_u64_field("missing_field");
let segment_reader = searcher.segment_reader(0u32);
top_collector
.for_segment(0, segment_reader)
@@ -1050,7 +1027,7 @@ mod tests {
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let segment = searcher.segment_reader(0);
let top_collector = TopDocs::with_limit(4).order_by_u64_field(SIZE, Order::Desc);
let top_collector = TopDocs::with_limit(4).order_by_u64_field(SIZE);
let err = top_collector.for_segment(0, segment).err().unwrap();
assert!(matches!(err, crate::TantivyError::InvalidArgument(_)));
Ok(())
@@ -1067,7 +1044,7 @@ mod tests {
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let segment = searcher.segment_reader(0);
let top_collector = TopDocs::with_limit(4).order_by_fast_field::<i64>(SIZE, Order::Desc);
let top_collector = TopDocs::with_limit(4).order_by_fast_field::<i64>(SIZE);
let err = top_collector.for_segment(0, segment).err().unwrap();
assert!(
matches!(err, crate::TantivyError::SchemaError(msg) if msg == "Field \"size\" is not a fast field.")
@@ -1129,50 +1106,4 @@ mod tests {
let query = query_parser.parse_query(query).unwrap();
(index, query)
}
#[test]
fn test_fast_field_ascending_order() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let title = schema_builder.add_text_field(TITLE, TEXT);
let size = schema_builder.add_u64_field(SIZE, FAST);
let schema = schema_builder.build();
let (index, query) = index("beer", title, schema, |index_writer| {
index_writer
.add_document(doc!(
title => "bottle of beer",
size => 12u64,
))
.unwrap();
index_writer
.add_document(doc!(
title => "growler of beer",
size => 64u64,
))
.unwrap();
index_writer
.add_document(doc!(
title => "pint of beer",
size => 16u64,
))
.unwrap();
index_writer
.add_document(doc!(
title => "empty beer",
))
.unwrap();
});
let searcher = index.reader()?.searcher();
let top_collector = TopDocs::with_limit(4).order_by_fast_field(SIZE, Order::Asc);
let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector)?;
assert_eq!(
&top_docs[..],
&[
(12, DocAddress::new(0, 0)),
(16, DocAddress::new(0, 2)),
(64, DocAddress::new(0, 1)),
(18446744073709551615, DocAddress::new(0, 3)),
]
);
Ok(())
}
}

View File

@@ -259,7 +259,7 @@ pub(crate) fn set_string_and_get_terms(
/// Writes a value of a JSON field to a `Term`.
/// The Term format is as follows:
/// `[JSON_TYPE][JSON_PATH][JSON_END_OF_PATH][VALUE_BYTES]`
/// [JSON_TYPE][JSON_PATH][JSON_END_OF_PATH][VALUE_BYTES]
pub struct JsonTermWriter<'a> {
term_buffer: &'a mut Term,
path_stack: Vec<usize>,

View File

@@ -1291,28 +1291,4 @@ mod tests {
let vals: Vec<i64> = column.values_for_doc(0u32).collect();
assert_eq!(&vals, &[33]);
}
#[test]
fn check_num_columnar_fields() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let id_field = schema_builder.add_text_field("id", FAST);
let index = Index::create_in_ram(schema_builder.build());
{
let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer.add_document(doc!(
id_field => 1u64,
))?;
index_writer.commit()?;
}
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let ff_reader = searcher.segment_reader(0).fast_fields();
let fields = ff_reader.u64_lenient_for_type_all(None, "id").unwrap();
assert_eq!(fields.len(), 1);
Ok(())
}
}

View File

@@ -88,7 +88,7 @@ impl FastFieldReaders {
let Some((field, path)): Option<(Field, &str)> = self
.schema
.find_field_with_default(field_name, default_field_opt)
else {
else{
return Ok(None);
};
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
@@ -120,8 +120,7 @@ impl FastFieldReaders {
T: HasAssociatedColumnType,
DynamicColumn: Into<Option<Column<T>>>,
{
let Some(dynamic_column_handle) =
self.dynamic_column_handle(field_name, T::column_type())?
let Some(dynamic_column_handle) = self.dynamic_column_handle(field_name, T::column_type())?
else {
return Ok(None);
};
@@ -197,8 +196,7 @@ impl FastFieldReaders {
/// Returns a `str` column.
pub fn str(&self, field_name: &str) -> crate::Result<Option<StrColumn>> {
let Some(dynamic_column_handle) =
self.dynamic_column_handle(field_name, ColumnType::Str)?
let Some(dynamic_column_handle) = self.dynamic_column_handle(field_name, ColumnType::Str)?
else {
return Ok(None);
};
@@ -208,8 +206,7 @@ impl FastFieldReaders {
/// Returns a `bytes` column.
pub fn bytes(&self, field_name: &str) -> crate::Result<Option<BytesColumn>> {
let Some(dynamic_column_handle) =
self.dynamic_column_handle(field_name, ColumnType::Bytes)?
let Some(dynamic_column_handle) = self.dynamic_column_handle(field_name, ColumnType::Bytes)?
else {
return Ok(None);
};

View File

@@ -1,6 +1,5 @@
use columnar::MonotonicallyMappableToU64;
use itertools::Itertools;
use tokenizer_api::BoxTokenStream;
use super::doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping};
use super::operation::AddOperation;
@@ -210,7 +209,7 @@ impl SegmentWriter {
for value in values {
let mut token_stream = match value {
Value::PreTokStr(tok_str) => {
BoxTokenStream::new(PreTokenizedStream::from(tok_str.clone()))
Box::new(PreTokenizedStream::from(tok_str.clone()))
}
Value::Str(ref text) => {
let text_analyzer =

View File

@@ -472,7 +472,6 @@ mod tests {
use super::RangeQuery;
use crate::collector::{Count, TopDocs};
use crate::indexer::NoMergePolicy;
use crate::query::QueryParser;
use crate::schema::{Document, Field, IntoIpv6Addr, Schema, FAST, INDEXED, STORED, TEXT};
use crate::{doc, Index};
@@ -548,8 +547,7 @@ mod tests {
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 60_000_000)?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
let mut index_writer = index.writer_with_num_threads(2, 60_000_000)?;
for i in 1..100 {
let mut doc = Document::new();
@@ -559,9 +557,6 @@ mod tests {
}
}
index_writer.add_document(doc)?;
if i == 10 {
index_writer.commit()?;
}
}
index_writer.commit()?;

View File

@@ -31,10 +31,9 @@ impl IPFastFieldRangeWeight {
impl Weight for IPFastFieldRangeWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let Some(ip_addr_column): Option<Column<Ipv6Addr>> =
reader.fast_fields().column_opt(&self.field)?
else {
return Ok(Box::new(EmptyScorer));
let Some(ip_addr_column): Option<Column<Ipv6Addr>> = reader.fast_fields()
.column_opt(&self.field)? else {
return Ok(Box::new(EmptyScorer))
};
let value_range = bound_to_value_range(
&self.lower_bound,

View File

@@ -71,9 +71,7 @@ impl Weight for FastFieldRangeWeight {
let column_type_opt_ref: Option<&[ColumnType]> = column_type_opt
.as_ref()
.map(|column_types| column_types.as_slice());
let Some((column, _)) =
fast_field_reader.u64_lenient_for_type(column_type_opt_ref, &self.field)?
else {
let Some((column, _)) = fast_field_reader.u64_lenient_for_type(column_type_opt_ref, &self.field)? else {
return Ok(Box::new(EmptyScorer));
};
let value_range = bound_to_value_range(

View File

@@ -693,7 +693,7 @@ Survey in 2016, 2017, and 2018."#;
terms.insert(String::from("bc"), 1.0);
let fragments = search_fragments(
&mut From::from(NgramTokenizer::all_ngrams(2, 2).unwrap()),
&mut From::from(NgramTokenizer::all_ngrams(2, 2)),
text,
&terms,
3,

View File

@@ -426,7 +426,7 @@ mod tests {
assert_eq!(store.cache_stats().cache_hits, 1);
assert_eq!(store.cache_stats().cache_misses, 2);
assert_eq!(store.cache.peek_lru(), Some(11207));
assert_eq!(store.cache.peek_lru(), Some(11163));
Ok(())
}

View File

@@ -139,7 +139,7 @@ mod tokenizer;
mod tokenizer_manager;
mod whitespace_tokenizer;
pub use tokenizer_api::{BoxTokenStream, Token, TokenFilter, TokenStream, Tokenizer};
pub use tokenizer_api::{Token, TokenFilter, TokenStream, Tokenizer};
pub use self::alphanum_only::AlphaNumOnlyFilter;
pub use self::ascii_folding_filter::AsciiFoldingFilter;
@@ -154,7 +154,7 @@ pub use self::split_compound_words::SplitCompoundWords;
pub use self::stemmer::{Language, Stemmer};
pub use self::stop_word_filter::StopWordFilter;
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
pub use self::tokenizer::{TextAnalyzer, TextAnalyzerBuilder};
pub use self::tokenizer::{BoxTokenFilter, TextAnalyzer, TextAnalyzerBuilder};
pub use self::tokenizer_manager::TokenizerManager;
pub use self::whitespace_tokenizer::WhitespaceTokenizer;

View File

@@ -1,5 +1,4 @@
use super::{Token, TokenStream, Tokenizer};
use crate::TantivyError;
/// Tokenize the text by splitting words into n-grams of the given size(s)
///
@@ -34,7 +33,7 @@ use crate::TantivyError;
/// ```rust
/// use tantivy::tokenizer::*;
///
/// let mut tokenizer = NgramTokenizer::new(2, 3, false).unwrap();
/// let mut tokenizer = NgramTokenizer::new(2, 3, false);
/// let mut stream = tokenizer.token_stream("hello");
/// {
/// let token = stream.next().unwrap();
@@ -80,7 +79,7 @@ use crate::TantivyError;
/// }
/// assert!(stream.next().is_none());
/// ```
#[derive(Clone, Debug)]
#[derive(Clone)]
pub struct NgramTokenizer {
/// min size of the n-gram
min_gram: usize,
@@ -93,39 +92,30 @@ pub struct NgramTokenizer {
impl NgramTokenizer {
/// Configures a new Ngram tokenizer
pub fn new(
min_gram: usize,
max_gram: usize,
prefix_only: bool,
) -> crate::Result<NgramTokenizer> {
if min_gram == 0 {
return Err(TantivyError::InvalidArgument(
"min_gram must be greater than 0".to_string(),
));
}
if min_gram > max_gram {
return Err(TantivyError::InvalidArgument(
"min_gram must not be greater than max_gram".to_string(),
));
}
Ok(NgramTokenizer {
pub fn new(min_gram: usize, max_gram: usize, prefix_only: bool) -> NgramTokenizer {
assert!(min_gram > 0, "min_gram must be greater than 0");
assert!(
min_gram <= max_gram,
"min_gram must not be greater than max_gram"
);
NgramTokenizer {
min_gram,
max_gram,
prefix_only,
token: Token::default(),
})
}
}
/// Create a `NGramTokenizer` which generates tokens for all inner ngrams.
///
/// This is as opposed to only prefix ngrams .
pub fn all_ngrams(min_gram: usize, max_gram: usize) -> crate::Result<NgramTokenizer> {
pub fn all_ngrams(min_gram: usize, max_gram: usize) -> NgramTokenizer {
Self::new(min_gram, max_gram, false)
}
/// Create a `NGramTokenizer` which only generates tokens for the
/// prefix ngrams.
pub fn prefix_only(min_gram: usize, max_gram: usize) -> crate::Result<NgramTokenizer> {
pub fn prefix_only(min_gram: usize, max_gram: usize) -> NgramTokenizer {
Self::new(min_gram, max_gram, true)
}
}
@@ -359,11 +349,7 @@ mod tests {
#[test]
fn test_ngram_tokenizer_1_2_false() {
let tokens = test_helper(
NgramTokenizer::all_ngrams(1, 2)
.unwrap()
.token_stream("hello"),
);
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hello"));
assert_eq!(tokens.len(), 9);
assert_token(&tokens[0], 0, "h", 0, 1);
assert_token(&tokens[1], 0, "he", 0, 2);
@@ -378,11 +364,7 @@ mod tests {
#[test]
fn test_ngram_tokenizer_min_max_equal() {
let tokens = test_helper(
NgramTokenizer::all_ngrams(3, 3)
.unwrap()
.token_stream("hello"),
);
let tokens = test_helper(NgramTokenizer::all_ngrams(3, 3).token_stream("hello"));
assert_eq!(tokens.len(), 3);
assert_token(&tokens[0], 0, "hel", 0, 3);
assert_token(&tokens[1], 0, "ell", 1, 4);
@@ -391,11 +373,7 @@ mod tests {
#[test]
fn test_ngram_tokenizer_2_5_prefix() {
let tokens = test_helper(
NgramTokenizer::prefix_only(2, 5)
.unwrap()
.token_stream("frankenstein"),
);
let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("frankenstein"));
assert_eq!(tokens.len(), 4);
assert_token(&tokens[0], 0, "fr", 0, 2);
assert_token(&tokens[1], 0, "fra", 0, 3);
@@ -405,11 +383,7 @@ mod tests {
#[test]
fn test_ngram_non_ascii_1_2() {
let tokens = test_helper(
NgramTokenizer::all_ngrams(1, 2)
.unwrap()
.token_stream("hεllo"),
);
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hεllo"));
assert_eq!(tokens.len(), 9);
assert_token(&tokens[0], 0, "h", 0, 1);
assert_token(&tokens[1], 0, "", 0, 3);
@@ -424,11 +398,7 @@ mod tests {
#[test]
fn test_ngram_non_ascii_2_5_prefix() {
let tokens = test_helper(
NgramTokenizer::prefix_only(2, 5)
.unwrap()
.token_stream("hεllo"),
);
let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("hεllo"));
assert_eq!(tokens.len(), 4);
assert_token(&tokens[0], 0, "", 0, 3);
assert_token(&tokens[1], 0, "hεl", 0, 4);
@@ -438,26 +408,22 @@ mod tests {
#[test]
fn test_ngram_empty() {
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 5).unwrap().token_stream(""));
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 5).token_stream(""));
assert!(tokens.is_empty());
let tokens = test_helper(NgramTokenizer::all_ngrams(2, 5).unwrap().token_stream(""));
let tokens = test_helper(NgramTokenizer::all_ngrams(2, 5).token_stream(""));
assert!(tokens.is_empty());
}
#[test]
#[should_panic(expected = "min_gram must be greater than 0")]
fn test_ngram_min_max_interval_empty() {
test_helper(
NgramTokenizer::all_ngrams(0, 2)
.unwrap()
.token_stream("hellossss"),
);
test_helper(NgramTokenizer::all_ngrams(0, 2).token_stream("hellossss"));
}
#[test]
#[should_panic(expected = "min_gram must not be greater than max_gram")]
fn test_invalid_interval_should_panic_if_smaller() {
NgramTokenizer::all_ngrams(2, 1).unwrap();
NgramTokenizer::all_ngrams(2, 1);
}
#[test]

View File

@@ -86,8 +86,6 @@ impl TokenFilter for SplitCompoundWords {
SplitCompoundWordsFilter {
dict: self.dict,
inner: tokenizer,
cuts: Vec::new(),
parts: Vec::new(),
}
}
}
@@ -96,33 +94,29 @@ impl TokenFilter for SplitCompoundWords {
pub struct SplitCompoundWordsFilter<T> {
dict: AhoCorasick,
inner: T,
cuts: Vec<usize>,
parts: Vec<Token>,
}
impl<T: Tokenizer> Tokenizer for SplitCompoundWordsFilter<T> {
type TokenStream<'a> = SplitCompoundWordsTokenStream<'a, T::TokenStream<'a>>;
type TokenStream<'a> = SplitCompoundWordsTokenStream<T::TokenStream<'a>>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
self.cuts.clear();
self.parts.clear();
SplitCompoundWordsTokenStream {
dict: self.dict.clone(),
tail: self.inner.token_stream(text),
cuts: &mut self.cuts,
parts: &mut self.parts,
cuts: Vec::new(),
parts: Vec::new(),
}
}
}
pub struct SplitCompoundWordsTokenStream<'a, T> {
pub struct SplitCompoundWordsTokenStream<T> {
dict: AhoCorasick,
tail: T,
cuts: &'a mut Vec<usize>,
parts: &'a mut Vec<Token>,
cuts: Vec<usize>,
parts: Vec<Token>,
}
impl<'a, T: TokenStream> SplitCompoundWordsTokenStream<'a, T> {
impl<T: TokenStream> SplitCompoundWordsTokenStream<T> {
// Will use `self.cuts` to fill `self.parts` if `self.tail.token()`
// can fully be split into consecutive matches against `self.dict`.
fn split(&mut self) {
@@ -158,7 +152,7 @@ impl<'a, T: TokenStream> SplitCompoundWordsTokenStream<'a, T> {
}
}
impl<'a, T: TokenStream> TokenStream for SplitCompoundWordsTokenStream<'a, T> {
impl<T: TokenStream> TokenStream for SplitCompoundWordsTokenStream<T> {
fn advance(&mut self) -> bool {
self.parts.pop();

View File

@@ -1,6 +1,7 @@
use dyn_clone::DynClone;
/// The tokenizer module contains all of the tools used to process
/// text in `tantivy`.
use tokenizer_api::{BoxTokenStream, TokenFilter, Tokenizer};
use tokenizer_api::{TokenFilter, TokenStream, Tokenizer};
use crate::tokenizer::empty_tokenizer::EmptyTokenizer;
@@ -10,39 +11,95 @@ pub struct TextAnalyzer {
tokenizer: Box<dyn BoxableTokenizer>,
}
impl Tokenizer for Box<dyn BoxableTokenizer> {
type TokenStream<'a> = BoxTokenStream<'a>;
// Note: we want to call `box_token_stream` on the concrete `Tokenizer`
// implementation, not the `BoxableTokenizer` one as it will cause
// a recursive call (and a stack overflow).
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
(**self).box_token_stream(text)
}
}
impl Clone for Box<dyn BoxableTokenizer> {
// Note: we want to call `box_clone` on the concrete `Tokenizer`
// implementation in order to clone the concrete `Tokenizer`.
fn clone(&self) -> Self {
(**self).box_clone()
}
}
/// A boxable `Tokenizer`, with its `TokenStream` type erased.
pub trait BoxableTokenizer: 'static + Send + Sync {
trait BoxableTokenizer: 'static + Send + Sync + DynClone {
/// Creates a boxed token stream for a given `str`.
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a>;
/// Clone this tokenizer.
fn box_clone(&self) -> Box<dyn BoxableTokenizer>;
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> Box<dyn TokenStream + 'a>;
}
impl<T: Tokenizer> BoxableTokenizer for T {
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
BoxTokenStream::new(self.token_stream(text))
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> Box<dyn TokenStream + 'a> {
Box::new(self.token_stream(text))
}
fn box_clone(&self) -> Box<dyn BoxableTokenizer> {
Box::new(self.clone())
}
dyn_clone::clone_trait_object!(BoxableTokenizer);
/// A boxed `BoxableTokenizer` which is a `Tokenizer` with its `TokenStream` type erased.
#[derive(Clone)]
struct BoxTokenizer(Box<dyn BoxableTokenizer>);
impl Tokenizer for BoxTokenizer {
type TokenStream<'a> = Box<dyn TokenStream + 'a>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
self.0.box_token_stream(text).into()
}
}
/// A boxable `TokenFilter`, with its `Tokenizer` type erased.
trait BoxableTokenFilter: 'static + Send + Sync {
/// Wraps a `BoxedTokenizer` and returns a new one.
fn box_transform(&self, tokenizer: BoxTokenizer) -> BoxTokenizer;
}
impl<T: TokenFilter> BoxableTokenFilter for T {
fn box_transform(&self, tokenizer: BoxTokenizer) -> BoxTokenizer {
let tokenizer = self.clone().transform(tokenizer);
BoxTokenizer(Box::new(tokenizer))
}
}
/// A boxed `BoxableTokenFilter` which is a `TokenFilter` with its `Tokenizer` type erased.
pub struct BoxTokenFilter(Box<dyn BoxableTokenFilter>);
impl<T: TokenFilter> From<T> for BoxTokenFilter {
fn from(tokenizer: T) -> BoxTokenFilter {
BoxTokenFilter(Box::new(tokenizer))
}
}
impl TextAnalyzer {
/// Builds a new `TextAnalyzer` given a tokenizer and a vector of `BoxTokenFilter`.
///
/// When creating a `TextAnalyzer` from a `Tokenizer` and a static set of `TokenFilter`,
/// prefer using `TextAnalyzer::builder(tokenizer).filter(token_filter).build()` as it
/// will be more performant and create less boxes.
///
/// # Example
///
/// ```rust
/// use tantivy::tokenizer::*;
///
/// let en_stem = TextAnalyzer::build(
/// SimpleTokenizer::default(),
/// vec![
/// BoxTokenFilter::from(RemoveLongFilter::limit(40)),
/// BoxTokenFilter::from(LowerCaser),
/// BoxTokenFilter::from(Stemmer::default()),
/// ]);
/// ```
pub fn build<T: Tokenizer>(
tokenizer: T,
boxed_token_filters: Vec<BoxTokenFilter>,
) -> TextAnalyzer {
let mut boxed_tokenizer = BoxTokenizer(Box::new(tokenizer));
for filter in boxed_token_filters.into_iter() {
boxed_tokenizer = filter.0.box_transform(boxed_tokenizer);
}
TextAnalyzer {
tokenizer: boxed_tokenizer.0,
}
}
/// Create a new TextAnalyzerBuilder
pub fn builder<T: Tokenizer>(tokenizer: T) -> TextAnalyzerBuilder<T> {
TextAnalyzerBuilder { tokenizer }
}
/// Creates a token stream for a given `str`.
pub fn token_stream<'a>(&'a mut self, text: &'a str) -> Box<dyn TokenStream + 'a> {
self.tokenizer.box_token_stream(text)
}
}
@@ -58,20 +115,8 @@ impl<T: Tokenizer + Clone> From<T> for TextAnalyzer {
}
}
impl TextAnalyzer {
/// Create a new TextAnalyzerBuilder
pub fn builder<T: Tokenizer>(tokenizer: T) -> TextAnalyzerBuilder<T> {
TextAnalyzerBuilder { tokenizer }
}
/// Creates a token stream for a given `str`.
pub fn token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
self.tokenizer.token_stream(text)
}
}
/// Builder helper for [`TextAnalyzer`]
pub struct TextAnalyzerBuilder<T = Box<dyn BoxableTokenizer>> {
pub struct TextAnalyzerBuilder<T: Tokenizer> {
tokenizer: T,
}
@@ -95,23 +140,6 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
}
}
/// Boxes the internal tokenizer. This is useful for adding dynamic filters.
/// Note: this will be less performant than the non boxed version.
pub fn dynamic(self) -> TextAnalyzerBuilder {
let boxed_tokenizer = Box::new(self.tokenizer);
TextAnalyzerBuilder {
tokenizer: boxed_tokenizer,
}
}
/// Appends a token filter to the current builder and returns a boxed version of the
/// tokenizer. This is useful when you want to build a `TextAnalyzer` dynamically.
/// Prefer using `TextAnalyzer::builder(tokenizer).filter(token_filter).build()` if
/// possible as it will be more performant and create less boxes.
pub fn filter_dynamic<F: TokenFilter>(self, token_filter: F) -> TextAnalyzerBuilder {
self.filter(token_filter).dynamic()
}
/// Finalize building the TextAnalyzer
pub fn build(self) -> TextAnalyzer {
TextAnalyzer {
@@ -124,52 +152,32 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
mod tests {
use super::*;
use crate::tokenizer::{LowerCaser, RemoveLongFilter, SimpleTokenizer};
use crate::tokenizer::{AlphaNumOnlyFilter, LowerCaser, RemoveLongFilter, WhitespaceTokenizer};
#[test]
fn test_text_analyzer_builder() {
let mut analyzer = TextAnalyzer::builder(SimpleTokenizer::default())
.filter(RemoveLongFilter::limit(40))
let mut analyzer = TextAnalyzer::builder(WhitespaceTokenizer::default())
.filter(AlphaNumOnlyFilter)
.filter(RemoveLongFilter::limit(6))
.filter(LowerCaser)
.build();
let mut stream = analyzer.token_stream("- first bullet point");
assert_eq!(stream.next().unwrap().text, "first");
assert_eq!(stream.next().unwrap().text, "bullet");
assert_eq!(stream.next().unwrap().text, "point");
}
#[test]
fn test_text_analyzer_with_filters_boxed() {
// This test shows how one can build a TextAnalyzer dynamically, by stacking a list
// of parametrizable token filters.
//
// The following enum is the thing that would be serializable.
// Note that token filters can have their own parameters, too, like the RemoveLongFilter
enum SerializableTokenFilterEnum {
LowerCaser(LowerCaser),
RemoveLongFilter(RemoveLongFilter),
}
// Note that everything below is dynamic.
let filters: Vec<SerializableTokenFilterEnum> = vec![
SerializableTokenFilterEnum::LowerCaser(LowerCaser),
SerializableTokenFilterEnum::RemoveLongFilter(RemoveLongFilter::limit(12)),
];
let mut analyzer_builder: TextAnalyzerBuilder =
TextAnalyzer::builder(SimpleTokenizer::default())
.filter_dynamic(RemoveLongFilter::limit(40))
.filter_dynamic(LowerCaser);
for filter in filters {
analyzer_builder = match filter {
SerializableTokenFilterEnum::LowerCaser(lower_caser) => {
analyzer_builder.filter_dynamic(lower_caser)
}
SerializableTokenFilterEnum::RemoveLongFilter(remove_long_filter) => {
analyzer_builder.filter_dynamic(remove_long_filter)
}
}
}
let mut analyzer = analyzer_builder.build();
let mut stream = analyzer.token_stream("first bullet point");
let mut analyzer = TextAnalyzer::build(
WhitespaceTokenizer::default(),
vec![
BoxTokenFilter::from(AlphaNumOnlyFilter),
BoxTokenFilter::from(LowerCaser),
BoxTokenFilter::from(RemoveLongFilter::limit(6)),
],
);
let mut stream = analyzer.token_stream("- first bullet point");
assert_eq!(stream.next().unwrap().text, "first");
assert_eq!(stream.next().unwrap().text, "bullet");
assert_eq!(stream.next().unwrap().text, "point");
}
}

View File

@@ -6,7 +6,6 @@
//! Checkout the [tantivy repo](https://github.com/quickwit-oss/tantivy/tree/main/src/tokenizer) for some examples.
use std::borrow::{Borrow, BorrowMut};
use std::ops::{Deref, DerefMut};
use serde::{Deserialize, Serialize};
@@ -60,42 +59,6 @@ pub trait Tokenizer: 'static + Clone + Send + Sync {
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a>;
}
/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
impl<'a> TokenStream for BoxTokenStream<'a> {
fn advance(&mut self) -> bool {
self.0.advance()
}
fn token(&self) -> &Token {
self.0.token()
}
fn token_mut(&mut self) -> &mut Token {
self.0.token_mut()
}
}
impl<'a> BoxTokenStream<'a> {
pub fn new<T: TokenStream + 'a>(token_stream: T) -> BoxTokenStream<'a> {
BoxTokenStream(Box::new(token_stream))
}
}
impl<'a> Deref for BoxTokenStream<'a> {
type Target = dyn TokenStream + 'a;
fn deref(&self) -> &Self::Target {
&*self.0
}
}
impl<'a> DerefMut for BoxTokenStream<'a> {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut *self.0
}
}
impl<'a> TokenStream for Box<dyn TokenStream + 'a> {
fn advance(&mut self) -> bool {
let token_stream: &mut dyn TokenStream = self.borrow_mut();
@@ -149,7 +112,7 @@ pub trait TokenStream {
}
/// Trait for the pluggable components of `Tokenizer`s.
pub trait TokenFilter: 'static + Send + Sync {
pub trait TokenFilter: 'static + Send + Sync + Clone {
/// The Tokenizer type returned by this filter, typically parametrized by the underlying
/// Tokenizer.
type Tokenizer<T: Tokenizer>: Tokenizer;