Compare commits

...

10 Commits

Author SHA1 Message Date
François Massot
dc783f8328 Remove BoxTokenStream. 2023-06-23 13:33:40 +02:00
François Massot
b82cd08f5d Fix comment. 2023-06-22 09:13:21 +02:00
François Massot
54f43135f2 Use dyn_clone. 2023-06-22 09:13:21 +02:00
François Massot
6c6b97d4ef Clean code and improve docs. 2023-06-22 09:13:20 +02:00
François Massot
ad9b825067 Add boxed token filter to ease the building of TextAnalyzer with a vec of filters. 2023-06-22 09:12:23 +02:00
PSeitz
44850e1036 move fail dep to dev only (#2094)
wasm compilation fails with dep only
2023-06-22 06:59:11 +02:00
Adam Reichold
3b0cbf8102 Cosmetic updates to the warmer example. (#2095)
Just some cosmetic tweaks to make the example easier on the eyes as a colleague
was staring at this for quite some time this week.
2023-06-22 11:25:01 +09:00
Adam Reichold
4aa131c3db Make TextAnalyzerBuilder publically accessible (#2097)
This way, client code can name the type to e.g. store it inside structs without
resorting to generics and it means that its documentation is part of the crate
documentation generated by `cargo doc`.
2023-06-22 11:24:21 +09:00
Naveen Aiathurai
59962097d0 fix: #2078 return error when tokenizer not found while indexing (#2093)
* fix: #2078 return error when tokenizer not found while indexing

* chore: formatting issues

* chore: fix review comments
2023-06-16 04:33:55 +02:00
Adam Reichold
ebc78127f3 Add BytesFilterCollector to support filtering based on a bytes fast field (#2075)
* Do some Clippy- and Cargo-related boy-scouting.

* Add BytesFilterCollector to support filtering based on a bytes fast field

This is basically a copy of the existing FilterCollector but modified and
specialised to work on a bytes fast field.

* Changed semantics of filter collectors to consider multi-valued fields
2023-06-13 14:19:58 +09:00
19 changed files with 468 additions and 165 deletions

View File

@@ -19,6 +19,7 @@ oneshot = "0.1.5"
base64 = "0.21.0"
byteorder = "1.4.3"
crc32fast = "1.3.2"
dyn-clone = "1.0.11"
once_cell = "1.10.0"
regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] }
aho-corasick = "1.0"
@@ -44,7 +45,7 @@ census = "0.4.0"
rustc-hash = "1.1.0"
thiserror = "1.0.30"
htmlescape = "0.3.1"
fail = "0.5.0"
fail = { version = "0.5.0", optional = true }
murmurhash32 = "0.3.0"
time = { version = "0.3.10", features = ["serde-well-known"] }
smallvec = "1.8.0"
@@ -112,7 +113,7 @@ lz4-compression = ["lz4_flex"]
snappy-compression = ["snap"]
zstd-compression = ["zstd"]
failpoints = ["fail/failpoints"]
failpoints = ["fail", "fail/failpoints"]
unstable = [] # useful for benches.
quickwit = ["sstable", "futures-util"]

View File

@@ -1,6 +1,6 @@
use std::ops::RangeInclusive;
#[cfg(any(target_arch = "x86_64"))]
#[cfg(target_arch = "x86_64")]
mod avx2;
mod scalar;

View File

@@ -5,7 +5,7 @@ edition = "2021"
license = "MIT"
homepage = "https://github.com/quickwit-oss/tantivy"
repository = "https://github.com/quickwit-oss/tantivy"
desciption = "column oriented storage for tantivy"
description = "column oriented storage for tantivy"
categories = ["database-implementations", "data-structures", "compression"]
[dependencies]

View File

@@ -6,12 +6,14 @@ use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::{Schema, FAST, TEXT};
use tantivy::{
doc, DocAddress, DocId, Index, IndexReader, Opstamp, Searcher, SearcherGeneration, SegmentId,
SegmentReader, Warmer,
doc, DocAddress, DocId, Index, Opstamp, Searcher, SearcherGeneration, SegmentId, SegmentReader,
Warmer,
};
// This example shows how warmers can be used to
// load a values from an external sources using the Warmer API.
// load values from an external sources and
// tie their lifecycle to that of the index segments
// using the Warmer API.
//
// In this example, we assume an e-commerce search engine.
@@ -23,9 +25,11 @@ pub trait PriceFetcher: Send + Sync + 'static {
fn fetch_prices(&self, product_ids: &[ProductId]) -> Vec<Price>;
}
type SegmentKey = (SegmentId, Option<Opstamp>);
struct DynamicPriceColumn {
field: String,
price_cache: RwLock<HashMap<(SegmentId, Option<Opstamp>), Arc<Vec<Price>>>>,
price_cache: RwLock<HashMap<SegmentKey, Arc<Vec<Price>>>>,
price_fetcher: Box<dyn PriceFetcher>,
}
@@ -46,7 +50,6 @@ impl DynamicPriceColumn {
impl Warmer for DynamicPriceColumn {
fn warm(&self, searcher: &Searcher) -> tantivy::Result<()> {
for segment in searcher.segment_readers() {
let key = (segment.segment_id(), segment.delete_opstamp());
let product_id_reader = segment
.fast_fields()
.u64(&self.field)?
@@ -55,37 +58,40 @@ impl Warmer for DynamicPriceColumn {
.doc_ids_alive()
.map(|doc| product_id_reader.get_val(doc))
.collect();
let mut prices_it = self.price_fetcher.fetch_prices(&product_ids).into_iter();
let mut price_vals: Vec<Price> = Vec::new();
for doc in 0..segment.max_doc() {
if segment.is_deleted(doc) {
price_vals.push(0);
} else {
price_vals.push(prices_it.next().unwrap())
}
}
let mut prices = self.price_fetcher.fetch_prices(&product_ids).into_iter();
let prices: Vec<Price> = (0..segment.max_doc())
.map(|doc| {
if !segment.is_deleted(doc) {
prices.next().unwrap()
} else {
0
}
})
.collect();
let key = (segment.segment_id(), segment.delete_opstamp());
self.price_cache
.write()
.unwrap()
.insert(key, Arc::new(price_vals));
.insert(key, Arc::new(prices));
}
Ok(())
}
fn garbage_collect(&self, live_generations: &[&SearcherGeneration]) {
let live_segment_id_and_delete_ops: HashSet<(SegmentId, Option<Opstamp>)> =
live_generations
.iter()
.flat_map(|gen| gen.segments())
.map(|(&segment_id, &opstamp)| (segment_id, opstamp))
.collect();
let mut price_cache_wrt = self.price_cache.write().unwrap();
// let price_cache = std::mem::take(&mut *price_cache_wrt);
// Drain would be nicer here.
*price_cache_wrt = std::mem::take(&mut *price_cache_wrt)
.into_iter()
.filter(|(seg_id_and_op, _)| !live_segment_id_and_delete_ops.contains(seg_id_and_op))
let live_keys: HashSet<SegmentKey> = live_generations
.iter()
.flat_map(|gen| gen.segments())
.map(|(&segment_id, &opstamp)| (segment_id, opstamp))
.collect();
self.price_cache
.write()
.unwrap()
.retain(|key, _| live_keys.contains(key));
}
}
@@ -100,17 +106,17 @@ pub struct ExternalPriceTable {
impl ExternalPriceTable {
pub fn update_price(&self, product_id: ProductId, price: Price) {
let mut prices_wrt = self.prices.write().unwrap();
prices_wrt.insert(product_id, price);
self.prices.write().unwrap().insert(product_id, price);
}
}
impl PriceFetcher for ExternalPriceTable {
fn fetch_prices(&self, product_ids: &[ProductId]) -> Vec<Price> {
let prices_read = self.prices.read().unwrap();
let prices = self.prices.read().unwrap();
product_ids
.iter()
.map(|product_id| prices_read.get(product_id).cloned().unwrap_or(0))
.map(|product_id| prices.get(product_id).cloned().unwrap_or(0))
.collect()
}
}
@@ -143,11 +149,8 @@ fn main() -> tantivy::Result<()> {
writer.add_document(doc!(product_id=>SNEAKERS, text=>"uber sweet sneakers"))?;
writer.commit()?;
let warmers: Vec<Weak<dyn Warmer>> = vec![Arc::downgrade(
&(price_dynamic_column.clone() as Arc<dyn Warmer>),
)];
let reader: IndexReader = index.reader_builder().warmers(warmers).try_into()?;
reader.reload()?;
let warmers = vec![Arc::downgrade(&price_dynamic_column) as Weak<dyn Warmer>];
let reader = index.reader_builder().warmers(warmers).try_into()?;
let query_parser = QueryParser::for_index(&index, vec![text]);
let query = query_parser.parse_query("cooking")?;

View File

@@ -6,32 +6,35 @@
//
// Of course, you can have a look at the tantivy's built-in collectors
// such as the `CountCollector` for more examples.
// ---
// Importing tantivy...
use std::fmt::Debug;
use std::marker::PhantomData;
use std::sync::Arc;
use columnar::{ColumnValues, DynamicColumn, HasAssociatedColumnType};
use columnar::{BytesColumn, Column, DynamicColumn, HasAssociatedColumnType};
use crate::collector::{Collector, SegmentCollector};
use crate::schema::Field;
use crate::{Score, SegmentReader, TantivyError};
use crate::{DocId, Score, SegmentReader, TantivyError};
/// The `FilterCollector` filters docs using a fast field value and a predicate.
/// Only the documents for which the predicate returned "true" will be passed on to the next
/// collector.
///
/// Only the documents containing at least one value for which the predicate returns `true`
/// will be passed on to the next collector.
///
/// In other words,
/// - documents with no values are filtered out.
/// - documents with several values are accepted if at least one value matches the predicate.
///
///
/// ```rust
/// use tantivy::collector::{TopDocs, FilterCollector};
/// use tantivy::query::QueryParser;
/// use tantivy::schema::{Schema, TEXT, INDEXED, FAST};
/// use tantivy::schema::{Schema, TEXT, FAST};
/// use tantivy::{doc, DocAddress, Index};
///
/// # fn main() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder();
/// let title = schema_builder.add_text_field("title", TEXT);
/// let price = schema_builder.add_u64_field("price", INDEXED | FAST);
/// let price = schema_builder.add_u64_field("price", FAST);
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
///
@@ -47,20 +50,24 @@ use crate::{Score, SegmentReader, TantivyError};
///
/// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary")?;
/// let no_filter_collector = FilterCollector::new(price, &|value: u64| value > 20_120u64, TopDocs::with_limit(2));
/// let no_filter_collector = FilterCollector::new(price, |value: u64| value > 20_120u64, TopDocs::with_limit(2));
/// let top_docs = searcher.search(&query, &no_filter_collector)?;
///
/// assert_eq!(top_docs.len(), 1);
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
///
/// let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new(price, &|value| value < 5u64, TopDocs::with_limit(2));
/// let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new(price, |value| value < 5u64, TopDocs::with_limit(2));
/// let filtered_top_docs = searcher.search(&query, &filter_all_collector)?;
///
/// assert_eq!(filtered_top_docs.len(), 0);
/// # Ok(())
/// # }
/// ```
pub struct FilterCollector<TCollector, TPredicate, TPredicateValue: Default>
///
/// Note that this is limited to fast fields which implement the
/// [`FastValue`][crate::fastfield::FastValue] trait, e.g. `u64` but not `&[u8]`.
/// To filter based on a bytes fast field, use a [`BytesFilterCollector`] instead.
pub struct FilterCollector<TCollector, TPredicate, TPredicateValue>
where TPredicate: 'static + Clone
{
field: Field,
@@ -69,19 +76,15 @@ where TPredicate: 'static + Clone
t_predicate_value: PhantomData<TPredicateValue>,
}
impl<TCollector, TPredicate, TPredicateValue: Default>
impl<TCollector, TPredicate, TPredicateValue>
FilterCollector<TCollector, TPredicate, TPredicateValue>
where
TCollector: Collector + Send + Sync,
TPredicate: Fn(TPredicateValue) -> bool + Send + Sync + Clone,
{
/// Create a new FilterCollector.
pub fn new(
field: Field,
predicate: TPredicate,
collector: TCollector,
) -> FilterCollector<TCollector, TPredicate, TPredicateValue> {
FilterCollector {
/// Create a new `FilterCollector`.
pub fn new(field: Field, predicate: TPredicate, collector: TCollector) -> Self {
Self {
field,
predicate,
collector,
@@ -90,7 +93,7 @@ where
}
}
impl<TCollector, TPredicate, TPredicateValue: Default> Collector
impl<TCollector, TPredicate, TPredicateValue> Collector
for FilterCollector<TCollector, TPredicate, TPredicateValue>
where
TCollector: Collector + Send + Sync,
@@ -98,8 +101,6 @@ where
TPredicateValue: HasAssociatedColumnType,
DynamicColumn: Into<Option<columnar::Column<TPredicateValue>>>,
{
// That's the type of our result.
// Our standard deviation will be a float.
type Fruit = TCollector::Fruit;
type Child = FilterSegmentCollector<TCollector::Child, TPredicate, TPredicateValue>;
@@ -108,7 +109,7 @@ where
&self,
segment_local_id: u32,
segment_reader: &SegmentReader,
) -> crate::Result<FilterSegmentCollector<TCollector::Child, TPredicate, TPredicateValue>> {
) -> crate::Result<Self::Child> {
let schema = segment_reader.schema();
let field_entry = schema.get_field_entry(self.field);
if !field_entry.is_fast() {
@@ -118,16 +119,16 @@ where
)));
}
let fast_field_reader = segment_reader
let column_opt = segment_reader
.fast_fields()
.column_first_or_default(schema.get_field_name(self.field))?;
.column_opt(field_entry.name())?;
let segment_collector = self
.collector
.for_segment(segment_local_id, segment_reader)?;
Ok(FilterSegmentCollector {
fast_field_reader,
column_opt,
segment_collector,
predicate: self.predicate.clone(),
t_predicate_value: PhantomData,
@@ -146,35 +147,208 @@ where
}
}
pub struct FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue>
where
TPredicate: 'static,
DynamicColumn: Into<Option<columnar::Column<TPredicateValue>>>,
{
fast_field_reader: Arc<dyn ColumnValues<TPredicateValue>>,
pub struct FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue> {
column_opt: Option<Column<TPredicateValue>>,
segment_collector: TSegmentCollector,
predicate: TPredicate,
t_predicate_value: PhantomData<TPredicateValue>,
}
impl<TSegmentCollector, TPredicate, TPredicateValue>
FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue>
where
TPredicateValue: PartialOrd + Copy + Debug + Send + Sync + 'static,
TPredicate: 'static + Fn(TPredicateValue) -> bool + Send + Sync,
{
#[inline]
fn accept_document(&self, doc_id: DocId) -> bool {
if let Some(column) = &self.column_opt {
for val in column.values_for_doc(doc_id) {
if (self.predicate)(val) {
return true;
}
}
}
false
}
}
impl<TSegmentCollector, TPredicate, TPredicateValue> SegmentCollector
for FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue>
where
TSegmentCollector: SegmentCollector,
TPredicateValue: HasAssociatedColumnType,
TPredicate: 'static + Fn(TPredicateValue) -> bool + Send + Sync,
DynamicColumn: Into<Option<columnar::Column<TPredicateValue>>>,
TPredicate: 'static + Fn(TPredicateValue) -> bool + Send + Sync, /* DynamicColumn: Into<Option<columnar::Column<TPredicateValue>>> */
{
type Fruit = TSegmentCollector::Fruit;
fn collect(&mut self, doc: u32, score: Score) {
let value = self.fast_field_reader.get_val(doc);
if (self.predicate)(value) {
self.segment_collector.collect(doc, score)
if self.accept_document(doc) {
self.segment_collector.collect(doc, score);
}
}
fn harvest(self) -> <TSegmentCollector as SegmentCollector>::Fruit {
fn harvest(self) -> TSegmentCollector::Fruit {
self.segment_collector.harvest()
}
}
/// A variant of the [`FilterCollector`] specialized for bytes fast fields, i.e.
/// it transparently wraps an inner [`Collector`] but filters documents
/// based on the result of applying the predicate to the bytes fast field.
///
/// A document is accepted if and only if the predicate returns `true` for at least one value.
///
/// In other words,
/// - documents with no values are filtered out.
/// - documents with several values are accepted if at least one value matches the predicate.
///
/// ```rust
/// use tantivy::collector::{TopDocs, BytesFilterCollector};
/// use tantivy::query::QueryParser;
/// use tantivy::schema::{Schema, TEXT, FAST};
/// use tantivy::{doc, DocAddress, Index};
///
/// # fn main() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder();
/// let title = schema_builder.add_text_field("title", TEXT);
/// let barcode = schema_builder.add_bytes_field("barcode", FAST);
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
///
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// index_writer.add_document(doc!(title => "The Name of the Wind", barcode => &b"010101"[..]))?;
/// index_writer.add_document(doc!(title => "The Diary of Muadib", barcode => &b"110011"[..]))?;
/// index_writer.add_document(doc!(title => "A Dairy Cow", barcode => &b"110111"[..]))?;
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl", barcode => &b"011101"[..]))?;
/// index_writer.add_document(doc!(title => "Bridget Jones's Diary"))?;
/// index_writer.commit()?;
///
/// let reader = index.reader()?;
/// let searcher = reader.searcher();
///
/// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary")?;
/// let filter_collector = BytesFilterCollector::new(barcode, |bytes: &[u8]| bytes.starts_with(b"01"), TopDocs::with_limit(2));
/// let top_docs = searcher.search(&query, &filter_collector)?;
///
/// assert_eq!(top_docs.len(), 1);
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 3));
/// # Ok(())
/// # }
/// ```
pub struct BytesFilterCollector<TCollector, TPredicate>
where TPredicate: 'static + Clone
{
field: Field,
collector: TCollector,
predicate: TPredicate,
}
impl<TCollector, TPredicate> BytesFilterCollector<TCollector, TPredicate>
where
TCollector: Collector + Send + Sync,
TPredicate: Fn(&[u8]) -> bool + Send + Sync + Clone,
{
/// Create a new `BytesFilterCollector`.
pub fn new(field: Field, predicate: TPredicate, collector: TCollector) -> Self {
Self {
field,
predicate,
collector,
}
}
}
impl<TCollector, TPredicate> Collector for BytesFilterCollector<TCollector, TPredicate>
where
TCollector: Collector + Send + Sync,
TPredicate: 'static + Fn(&[u8]) -> bool + Send + Sync + Clone,
{
type Fruit = TCollector::Fruit;
type Child = BytesFilterSegmentCollector<TCollector::Child, TPredicate>;
fn for_segment(
&self,
segment_local_id: u32,
segment_reader: &SegmentReader,
) -> crate::Result<Self::Child> {
let schema = segment_reader.schema();
let field_name = schema.get_field_name(self.field);
let column_opt = segment_reader.fast_fields().bytes(field_name)?;
let segment_collector = self
.collector
.for_segment(segment_local_id, segment_reader)?;
Ok(BytesFilterSegmentCollector {
column_opt,
segment_collector,
predicate: self.predicate.clone(),
buffer: Vec::new(),
})
}
fn requires_scoring(&self) -> bool {
self.collector.requires_scoring()
}
fn merge_fruits(
&self,
segment_fruits: Vec<<TCollector::Child as SegmentCollector>::Fruit>,
) -> crate::Result<TCollector::Fruit> {
self.collector.merge_fruits(segment_fruits)
}
}
pub struct BytesFilterSegmentCollector<TSegmentCollector, TPredicate>
where TPredicate: 'static
{
column_opt: Option<BytesColumn>,
segment_collector: TSegmentCollector,
predicate: TPredicate,
buffer: Vec<u8>,
}
impl<TSegmentCollector, TPredicate> BytesFilterSegmentCollector<TSegmentCollector, TPredicate>
where
TSegmentCollector: SegmentCollector,
TPredicate: 'static + Fn(&[u8]) -> bool + Send + Sync,
{
#[inline]
fn accept_document(&mut self, doc_id: DocId) -> bool {
if let Some(column) = &self.column_opt {
for ord in column.term_ords(doc_id) {
self.buffer.clear();
let found = column.ord_to_bytes(ord, &mut self.buffer).unwrap_or(false);
if found && (self.predicate)(&self.buffer) {
return true;
}
}
}
false
}
}
impl<TSegmentCollector, TPredicate> SegmentCollector
for BytesFilterSegmentCollector<TSegmentCollector, TPredicate>
where
TSegmentCollector: SegmentCollector,
TPredicate: 'static + Fn(&[u8]) -> bool + Send + Sync,
{
type Fruit = TSegmentCollector::Fruit;
fn collect(&mut self, doc: u32, score: Score) {
if self.accept_document(doc) {
self.segment_collector.collect(doc, score);
}
}
fn harvest(self) -> TSegmentCollector::Fruit {
self.segment_collector.harvest()
}
}

View File

@@ -112,7 +112,7 @@ mod docset_collector;
pub use self::docset_collector::DocSetCollector;
mod filter_collector_wrapper;
pub use self::filter_collector_wrapper::FilterCollector;
pub use self::filter_collector_wrapper::{BytesFilterCollector, FilterCollector};
/// `Fruit` is the type for the result of our collection.
/// e.g. `usize` for the `Count` collector.

View File

@@ -2,8 +2,6 @@ use std::collections::HashMap;
use std::sync::{Arc, RwLock};
use std::{fmt, io};
use fail::fail_point;
use crate::core::{InvertedIndexReader, Segment, SegmentComponent, SegmentId};
use crate::directory::{CompositeFile, FileSlice};
use crate::error::DataCorruption;
@@ -151,7 +149,7 @@ impl SegmentReader {
let store_file = segment.open_read(SegmentComponent::Store)?;
fail_point!("SegmentReader::open#middle");
crate::fail_point!("SegmentReader::open#middle");
let postings_file = segment.open_read(SegmentComponent::Postings)?;
let postings_composite = CompositeFile::open(&postings_file)?;

View File

@@ -5,7 +5,6 @@ use std::sync::{Arc, RwLock};
use std::{fmt, result};
use common::HasLen;
use fail::fail_point;
use super::FileHandle;
use crate::core::META_FILEPATH;
@@ -184,7 +183,7 @@ impl Directory for RamDirectory {
}
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
fail_point!("RamDirectory::delete", |_| {
crate::fail_point!("RamDirectory::delete", |_| {
Err(DeleteError::IoError {
io_error: Arc::new(io::Error::from(io::ErrorKind::Other)),
filepath: path.to_path_buf(),

View File

@@ -6,7 +6,6 @@ use std::path::PathBuf;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, RwLock};
use fail::fail_point;
use rayon::{ThreadPool, ThreadPoolBuilder};
use super::segment_manager::SegmentManager;
@@ -43,7 +42,7 @@ pub(crate) fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate:
let mut buffer = serde_json::to_vec_pretty(metas)?;
// Just adding a new line at the end of the buffer.
writeln!(&mut buffer)?;
fail_point!("save_metas", |msg| Err(crate::TantivyError::from(
crate::fail_point!("save_metas", |msg| Err(crate::TantivyError::from(
std::io::Error::new(
std::io::ErrorKind::Other,
msg.unwrap_or_else(|| "Undefined".to_string())

View File

@@ -15,7 +15,7 @@ use crate::postings::{
use crate::schema::{FieldEntry, FieldType, Schema, Term, Value, DATE_TIME_PRECISION_INDEXED};
use crate::store::{StoreReader, StoreWriter};
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer};
use crate::{DocId, Document, Opstamp, SegmentComponent};
use crate::{DocId, Document, Opstamp, SegmentComponent, TantivyError};
/// Computes the initial size of the hash table.
///
@@ -98,14 +98,18 @@ impl SegmentWriter {
}
_ => None,
};
text_options
.and_then(|text_index_option| {
let tokenizer_name = &text_index_option.tokenizer();
tokenizer_manager.get(tokenizer_name)
})
.unwrap_or_default()
let tokenizer_name = text_options
.map(|text_index_option| text_index_option.tokenizer())
.unwrap_or("default");
tokenizer_manager.get(tokenizer_name).ok_or_else(|| {
TantivyError::SchemaError(format!(
"Error getting tokenizer for field: {}",
field_entry.name()
))
})
})
.collect();
.collect::<Result<Vec<_>, _>>()?;
Ok(SegmentWriter {
max_doc: 0,
ctx: IndexingContext::new(table_size),
@@ -205,7 +209,7 @@ impl SegmentWriter {
for value in values {
let mut token_stream = match value {
Value::PreTokStr(tok_str) => {
PreTokenizedStream::from(tok_str.clone()).into()
Box::new(PreTokenizedStream::from(tok_str.clone()))
}
Value::Str(ref text) => {
let text_analyzer =
@@ -438,7 +442,9 @@ fn remap_and_write(
#[cfg(test)]
mod tests {
use std::path::Path;
use std::path::{Path, PathBuf};
use tempfile::TempDir;
use super::compute_initial_table_size;
use crate::collector::Count;
@@ -446,7 +452,9 @@ mod tests {
use crate::directory::RamDirectory;
use crate::postings::TermInfo;
use crate::query::PhraseQuery;
use crate::schema::{IndexRecordOption, Schema, Type, STORED, STRING, TEXT};
use crate::schema::{
IndexRecordOption, Schema, TextFieldIndexing, TextOptions, Type, STORED, STRING, TEXT,
};
use crate::store::{Compressor, StoreReader, StoreWriter};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::OffsetDateTime;
@@ -900,4 +908,32 @@ mod tests {
postings.positions(&mut positions);
assert_eq!(positions, &[4]); //< as opposed to 3 if we had a position length of 1.
}
#[test]
fn test_show_error_when_tokenizer_not_registered() {
let text_field_indexing = TextFieldIndexing::default()
.set_tokenizer("custom_en")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let text_options = TextOptions::default()
.set_indexing_options(text_field_indexing)
.set_stored();
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("title", text_options);
let schema = schema_builder.build();
let tempdir = TempDir::new().unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
Index::create_in_dir(&tempdir_path, schema).unwrap();
let index = Index::open_in_dir(tempdir_path).unwrap();
let schema = index.schema();
let mut index_writer = index.writer(50_000_000).unwrap();
let title = schema.get_field("title").unwrap();
let mut document = Document::default();
document.add_text(title, "The Old Man and the Sea");
index_writer.add_document(document).unwrap();
let error = index_writer.commit().unwrap_err();
assert_eq!(
error.to_string(),
"Schema error: 'Error getting tokenizer for field: title'"
);
}
}

View File

@@ -101,6 +101,7 @@ mod test {
use super::Stamper;
#[allow(clippy::redundant_clone)]
#[test]
fn test_stamper() {
let stamper = Stamper::new(7u64);
@@ -116,6 +117,7 @@ mod test {
assert_eq!(stamper.stamp(), 15u64);
}
#[allow(clippy::redundant_clone)]
#[test]
fn test_stamper_revert() {
let stamper = Stamper::new(7u64);

View File

@@ -299,6 +299,35 @@ pub struct DocAddress {
pub doc_id: DocId,
}
#[macro_export]
/// Enable fail_point if feature is enabled.
macro_rules! fail_point {
($name:expr) => {{
#[cfg(feature = "failpoints")]
{
fail::eval($name, |_| {
panic!("Return is not supported for the fail point \"{}\"", $name);
});
}
}};
($name:expr, $e:expr) => {{
#[cfg(feature = "failpoints")]
{
if let Some(res) = fail::eval($name, $e) {
return res;
}
}
}};
($name:expr, $cond:expr, $e:expr) => {{
#[cfg(feature = "failpoints")]
{
if $cond {
fail::fail_point!($name, $e);
}
}
}};
}
#[cfg(test)]
pub mod tests {
use common::{BinarySerializable, FixedSize};
@@ -876,8 +905,8 @@ pub mod tests {
}"#,
)
.unwrap();
let doc = doc!(json_field=>json_val.clone());
let index = Index::create_in_ram(schema.clone());
let doc = doc!(json_field=>json_val);
let index = Index::create_in_ram(schema);
let mut writer = index.writer_for_tests().unwrap();
writer.add_document(doc).unwrap();
writer.commit().unwrap();

View File

@@ -2,7 +2,6 @@ use std::cmp::Ordering;
use std::io::{self, Write};
use common::{BinarySerializable, CountingWriter, VInt};
use fail::fail_point;
use super::TermInfo;
use crate::core::Segment;
@@ -205,7 +204,7 @@ impl<'a> FieldSerializer<'a> {
/// If the current block is incomplete, it needs to be encoded
/// using `VInt` encoding.
pub fn close_term(&mut self) -> io::Result<()> {
fail_point!("FieldSerializer::close_term", |msg: Option<String>| {
crate::fail_point!("FieldSerializer::close_term", |msg: Option<String>| {
Err(io::Error::new(io::ErrorKind::Other, format!("{msg:?}")))
});
if self.term_open {

View File

@@ -4,9 +4,7 @@ use std::collections::{BinaryHeap, HashMap};
use crate::query::bm25::idf;
use crate::query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery};
use crate::schema::{Field, FieldType, IndexRecordOption, Term, Value};
use crate::tokenizer::{
BoxTokenStream, FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer,
};
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer};
use crate::{DocAddress, Result, Searcher, TantivyError};
#[derive(Debug, PartialEq)]
@@ -206,8 +204,7 @@ impl MoreLikeThis {
for value in values {
match value {
Value::PreTokStr(tok_str) => {
let mut token_stream: BoxTokenStream =
PreTokenizedStream::from(tok_str.clone()).into();
let mut token_stream = PreTokenizedStream::from(tok_str.clone());
token_stream.process(&mut |token| {
if !self.is_noise_word(token.text.clone()) {
let term = Term::from_field_text(field, &token.text);

View File

@@ -139,7 +139,7 @@ mod tokenizer;
mod tokenizer_manager;
mod whitespace_tokenizer;
pub use tokenizer_api::{BoxTokenStream, Token, TokenFilter, TokenStream, Tokenizer};
pub use tokenizer_api::{Token, TokenFilter, TokenStream, Tokenizer};
pub use self::alphanum_only::AlphaNumOnlyFilter;
pub use self::ascii_folding_filter::AsciiFoldingFilter;
@@ -154,7 +154,7 @@ pub use self::split_compound_words::SplitCompoundWords;
pub use self::stemmer::{Language, Stemmer};
pub use self::stop_word_filter::StopWordFilter;
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
pub use self::tokenizer::TextAnalyzer;
pub use self::tokenizer::{BoxTokenFilter, TextAnalyzer, TextAnalyzerBuilder};
pub use self::tokenizer_manager::TokenizerManager;
pub use self::whitespace_tokenizer::WhitespaceTokenizer;

View File

@@ -1,36 +1,105 @@
use dyn_clone::DynClone;
/// The tokenizer module contains all of the tools used to process
/// text in `tantivy`.
use tokenizer_api::{BoxTokenStream, TokenFilter, Tokenizer};
use tokenizer_api::{TokenFilter, TokenStream, Tokenizer};
use crate::tokenizer::empty_tokenizer::EmptyTokenizer;
/// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`.
#[derive(Clone)]
pub struct TextAnalyzer {
tokenizer: Box<dyn BoxableTokenizer>,
}
/// A boxable `Tokenizer`, with its `TokenStream` type erased.
trait BoxableTokenizer: 'static + Send + Sync {
trait BoxableTokenizer: 'static + Send + Sync + DynClone {
/// Creates a boxed token stream for a given `str`.
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a>;
/// Clone this tokenizer.
fn box_clone(&self) -> Box<dyn BoxableTokenizer>;
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> Box<dyn TokenStream + 'a>;
}
impl<T: Tokenizer> BoxableTokenizer for T {
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
self.token_stream(text).into()
}
fn box_clone(&self) -> Box<dyn BoxableTokenizer> {
Box::new(self.clone())
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> Box<dyn TokenStream + 'a> {
Box::new(self.token_stream(text))
}
}
impl Clone for TextAnalyzer {
fn clone(&self) -> Self {
TextAnalyzer {
tokenizer: self.tokenizer.box_clone(),
dyn_clone::clone_trait_object!(BoxableTokenizer);
/// A boxed `BoxableTokenizer` which is a `Tokenizer` with its `TokenStream` type erased.
#[derive(Clone)]
struct BoxTokenizer(Box<dyn BoxableTokenizer>);
impl Tokenizer for BoxTokenizer {
type TokenStream<'a> = Box<dyn TokenStream + 'a>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
self.0.box_token_stream(text).into()
}
}
/// A boxable `TokenFilter`, with its `Tokenizer` type erased.
trait BoxableTokenFilter: 'static + Send + Sync {
/// Wraps a `BoxedTokenizer` and returns a new one.
fn box_transform(&self, tokenizer: BoxTokenizer) -> BoxTokenizer;
}
impl<T: TokenFilter> BoxableTokenFilter for T {
fn box_transform(&self, tokenizer: BoxTokenizer) -> BoxTokenizer {
let tokenizer = self.clone().transform(tokenizer);
BoxTokenizer(Box::new(tokenizer))
}
}
/// A boxed `BoxableTokenFilter` which is a `TokenFilter` with its `Tokenizer` type erased.
pub struct BoxTokenFilter(Box<dyn BoxableTokenFilter>);
impl<T: TokenFilter> From<T> for BoxTokenFilter {
fn from(tokenizer: T) -> BoxTokenFilter {
BoxTokenFilter(Box::new(tokenizer))
}
}
impl TextAnalyzer {
/// Builds a new `TextAnalyzer` given a tokenizer and a vector of `BoxTokenFilter`.
///
/// When creating a `TextAnalyzer` from a `Tokenizer` and a static set of `TokenFilter`,
/// prefer using `TextAnalyzer::builder(tokenizer).filter(token_filter).build()` as it
/// will be more performant and create less boxes.
///
/// # Example
///
/// ```rust
/// use tantivy::tokenizer::*;
///
/// let en_stem = TextAnalyzer::build(
/// SimpleTokenizer::default(),
/// vec![
/// BoxTokenFilter::from(RemoveLongFilter::limit(40)),
/// BoxTokenFilter::from(LowerCaser),
/// BoxTokenFilter::from(Stemmer::default()),
/// ]);
/// ```
pub fn build<T: Tokenizer>(
tokenizer: T,
boxed_token_filters: Vec<BoxTokenFilter>,
) -> TextAnalyzer {
let mut boxed_tokenizer = BoxTokenizer(Box::new(tokenizer));
for filter in boxed_token_filters.into_iter() {
boxed_tokenizer = filter.0.box_transform(boxed_tokenizer);
}
TextAnalyzer {
tokenizer: boxed_tokenizer.0,
}
}
/// Create a new TextAnalyzerBuilder
pub fn builder<T: Tokenizer>(tokenizer: T) -> TextAnalyzerBuilder<T> {
TextAnalyzerBuilder { tokenizer }
}
/// Creates a token stream for a given `str`.
pub fn token_stream<'a>(&'a mut self, text: &'a str) -> Box<dyn TokenStream + 'a> {
self.tokenizer.box_token_stream(text)
}
}
@@ -46,20 +115,8 @@ impl<T: Tokenizer + Clone> From<T> for TextAnalyzer {
}
}
impl TextAnalyzer {
/// Create a new TextAnalyzerBuilder
pub fn builder<T: Tokenizer>(tokenizer: T) -> TextAnalyzerBuilder<T> {
TextAnalyzerBuilder { tokenizer }
}
/// Creates a token stream for a given `str`.
pub fn token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
self.tokenizer.box_token_stream(text)
}
}
/// Builder helper for [`TextAnalyzer`]
pub struct TextAnalyzerBuilder<T> {
pub struct TextAnalyzerBuilder<T: Tokenizer> {
tokenizer: T,
}
@@ -90,3 +147,37 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::tokenizer::{AlphaNumOnlyFilter, LowerCaser, RemoveLongFilter, WhitespaceTokenizer};
#[test]
fn test_text_analyzer_builder() {
let mut analyzer = TextAnalyzer::builder(WhitespaceTokenizer::default())
.filter(AlphaNumOnlyFilter)
.filter(RemoveLongFilter::limit(6))
.filter(LowerCaser)
.build();
let mut stream = analyzer.token_stream("- first bullet point");
assert_eq!(stream.next().unwrap().text, "first");
assert_eq!(stream.next().unwrap().text, "point");
}
#[test]
fn test_text_analyzer_with_filters_boxed() {
let mut analyzer = TextAnalyzer::build(
WhitespaceTokenizer::default(),
vec![
BoxTokenFilter::from(AlphaNumOnlyFilter),
BoxTokenFilter::from(LowerCaser),
BoxTokenFilter::from(RemoveLongFilter::limit(6)),
],
);
let mut stream = analyzer.token_stream("- first bullet point");
assert_eq!(stream.next().unwrap().text, "first");
assert_eq!(stream.next().unwrap().text, "point");
}
}

View File

@@ -7,7 +7,7 @@ homepage = "https://github.com/quickwit-oss/tantivy"
repository = "https://github.com/quickwit-oss/tantivy"
keywords = ["search", "information", "retrieval", "sstable"]
categories = ["database-implementations", "data-structures", "compression"]
desciption = "sstables for tantivy"
description = "sstables for tantivy"
[dependencies]
common = {version= "0.5", path="../common", package="tantivy-common"}

View File

@@ -44,7 +44,7 @@ pub fn fast_short_slice_copy(src: &[u8], dst: &mut [u8]) {
return;
}
/// The code will use the vmovdqu instruction to copy 32 bytes at a time.
// The code will use the vmovdqu instruction to copy 32 bytes at a time.
#[cfg(target_feature = "avx")]
{
if len <= 64 {

View File

@@ -6,7 +6,6 @@
//! Checkout the [tantivy repo](https://github.com/quickwit-oss/tantivy/tree/main/src/tokenizer) for some examples.
use std::borrow::{Borrow, BorrowMut};
use std::ops::{Deref, DerefMut};
use serde::{Deserialize, Serialize};
@@ -60,30 +59,6 @@ pub trait Tokenizer: 'static + Clone + Send + Sync {
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a>;
}
/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
impl<'a, T> From<T> for BoxTokenStream<'a>
where T: TokenStream + 'a
{
fn from(token_stream: T) -> BoxTokenStream<'a> {
BoxTokenStream(Box::new(token_stream))
}
}
impl<'a> Deref for BoxTokenStream<'a> {
type Target = dyn TokenStream + 'a;
fn deref(&self) -> &Self::Target {
&*self.0
}
}
impl<'a> DerefMut for BoxTokenStream<'a> {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut *self.0
}
}
impl<'a> TokenStream for Box<dyn TokenStream + 'a> {
fn advance(&mut self) -> bool {
let token_stream: &mut dyn TokenStream = self.borrow_mut();
@@ -137,7 +112,7 @@ pub trait TokenStream {
}
/// Trait for the pluggable components of `Tokenizer`s.
pub trait TokenFilter: 'static + Send + Sync {
pub trait TokenFilter: 'static + Send + Sync + Clone {
/// The Tokenizer type returned by this filter, typically parametrized by the underlying
/// Tokenizer.
type Tokenizer<T: Tokenizer>: Tokenizer;