Compare commits

..

1 Commits

Author SHA1 Message Date
Pascal Seitz
4f2e810b83 add From impl to BoxTokenStream, Bump tokenizer-api version 2023-06-23 18:54:03 +08:00
20 changed files with 172 additions and 469 deletions

View File

@@ -19,7 +19,6 @@ oneshot = "0.1.5"
base64 = "0.21.0"
byteorder = "1.4.3"
crc32fast = "1.3.2"
dyn-clone = "1.0.11"
once_cell = "1.10.0"
regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] }
aho-corasick = "1.0"
@@ -45,7 +44,7 @@ census = "0.4.0"
rustc-hash = "1.1.0"
thiserror = "1.0.30"
htmlescape = "0.3.1"
fail = { version = "0.5.0", optional = true }
fail = "0.5.0"
murmurhash32 = "0.3.0"
time = { version = "0.3.10", features = ["serde-well-known"] }
smallvec = "1.8.0"
@@ -113,7 +112,7 @@ lz4-compression = ["lz4_flex"]
snappy-compression = ["snap"]
zstd-compression = ["zstd"]
failpoints = ["fail", "fail/failpoints"]
failpoints = ["fail/failpoints"]
unstable = [] # useful for benches.
quickwit = ["sstable", "futures-util"]

View File

@@ -1,6 +1,6 @@
use std::ops::RangeInclusive;
#[cfg(target_arch = "x86_64")]
#[cfg(any(target_arch = "x86_64"))]
mod avx2;
mod scalar;

View File

@@ -5,7 +5,7 @@ edition = "2021"
license = "MIT"
homepage = "https://github.com/quickwit-oss/tantivy"
repository = "https://github.com/quickwit-oss/tantivy"
description = "column oriented storage for tantivy"
desciption = "column oriented storage for tantivy"
categories = ["database-implementations", "data-structures", "compression"]
[dependencies]

View File

@@ -6,14 +6,12 @@ use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::{Schema, FAST, TEXT};
use tantivy::{
doc, DocAddress, DocId, Index, Opstamp, Searcher, SearcherGeneration, SegmentId, SegmentReader,
Warmer,
doc, DocAddress, DocId, Index, IndexReader, Opstamp, Searcher, SearcherGeneration, SegmentId,
SegmentReader, Warmer,
};
// This example shows how warmers can be used to
// load values from an external sources and
// tie their lifecycle to that of the index segments
// using the Warmer API.
// load a values from an external sources using the Warmer API.
//
// In this example, we assume an e-commerce search engine.
@@ -25,11 +23,9 @@ pub trait PriceFetcher: Send + Sync + 'static {
fn fetch_prices(&self, product_ids: &[ProductId]) -> Vec<Price>;
}
type SegmentKey = (SegmentId, Option<Opstamp>);
struct DynamicPriceColumn {
field: String,
price_cache: RwLock<HashMap<SegmentKey, Arc<Vec<Price>>>>,
price_cache: RwLock<HashMap<(SegmentId, Option<Opstamp>), Arc<Vec<Price>>>>,
price_fetcher: Box<dyn PriceFetcher>,
}
@@ -50,6 +46,7 @@ impl DynamicPriceColumn {
impl Warmer for DynamicPriceColumn {
fn warm(&self, searcher: &Searcher) -> tantivy::Result<()> {
for segment in searcher.segment_readers() {
let key = (segment.segment_id(), segment.delete_opstamp());
let product_id_reader = segment
.fast_fields()
.u64(&self.field)?
@@ -58,40 +55,37 @@ impl Warmer for DynamicPriceColumn {
.doc_ids_alive()
.map(|doc| product_id_reader.get_val(doc))
.collect();
let mut prices = self.price_fetcher.fetch_prices(&product_ids).into_iter();
let prices: Vec<Price> = (0..segment.max_doc())
.map(|doc| {
if !segment.is_deleted(doc) {
prices.next().unwrap()
} else {
0
}
})
.collect();
let key = (segment.segment_id(), segment.delete_opstamp());
let mut prices_it = self.price_fetcher.fetch_prices(&product_ids).into_iter();
let mut price_vals: Vec<Price> = Vec::new();
for doc in 0..segment.max_doc() {
if segment.is_deleted(doc) {
price_vals.push(0);
} else {
price_vals.push(prices_it.next().unwrap())
}
}
self.price_cache
.write()
.unwrap()
.insert(key, Arc::new(prices));
.insert(key, Arc::new(price_vals));
}
Ok(())
}
fn garbage_collect(&self, live_generations: &[&SearcherGeneration]) {
let live_keys: HashSet<SegmentKey> = live_generations
.iter()
.flat_map(|gen| gen.segments())
.map(|(&segment_id, &opstamp)| (segment_id, opstamp))
let live_segment_id_and_delete_ops: HashSet<(SegmentId, Option<Opstamp>)> =
live_generations
.iter()
.flat_map(|gen| gen.segments())
.map(|(&segment_id, &opstamp)| (segment_id, opstamp))
.collect();
let mut price_cache_wrt = self.price_cache.write().unwrap();
// let price_cache = std::mem::take(&mut *price_cache_wrt);
// Drain would be nicer here.
*price_cache_wrt = std::mem::take(&mut *price_cache_wrt)
.into_iter()
.filter(|(seg_id_and_op, _)| !live_segment_id_and_delete_ops.contains(seg_id_and_op))
.collect();
self.price_cache
.write()
.unwrap()
.retain(|key, _| live_keys.contains(key));
}
}
@@ -106,17 +100,17 @@ pub struct ExternalPriceTable {
impl ExternalPriceTable {
pub fn update_price(&self, product_id: ProductId, price: Price) {
self.prices.write().unwrap().insert(product_id, price);
let mut prices_wrt = self.prices.write().unwrap();
prices_wrt.insert(product_id, price);
}
}
impl PriceFetcher for ExternalPriceTable {
fn fetch_prices(&self, product_ids: &[ProductId]) -> Vec<Price> {
let prices = self.prices.read().unwrap();
let prices_read = self.prices.read().unwrap();
product_ids
.iter()
.map(|product_id| prices.get(product_id).cloned().unwrap_or(0))
.map(|product_id| prices_read.get(product_id).cloned().unwrap_or(0))
.collect()
}
}
@@ -149,8 +143,11 @@ fn main() -> tantivy::Result<()> {
writer.add_document(doc!(product_id=>SNEAKERS, text=>"uber sweet sneakers"))?;
writer.commit()?;
let warmers = vec![Arc::downgrade(&price_dynamic_column) as Weak<dyn Warmer>];
let reader = index.reader_builder().warmers(warmers).try_into()?;
let warmers: Vec<Weak<dyn Warmer>> = vec![Arc::downgrade(
&(price_dynamic_column.clone() as Arc<dyn Warmer>),
)];
let reader: IndexReader = index.reader_builder().warmers(warmers).try_into()?;
reader.reload()?;
let query_parser = QueryParser::for_index(&index, vec![text]);
let query = query_parser.parse_query("cooking")?;

View File

@@ -6,35 +6,32 @@
//
// Of course, you can have a look at the tantivy's built-in collectors
// such as the `CountCollector` for more examples.
use std::fmt::Debug;
use std::marker::PhantomData;
use columnar::{BytesColumn, Column, DynamicColumn, HasAssociatedColumnType};
// ---
// Importing tantivy...
use std::marker::PhantomData;
use std::sync::Arc;
use columnar::{ColumnValues, DynamicColumn, HasAssociatedColumnType};
use crate::collector::{Collector, SegmentCollector};
use crate::schema::Field;
use crate::{DocId, Score, SegmentReader, TantivyError};
use crate::{Score, SegmentReader, TantivyError};
/// The `FilterCollector` filters docs using a fast field value and a predicate.
///
/// Only the documents containing at least one value for which the predicate returns `true`
/// will be passed on to the next collector.
///
/// In other words,
/// - documents with no values are filtered out.
/// - documents with several values are accepted if at least one value matches the predicate.
///
/// Only the documents for which the predicate returned "true" will be passed on to the next
/// collector.
///
/// ```rust
/// use tantivy::collector::{TopDocs, FilterCollector};
/// use tantivy::query::QueryParser;
/// use tantivy::schema::{Schema, TEXT, FAST};
/// use tantivy::schema::{Schema, TEXT, INDEXED, FAST};
/// use tantivy::{doc, DocAddress, Index};
///
/// # fn main() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder();
/// let title = schema_builder.add_text_field("title", TEXT);
/// let price = schema_builder.add_u64_field("price", FAST);
/// let price = schema_builder.add_u64_field("price", INDEXED | FAST);
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
///
@@ -50,24 +47,20 @@ use crate::{DocId, Score, SegmentReader, TantivyError};
///
/// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary")?;
/// let no_filter_collector = FilterCollector::new(price, |value: u64| value > 20_120u64, TopDocs::with_limit(2));
/// let no_filter_collector = FilterCollector::new(price, &|value: u64| value > 20_120u64, TopDocs::with_limit(2));
/// let top_docs = searcher.search(&query, &no_filter_collector)?;
///
/// assert_eq!(top_docs.len(), 1);
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
///
/// let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new(price, |value| value < 5u64, TopDocs::with_limit(2));
/// let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new(price, &|value| value < 5u64, TopDocs::with_limit(2));
/// let filtered_top_docs = searcher.search(&query, &filter_all_collector)?;
///
/// assert_eq!(filtered_top_docs.len(), 0);
/// # Ok(())
/// # }
/// ```
///
/// Note that this is limited to fast fields which implement the
/// [`FastValue`][crate::fastfield::FastValue] trait, e.g. `u64` but not `&[u8]`.
/// To filter based on a bytes fast field, use a [`BytesFilterCollector`] instead.
pub struct FilterCollector<TCollector, TPredicate, TPredicateValue>
pub struct FilterCollector<TCollector, TPredicate, TPredicateValue: Default>
where TPredicate: 'static + Clone
{
field: Field,
@@ -76,15 +69,19 @@ where TPredicate: 'static + Clone
t_predicate_value: PhantomData<TPredicateValue>,
}
impl<TCollector, TPredicate, TPredicateValue>
impl<TCollector, TPredicate, TPredicateValue: Default>
FilterCollector<TCollector, TPredicate, TPredicateValue>
where
TCollector: Collector + Send + Sync,
TPredicate: Fn(TPredicateValue) -> bool + Send + Sync + Clone,
{
/// Create a new `FilterCollector`.
pub fn new(field: Field, predicate: TPredicate, collector: TCollector) -> Self {
Self {
/// Create a new FilterCollector.
pub fn new(
field: Field,
predicate: TPredicate,
collector: TCollector,
) -> FilterCollector<TCollector, TPredicate, TPredicateValue> {
FilterCollector {
field,
predicate,
collector,
@@ -93,7 +90,7 @@ where
}
}
impl<TCollector, TPredicate, TPredicateValue> Collector
impl<TCollector, TPredicate, TPredicateValue: Default> Collector
for FilterCollector<TCollector, TPredicate, TPredicateValue>
where
TCollector: Collector + Send + Sync,
@@ -101,6 +98,8 @@ where
TPredicateValue: HasAssociatedColumnType,
DynamicColumn: Into<Option<columnar::Column<TPredicateValue>>>,
{
// That's the type of our result.
// Our standard deviation will be a float.
type Fruit = TCollector::Fruit;
type Child = FilterSegmentCollector<TCollector::Child, TPredicate, TPredicateValue>;
@@ -109,7 +108,7 @@ where
&self,
segment_local_id: u32,
segment_reader: &SegmentReader,
) -> crate::Result<Self::Child> {
) -> crate::Result<FilterSegmentCollector<TCollector::Child, TPredicate, TPredicateValue>> {
let schema = segment_reader.schema();
let field_entry = schema.get_field_entry(self.field);
if !field_entry.is_fast() {
@@ -119,16 +118,16 @@ where
)));
}
let column_opt = segment_reader
let fast_field_reader = segment_reader
.fast_fields()
.column_opt(field_entry.name())?;
.column_first_or_default(schema.get_field_name(self.field))?;
let segment_collector = self
.collector
.for_segment(segment_local_id, segment_reader)?;
Ok(FilterSegmentCollector {
column_opt,
fast_field_reader,
segment_collector,
predicate: self.predicate.clone(),
t_predicate_value: PhantomData,
@@ -147,208 +146,35 @@ where
}
}
pub struct FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue> {
column_opt: Option<Column<TPredicateValue>>,
pub struct FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue>
where
TPredicate: 'static,
DynamicColumn: Into<Option<columnar::Column<TPredicateValue>>>,
{
fast_field_reader: Arc<dyn ColumnValues<TPredicateValue>>,
segment_collector: TSegmentCollector,
predicate: TPredicate,
t_predicate_value: PhantomData<TPredicateValue>,
}
impl<TSegmentCollector, TPredicate, TPredicateValue>
FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue>
where
TPredicateValue: PartialOrd + Copy + Debug + Send + Sync + 'static,
TPredicate: 'static + Fn(TPredicateValue) -> bool + Send + Sync,
{
#[inline]
fn accept_document(&self, doc_id: DocId) -> bool {
if let Some(column) = &self.column_opt {
for val in column.values_for_doc(doc_id) {
if (self.predicate)(val) {
return true;
}
}
}
false
}
}
impl<TSegmentCollector, TPredicate, TPredicateValue> SegmentCollector
for FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue>
where
TSegmentCollector: SegmentCollector,
TPredicateValue: HasAssociatedColumnType,
TPredicate: 'static + Fn(TPredicateValue) -> bool + Send + Sync, /* DynamicColumn: Into<Option<columnar::Column<TPredicateValue>>> */
TPredicate: 'static + Fn(TPredicateValue) -> bool + Send + Sync,
DynamicColumn: Into<Option<columnar::Column<TPredicateValue>>>,
{
type Fruit = TSegmentCollector::Fruit;
fn collect(&mut self, doc: u32, score: Score) {
if self.accept_document(doc) {
self.segment_collector.collect(doc, score);
let value = self.fast_field_reader.get_val(doc);
if (self.predicate)(value) {
self.segment_collector.collect(doc, score)
}
}
fn harvest(self) -> TSegmentCollector::Fruit {
self.segment_collector.harvest()
}
}
/// A variant of the [`FilterCollector`] specialized for bytes fast fields, i.e.
/// it transparently wraps an inner [`Collector`] but filters documents
/// based on the result of applying the predicate to the bytes fast field.
///
/// A document is accepted if and only if the predicate returns `true` for at least one value.
///
/// In other words,
/// - documents with no values are filtered out.
/// - documents with several values are accepted if at least one value matches the predicate.
///
/// ```rust
/// use tantivy::collector::{TopDocs, BytesFilterCollector};
/// use tantivy::query::QueryParser;
/// use tantivy::schema::{Schema, TEXT, FAST};
/// use tantivy::{doc, DocAddress, Index};
///
/// # fn main() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder();
/// let title = schema_builder.add_text_field("title", TEXT);
/// let barcode = schema_builder.add_bytes_field("barcode", FAST);
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
///
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// index_writer.add_document(doc!(title => "The Name of the Wind", barcode => &b"010101"[..]))?;
/// index_writer.add_document(doc!(title => "The Diary of Muadib", barcode => &b"110011"[..]))?;
/// index_writer.add_document(doc!(title => "A Dairy Cow", barcode => &b"110111"[..]))?;
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl", barcode => &b"011101"[..]))?;
/// index_writer.add_document(doc!(title => "Bridget Jones's Diary"))?;
/// index_writer.commit()?;
///
/// let reader = index.reader()?;
/// let searcher = reader.searcher();
///
/// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary")?;
/// let filter_collector = BytesFilterCollector::new(barcode, |bytes: &[u8]| bytes.starts_with(b"01"), TopDocs::with_limit(2));
/// let top_docs = searcher.search(&query, &filter_collector)?;
///
/// assert_eq!(top_docs.len(), 1);
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 3));
/// # Ok(())
/// # }
/// ```
pub struct BytesFilterCollector<TCollector, TPredicate>
where TPredicate: 'static + Clone
{
field: Field,
collector: TCollector,
predicate: TPredicate,
}
impl<TCollector, TPredicate> BytesFilterCollector<TCollector, TPredicate>
where
TCollector: Collector + Send + Sync,
TPredicate: Fn(&[u8]) -> bool + Send + Sync + Clone,
{
/// Create a new `BytesFilterCollector`.
pub fn new(field: Field, predicate: TPredicate, collector: TCollector) -> Self {
Self {
field,
predicate,
collector,
}
}
}
impl<TCollector, TPredicate> Collector for BytesFilterCollector<TCollector, TPredicate>
where
TCollector: Collector + Send + Sync,
TPredicate: 'static + Fn(&[u8]) -> bool + Send + Sync + Clone,
{
type Fruit = TCollector::Fruit;
type Child = BytesFilterSegmentCollector<TCollector::Child, TPredicate>;
fn for_segment(
&self,
segment_local_id: u32,
segment_reader: &SegmentReader,
) -> crate::Result<Self::Child> {
let schema = segment_reader.schema();
let field_name = schema.get_field_name(self.field);
let column_opt = segment_reader.fast_fields().bytes(field_name)?;
let segment_collector = self
.collector
.for_segment(segment_local_id, segment_reader)?;
Ok(BytesFilterSegmentCollector {
column_opt,
segment_collector,
predicate: self.predicate.clone(),
buffer: Vec::new(),
})
}
fn requires_scoring(&self) -> bool {
self.collector.requires_scoring()
}
fn merge_fruits(
&self,
segment_fruits: Vec<<TCollector::Child as SegmentCollector>::Fruit>,
) -> crate::Result<TCollector::Fruit> {
self.collector.merge_fruits(segment_fruits)
}
}
pub struct BytesFilterSegmentCollector<TSegmentCollector, TPredicate>
where TPredicate: 'static
{
column_opt: Option<BytesColumn>,
segment_collector: TSegmentCollector,
predicate: TPredicate,
buffer: Vec<u8>,
}
impl<TSegmentCollector, TPredicate> BytesFilterSegmentCollector<TSegmentCollector, TPredicate>
where
TSegmentCollector: SegmentCollector,
TPredicate: 'static + Fn(&[u8]) -> bool + Send + Sync,
{
#[inline]
fn accept_document(&mut self, doc_id: DocId) -> bool {
if let Some(column) = &self.column_opt {
for ord in column.term_ords(doc_id) {
self.buffer.clear();
let found = column.ord_to_bytes(ord, &mut self.buffer).unwrap_or(false);
if found && (self.predicate)(&self.buffer) {
return true;
}
}
}
false
}
}
impl<TSegmentCollector, TPredicate> SegmentCollector
for BytesFilterSegmentCollector<TSegmentCollector, TPredicate>
where
TSegmentCollector: SegmentCollector,
TPredicate: 'static + Fn(&[u8]) -> bool + Send + Sync,
{
type Fruit = TSegmentCollector::Fruit;
fn collect(&mut self, doc: u32, score: Score) {
if self.accept_document(doc) {
self.segment_collector.collect(doc, score);
}
}
fn harvest(self) -> TSegmentCollector::Fruit {
fn harvest(self) -> <TSegmentCollector as SegmentCollector>::Fruit {
self.segment_collector.harvest()
}
}

View File

@@ -112,7 +112,7 @@ mod docset_collector;
pub use self::docset_collector::DocSetCollector;
mod filter_collector_wrapper;
pub use self::filter_collector_wrapper::{BytesFilterCollector, FilterCollector};
pub use self::filter_collector_wrapper::FilterCollector;
/// `Fruit` is the type for the result of our collection.
/// e.g. `usize` for the `Count` collector.

View File

@@ -2,6 +2,8 @@ use std::collections::HashMap;
use std::sync::{Arc, RwLock};
use std::{fmt, io};
use fail::fail_point;
use crate::core::{InvertedIndexReader, Segment, SegmentComponent, SegmentId};
use crate::directory::{CompositeFile, FileSlice};
use crate::error::DataCorruption;
@@ -149,7 +151,7 @@ impl SegmentReader {
let store_file = segment.open_read(SegmentComponent::Store)?;
crate::fail_point!("SegmentReader::open#middle");
fail_point!("SegmentReader::open#middle");
let postings_file = segment.open_read(SegmentComponent::Postings)?;
let postings_composite = CompositeFile::open(&postings_file)?;

View File

@@ -5,6 +5,7 @@ use std::sync::{Arc, RwLock};
use std::{fmt, result};
use common::HasLen;
use fail::fail_point;
use super::FileHandle;
use crate::core::META_FILEPATH;
@@ -183,7 +184,7 @@ impl Directory for RamDirectory {
}
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
crate::fail_point!("RamDirectory::delete", |_| {
fail_point!("RamDirectory::delete", |_| {
Err(DeleteError::IoError {
io_error: Arc::new(io::Error::from(io::ErrorKind::Other)),
filepath: path.to_path_buf(),

View File

@@ -6,6 +6,7 @@ use std::path::PathBuf;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, RwLock};
use fail::fail_point;
use rayon::{ThreadPool, ThreadPoolBuilder};
use super::segment_manager::SegmentManager;
@@ -42,7 +43,7 @@ pub(crate) fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate:
let mut buffer = serde_json::to_vec_pretty(metas)?;
// Just adding a new line at the end of the buffer.
writeln!(&mut buffer)?;
crate::fail_point!("save_metas", |msg| Err(crate::TantivyError::from(
fail_point!("save_metas", |msg| Err(crate::TantivyError::from(
std::io::Error::new(
std::io::ErrorKind::Other,
msg.unwrap_or_else(|| "Undefined".to_string())

View File

@@ -15,7 +15,7 @@ use crate::postings::{
use crate::schema::{FieldEntry, FieldType, Schema, Term, Value, DATE_TIME_PRECISION_INDEXED};
use crate::store::{StoreReader, StoreWriter};
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer};
use crate::{DocId, Document, Opstamp, SegmentComponent, TantivyError};
use crate::{DocId, Document, Opstamp, SegmentComponent};
/// Computes the initial size of the hash table.
///
@@ -98,18 +98,14 @@ impl SegmentWriter {
}
_ => None,
};
let tokenizer_name = text_options
.map(|text_index_option| text_index_option.tokenizer())
.unwrap_or("default");
tokenizer_manager.get(tokenizer_name).ok_or_else(|| {
TantivyError::SchemaError(format!(
"Error getting tokenizer for field: {}",
field_entry.name()
))
})
text_options
.and_then(|text_index_option| {
let tokenizer_name = &text_index_option.tokenizer();
tokenizer_manager.get(tokenizer_name)
})
.unwrap_or_default()
})
.collect::<Result<Vec<_>, _>>()?;
.collect();
Ok(SegmentWriter {
max_doc: 0,
ctx: IndexingContext::new(table_size),
@@ -209,7 +205,7 @@ impl SegmentWriter {
for value in values {
let mut token_stream = match value {
Value::PreTokStr(tok_str) => {
Box::new(PreTokenizedStream::from(tok_str.clone()))
PreTokenizedStream::from(tok_str.clone()).into()
}
Value::Str(ref text) => {
let text_analyzer =
@@ -442,9 +438,7 @@ fn remap_and_write(
#[cfg(test)]
mod tests {
use std::path::{Path, PathBuf};
use tempfile::TempDir;
use std::path::Path;
use super::compute_initial_table_size;
use crate::collector::Count;
@@ -452,9 +446,7 @@ mod tests {
use crate::directory::RamDirectory;
use crate::postings::TermInfo;
use crate::query::PhraseQuery;
use crate::schema::{
IndexRecordOption, Schema, TextFieldIndexing, TextOptions, Type, STORED, STRING, TEXT,
};
use crate::schema::{IndexRecordOption, Schema, Type, STORED, STRING, TEXT};
use crate::store::{Compressor, StoreReader, StoreWriter};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::OffsetDateTime;
@@ -908,32 +900,4 @@ mod tests {
postings.positions(&mut positions);
assert_eq!(positions, &[4]); //< as opposed to 3 if we had a position length of 1.
}
#[test]
fn test_show_error_when_tokenizer_not_registered() {
let text_field_indexing = TextFieldIndexing::default()
.set_tokenizer("custom_en")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let text_options = TextOptions::default()
.set_indexing_options(text_field_indexing)
.set_stored();
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("title", text_options);
let schema = schema_builder.build();
let tempdir = TempDir::new().unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
Index::create_in_dir(&tempdir_path, schema).unwrap();
let index = Index::open_in_dir(tempdir_path).unwrap();
let schema = index.schema();
let mut index_writer = index.writer(50_000_000).unwrap();
let title = schema.get_field("title").unwrap();
let mut document = Document::default();
document.add_text(title, "The Old Man and the Sea");
index_writer.add_document(document).unwrap();
let error = index_writer.commit().unwrap_err();
assert_eq!(
error.to_string(),
"Schema error: 'Error getting tokenizer for field: title'"
);
}
}

View File

@@ -101,7 +101,6 @@ mod test {
use super::Stamper;
#[allow(clippy::redundant_clone)]
#[test]
fn test_stamper() {
let stamper = Stamper::new(7u64);
@@ -117,7 +116,6 @@ mod test {
assert_eq!(stamper.stamp(), 15u64);
}
#[allow(clippy::redundant_clone)]
#[test]
fn test_stamper_revert() {
let stamper = Stamper::new(7u64);

View File

@@ -299,35 +299,6 @@ pub struct DocAddress {
pub doc_id: DocId,
}
#[macro_export]
/// Enable fail_point if feature is enabled.
macro_rules! fail_point {
($name:expr) => {{
#[cfg(feature = "failpoints")]
{
fail::eval($name, |_| {
panic!("Return is not supported for the fail point \"{}\"", $name);
});
}
}};
($name:expr, $e:expr) => {{
#[cfg(feature = "failpoints")]
{
if let Some(res) = fail::eval($name, $e) {
return res;
}
}
}};
($name:expr, $cond:expr, $e:expr) => {{
#[cfg(feature = "failpoints")]
{
if $cond {
fail::fail_point!($name, $e);
}
}
}};
}
#[cfg(test)]
pub mod tests {
use common::{BinarySerializable, FixedSize};
@@ -905,8 +876,8 @@ pub mod tests {
}"#,
)
.unwrap();
let doc = doc!(json_field=>json_val);
let index = Index::create_in_ram(schema);
let doc = doc!(json_field=>json_val.clone());
let index = Index::create_in_ram(schema.clone());
let mut writer = index.writer_for_tests().unwrap();
writer.add_document(doc).unwrap();
writer.commit().unwrap();

View File

@@ -2,6 +2,7 @@ use std::cmp::Ordering;
use std::io::{self, Write};
use common::{BinarySerializable, CountingWriter, VInt};
use fail::fail_point;
use super::TermInfo;
use crate::core::Segment;
@@ -204,7 +205,7 @@ impl<'a> FieldSerializer<'a> {
/// If the current block is incomplete, it needs to be encoded
/// using `VInt` encoding.
pub fn close_term(&mut self) -> io::Result<()> {
crate::fail_point!("FieldSerializer::close_term", |msg: Option<String>| {
fail_point!("FieldSerializer::close_term", |msg: Option<String>| {
Err(io::Error::new(io::ErrorKind::Other, format!("{msg:?}")))
});
if self.term_open {

View File

@@ -4,7 +4,9 @@ use std::collections::{BinaryHeap, HashMap};
use crate::query::bm25::idf;
use crate::query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery};
use crate::schema::{Field, FieldType, IndexRecordOption, Term, Value};
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer};
use crate::tokenizer::{
BoxTokenStream, FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer,
};
use crate::{DocAddress, Result, Searcher, TantivyError};
#[derive(Debug, PartialEq)]
@@ -204,7 +206,8 @@ impl MoreLikeThis {
for value in values {
match value {
Value::PreTokStr(tok_str) => {
let mut token_stream = PreTokenizedStream::from(tok_str.clone());
let mut token_stream: BoxTokenStream =
PreTokenizedStream::from(tok_str.clone()).into();
token_stream.process(&mut |token| {
if !self.is_noise_word(token.text.clone()) {
let term = Term::from_field_text(field, &token.text);

View File

@@ -139,7 +139,7 @@ mod tokenizer;
mod tokenizer_manager;
mod whitespace_tokenizer;
pub use tokenizer_api::{Token, TokenFilter, TokenStream, Tokenizer};
pub use tokenizer_api::{BoxTokenStream, Token, TokenFilter, TokenStream, Tokenizer};
pub use self::alphanum_only::AlphaNumOnlyFilter;
pub use self::ascii_folding_filter::AsciiFoldingFilter;
@@ -154,7 +154,7 @@ pub use self::split_compound_words::SplitCompoundWords;
pub use self::stemmer::{Language, Stemmer};
pub use self::stop_word_filter::StopWordFilter;
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
pub use self::tokenizer::{BoxTokenFilter, TextAnalyzer, TextAnalyzerBuilder};
pub use self::tokenizer::TextAnalyzer;
pub use self::tokenizer_manager::TokenizerManager;
pub use self::whitespace_tokenizer::WhitespaceTokenizer;

View File

@@ -1,106 +1,37 @@
use dyn_clone::DynClone;
/// The tokenizer module contains all of the tools used to process
/// text in `tantivy`.
use tokenizer_api::{TokenFilter, TokenStream, Tokenizer};
use tokenizer_api::{BoxTokenStream, TokenFilter, Tokenizer};
use crate::tokenizer::empty_tokenizer::EmptyTokenizer;
/// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`.
#[derive(Clone)]
pub struct TextAnalyzer {
tokenizer: Box<dyn BoxableTokenizer>,
}
/// A boxable `Tokenizer`, with its `TokenStream` type erased.
trait BoxableTokenizer: 'static + Send + Sync + DynClone {
trait BoxableTokenizer: 'static + Send + Sync {
/// Creates a boxed token stream for a given `str`.
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> Box<dyn TokenStream + 'a>;
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a>;
/// Clone this tokenizer.
fn box_clone(&self) -> Box<dyn BoxableTokenizer>;
}
impl<T: Tokenizer> BoxableTokenizer for T {
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> Box<dyn TokenStream + 'a> {
Box::new(self.token_stream(text))
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
self.token_stream(text).into()
}
fn box_clone(&self) -> Box<dyn BoxableTokenizer> {
Box::new(self.clone())
}
}
dyn_clone::clone_trait_object!(BoxableTokenizer);
/// A boxed `BoxableTokenizer` which is a `Tokenizer` with its `TokenStream` type erased.
#[derive(Clone)]
struct BoxTokenizer(Box<dyn BoxableTokenizer>);
impl Tokenizer for BoxTokenizer {
type TokenStream<'a> = Box<dyn TokenStream + 'a>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
self.0.box_token_stream(text).into()
}
}
/// A boxable `TokenFilter`, with its `Tokenizer` type erased.
trait BoxableTokenFilter: 'static + Send + Sync {
/// Wraps a `BoxedTokenizer` and returns a new one.
fn box_transform(&self, tokenizer: BoxTokenizer) -> BoxTokenizer;
}
impl<T: TokenFilter> BoxableTokenFilter for T {
fn box_transform(&self, tokenizer: BoxTokenizer) -> BoxTokenizer {
let tokenizer = self.clone().transform(tokenizer);
BoxTokenizer(Box::new(tokenizer))
}
}
/// A boxed `BoxableTokenFilter` which is a `TokenFilter` with its `Tokenizer` type erased.
pub struct BoxTokenFilter(Box<dyn BoxableTokenFilter>);
impl<T: TokenFilter> From<T> for BoxTokenFilter {
fn from(tokenizer: T) -> BoxTokenFilter {
BoxTokenFilter(Box::new(tokenizer))
}
}
impl TextAnalyzer {
/// Builds a new `TextAnalyzer` given a tokenizer and a vector of `BoxTokenFilter`.
///
/// When creating a `TextAnalyzer` from a `Tokenizer` and a static set of `TokenFilter`,
/// prefer using `TextAnalyzer::builder(tokenizer).filter(token_filter).build()` as it
/// will be more performant and create less boxes.
///
/// # Example
///
/// ```rust
/// use tantivy::tokenizer::*;
///
/// let en_stem = TextAnalyzer::build(
/// SimpleTokenizer::default(),
/// vec![
/// BoxTokenFilter::from(RemoveLongFilter::limit(40)),
/// BoxTokenFilter::from(LowerCaser),
/// BoxTokenFilter::from(Stemmer::default()),
/// ]);
/// ```
pub fn build<T: Tokenizer>(
tokenizer: T,
boxed_token_filters: Vec<BoxTokenFilter>,
) -> TextAnalyzer {
let mut boxed_tokenizer = BoxTokenizer(Box::new(tokenizer));
for filter in boxed_token_filters.into_iter() {
boxed_tokenizer = filter.0.box_transform(boxed_tokenizer);
}
impl Clone for TextAnalyzer {
fn clone(&self) -> Self {
TextAnalyzer {
tokenizer: boxed_tokenizer.0,
tokenizer: self.tokenizer.box_clone(),
}
}
/// Create a new TextAnalyzerBuilder
pub fn builder<T: Tokenizer>(tokenizer: T) -> TextAnalyzerBuilder<T> {
TextAnalyzerBuilder { tokenizer }
}
/// Creates a token stream for a given `str`.
pub fn token_stream<'a>(&'a mut self, text: &'a str) -> Box<dyn TokenStream + 'a> {
self.tokenizer.box_token_stream(text)
}
}
impl Default for TextAnalyzer {
@@ -115,8 +46,20 @@ impl<T: Tokenizer + Clone> From<T> for TextAnalyzer {
}
}
impl TextAnalyzer {
/// Create a new TextAnalyzerBuilder
pub fn builder<T: Tokenizer>(tokenizer: T) -> TextAnalyzerBuilder<T> {
TextAnalyzerBuilder { tokenizer }
}
/// Creates a token stream for a given `str`.
pub fn token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
self.tokenizer.box_token_stream(text)
}
}
/// Builder helper for [`TextAnalyzer`]
pub struct TextAnalyzerBuilder<T: Tokenizer> {
pub struct TextAnalyzerBuilder<T> {
tokenizer: T,
}
@@ -147,37 +90,3 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::tokenizer::{AlphaNumOnlyFilter, LowerCaser, RemoveLongFilter, WhitespaceTokenizer};
#[test]
fn test_text_analyzer_builder() {
let mut analyzer = TextAnalyzer::builder(WhitespaceTokenizer::default())
.filter(AlphaNumOnlyFilter)
.filter(RemoveLongFilter::limit(6))
.filter(LowerCaser)
.build();
let mut stream = analyzer.token_stream("- first bullet point");
assert_eq!(stream.next().unwrap().text, "first");
assert_eq!(stream.next().unwrap().text, "point");
}
#[test]
fn test_text_analyzer_with_filters_boxed() {
let mut analyzer = TextAnalyzer::build(
WhitespaceTokenizer::default(),
vec![
BoxTokenFilter::from(AlphaNumOnlyFilter),
BoxTokenFilter::from(LowerCaser),
BoxTokenFilter::from(RemoveLongFilter::limit(6)),
],
);
let mut stream = analyzer.token_stream("- first bullet point");
assert_eq!(stream.next().unwrap().text, "first");
assert_eq!(stream.next().unwrap().text, "point");
}
}

View File

@@ -7,7 +7,7 @@ homepage = "https://github.com/quickwit-oss/tantivy"
repository = "https://github.com/quickwit-oss/tantivy"
keywords = ["search", "information", "retrieval", "sstable"]
categories = ["database-implementations", "data-structures", "compression"]
description = "sstables for tantivy"
desciption = "sstables for tantivy"
[dependencies]
common = {version= "0.5", path="../common", package="tantivy-common"}

View File

@@ -44,7 +44,7 @@ pub fn fast_short_slice_copy(src: &[u8], dst: &mut [u8]) {
return;
}
// The code will use the vmovdqu instruction to copy 32 bytes at a time.
/// The code will use the vmovdqu instruction to copy 32 bytes at a time.
#[cfg(target_feature = "avx")]
{
if len <= 64 {

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy-tokenizer-api"
version = "0.1.0"
version = "0.1.1"
license = "MIT"
edition = "2021"
description = "Tokenizer API of tantivy"

View File

@@ -6,6 +6,7 @@
//! Checkout the [tantivy repo](https://github.com/quickwit-oss/tantivy/tree/main/src/tokenizer) for some examples.
use std::borrow::{Borrow, BorrowMut};
use std::ops::{Deref, DerefMut};
use serde::{Deserialize, Serialize};
@@ -59,6 +60,36 @@ pub trait Tokenizer: 'static + Clone + Send + Sync {
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a>;
}
/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
impl<'a> From<BoxTokenStream<'a>> for Box<dyn TokenStream + 'a> {
fn from(token_stream: BoxTokenStream<'a>) -> Self {
token_stream.0
}
}
impl<'a, T> From<T> for BoxTokenStream<'a>
where T: TokenStream + 'a
{
fn from(token_stream: T) -> BoxTokenStream<'a> {
BoxTokenStream(Box::new(token_stream))
}
}
impl<'a> Deref for BoxTokenStream<'a> {
type Target = dyn TokenStream + 'a;
fn deref(&self) -> &Self::Target {
&*self.0
}
}
impl<'a> DerefMut for BoxTokenStream<'a> {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut *self.0
}
}
impl<'a> TokenStream for Box<dyn TokenStream + 'a> {
fn advance(&mut self) -> bool {
let token_stream: &mut dyn TokenStream = self.borrow_mut();
@@ -112,7 +143,7 @@ pub trait TokenStream {
}
/// Trait for the pluggable components of `Tokenizer`s.
pub trait TokenFilter: 'static + Send + Sync + Clone {
pub trait TokenFilter: 'static + Send + Sync {
/// The Tokenizer type returned by this filter, typically parametrized by the underlying
/// Tokenizer.
type Tokenizer<T: Tokenizer>: Tokenizer;