mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-07 17:42:55 +00:00
Compare commits
21 Commits
0.20
...
fmassot/ad
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
dc783f8328 | ||
|
|
b82cd08f5d | ||
|
|
54f43135f2 | ||
|
|
6c6b97d4ef | ||
|
|
ad9b825067 | ||
|
|
44850e1036 | ||
|
|
3b0cbf8102 | ||
|
|
4aa131c3db | ||
|
|
59962097d0 | ||
|
|
ebc78127f3 | ||
|
|
8199aa7de7 | ||
|
|
657f0cd3bd | ||
|
|
3a82ef2560 | ||
|
|
3546e7fc63 | ||
|
|
862f367f9e | ||
|
|
14137d91c4 | ||
|
|
924fc70cb5 | ||
|
|
07023948aa | ||
|
|
0cb53207ec | ||
|
|
17c783b4db | ||
|
|
7220df8a09 |
@@ -27,6 +27,7 @@ Tantivy 0.20 [Unreleased]
|
||||
- [**breaking**] Drop JSON support on intermediate agg result (we use postcard as format in `quickwit` to send intermediate results) [#1992](https://github.com/quickwit-oss/tantivy/issues/1992) (@PSeitz)
|
||||
- Set memory limit in bytes for aggregations after which they abort (Previously there was only the bucket limit) [#1942](https://github.com/quickwit-oss/tantivy/issues/1942)[#1957](https://github.com/quickwit-oss/tantivy/issues/1957)(@PSeitz)
|
||||
- Add support for u64,i64,f64 fields in term aggregation [#1883](https://github.com/quickwit-oss/tantivy/issues/1883) (@PSeitz)
|
||||
- Allow histogram bounds to be passed as Rfc3339 [#2076](https://github.com/quickwit-oss/tantivy/issues/2076) (@PSeitz)
|
||||
- Add count, min, max, and sum aggregations [#1794](https://github.com/quickwit-oss/tantivy/issues/1794) (@guilload)
|
||||
- Switch to Aggregation without serde_untagged => better deserialization errors. [#2003](https://github.com/quickwit-oss/tantivy/issues/2003) (@PSeitz)
|
||||
- Switch to ms in histogram for date type (ES compatibility) [#2045](https://github.com/quickwit-oss/tantivy/issues/2045) (@PSeitz)
|
||||
@@ -39,10 +40,10 @@ Tantivy 0.20 [Unreleased]
|
||||
- Perf: Fetch blocks of vals in aggregation for all cardinality [#1950](https://github.com/quickwit-oss/tantivy/issues/1950) (@PSeitz)
|
||||
- `Searcher` with disabled scoring via `EnableScoring::Disabled` [#1780](https://github.com/quickwit-oss/tantivy/issues/1780) (@shikhar)
|
||||
- Enable tokenizer on json fields [#2053](https://github.com/quickwit-oss/tantivy/issues/2053) (@PSeitz)
|
||||
- Enforcing "NOT" and "-" queries consistency in UserInputAst [#1609](https://github.com/quickwit-oss/tantivy/issues/1609) (@Denis Bazhenov)
|
||||
- Enforcing "NOT" and "-" queries consistency in UserInputAst [#1609](https://github.com/quickwit-oss/tantivy/issues/1609) (@bazhenov)
|
||||
- Faster indexing
|
||||
- Refactor tokenization pipeline to use GATs [#1924](https://github.com/quickwit-oss/tantivy/issues/1924) (@trinity-1686a)
|
||||
- Faster term hash map [#1940](https://github.com/quickwit-oss/tantivy/issues/1940) (@PSeitz)
|
||||
- Faster term hash map [#2058](https://github.com/quickwit-oss/tantivy/issues/2058)[#1940](https://github.com/quickwit-oss/tantivy/issues/1940) (@PSeitz)
|
||||
- Refactor vint [#2010](https://github.com/quickwit-oss/tantivy/issues/2010) (@PSeitz)
|
||||
- Faster search
|
||||
- Work in batches of docs on the SegmentCollector (Only for cases without score for now) [#1937](https://github.com/quickwit-oss/tantivy/issues/1937) (@PSeitz)
|
||||
@@ -51,7 +52,8 @@ Tantivy 0.20 [Unreleased]
|
||||
- Make BM25 scoring more flexible [#1855](https://github.com/quickwit-oss/tantivy/issues/1855) (@alexcole)
|
||||
- Switch fs2 to fs4 as it is now unmaintained and does not support illumos [#1944](https://github.com/quickwit-oss/tantivy/issues/1944) (@Toasterson)
|
||||
- Made BooleanWeight and BoostWeight public [#1991](https://github.com/quickwit-oss/tantivy/issues/1991) (@fulmicoton)
|
||||
- Make index compatible with virtual drives on Windows [#1843](https://github.com/quickwit-oss/tantivy/issues/1843) (@Yukun Guo)
|
||||
- Make index compatible with virtual drives on Windows [#1843](https://github.com/quickwit-oss/tantivy/issues/1843) (@gyk)
|
||||
- Add stop words for Hungarian language [#2069](https://github.com/quickwit-oss/tantivy/issues/2069) (@tnxbutno)
|
||||
- Auto downgrade index record option, instead of vint error [#1857](https://github.com/quickwit-oss/tantivy/issues/1857) (@PSeitz)
|
||||
- Enable range query on fast field for u64 compatible types [#1762](https://github.com/quickwit-oss/tantivy/issues/1762) (@PSeitz) [#1876]
|
||||
- sstable
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.20.0"
|
||||
version = "0.20.2"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
@@ -12,12 +12,14 @@ readme = "README.md"
|
||||
keywords = ["search", "information", "retrieval"]
|
||||
edition = "2021"
|
||||
rust-version = "1.62"
|
||||
exclude = ["benches/*.json", "benches/*.txt"]
|
||||
|
||||
[dependencies]
|
||||
oneshot = "0.1.5"
|
||||
base64 = "0.21.0"
|
||||
byteorder = "1.4.3"
|
||||
crc32fast = "1.3.2"
|
||||
dyn-clone = "1.0.11"
|
||||
once_cell = "1.10.0"
|
||||
regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] }
|
||||
aho-corasick = "1.0"
|
||||
@@ -43,7 +45,7 @@ census = "0.4.0"
|
||||
rustc-hash = "1.1.0"
|
||||
thiserror = "1.0.30"
|
||||
htmlescape = "0.3.1"
|
||||
fail = "0.5.0"
|
||||
fail = { version = "0.5.0", optional = true }
|
||||
murmurhash32 = "0.3.0"
|
||||
time = { version = "0.3.10", features = ["serde-well-known"] }
|
||||
smallvec = "1.8.0"
|
||||
@@ -111,7 +113,7 @@ lz4-compression = ["lz4_flex"]
|
||||
snappy-compression = ["snap"]
|
||||
zstd-compression = ["zstd"]
|
||||
|
||||
failpoints = ["fail/failpoints"]
|
||||
failpoints = ["fail", "fail/failpoints"]
|
||||
unstable = [] # useful for benches.
|
||||
|
||||
quickwit = ["sstable", "futures-util"]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::ops::RangeInclusive;
|
||||
|
||||
#[cfg(any(target_arch = "x86_64"))]
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
mod avx2;
|
||||
|
||||
mod scalar;
|
||||
|
||||
@@ -5,7 +5,7 @@ edition = "2021"
|
||||
license = "MIT"
|
||||
homepage = "https://github.com/quickwit-oss/tantivy"
|
||||
repository = "https://github.com/quickwit-oss/tantivy"
|
||||
desciption = "column oriented storage for tantivy"
|
||||
description = "column oriented storage for tantivy"
|
||||
categories = ["database-implementations", "data-structures", "compression"]
|
||||
|
||||
[dependencies]
|
||||
|
||||
@@ -6,12 +6,14 @@ use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::{Schema, FAST, TEXT};
|
||||
use tantivy::{
|
||||
doc, DocAddress, DocId, Index, IndexReader, Opstamp, Searcher, SearcherGeneration, SegmentId,
|
||||
SegmentReader, Warmer,
|
||||
doc, DocAddress, DocId, Index, Opstamp, Searcher, SearcherGeneration, SegmentId, SegmentReader,
|
||||
Warmer,
|
||||
};
|
||||
|
||||
// This example shows how warmers can be used to
|
||||
// load a values from an external sources using the Warmer API.
|
||||
// load values from an external sources and
|
||||
// tie their lifecycle to that of the index segments
|
||||
// using the Warmer API.
|
||||
//
|
||||
// In this example, we assume an e-commerce search engine.
|
||||
|
||||
@@ -23,9 +25,11 @@ pub trait PriceFetcher: Send + Sync + 'static {
|
||||
fn fetch_prices(&self, product_ids: &[ProductId]) -> Vec<Price>;
|
||||
}
|
||||
|
||||
type SegmentKey = (SegmentId, Option<Opstamp>);
|
||||
|
||||
struct DynamicPriceColumn {
|
||||
field: String,
|
||||
price_cache: RwLock<HashMap<(SegmentId, Option<Opstamp>), Arc<Vec<Price>>>>,
|
||||
price_cache: RwLock<HashMap<SegmentKey, Arc<Vec<Price>>>>,
|
||||
price_fetcher: Box<dyn PriceFetcher>,
|
||||
}
|
||||
|
||||
@@ -46,7 +50,6 @@ impl DynamicPriceColumn {
|
||||
impl Warmer for DynamicPriceColumn {
|
||||
fn warm(&self, searcher: &Searcher) -> tantivy::Result<()> {
|
||||
for segment in searcher.segment_readers() {
|
||||
let key = (segment.segment_id(), segment.delete_opstamp());
|
||||
let product_id_reader = segment
|
||||
.fast_fields()
|
||||
.u64(&self.field)?
|
||||
@@ -55,37 +58,40 @@ impl Warmer for DynamicPriceColumn {
|
||||
.doc_ids_alive()
|
||||
.map(|doc| product_id_reader.get_val(doc))
|
||||
.collect();
|
||||
let mut prices_it = self.price_fetcher.fetch_prices(&product_ids).into_iter();
|
||||
let mut price_vals: Vec<Price> = Vec::new();
|
||||
for doc in 0..segment.max_doc() {
|
||||
if segment.is_deleted(doc) {
|
||||
price_vals.push(0);
|
||||
} else {
|
||||
price_vals.push(prices_it.next().unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
let mut prices = self.price_fetcher.fetch_prices(&product_ids).into_iter();
|
||||
|
||||
let prices: Vec<Price> = (0..segment.max_doc())
|
||||
.map(|doc| {
|
||||
if !segment.is_deleted(doc) {
|
||||
prices.next().unwrap()
|
||||
} else {
|
||||
0
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
let key = (segment.segment_id(), segment.delete_opstamp());
|
||||
self.price_cache
|
||||
.write()
|
||||
.unwrap()
|
||||
.insert(key, Arc::new(price_vals));
|
||||
.insert(key, Arc::new(prices));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn garbage_collect(&self, live_generations: &[&SearcherGeneration]) {
|
||||
let live_segment_id_and_delete_ops: HashSet<(SegmentId, Option<Opstamp>)> =
|
||||
live_generations
|
||||
.iter()
|
||||
.flat_map(|gen| gen.segments())
|
||||
.map(|(&segment_id, &opstamp)| (segment_id, opstamp))
|
||||
.collect();
|
||||
let mut price_cache_wrt = self.price_cache.write().unwrap();
|
||||
// let price_cache = std::mem::take(&mut *price_cache_wrt);
|
||||
// Drain would be nicer here.
|
||||
*price_cache_wrt = std::mem::take(&mut *price_cache_wrt)
|
||||
.into_iter()
|
||||
.filter(|(seg_id_and_op, _)| !live_segment_id_and_delete_ops.contains(seg_id_and_op))
|
||||
let live_keys: HashSet<SegmentKey> = live_generations
|
||||
.iter()
|
||||
.flat_map(|gen| gen.segments())
|
||||
.map(|(&segment_id, &opstamp)| (segment_id, opstamp))
|
||||
.collect();
|
||||
|
||||
self.price_cache
|
||||
.write()
|
||||
.unwrap()
|
||||
.retain(|key, _| live_keys.contains(key));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -100,17 +106,17 @@ pub struct ExternalPriceTable {
|
||||
|
||||
impl ExternalPriceTable {
|
||||
pub fn update_price(&self, product_id: ProductId, price: Price) {
|
||||
let mut prices_wrt = self.prices.write().unwrap();
|
||||
prices_wrt.insert(product_id, price);
|
||||
self.prices.write().unwrap().insert(product_id, price);
|
||||
}
|
||||
}
|
||||
|
||||
impl PriceFetcher for ExternalPriceTable {
|
||||
fn fetch_prices(&self, product_ids: &[ProductId]) -> Vec<Price> {
|
||||
let prices_read = self.prices.read().unwrap();
|
||||
let prices = self.prices.read().unwrap();
|
||||
|
||||
product_ids
|
||||
.iter()
|
||||
.map(|product_id| prices_read.get(product_id).cloned().unwrap_or(0))
|
||||
.map(|product_id| prices.get(product_id).cloned().unwrap_or(0))
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
@@ -143,11 +149,8 @@ fn main() -> tantivy::Result<()> {
|
||||
writer.add_document(doc!(product_id=>SNEAKERS, text=>"uber sweet sneakers"))?;
|
||||
writer.commit()?;
|
||||
|
||||
let warmers: Vec<Weak<dyn Warmer>> = vec![Arc::downgrade(
|
||||
&(price_dynamic_column.clone() as Arc<dyn Warmer>),
|
||||
)];
|
||||
let reader: IndexReader = index.reader_builder().warmers(warmers).try_into()?;
|
||||
reader.reload()?;
|
||||
let warmers = vec![Arc::downgrade(&price_dynamic_column) as Weak<dyn Warmer>];
|
||||
let reader = index.reader_builder().warmers(warmers).try_into()?;
|
||||
|
||||
let query_parser = QueryParser::for_index(&index, vec![text]);
|
||||
let query = query_parser.parse_query("cooking")?;
|
||||
|
||||
@@ -60,6 +60,8 @@ impl AggregationLimits {
|
||||
/// *bucket_limit*
|
||||
/// Limits the maximum number of buckets returned from an aggregation request.
|
||||
/// bucket_limit will default to `DEFAULT_BUCKET_LIMIT` (65000)
|
||||
///
|
||||
/// Note: The returned instance contains a Arc shared counter to track memory consumption.
|
||||
pub fn new(memory_limit: Option<u64>, bucket_limit: Option<u32>) -> Self {
|
||||
Self {
|
||||
memory_consumption: Default::default(),
|
||||
|
||||
@@ -74,14 +74,14 @@ impl AggregationWithAccessor {
|
||||
ColumnType::I64,
|
||||
ColumnType::U64,
|
||||
ColumnType::F64,
|
||||
ColumnType::Bytes,
|
||||
ColumnType::Str,
|
||||
// ColumnType::Bytes Unsupported
|
||||
// ColumnType::Bool Unsupported
|
||||
// ColumnType::IpAddr Unsupported
|
||||
// ColumnType::DateTime Unsupported
|
||||
];
|
||||
let mut columns =
|
||||
get_all_ff_reader(reader, field_name, Some(&allowed_column_types))?;
|
||||
get_all_ff_reader_or_empty(reader, field_name, Some(&allowed_column_types))?;
|
||||
let first = columns.pop().unwrap();
|
||||
accessor2 = columns.pop();
|
||||
first
|
||||
@@ -177,7 +177,7 @@ fn get_ff_reader(
|
||||
/// Get all fast field reader or empty as default.
|
||||
///
|
||||
/// Is guaranteed to return at least one column.
|
||||
fn get_all_ff_reader(
|
||||
fn get_all_ff_reader_or_empty(
|
||||
reader: &SegmentReader,
|
||||
field_name: &str,
|
||||
allowed_column_types: Option<&[ColumnType]>,
|
||||
|
||||
@@ -428,6 +428,12 @@ impl SegmentTermCollector {
|
||||
field_type: ColumnType,
|
||||
accessor_idx: usize,
|
||||
) -> crate::Result<Self> {
|
||||
if field_type == ColumnType::Bytes || field_type == ColumnType::Bool {
|
||||
return Err(TantivyError::InvalidArgument(format!(
|
||||
"terms aggregation is not supported for column type {:?}",
|
||||
field_type
|
||||
)));
|
||||
}
|
||||
let term_buckets = TermBuckets::default();
|
||||
|
||||
if let Some(custom_order) = req.order.as_ref() {
|
||||
@@ -1500,4 +1506,41 @@ mod tests {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn terms_aggregation_bytes() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let bytes_field = schema_builder.add_bytes_field("bytes", FAST);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
index_writer.add_document(doc!(
|
||||
bytes_field => vec![1,2,3],
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
}
|
||||
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"my_texts": {
|
||||
"terms": {
|
||||
"field": "bytes"
|
||||
},
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let res = exec_request_with_query(agg_req, &index, None)?;
|
||||
|
||||
// TODO: Returning an error would be better instead of an empty result, since this is not a
|
||||
// JSON field
|
||||
assert_eq!(
|
||||
res["my_texts"]["buckets"][0]["key"],
|
||||
serde_json::Value::Null
|
||||
);
|
||||
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
|
||||
assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -161,6 +161,21 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
||||
/// ]);
|
||||
/// }
|
||||
///
|
||||
/// {
|
||||
/// let mut facet_collector = FacetCollector::for_field("facet");
|
||||
/// facet_collector.add_facet("/");
|
||||
/// let facet_counts = searcher.search(&AllQuery, &facet_collector)?;
|
||||
///
|
||||
/// // This lists all of the facet counts
|
||||
/// let facets: Vec<(&Facet, u64)> = facet_counts
|
||||
/// .get("/")
|
||||
/// .collect();
|
||||
/// assert_eq!(facets, vec![
|
||||
/// (&Facet::from("/category"), 4),
|
||||
/// (&Facet::from("/lang"), 4)
|
||||
/// ]);
|
||||
/// }
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// # assert!(example().is_ok());
|
||||
@@ -285,6 +300,9 @@ fn is_child_facet(parent_facet: &[u8], possible_child_facet: &[u8]) -> bool {
|
||||
if !possible_child_facet.starts_with(parent_facet) {
|
||||
return false;
|
||||
}
|
||||
if parent_facet.is_empty() {
|
||||
return true;
|
||||
}
|
||||
possible_child_facet.get(parent_facet.len()).copied() == Some(0u8)
|
||||
}
|
||||
|
||||
@@ -789,6 +807,15 @@ mod tests {
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn is_child_facet() {
|
||||
assert!(super::is_child_facet(&b"foo"[..], &b"foo\0bar"[..]));
|
||||
assert!(super::is_child_facet(&b""[..], &b"foo\0bar"[..]));
|
||||
assert!(super::is_child_facet(&b""[..], &b"foo"[..]));
|
||||
assert!(!super::is_child_facet(&b"foo\0bar"[..], &b"foo"[..]));
|
||||
assert!(!super::is_child_facet(&b"foo"[..], &b"foobar\0baz"[..]));
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
|
||||
@@ -6,32 +6,35 @@
|
||||
//
|
||||
// Of course, you can have a look at the tantivy's built-in collectors
|
||||
// such as the `CountCollector` for more examples.
|
||||
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
use std::fmt::Debug;
|
||||
use std::marker::PhantomData;
|
||||
use std::sync::Arc;
|
||||
|
||||
use columnar::{ColumnValues, DynamicColumn, HasAssociatedColumnType};
|
||||
use columnar::{BytesColumn, Column, DynamicColumn, HasAssociatedColumnType};
|
||||
|
||||
use crate::collector::{Collector, SegmentCollector};
|
||||
use crate::schema::Field;
|
||||
use crate::{Score, SegmentReader, TantivyError};
|
||||
use crate::{DocId, Score, SegmentReader, TantivyError};
|
||||
|
||||
/// The `FilterCollector` filters docs using a fast field value and a predicate.
|
||||
/// Only the documents for which the predicate returned "true" will be passed on to the next
|
||||
/// collector.
|
||||
///
|
||||
/// Only the documents containing at least one value for which the predicate returns `true`
|
||||
/// will be passed on to the next collector.
|
||||
///
|
||||
/// In other words,
|
||||
/// - documents with no values are filtered out.
|
||||
/// - documents with several values are accepted if at least one value matches the predicate.
|
||||
///
|
||||
///
|
||||
/// ```rust
|
||||
/// use tantivy::collector::{TopDocs, FilterCollector};
|
||||
/// use tantivy::query::QueryParser;
|
||||
/// use tantivy::schema::{Schema, TEXT, INDEXED, FAST};
|
||||
/// use tantivy::schema::{Schema, TEXT, FAST};
|
||||
/// use tantivy::{doc, DocAddress, Index};
|
||||
///
|
||||
/// # fn main() -> tantivy::Result<()> {
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let price = schema_builder.add_u64_field("price", INDEXED | FAST);
|
||||
/// let price = schema_builder.add_u64_field("price", FAST);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
///
|
||||
@@ -47,20 +50,24 @@ use crate::{Score, SegmentReader, TantivyError};
|
||||
///
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary")?;
|
||||
/// let no_filter_collector = FilterCollector::new(price, &|value: u64| value > 20_120u64, TopDocs::with_limit(2));
|
||||
/// let no_filter_collector = FilterCollector::new(price, |value: u64| value > 20_120u64, TopDocs::with_limit(2));
|
||||
/// let top_docs = searcher.search(&query, &no_filter_collector)?;
|
||||
///
|
||||
/// assert_eq!(top_docs.len(), 1);
|
||||
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
|
||||
///
|
||||
/// let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new(price, &|value| value < 5u64, TopDocs::with_limit(2));
|
||||
/// let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new(price, |value| value < 5u64, TopDocs::with_limit(2));
|
||||
/// let filtered_top_docs = searcher.search(&query, &filter_all_collector)?;
|
||||
///
|
||||
/// assert_eq!(filtered_top_docs.len(), 0);
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
pub struct FilterCollector<TCollector, TPredicate, TPredicateValue: Default>
|
||||
///
|
||||
/// Note that this is limited to fast fields which implement the
|
||||
/// [`FastValue`][crate::fastfield::FastValue] trait, e.g. `u64` but not `&[u8]`.
|
||||
/// To filter based on a bytes fast field, use a [`BytesFilterCollector`] instead.
|
||||
pub struct FilterCollector<TCollector, TPredicate, TPredicateValue>
|
||||
where TPredicate: 'static + Clone
|
||||
{
|
||||
field: Field,
|
||||
@@ -69,19 +76,15 @@ where TPredicate: 'static + Clone
|
||||
t_predicate_value: PhantomData<TPredicateValue>,
|
||||
}
|
||||
|
||||
impl<TCollector, TPredicate, TPredicateValue: Default>
|
||||
impl<TCollector, TPredicate, TPredicateValue>
|
||||
FilterCollector<TCollector, TPredicate, TPredicateValue>
|
||||
where
|
||||
TCollector: Collector + Send + Sync,
|
||||
TPredicate: Fn(TPredicateValue) -> bool + Send + Sync + Clone,
|
||||
{
|
||||
/// Create a new FilterCollector.
|
||||
pub fn new(
|
||||
field: Field,
|
||||
predicate: TPredicate,
|
||||
collector: TCollector,
|
||||
) -> FilterCollector<TCollector, TPredicate, TPredicateValue> {
|
||||
FilterCollector {
|
||||
/// Create a new `FilterCollector`.
|
||||
pub fn new(field: Field, predicate: TPredicate, collector: TCollector) -> Self {
|
||||
Self {
|
||||
field,
|
||||
predicate,
|
||||
collector,
|
||||
@@ -90,7 +93,7 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
impl<TCollector, TPredicate, TPredicateValue: Default> Collector
|
||||
impl<TCollector, TPredicate, TPredicateValue> Collector
|
||||
for FilterCollector<TCollector, TPredicate, TPredicateValue>
|
||||
where
|
||||
TCollector: Collector + Send + Sync,
|
||||
@@ -98,8 +101,6 @@ where
|
||||
TPredicateValue: HasAssociatedColumnType,
|
||||
DynamicColumn: Into<Option<columnar::Column<TPredicateValue>>>,
|
||||
{
|
||||
// That's the type of our result.
|
||||
// Our standard deviation will be a float.
|
||||
type Fruit = TCollector::Fruit;
|
||||
|
||||
type Child = FilterSegmentCollector<TCollector::Child, TPredicate, TPredicateValue>;
|
||||
@@ -108,7 +109,7 @@ where
|
||||
&self,
|
||||
segment_local_id: u32,
|
||||
segment_reader: &SegmentReader,
|
||||
) -> crate::Result<FilterSegmentCollector<TCollector::Child, TPredicate, TPredicateValue>> {
|
||||
) -> crate::Result<Self::Child> {
|
||||
let schema = segment_reader.schema();
|
||||
let field_entry = schema.get_field_entry(self.field);
|
||||
if !field_entry.is_fast() {
|
||||
@@ -118,16 +119,16 @@ where
|
||||
)));
|
||||
}
|
||||
|
||||
let fast_field_reader = segment_reader
|
||||
let column_opt = segment_reader
|
||||
.fast_fields()
|
||||
.column_first_or_default(schema.get_field_name(self.field))?;
|
||||
.column_opt(field_entry.name())?;
|
||||
|
||||
let segment_collector = self
|
||||
.collector
|
||||
.for_segment(segment_local_id, segment_reader)?;
|
||||
|
||||
Ok(FilterSegmentCollector {
|
||||
fast_field_reader,
|
||||
column_opt,
|
||||
segment_collector,
|
||||
predicate: self.predicate.clone(),
|
||||
t_predicate_value: PhantomData,
|
||||
@@ -146,35 +147,208 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue>
|
||||
where
|
||||
TPredicate: 'static,
|
||||
DynamicColumn: Into<Option<columnar::Column<TPredicateValue>>>,
|
||||
{
|
||||
fast_field_reader: Arc<dyn ColumnValues<TPredicateValue>>,
|
||||
pub struct FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue> {
|
||||
column_opt: Option<Column<TPredicateValue>>,
|
||||
segment_collector: TSegmentCollector,
|
||||
predicate: TPredicate,
|
||||
t_predicate_value: PhantomData<TPredicateValue>,
|
||||
}
|
||||
|
||||
impl<TSegmentCollector, TPredicate, TPredicateValue>
|
||||
FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue>
|
||||
where
|
||||
TPredicateValue: PartialOrd + Copy + Debug + Send + Sync + 'static,
|
||||
TPredicate: 'static + Fn(TPredicateValue) -> bool + Send + Sync,
|
||||
{
|
||||
#[inline]
|
||||
fn accept_document(&self, doc_id: DocId) -> bool {
|
||||
if let Some(column) = &self.column_opt {
|
||||
for val in column.values_for_doc(doc_id) {
|
||||
if (self.predicate)(val) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl<TSegmentCollector, TPredicate, TPredicateValue> SegmentCollector
|
||||
for FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue>
|
||||
where
|
||||
TSegmentCollector: SegmentCollector,
|
||||
TPredicateValue: HasAssociatedColumnType,
|
||||
TPredicate: 'static + Fn(TPredicateValue) -> bool + Send + Sync,
|
||||
DynamicColumn: Into<Option<columnar::Column<TPredicateValue>>>,
|
||||
TPredicate: 'static + Fn(TPredicateValue) -> bool + Send + Sync, /* DynamicColumn: Into<Option<columnar::Column<TPredicateValue>>> */
|
||||
{
|
||||
type Fruit = TSegmentCollector::Fruit;
|
||||
|
||||
fn collect(&mut self, doc: u32, score: Score) {
|
||||
let value = self.fast_field_reader.get_val(doc);
|
||||
if (self.predicate)(value) {
|
||||
self.segment_collector.collect(doc, score)
|
||||
if self.accept_document(doc) {
|
||||
self.segment_collector.collect(doc, score);
|
||||
}
|
||||
}
|
||||
|
||||
fn harvest(self) -> <TSegmentCollector as SegmentCollector>::Fruit {
|
||||
fn harvest(self) -> TSegmentCollector::Fruit {
|
||||
self.segment_collector.harvest()
|
||||
}
|
||||
}
|
||||
|
||||
/// A variant of the [`FilterCollector`] specialized for bytes fast fields, i.e.
|
||||
/// it transparently wraps an inner [`Collector`] but filters documents
|
||||
/// based on the result of applying the predicate to the bytes fast field.
|
||||
///
|
||||
/// A document is accepted if and only if the predicate returns `true` for at least one value.
|
||||
///
|
||||
/// In other words,
|
||||
/// - documents with no values are filtered out.
|
||||
/// - documents with several values are accepted if at least one value matches the predicate.
|
||||
///
|
||||
/// ```rust
|
||||
/// use tantivy::collector::{TopDocs, BytesFilterCollector};
|
||||
/// use tantivy::query::QueryParser;
|
||||
/// use tantivy::schema::{Schema, TEXT, FAST};
|
||||
/// use tantivy::{doc, DocAddress, Index};
|
||||
///
|
||||
/// # fn main() -> tantivy::Result<()> {
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let barcode = schema_builder.add_bytes_field("barcode", FAST);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
///
|
||||
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||
/// index_writer.add_document(doc!(title => "The Name of the Wind", barcode => &b"010101"[..]))?;
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib", barcode => &b"110011"[..]))?;
|
||||
/// index_writer.add_document(doc!(title => "A Dairy Cow", barcode => &b"110111"[..]))?;
|
||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl", barcode => &b"011101"[..]))?;
|
||||
/// index_writer.add_document(doc!(title => "Bridget Jones's Diary"))?;
|
||||
/// index_writer.commit()?;
|
||||
///
|
||||
/// let reader = index.reader()?;
|
||||
/// let searcher = reader.searcher();
|
||||
///
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary")?;
|
||||
/// let filter_collector = BytesFilterCollector::new(barcode, |bytes: &[u8]| bytes.starts_with(b"01"), TopDocs::with_limit(2));
|
||||
/// let top_docs = searcher.search(&query, &filter_collector)?;
|
||||
///
|
||||
/// assert_eq!(top_docs.len(), 1);
|
||||
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 3));
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
pub struct BytesFilterCollector<TCollector, TPredicate>
|
||||
where TPredicate: 'static + Clone
|
||||
{
|
||||
field: Field,
|
||||
collector: TCollector,
|
||||
predicate: TPredicate,
|
||||
}
|
||||
|
||||
impl<TCollector, TPredicate> BytesFilterCollector<TCollector, TPredicate>
|
||||
where
|
||||
TCollector: Collector + Send + Sync,
|
||||
TPredicate: Fn(&[u8]) -> bool + Send + Sync + Clone,
|
||||
{
|
||||
/// Create a new `BytesFilterCollector`.
|
||||
pub fn new(field: Field, predicate: TPredicate, collector: TCollector) -> Self {
|
||||
Self {
|
||||
field,
|
||||
predicate,
|
||||
collector,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<TCollector, TPredicate> Collector for BytesFilterCollector<TCollector, TPredicate>
|
||||
where
|
||||
TCollector: Collector + Send + Sync,
|
||||
TPredicate: 'static + Fn(&[u8]) -> bool + Send + Sync + Clone,
|
||||
{
|
||||
type Fruit = TCollector::Fruit;
|
||||
|
||||
type Child = BytesFilterSegmentCollector<TCollector::Child, TPredicate>;
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
segment_local_id: u32,
|
||||
segment_reader: &SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
let schema = segment_reader.schema();
|
||||
let field_name = schema.get_field_name(self.field);
|
||||
|
||||
let column_opt = segment_reader.fast_fields().bytes(field_name)?;
|
||||
|
||||
let segment_collector = self
|
||||
.collector
|
||||
.for_segment(segment_local_id, segment_reader)?;
|
||||
|
||||
Ok(BytesFilterSegmentCollector {
|
||||
column_opt,
|
||||
segment_collector,
|
||||
predicate: self.predicate.clone(),
|
||||
buffer: Vec::new(),
|
||||
})
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
self.collector.requires_scoring()
|
||||
}
|
||||
|
||||
fn merge_fruits(
|
||||
&self,
|
||||
segment_fruits: Vec<<TCollector::Child as SegmentCollector>::Fruit>,
|
||||
) -> crate::Result<TCollector::Fruit> {
|
||||
self.collector.merge_fruits(segment_fruits)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct BytesFilterSegmentCollector<TSegmentCollector, TPredicate>
|
||||
where TPredicate: 'static
|
||||
{
|
||||
column_opt: Option<BytesColumn>,
|
||||
segment_collector: TSegmentCollector,
|
||||
predicate: TPredicate,
|
||||
buffer: Vec<u8>,
|
||||
}
|
||||
|
||||
impl<TSegmentCollector, TPredicate> BytesFilterSegmentCollector<TSegmentCollector, TPredicate>
|
||||
where
|
||||
TSegmentCollector: SegmentCollector,
|
||||
TPredicate: 'static + Fn(&[u8]) -> bool + Send + Sync,
|
||||
{
|
||||
#[inline]
|
||||
fn accept_document(&mut self, doc_id: DocId) -> bool {
|
||||
if let Some(column) = &self.column_opt {
|
||||
for ord in column.term_ords(doc_id) {
|
||||
self.buffer.clear();
|
||||
|
||||
let found = column.ord_to_bytes(ord, &mut self.buffer).unwrap_or(false);
|
||||
|
||||
if found && (self.predicate)(&self.buffer) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl<TSegmentCollector, TPredicate> SegmentCollector
|
||||
for BytesFilterSegmentCollector<TSegmentCollector, TPredicate>
|
||||
where
|
||||
TSegmentCollector: SegmentCollector,
|
||||
TPredicate: 'static + Fn(&[u8]) -> bool + Send + Sync,
|
||||
{
|
||||
type Fruit = TSegmentCollector::Fruit;
|
||||
|
||||
fn collect(&mut self, doc: u32, score: Score) {
|
||||
if self.accept_document(doc) {
|
||||
self.segment_collector.collect(doc, score);
|
||||
}
|
||||
}
|
||||
|
||||
fn harvest(self) -> TSegmentCollector::Fruit {
|
||||
self.segment_collector.harvest()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -112,7 +112,7 @@ mod docset_collector;
|
||||
pub use self::docset_collector::DocSetCollector;
|
||||
|
||||
mod filter_collector_wrapper;
|
||||
pub use self::filter_collector_wrapper::FilterCollector;
|
||||
pub use self::filter_collector_wrapper::{BytesFilterCollector, FilterCollector};
|
||||
|
||||
/// `Fruit` is the type for the result of our collection.
|
||||
/// e.g. `usize` for the `Count` collector.
|
||||
|
||||
@@ -212,12 +212,12 @@ pub fn convert_to_fast_value_and_get_term(
|
||||
DateTime::from_utc(dt_utc),
|
||||
));
|
||||
}
|
||||
if let Ok(u64_val) = str::parse::<u64>(phrase) {
|
||||
return Some(set_fastvalue_and_get_term(json_term_writer, u64_val));
|
||||
}
|
||||
if let Ok(i64_val) = str::parse::<i64>(phrase) {
|
||||
return Some(set_fastvalue_and_get_term(json_term_writer, i64_val));
|
||||
}
|
||||
if let Ok(u64_val) = str::parse::<u64>(phrase) {
|
||||
return Some(set_fastvalue_and_get_term(json_term_writer, u64_val));
|
||||
}
|
||||
if let Ok(f64_val) = str::parse::<f64>(phrase) {
|
||||
return Some(set_fastvalue_and_get_term(json_term_writer, f64_val));
|
||||
}
|
||||
|
||||
@@ -2,8 +2,6 @@ use std::collections::HashMap;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::{fmt, io};
|
||||
|
||||
use fail::fail_point;
|
||||
|
||||
use crate::core::{InvertedIndexReader, Segment, SegmentComponent, SegmentId};
|
||||
use crate::directory::{CompositeFile, FileSlice};
|
||||
use crate::error::DataCorruption;
|
||||
@@ -151,7 +149,7 @@ impl SegmentReader {
|
||||
|
||||
let store_file = segment.open_read(SegmentComponent::Store)?;
|
||||
|
||||
fail_point!("SegmentReader::open#middle");
|
||||
crate::fail_point!("SegmentReader::open#middle");
|
||||
|
||||
let postings_file = segment.open_read(SegmentComponent::Postings)?;
|
||||
let postings_composite = CompositeFile::open(&postings_file)?;
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::fs::{self, File, OpenOptions};
|
||||
use std::io::{self, BufWriter, Read, Seek, Write};
|
||||
use std::ops::Deref;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{Arc, RwLock, Weak};
|
||||
use std::{fmt, result};
|
||||
|
||||
use common::StableDeref;
|
||||
use fs4::FileExt;
|
||||
@@ -21,6 +21,7 @@ use crate::directory::{
|
||||
AntiCallToken, Directory, DirectoryLock, FileHandle, Lock, OwnedBytes, TerminatingWrite,
|
||||
WatchCallback, WatchHandle, WritePtr,
|
||||
};
|
||||
#[cfg(unix)]
|
||||
use crate::Advice;
|
||||
|
||||
pub type ArcBytes = Arc<dyn Deref<Target = [u8]> + Send + Sync + 'static>;
|
||||
@@ -33,10 +34,7 @@ pub(crate) fn make_io_err(msg: String) -> io::Error {
|
||||
|
||||
/// Returns `None` iff the file exists, can be read, but is empty (and hence
|
||||
/// cannot be mmapped)
|
||||
fn open_mmap(
|
||||
full_path: &Path,
|
||||
madvice_opt: Option<Advice>,
|
||||
) -> result::Result<Option<Mmap>, OpenReadError> {
|
||||
fn open_mmap(full_path: &Path) -> Result<Option<Mmap>, OpenReadError> {
|
||||
let file = File::open(full_path).map_err(|io_err| {
|
||||
if io_err.kind() == io::ErrorKind::NotFound {
|
||||
OpenReadError::FileDoesNotExist(full_path.to_path_buf())
|
||||
@@ -59,9 +57,7 @@ fn open_mmap(
|
||||
.map(Some)
|
||||
.map_err(|io_err| OpenReadError::wrap_io_error(io_err, full_path.to_path_buf()))
|
||||
}?;
|
||||
if let (Some(mmap), Some(madvice)) = (&mmap_opt, madvice_opt) {
|
||||
let _ = mmap.advise(madvice);
|
||||
}
|
||||
|
||||
Ok(mmap_opt)
|
||||
}
|
||||
|
||||
@@ -83,18 +79,25 @@ pub struct CacheInfo {
|
||||
struct MmapCache {
|
||||
counters: CacheCounters,
|
||||
cache: HashMap<PathBuf, WeakArcBytes>,
|
||||
#[cfg(unix)]
|
||||
madvice_opt: Option<Advice>,
|
||||
}
|
||||
|
||||
impl MmapCache {
|
||||
fn new(madvice_opt: Option<Advice>) -> MmapCache {
|
||||
fn new() -> MmapCache {
|
||||
MmapCache {
|
||||
counters: CacheCounters::default(),
|
||||
cache: HashMap::default(),
|
||||
madvice_opt,
|
||||
#[cfg(unix)]
|
||||
madvice_opt: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(unix)]
|
||||
fn set_advice(&mut self, madvice: Advice) {
|
||||
self.madvice_opt = Some(madvice);
|
||||
}
|
||||
|
||||
fn get_info(&self) -> CacheInfo {
|
||||
let paths: Vec<PathBuf> = self.cache.keys().cloned().collect();
|
||||
CacheInfo {
|
||||
@@ -115,6 +118,16 @@ impl MmapCache {
|
||||
}
|
||||
}
|
||||
|
||||
fn open_mmap_impl(&self, full_path: &Path) -> Result<Option<Mmap>, OpenReadError> {
|
||||
let mmap_opt = open_mmap(full_path)?;
|
||||
#[cfg(unix)]
|
||||
if let (Some(mmap), Some(madvice)) = (mmap_opt.as_ref(), self.madvice_opt) {
|
||||
// We ignore madvise errors.
|
||||
let _ = mmap.advise(madvice);
|
||||
}
|
||||
Ok(mmap_opt)
|
||||
}
|
||||
|
||||
// Returns None if the file exists but as a len of 0 (and hence is not mmappable).
|
||||
fn get_mmap(&mut self, full_path: &Path) -> Result<Option<ArcBytes>, OpenReadError> {
|
||||
if let Some(mmap_weak) = self.cache.get(full_path) {
|
||||
@@ -125,7 +138,7 @@ impl MmapCache {
|
||||
}
|
||||
self.cache.remove(full_path);
|
||||
self.counters.miss += 1;
|
||||
let mmap_opt = open_mmap(full_path, self.madvice_opt)?;
|
||||
let mmap_opt = self.open_mmap_impl(full_path)?;
|
||||
Ok(mmap_opt.map(|mmap| {
|
||||
let mmap_arc: ArcBytes = Arc::new(mmap);
|
||||
let mmap_weak = Arc::downgrade(&mmap_arc);
|
||||
@@ -160,13 +173,9 @@ struct MmapDirectoryInner {
|
||||
}
|
||||
|
||||
impl MmapDirectoryInner {
|
||||
fn new(
|
||||
root_path: PathBuf,
|
||||
temp_directory: Option<TempDir>,
|
||||
madvice_opt: Option<Advice>,
|
||||
) -> MmapDirectoryInner {
|
||||
fn new(root_path: PathBuf, temp_directory: Option<TempDir>) -> MmapDirectoryInner {
|
||||
MmapDirectoryInner {
|
||||
mmap_cache: RwLock::new(MmapCache::new(madvice_opt)),
|
||||
mmap_cache: RwLock::new(MmapCache::new()),
|
||||
_temp_directory: temp_directory,
|
||||
watcher: FileWatcher::new(&root_path.join(*META_FILEPATH)),
|
||||
root_path,
|
||||
@@ -185,12 +194,8 @@ impl fmt::Debug for MmapDirectory {
|
||||
}
|
||||
|
||||
impl MmapDirectory {
|
||||
fn new(
|
||||
root_path: PathBuf,
|
||||
temp_directory: Option<TempDir>,
|
||||
madvice_opt: Option<Advice>,
|
||||
) -> MmapDirectory {
|
||||
let inner = MmapDirectoryInner::new(root_path, temp_directory, madvice_opt);
|
||||
fn new(root_path: PathBuf, temp_directory: Option<TempDir>) -> MmapDirectory {
|
||||
let inner = MmapDirectoryInner::new(root_path, temp_directory);
|
||||
MmapDirectory {
|
||||
inner: Arc::new(inner),
|
||||
}
|
||||
@@ -206,29 +211,33 @@ impl MmapDirectory {
|
||||
Ok(MmapDirectory::new(
|
||||
tempdir.path().to_path_buf(),
|
||||
Some(tempdir),
|
||||
None,
|
||||
))
|
||||
}
|
||||
|
||||
/// Opens a MmapDirectory in a directory, with a given access pattern.
|
||||
///
|
||||
/// This is only supported on unix platforms.
|
||||
#[cfg(unix)]
|
||||
pub fn open_with_madvice(
|
||||
directory_path: impl AsRef<Path>,
|
||||
madvice: Advice,
|
||||
) -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
let dir = Self::open_impl_to_avoid_monomorphization(directory_path.as_ref())?;
|
||||
dir.inner.mmap_cache.write().unwrap().set_advice(madvice);
|
||||
Ok(dir)
|
||||
}
|
||||
|
||||
/// Opens a MmapDirectory in a directory.
|
||||
///
|
||||
/// Returns an error if the `directory_path` does not
|
||||
/// exist or if it is not a directory.
|
||||
pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
Self::open_with_access_pattern_impl(directory_path.as_ref(), None)
|
||||
pub fn open(directory_path: impl AsRef<Path>) -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
Self::open_impl_to_avoid_monomorphization(directory_path.as_ref())
|
||||
}
|
||||
|
||||
/// Opens a MmapDirectory in a directory, with a given access pattern.
|
||||
pub fn open_with_madvice<P: AsRef<Path>>(
|
||||
directory_path: P,
|
||||
madvice: Advice,
|
||||
) -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
Self::open_with_access_pattern_impl(directory_path.as_ref(), Some(madvice))
|
||||
}
|
||||
|
||||
fn open_with_access_pattern_impl(
|
||||
#[inline(never)]
|
||||
fn open_impl_to_avoid_monomorphization(
|
||||
directory_path: &Path,
|
||||
madvice_opt: Option<Advice>,
|
||||
) -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
if !directory_path.exists() {
|
||||
return Err(OpenDirectoryError::DoesNotExist(PathBuf::from(
|
||||
@@ -256,7 +265,7 @@ impl MmapDirectory {
|
||||
directory_path,
|
||||
)));
|
||||
}
|
||||
Ok(MmapDirectory::new(canonical_path, None, madvice_opt))
|
||||
Ok(MmapDirectory::new(canonical_path, None))
|
||||
}
|
||||
|
||||
/// Joins a relative_path to the directory `root_path`
|
||||
@@ -365,7 +374,7 @@ pub(crate) fn atomic_write(path: &Path, content: &[u8]) -> io::Result<()> {
|
||||
}
|
||||
|
||||
impl Directory for MmapDirectory {
|
||||
fn get_file_handle(&self, path: &Path) -> result::Result<Arc<dyn FileHandle>, OpenReadError> {
|
||||
fn get_file_handle(&self, path: &Path) -> Result<Arc<dyn FileHandle>, OpenReadError> {
|
||||
debug!("Open Read {:?}", path);
|
||||
let full_path = self.resolve_path(path);
|
||||
|
||||
@@ -388,7 +397,7 @@ impl Directory for MmapDirectory {
|
||||
|
||||
/// Any entry associated with the path in the mmap will be
|
||||
/// removed before the file is deleted.
|
||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
||||
fn delete(&self, path: &Path) -> Result<(), DeleteError> {
|
||||
let full_path = self.resolve_path(path);
|
||||
fs::remove_file(full_path).map_err(|e| {
|
||||
if e.kind() == io::ErrorKind::NotFound {
|
||||
|
||||
@@ -5,7 +5,6 @@ use std::sync::{Arc, RwLock};
|
||||
use std::{fmt, result};
|
||||
|
||||
use common::HasLen;
|
||||
use fail::fail_point;
|
||||
|
||||
use super::FileHandle;
|
||||
use crate::core::META_FILEPATH;
|
||||
@@ -184,7 +183,7 @@ impl Directory for RamDirectory {
|
||||
}
|
||||
|
||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
||||
fail_point!("RamDirectory::delete", |_| {
|
||||
crate::fail_point!("RamDirectory::delete", |_| {
|
||||
Err(DeleteError::IoError {
|
||||
io_error: Arc::new(io::Error::from(io::ErrorKind::Other)),
|
||||
filepath: path.to_path_buf(),
|
||||
|
||||
@@ -6,7 +6,6 @@ use std::path::PathBuf;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
use fail::fail_point;
|
||||
use rayon::{ThreadPool, ThreadPoolBuilder};
|
||||
|
||||
use super::segment_manager::SegmentManager;
|
||||
@@ -43,7 +42,7 @@ pub(crate) fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate:
|
||||
let mut buffer = serde_json::to_vec_pretty(metas)?;
|
||||
// Just adding a new line at the end of the buffer.
|
||||
writeln!(&mut buffer)?;
|
||||
fail_point!("save_metas", |msg| Err(crate::TantivyError::from(
|
||||
crate::fail_point!("save_metas", |msg| Err(crate::TantivyError::from(
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
msg.unwrap_or_else(|| "Undefined".to_string())
|
||||
|
||||
@@ -15,7 +15,7 @@ use crate::postings::{
|
||||
use crate::schema::{FieldEntry, FieldType, Schema, Term, Value, DATE_TIME_PRECISION_INDEXED};
|
||||
use crate::store::{StoreReader, StoreWriter};
|
||||
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer};
|
||||
use crate::{DocId, Document, Opstamp, SegmentComponent};
|
||||
use crate::{DocId, Document, Opstamp, SegmentComponent, TantivyError};
|
||||
|
||||
/// Computes the initial size of the hash table.
|
||||
///
|
||||
@@ -98,14 +98,18 @@ impl SegmentWriter {
|
||||
}
|
||||
_ => None,
|
||||
};
|
||||
text_options
|
||||
.and_then(|text_index_option| {
|
||||
let tokenizer_name = &text_index_option.tokenizer();
|
||||
tokenizer_manager.get(tokenizer_name)
|
||||
})
|
||||
.unwrap_or_default()
|
||||
let tokenizer_name = text_options
|
||||
.map(|text_index_option| text_index_option.tokenizer())
|
||||
.unwrap_or("default");
|
||||
|
||||
tokenizer_manager.get(tokenizer_name).ok_or_else(|| {
|
||||
TantivyError::SchemaError(format!(
|
||||
"Error getting tokenizer for field: {}",
|
||||
field_entry.name()
|
||||
))
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
Ok(SegmentWriter {
|
||||
max_doc: 0,
|
||||
ctx: IndexingContext::new(table_size),
|
||||
@@ -205,7 +209,7 @@ impl SegmentWriter {
|
||||
for value in values {
|
||||
let mut token_stream = match value {
|
||||
Value::PreTokStr(tok_str) => {
|
||||
PreTokenizedStream::from(tok_str.clone()).into()
|
||||
Box::new(PreTokenizedStream::from(tok_str.clone()))
|
||||
}
|
||||
Value::Str(ref text) => {
|
||||
let text_analyzer =
|
||||
@@ -438,7 +442,9 @@ fn remap_and_write(
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::path::Path;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use tempfile::TempDir;
|
||||
|
||||
use super::compute_initial_table_size;
|
||||
use crate::collector::Count;
|
||||
@@ -446,7 +452,9 @@ mod tests {
|
||||
use crate::directory::RamDirectory;
|
||||
use crate::postings::TermInfo;
|
||||
use crate::query::PhraseQuery;
|
||||
use crate::schema::{IndexRecordOption, Schema, Type, STORED, STRING, TEXT};
|
||||
use crate::schema::{
|
||||
IndexRecordOption, Schema, TextFieldIndexing, TextOptions, Type, STORED, STRING, TEXT,
|
||||
};
|
||||
use crate::store::{Compressor, StoreReader, StoreWriter};
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
use crate::time::OffsetDateTime;
|
||||
@@ -900,4 +908,32 @@ mod tests {
|
||||
postings.positions(&mut positions);
|
||||
assert_eq!(positions, &[4]); //< as opposed to 3 if we had a position length of 1.
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_show_error_when_tokenizer_not_registered() {
|
||||
let text_field_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("custom_en")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
let text_options = TextOptions::default()
|
||||
.set_indexing_options(text_field_indexing)
|
||||
.set_stored();
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_text_field("title", text_options);
|
||||
let schema = schema_builder.build();
|
||||
let tempdir = TempDir::new().unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
Index::create_in_dir(&tempdir_path, schema).unwrap();
|
||||
let index = Index::open_in_dir(tempdir_path).unwrap();
|
||||
let schema = index.schema();
|
||||
let mut index_writer = index.writer(50_000_000).unwrap();
|
||||
let title = schema.get_field("title").unwrap();
|
||||
let mut document = Document::default();
|
||||
document.add_text(title, "The Old Man and the Sea");
|
||||
index_writer.add_document(document).unwrap();
|
||||
let error = index_writer.commit().unwrap_err();
|
||||
assert_eq!(
|
||||
error.to_string(),
|
||||
"Schema error: 'Error getting tokenizer for field: title'"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -101,6 +101,7 @@ mod test {
|
||||
|
||||
use super::Stamper;
|
||||
|
||||
#[allow(clippy::redundant_clone)]
|
||||
#[test]
|
||||
fn test_stamper() {
|
||||
let stamper = Stamper::new(7u64);
|
||||
@@ -116,6 +117,7 @@ mod test {
|
||||
assert_eq!(stamper.stamp(), 15u64);
|
||||
}
|
||||
|
||||
#[allow(clippy::redundant_clone)]
|
||||
#[test]
|
||||
fn test_stamper_revert() {
|
||||
let stamper = Stamper::new(7u64);
|
||||
|
||||
120
src/lib.rs
120
src/lib.rs
@@ -191,6 +191,7 @@ pub use crate::schema::{DateOptions, DateTimePrecision, Document, Term};
|
||||
/// Index format version.
|
||||
const INDEX_FORMAT_VERSION: u32 = 5;
|
||||
|
||||
#[cfg(unix)]
|
||||
pub use memmap2::Advice;
|
||||
|
||||
/// Structure version for the index.
|
||||
@@ -298,9 +299,39 @@ pub struct DocAddress {
|
||||
pub doc_id: DocId,
|
||||
}
|
||||
|
||||
#[macro_export]
|
||||
/// Enable fail_point if feature is enabled.
|
||||
macro_rules! fail_point {
|
||||
($name:expr) => {{
|
||||
#[cfg(feature = "failpoints")]
|
||||
{
|
||||
fail::eval($name, |_| {
|
||||
panic!("Return is not supported for the fail point \"{}\"", $name);
|
||||
});
|
||||
}
|
||||
}};
|
||||
($name:expr, $e:expr) => {{
|
||||
#[cfg(feature = "failpoints")]
|
||||
{
|
||||
if let Some(res) = fail::eval($name, $e) {
|
||||
return res;
|
||||
}
|
||||
}
|
||||
}};
|
||||
($name:expr, $cond:expr, $e:expr) => {{
|
||||
#[cfg(feature = "failpoints")]
|
||||
{
|
||||
if $cond {
|
||||
fail::fail_point!($name, $e);
|
||||
}
|
||||
}
|
||||
}};
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
use common::{BinarySerializable, FixedSize};
|
||||
use query_grammar::{UserInputAst, UserInputLeaf, UserInputLiteral};
|
||||
use rand::distributions::{Bernoulli, Uniform};
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
@@ -856,6 +887,95 @@ pub mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_searcher_on_json_field_with_type_inference() {
|
||||
// When indexing and searching a json value, we infer its type.
|
||||
// This tests aims to check the type infereence is consistent between indexing and search.
|
||||
// Inference order is date, i64, u64, f64, bool.
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_field = schema_builder.add_json_field("json", STORED | TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let json_val: serde_json::Map<String, serde_json::Value> = serde_json::from_str(
|
||||
r#"{
|
||||
"signed": 2,
|
||||
"float": 2.0,
|
||||
"unsigned": 10000000000000,
|
||||
"date": "1985-04-12T23:20:50.52Z",
|
||||
"bool": true
|
||||
}"#,
|
||||
)
|
||||
.unwrap();
|
||||
let doc = doc!(json_field=>json_val);
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer = index.writer_for_tests().unwrap();
|
||||
writer.add_document(doc).unwrap();
|
||||
writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let get_doc_ids = |user_input_literal: UserInputLiteral| {
|
||||
let query_parser = crate::query::QueryParser::for_index(&index, Vec::new());
|
||||
let query = query_parser
|
||||
.build_query_from_user_input_ast(UserInputAst::from(UserInputLeaf::Literal(
|
||||
user_input_literal,
|
||||
)))
|
||||
.unwrap();
|
||||
searcher
|
||||
.search(&query, &TEST_COLLECTOR_WITH_SCORE)
|
||||
.map(|topdocs| topdocs.docs().to_vec())
|
||||
.unwrap()
|
||||
};
|
||||
{
|
||||
let user_input_literal = UserInputLiteral {
|
||||
field_name: Some("json.signed".to_string()),
|
||||
phrase: "2".to_string(),
|
||||
delimiter: crate::query_grammar::Delimiter::None,
|
||||
slop: 0,
|
||||
prefix: false,
|
||||
};
|
||||
assert_eq!(get_doc_ids(user_input_literal), vec![DocAddress::new(0, 0)]);
|
||||
}
|
||||
{
|
||||
let user_input_literal = UserInputLiteral {
|
||||
field_name: Some("json.float".to_string()),
|
||||
phrase: "2.0".to_string(),
|
||||
delimiter: crate::query_grammar::Delimiter::None,
|
||||
slop: 0,
|
||||
prefix: false,
|
||||
};
|
||||
assert_eq!(get_doc_ids(user_input_literal), vec![DocAddress::new(0, 0)]);
|
||||
}
|
||||
{
|
||||
let user_input_literal = UserInputLiteral {
|
||||
field_name: Some("json.date".to_string()),
|
||||
phrase: "1985-04-12T23:20:50.52Z".to_string(),
|
||||
delimiter: crate::query_grammar::Delimiter::None,
|
||||
slop: 0,
|
||||
prefix: false,
|
||||
};
|
||||
assert_eq!(get_doc_ids(user_input_literal), vec![DocAddress::new(0, 0)]);
|
||||
}
|
||||
{
|
||||
let user_input_literal = UserInputLiteral {
|
||||
field_name: Some("json.unsigned".to_string()),
|
||||
phrase: "10000000000000".to_string(),
|
||||
delimiter: crate::query_grammar::Delimiter::None,
|
||||
slop: 0,
|
||||
prefix: false,
|
||||
};
|
||||
assert_eq!(get_doc_ids(user_input_literal), vec![DocAddress::new(0, 0)]);
|
||||
}
|
||||
{
|
||||
let user_input_literal = UserInputLiteral {
|
||||
field_name: Some("json.bool".to_string()),
|
||||
phrase: "true".to_string(),
|
||||
delimiter: crate::query_grammar::Delimiter::None,
|
||||
slop: 0,
|
||||
prefix: false,
|
||||
};
|
||||
assert_eq!(get_doc_ids(user_input_literal), vec![DocAddress::new(0, 0)]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_doc_macro() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
@@ -2,7 +2,6 @@ use std::cmp::Ordering;
|
||||
use std::io::{self, Write};
|
||||
|
||||
use common::{BinarySerializable, CountingWriter, VInt};
|
||||
use fail::fail_point;
|
||||
|
||||
use super::TermInfo;
|
||||
use crate::core::Segment;
|
||||
@@ -205,7 +204,7 @@ impl<'a> FieldSerializer<'a> {
|
||||
/// If the current block is incomplete, it needs to be encoded
|
||||
/// using `VInt` encoding.
|
||||
pub fn close_term(&mut self) -> io::Result<()> {
|
||||
fail_point!("FieldSerializer::close_term", |msg: Option<String>| {
|
||||
crate::fail_point!("FieldSerializer::close_term", |msg: Option<String>| {
|
||||
Err(io::Error::new(io::ErrorKind::Other, format!("{msg:?}")))
|
||||
});
|
||||
if self.term_open {
|
||||
|
||||
@@ -4,9 +4,7 @@ use std::collections::{BinaryHeap, HashMap};
|
||||
use crate::query::bm25::idf;
|
||||
use crate::query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery};
|
||||
use crate::schema::{Field, FieldType, IndexRecordOption, Term, Value};
|
||||
use crate::tokenizer::{
|
||||
BoxTokenStream, FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer,
|
||||
};
|
||||
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer};
|
||||
use crate::{DocAddress, Result, Searcher, TantivyError};
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
@@ -206,8 +204,7 @@ impl MoreLikeThis {
|
||||
for value in values {
|
||||
match value {
|
||||
Value::PreTokStr(tok_str) => {
|
||||
let mut token_stream: BoxTokenStream =
|
||||
PreTokenizedStream::from(tok_str.clone()).into();
|
||||
let mut token_stream = PreTokenizedStream::from(tok_str.clone());
|
||||
token_stream.process(&mut |token| {
|
||||
if !self.is_noise_word(token.text.clone()) {
|
||||
let term = Term::from_field_text(field, &token.text);
|
||||
|
||||
@@ -1203,7 +1203,7 @@ mod test {
|
||||
fn test_json_field_possibly_a_number() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"json.titi:5",
|
||||
r#"(Term(field=14, type=Json, path=titi, type=U64, 5) Term(field=14, type=Json, path=titi, type=Str, "5"))"#,
|
||||
r#"(Term(field=14, type=Json, path=titi, type=I64, 5) Term(field=14, type=Json, path=titi, type=Str, "5"))"#,
|
||||
true,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
@@ -1211,6 +1211,11 @@ mod test {
|
||||
r#"(Term(field=14, type=Json, path=titi, type=I64, -5) Term(field=14, type=Json, path=titi, type=Str, "5"))"#, //< Yes this is a bit weird after going through the tokenizer we lose the "-".
|
||||
true,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"json.titi:10000000000000000000",
|
||||
r#"(Term(field=14, type=Json, path=titi, type=U64, 10000000000000000000) Term(field=14, type=Json, path=titi, type=Str, "10000000000000000000"))"#,
|
||||
true,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"json.titi:-5.2",
|
||||
r#"(Term(field=14, type=Json, path=titi, type=F64, -5.2) "[(0, Term(field=14, type=Json, path=titi, type=Str, "5")), (1, Term(field=14, type=Json, path=titi, type=Str, "2"))]")"#,
|
||||
@@ -1260,7 +1265,7 @@ mod test {
|
||||
fn test_json_default() {
|
||||
test_query_to_logical_ast_with_default_json(
|
||||
"titi:4",
|
||||
"(Term(field=14, type=Json, path=titi, type=U64, 4) Term(field=14, type=Json, \
|
||||
"(Term(field=14, type=Json, path=titi, type=I64, 4) Term(field=14, type=Json, \
|
||||
path=titi, type=Str, \"4\"))",
|
||||
false,
|
||||
);
|
||||
@@ -1282,7 +1287,7 @@ mod test {
|
||||
for conjunction in [false, true] {
|
||||
test_query_to_logical_ast_with_default_json(
|
||||
"json:4",
|
||||
r#"(Term(field=14, type=Json, path=, type=U64, 4) Term(field=14, type=Json, path=, type=Str, "4"))"#,
|
||||
r#"(Term(field=14, type=Json, path=, type=I64, 4) Term(field=14, type=Json, path=, type=Str, "4"))"#,
|
||||
conjunction,
|
||||
);
|
||||
}
|
||||
|
||||
@@ -139,7 +139,7 @@ mod tokenizer;
|
||||
mod tokenizer_manager;
|
||||
mod whitespace_tokenizer;
|
||||
|
||||
pub use tokenizer_api::{BoxTokenStream, Token, TokenFilter, TokenStream, Tokenizer};
|
||||
pub use tokenizer_api::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
|
||||
pub use self::alphanum_only::AlphaNumOnlyFilter;
|
||||
pub use self::ascii_folding_filter::AsciiFoldingFilter;
|
||||
@@ -154,7 +154,7 @@ pub use self::split_compound_words::SplitCompoundWords;
|
||||
pub use self::stemmer::{Language, Stemmer};
|
||||
pub use self::stop_word_filter::StopWordFilter;
|
||||
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
|
||||
pub use self::tokenizer::TextAnalyzer;
|
||||
pub use self::tokenizer::{BoxTokenFilter, TextAnalyzer, TextAnalyzerBuilder};
|
||||
pub use self::tokenizer_manager::TokenizerManager;
|
||||
pub use self::whitespace_tokenizer::WhitespaceTokenizer;
|
||||
|
||||
|
||||
@@ -1,36 +1,105 @@
|
||||
use dyn_clone::DynClone;
|
||||
/// The tokenizer module contains all of the tools used to process
|
||||
/// text in `tantivy`.
|
||||
use tokenizer_api::{BoxTokenStream, TokenFilter, Tokenizer};
|
||||
use tokenizer_api::{TokenFilter, TokenStream, Tokenizer};
|
||||
|
||||
use crate::tokenizer::empty_tokenizer::EmptyTokenizer;
|
||||
|
||||
/// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`.
|
||||
#[derive(Clone)]
|
||||
pub struct TextAnalyzer {
|
||||
tokenizer: Box<dyn BoxableTokenizer>,
|
||||
}
|
||||
|
||||
/// A boxable `Tokenizer`, with its `TokenStream` type erased.
|
||||
trait BoxableTokenizer: 'static + Send + Sync {
|
||||
trait BoxableTokenizer: 'static + Send + Sync + DynClone {
|
||||
/// Creates a boxed token stream for a given `str`.
|
||||
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a>;
|
||||
/// Clone this tokenizer.
|
||||
fn box_clone(&self) -> Box<dyn BoxableTokenizer>;
|
||||
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> Box<dyn TokenStream + 'a>;
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> BoxableTokenizer for T {
|
||||
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
self.token_stream(text).into()
|
||||
}
|
||||
fn box_clone(&self) -> Box<dyn BoxableTokenizer> {
|
||||
Box::new(self.clone())
|
||||
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> Box<dyn TokenStream + 'a> {
|
||||
Box::new(self.token_stream(text))
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for TextAnalyzer {
|
||||
fn clone(&self) -> Self {
|
||||
TextAnalyzer {
|
||||
tokenizer: self.tokenizer.box_clone(),
|
||||
dyn_clone::clone_trait_object!(BoxableTokenizer);
|
||||
|
||||
/// A boxed `BoxableTokenizer` which is a `Tokenizer` with its `TokenStream` type erased.
|
||||
#[derive(Clone)]
|
||||
struct BoxTokenizer(Box<dyn BoxableTokenizer>);
|
||||
|
||||
impl Tokenizer for BoxTokenizer {
|
||||
type TokenStream<'a> = Box<dyn TokenStream + 'a>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
self.0.box_token_stream(text).into()
|
||||
}
|
||||
}
|
||||
|
||||
/// A boxable `TokenFilter`, with its `Tokenizer` type erased.
|
||||
trait BoxableTokenFilter: 'static + Send + Sync {
|
||||
/// Wraps a `BoxedTokenizer` and returns a new one.
|
||||
fn box_transform(&self, tokenizer: BoxTokenizer) -> BoxTokenizer;
|
||||
}
|
||||
|
||||
impl<T: TokenFilter> BoxableTokenFilter for T {
|
||||
fn box_transform(&self, tokenizer: BoxTokenizer) -> BoxTokenizer {
|
||||
let tokenizer = self.clone().transform(tokenizer);
|
||||
BoxTokenizer(Box::new(tokenizer))
|
||||
}
|
||||
}
|
||||
|
||||
/// A boxed `BoxableTokenFilter` which is a `TokenFilter` with its `Tokenizer` type erased.
|
||||
pub struct BoxTokenFilter(Box<dyn BoxableTokenFilter>);
|
||||
|
||||
impl<T: TokenFilter> From<T> for BoxTokenFilter {
|
||||
fn from(tokenizer: T) -> BoxTokenFilter {
|
||||
BoxTokenFilter(Box::new(tokenizer))
|
||||
}
|
||||
}
|
||||
|
||||
impl TextAnalyzer {
|
||||
/// Builds a new `TextAnalyzer` given a tokenizer and a vector of `BoxTokenFilter`.
|
||||
///
|
||||
/// When creating a `TextAnalyzer` from a `Tokenizer` and a static set of `TokenFilter`,
|
||||
/// prefer using `TextAnalyzer::builder(tokenizer).filter(token_filter).build()` as it
|
||||
/// will be more performant and create less boxes.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// use tantivy::tokenizer::*;
|
||||
///
|
||||
/// let en_stem = TextAnalyzer::build(
|
||||
/// SimpleTokenizer::default(),
|
||||
/// vec![
|
||||
/// BoxTokenFilter::from(RemoveLongFilter::limit(40)),
|
||||
/// BoxTokenFilter::from(LowerCaser),
|
||||
/// BoxTokenFilter::from(Stemmer::default()),
|
||||
/// ]);
|
||||
/// ```
|
||||
pub fn build<T: Tokenizer>(
|
||||
tokenizer: T,
|
||||
boxed_token_filters: Vec<BoxTokenFilter>,
|
||||
) -> TextAnalyzer {
|
||||
let mut boxed_tokenizer = BoxTokenizer(Box::new(tokenizer));
|
||||
for filter in boxed_token_filters.into_iter() {
|
||||
boxed_tokenizer = filter.0.box_transform(boxed_tokenizer);
|
||||
}
|
||||
TextAnalyzer {
|
||||
tokenizer: boxed_tokenizer.0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new TextAnalyzerBuilder
|
||||
pub fn builder<T: Tokenizer>(tokenizer: T) -> TextAnalyzerBuilder<T> {
|
||||
TextAnalyzerBuilder { tokenizer }
|
||||
}
|
||||
|
||||
/// Creates a token stream for a given `str`.
|
||||
pub fn token_stream<'a>(&'a mut self, text: &'a str) -> Box<dyn TokenStream + 'a> {
|
||||
self.tokenizer.box_token_stream(text)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -46,20 +115,8 @@ impl<T: Tokenizer + Clone> From<T> for TextAnalyzer {
|
||||
}
|
||||
}
|
||||
|
||||
impl TextAnalyzer {
|
||||
/// Create a new TextAnalyzerBuilder
|
||||
pub fn builder<T: Tokenizer>(tokenizer: T) -> TextAnalyzerBuilder<T> {
|
||||
TextAnalyzerBuilder { tokenizer }
|
||||
}
|
||||
|
||||
/// Creates a token stream for a given `str`.
|
||||
pub fn token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
self.tokenizer.box_token_stream(text)
|
||||
}
|
||||
}
|
||||
|
||||
/// Builder helper for [`TextAnalyzer`]
|
||||
pub struct TextAnalyzerBuilder<T> {
|
||||
pub struct TextAnalyzerBuilder<T: Tokenizer> {
|
||||
tokenizer: T,
|
||||
}
|
||||
|
||||
@@ -90,3 +147,37 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::tokenizer::{AlphaNumOnlyFilter, LowerCaser, RemoveLongFilter, WhitespaceTokenizer};
|
||||
|
||||
#[test]
|
||||
fn test_text_analyzer_builder() {
|
||||
let mut analyzer = TextAnalyzer::builder(WhitespaceTokenizer::default())
|
||||
.filter(AlphaNumOnlyFilter)
|
||||
.filter(RemoveLongFilter::limit(6))
|
||||
.filter(LowerCaser)
|
||||
.build();
|
||||
let mut stream = analyzer.token_stream("- first bullet point");
|
||||
assert_eq!(stream.next().unwrap().text, "first");
|
||||
assert_eq!(stream.next().unwrap().text, "point");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_text_analyzer_with_filters_boxed() {
|
||||
let mut analyzer = TextAnalyzer::build(
|
||||
WhitespaceTokenizer::default(),
|
||||
vec![
|
||||
BoxTokenFilter::from(AlphaNumOnlyFilter),
|
||||
BoxTokenFilter::from(LowerCaser),
|
||||
BoxTokenFilter::from(RemoveLongFilter::limit(6)),
|
||||
],
|
||||
);
|
||||
let mut stream = analyzer.token_stream("- first bullet point");
|
||||
assert_eq!(stream.next().unwrap().text, "first");
|
||||
assert_eq!(stream.next().unwrap().text, "point");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@ homepage = "https://github.com/quickwit-oss/tantivy"
|
||||
repository = "https://github.com/quickwit-oss/tantivy"
|
||||
keywords = ["search", "information", "retrieval", "sstable"]
|
||||
categories = ["database-implementations", "data-structures", "compression"]
|
||||
desciption = "sstables for tantivy"
|
||||
description = "sstables for tantivy"
|
||||
|
||||
[dependencies]
|
||||
common = {version= "0.5", path="../common", package="tantivy-common"}
|
||||
|
||||
@@ -44,7 +44,7 @@ pub fn fast_short_slice_copy(src: &[u8], dst: &mut [u8]) {
|
||||
return;
|
||||
}
|
||||
|
||||
/// The code will use the vmovdqu instruction to copy 32 bytes at a time.
|
||||
// The code will use the vmovdqu instruction to copy 32 bytes at a time.
|
||||
#[cfg(target_feature = "avx")]
|
||||
{
|
||||
if len <= 64 {
|
||||
|
||||
@@ -6,7 +6,6 @@
|
||||
//! Checkout the [tantivy repo](https://github.com/quickwit-oss/tantivy/tree/main/src/tokenizer) for some examples.
|
||||
|
||||
use std::borrow::{Borrow, BorrowMut};
|
||||
use std::ops::{Deref, DerefMut};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
@@ -60,30 +59,6 @@ pub trait Tokenizer: 'static + Clone + Send + Sync {
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a>;
|
||||
}
|
||||
|
||||
/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
|
||||
pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
|
||||
|
||||
impl<'a, T> From<T> for BoxTokenStream<'a>
|
||||
where T: TokenStream + 'a
|
||||
{
|
||||
fn from(token_stream: T) -> BoxTokenStream<'a> {
|
||||
BoxTokenStream(Box::new(token_stream))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Deref for BoxTokenStream<'a> {
|
||||
type Target = dyn TokenStream + 'a;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&*self.0
|
||||
}
|
||||
}
|
||||
impl<'a> DerefMut for BoxTokenStream<'a> {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
&mut *self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for Box<dyn TokenStream + 'a> {
|
||||
fn advance(&mut self) -> bool {
|
||||
let token_stream: &mut dyn TokenStream = self.borrow_mut();
|
||||
@@ -137,7 +112,7 @@ pub trait TokenStream {
|
||||
}
|
||||
|
||||
/// Trait for the pluggable components of `Tokenizer`s.
|
||||
pub trait TokenFilter: 'static + Send + Sync {
|
||||
pub trait TokenFilter: 'static + Send + Sync + Clone {
|
||||
/// The Tokenizer type returned by this filter, typically parametrized by the underlying
|
||||
/// Tokenizer.
|
||||
type Tokenizer<T: Tokenizer>: Tokenizer;
|
||||
|
||||
Reference in New Issue
Block a user