Compare commits

..

44 Commits

Author SHA1 Message Date
Paul Masurel
a106e71b9b Adapted benchmark to the analyzer's Iterator<Item=Token> form 2021-01-06 22:26:58 +09:00
Paul Masurel
ee8e61a062 Wrapping stemmer into an Arc 2021-01-06 20:20:08 +09:00
dcraven
da023f33c0 Fix imports 2021-01-06 19:20:21 +09:00
dcraven
de5a8bfab3 Remove unused imports. 2021-01-06 19:20:21 +09:00
dcraven
0356b7d779 Remove patched rust-stemmers from Cargo.toml 2021-01-06 19:20:21 +09:00
dcraven
50812d0081 Remove TokenStream trait. 2021-01-06 19:20:21 +09:00
dcraven
4a68c8a712 Reorder for more linear reading. 2021-01-06 19:20:21 +09:00
dcraven
ca6fd5effc Fix bug. Cleanup some rough spots. Renamed functions. Fixed tests and docs. 2021-01-06 19:20:21 +09:00
dcraven
4e6b341422 Tests compile. 2021-01-06 19:20:21 +09:00
dcraven
a56330b234 Fix generic types in TokenChainIterator. Fix filter implementations. 2021-01-06 19:20:21 +09:00
dcraven
6af6c11ec2 Formulate more as iterators. 2021-01-06 19:20:21 +09:00
dcraven
9633d2e657 Small changes. 2021-01-06 19:20:21 +09:00
dcraven
39e8739ea5 Reformulate as Iterators, Checkpoint 2. Finished, now bubble up changes. 2021-01-06 19:20:21 +09:00
dcraven
801c82a5e1 Formulate as Iterators, Checkpoint 1. 2021-01-06 19:20:21 +09:00
dcraven
ccd0f3ccc9 Checkpoint converting to Iterators and static dispatch. 2021-01-06 19:20:21 +09:00
dcraven
f1973759ef Remove forgotten code. 2021-01-06 19:20:21 +09:00
dcraven
2bd1d8230c Remove unnecessary lifetime. 2021-01-06 19:20:21 +09:00
dcraven
c7407cc2a7 Simplify control flow. 2021-01-06 19:20:21 +09:00
dcraven
0150b406c5 Remove BoxTokenFilter. 2021-01-06 19:20:21 +09:00
dcraven
f73209a868 Reduced number of allocations. 2021-01-06 19:20:21 +09:00
dcraven
09375decc2 Removed unnecessary lifetimes. 2021-01-06 19:20:21 +09:00
dcraven
b8c7f1fe9c Removed unnecessary trait impls 2021-01-06 19:20:21 +09:00
Paul Masurel
2f14a892ca added a simple bench for the default analyzer 2021-01-06 19:11:26 +09:00
Paul Masurel
9c3cabce40 Updated version of the rand crate. 2021-01-06 18:09:00 +09:00
Paul Masurel
f8d71c2b10 Merge pull request #964 from mosuka/deserializable
Make NamedFieldDocument deserializable
2021-01-06 17:43:53 +09:00
Paul Masurel
394dfb24f1 Merge pull request #965 from lewisdiamond/patch-1
Fix spelling
2021-01-06 13:38:31 +09:00
Lewis Diamond
b0549a229d Fix spelling 2021-01-05 22:34:56 -05:00
Minoru Osuka
670b6eaff6 Make NamedFieldDocument deserializable 2020-12-21 16:51:31 +09:00
Paul Masurel
a4f33d3823 Added comment to f64 conversion to u64.
- Added proptest
- Added comment to Lemire blog post.
2020-12-15 13:40:31 +09:00
Paul Masurel
c7841e3da5 Merge pull request #953 from barrotsteindev/filter-collector-tpredicatevalue
Generic filter collector
2020-12-14 10:35:46 +09:00
barrotsteindev
e7b4a12bba cargo fmt 2020-12-10 14:10:55 +02:00
barrotsteindev
0aaa929d6e Merge branch 'main' into filter-collector-tpredicatevalue 2020-12-10 11:27:19 +02:00
barrotsteindev
1112797c18 added a line to CHANGELOG.md 2020-12-10 11:25:08 +02:00
barrotsteindev
920481e1c1 change unit test 2020-12-10 11:24:53 +02:00
Paul Masurel
55f7b84966 Merge pull request #952 from tantivy-search/bm25-on-onebyte
Encode blockwand on a single byte.
2020-12-10 18:09:31 +09:00
Paul Masurel
09ab4df1fe Encode blockwand on a single byte. 2020-12-10 18:08:52 +09:00
barrotsteindev
0c2cf81b37 cargo fmt 2020-12-10 11:08:35 +02:00
barrotsteindev
d864430bda final edits 2020-12-10 11:08:15 +02:00
Paul Masurel
de60540e06 fixing compilation 2020-12-10 10:36:21 +02:00
Paul Masurel
c3e311e6b8 Removed 'static in compression_lz4. 2020-12-09 15:30:52 +09:00
barrotsteindev
ac704f2f22 WIP generic filter collector 2020-12-08 14:36:52 +02:00
Paul Masurel
be626083a0 Reorganized and added termdict unit tests. 2020-12-07 12:50:36 +09:00
Paul Masurel
b68fcca1e0 Minor changes
- Open{Write,Read}Error::wrap_io_error made public
- Arc<PathBuf> -> Arc<Path> in file_watcher.
2020-12-03 23:31:50 +09:00
Paul Masurel
af6dfa1856 Small refactoring 2020-12-03 14:27:05 +09:00
62 changed files with 4895 additions and 2954 deletions

View File

@@ -9,6 +9,10 @@ Tantivy 0.14.0
- Bugfix in `Query::explain`
- Removed dependency on `notify` #924. Replaced with `FileWatcher` struct that polls meta file every 500ms in background thread. (@halvorboe @guilload)
- Added `FilterCollector`, which wraps another collector and filters docs using a predicate over a fast field (@barrotsteindev)
- Simplified the encoding of the skip reader struct. BlockWAND max tf is now encoded over a single byte. (@pmasurel)
- `FilterCollector` now supports all Fast Field value types (@barrotsteindev)
This version breaks compatibility and requires users to reindex everything.
Tantivy 0.13.2
===================

View File

@@ -26,7 +26,6 @@ snap = "1"
tempfile = {version="3", optional=true}
log = "0.4"
serde = {version="1", features=["derive"]}
serde_cbor = "0.11"
serde_json = "1"
num_cpus = "1"
fs2={version="0.4", optional=true}
@@ -54,10 +53,11 @@ lru = "0.6"
winapi = "0.3"
[dev-dependencies]
rand = "0.7"
rand = "0.8"
maplit = "1"
matches = "0.1.8"
proptest = "0.10"
criterion = "0.3"
[dev-dependencies.fail]
version = "0.4"
@@ -98,3 +98,7 @@ travis-ci = { repository = "tantivy-search/tantivy" }
name = "failpoints"
path = "tests/failpoints/mod.rs"
required-features = ["fail/failpoints"]
[[bench]]
name = "analyzer"
harness = false

3774
benches/alice.txt Normal file

File diff suppressed because it is too large Load Diff

22
benches/analyzer.rs Normal file
View File

@@ -0,0 +1,22 @@
use criterion::{criterion_group, criterion_main, Criterion};
use tantivy::tokenizer::TokenizerManager;
const ALICE_TXT: &'static str = include_str!("alice.txt");
pub fn criterion_benchmark(c: &mut Criterion) {
let tokenizer_manager = TokenizerManager::default();
let tokenizer = tokenizer_manager.get("default").unwrap();
c.bench_function("default-tokenize-alice", |b| {
b.iter(|| {
let mut word_count = 0;
let mut token_stream = tokenizer.token_stream(ALICE_TXT);
for token in token_stream {
word_count += 1;
}
assert_eq!(word_count, 30_731);
})
});
}
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);

View File

@@ -17,12 +17,7 @@ use tantivy::{doc, Index, ReloadPolicy};
use tempfile::TempDir;
fn pre_tokenize_text(text: &str) -> Vec<Token> {
let mut token_stream = SimpleTokenizer.token_stream(text);
let mut tokens = vec![];
while token_stream.advance() {
tokens.push(token_stream.token().clone());
}
tokens
SimpleTokenizer.token_stream(text).collect()
}
fn main() -> tantivy::Result<()> {

View File

@@ -51,7 +51,7 @@ fn main() -> tantivy::Result<()> {
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?;
let mut snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?;
for (score, doc_address) in top_docs {
let doc = searcher.doc(doc_address)?;

View File

@@ -50,12 +50,13 @@ fn main() -> tantivy::Result<()> {
// This tokenizer lowers all of the text (to help with stop word matching)
// then removes all instances of `the` and `and` from the corpus
let tokenizer = TextAnalyzer::from(SimpleTokenizer)
.filter(LowerCaser)
let tokenizer = analyzer_builder(SimpleTokenizer)
.filter(LowerCaser::new())
.filter(StopWordFilter::remove(vec![
"the".to_string(),
"and".to_string(),
]));
]))
.build();
index.tokenizers().register("stoppy", tokenizer);

View File

@@ -9,8 +9,10 @@
// ---
// Importing tantivy...
use std::marker::PhantomData;
use crate::collector::{Collector, SegmentCollector};
use crate::fastfield::FastFieldReader;
use crate::fastfield::{FastFieldReader, FastValue};
use crate::schema::Field;
use crate::{Score, SegmentReader, TantivyError};
@@ -41,78 +43,104 @@ use crate::{Score, SegmentReader, TantivyError};
///
/// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary").unwrap();
/// let no_filter_collector = FilterCollector::new(price, &|value| value > 20_120u64, TopDocs::with_limit(2));
/// let no_filter_collector = FilterCollector::new(price, &|value: u64| value > 20_120u64, TopDocs::with_limit(2));
/// let top_docs = searcher.search(&query, &no_filter_collector).unwrap();
///
/// assert_eq!(top_docs.len(), 1);
/// assert_eq!(top_docs[0].1, DocAddress(0, 1));
///
/// let filter_all_collector = FilterCollector::new(price, &|value| value < 5u64, TopDocs::with_limit(2));
/// let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new(price, &|value| value < 5u64, TopDocs::with_limit(2));
/// let filtered_top_docs = searcher.search(&query, &filter_all_collector).unwrap();
///
/// assert_eq!(filtered_top_docs.len(), 0);
/// ```
pub struct FilterCollector<TCollector, TPredicate>
pub struct FilterCollector<TCollector, TPredicate, TPredicateValue: FastValue>
where
TPredicate: 'static,
{
field: Field,
collector: TCollector,
predicate: &'static TPredicate,
t_predicate_value: PhantomData<TPredicateValue>,
}
impl<TCollector, TPredicate> FilterCollector<TCollector, TPredicate>
impl<TCollector, TPredicate, TPredicateValue: FastValue>
FilterCollector<TCollector, TPredicate, TPredicateValue>
where
TCollector: Collector + Send + Sync,
TPredicate: Fn(u64) -> bool + Send + Sync,
TPredicate: Fn(TPredicateValue) -> bool + Send + Sync,
{
/// Create a new FilterCollector.
pub fn new(
field: Field,
predicate: &'static TPredicate,
collector: TCollector,
) -> FilterCollector<TCollector, TPredicate> {
) -> FilterCollector<TCollector, TPredicate, TPredicateValue> {
FilterCollector {
field,
predicate,
collector,
t_predicate_value: PhantomData,
}
}
}
impl<TCollector, TPredicate> Collector for FilterCollector<TCollector, TPredicate>
impl<TCollector, TPredicate, TPredicateValue: FastValue> Collector
for FilterCollector<TCollector, TPredicate, TPredicateValue>
where
TCollector: Collector + Send + Sync,
TPredicate: 'static + Fn(u64) -> bool + Send + Sync,
TPredicate: 'static + Fn(TPredicateValue) -> bool + Send + Sync,
TPredicateValue: 'static + FastValue,
{
// That's the type of our result.
// Our standard deviation will be a float.
type Fruit = TCollector::Fruit;
type Child = FilterSegmentCollector<TCollector::Child, TPredicate>;
type Child = FilterSegmentCollector<TCollector::Child, TPredicate, TPredicateValue>;
fn for_segment(
&self,
segment_local_id: u32,
segment_reader: &SegmentReader,
) -> crate::Result<FilterSegmentCollector<TCollector::Child, TPredicate>> {
) -> crate::Result<FilterSegmentCollector<TCollector::Child, TPredicate, TPredicateValue>> {
let schema = segment_reader.schema();
let field_entry = schema.get_field_entry(self.field);
if !field_entry.is_fast() {
return Err(TantivyError::SchemaError(format!(
"Field {:?} is not a fast field.",
field_entry.name()
)));
}
let requested_type = TPredicateValue::to_type();
let field_schema_type = field_entry.field_type().value_type();
if requested_type != field_schema_type {
return Err(TantivyError::SchemaError(format!(
"Field {:?} is of type {:?}!={:?}",
field_entry.name(),
requested_type,
field_schema_type
)));
}
let fast_field_reader = segment_reader
.fast_fields()
.u64(self.field)
.typed_fast_field_reader(self.field)
.ok_or_else(|| {
let field_name = segment_reader.schema().get_field_name(self.field);
TantivyError::SchemaError(format!(
"Field {:?} is not a u64 fast field.",
field_name
"{:?} is not declared as a fast field in the schema.",
self.field
))
})?;
let segment_collector = self
.collector
.for_segment(segment_local_id, segment_reader)?;
Ok(FilterSegmentCollector {
fast_field_reader,
segment_collector,
predicate: self.predicate,
t_predicate_value: PhantomData,
})
}
@@ -128,20 +156,23 @@ where
}
}
pub struct FilterSegmentCollector<TSegmentCollector, TPredicate>
pub struct FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue>
where
TPredicate: 'static,
TPredicateValue: 'static + FastValue,
{
fast_field_reader: FastFieldReader<u64>,
fast_field_reader: FastFieldReader<TPredicateValue>,
segment_collector: TSegmentCollector,
predicate: &'static TPredicate,
t_predicate_value: PhantomData<TPredicateValue>,
}
impl<TSegmentCollector, TPredicate> SegmentCollector
for FilterSegmentCollector<TSegmentCollector, TPredicate>
impl<TSegmentCollector, TPredicate, TPredicateValue> SegmentCollector
for FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue>
where
TSegmentCollector: SegmentCollector,
TPredicate: 'static + Fn(u64) -> bool + Send + Sync,
TPredicate: 'static + Fn(TPredicateValue) -> bool + Send + Sync,
TPredicateValue: 'static + FastValue,
{
type Fruit = TSegmentCollector::Fruit;

View File

@@ -8,6 +8,13 @@ use crate::DocId;
use crate::Score;
use crate::SegmentLocalId;
use crate::collector::{FilterCollector, TopDocs};
use crate::query::QueryParser;
use crate::schema::{Schema, FAST, TEXT};
use crate::DateTime;
use crate::{doc, Index};
use std::str::FromStr;
pub const TEST_COLLECTOR_WITH_SCORE: TestCollector = TestCollector {
compute_score: true,
};
@@ -16,6 +23,54 @@ pub const TEST_COLLECTOR_WITHOUT_SCORE: TestCollector = TestCollector {
compute_score: true,
};
#[test]
pub fn test_filter_collector() {
let mut schema_builder = Schema::builder();
let title = schema_builder.add_text_field("title", TEXT);
let price = schema_builder.add_u64_field("price", FAST);
let date = schema_builder.add_date_field("date", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64, date => DateTime::from_str("1898-04-09T00:00:00+00:00").unwrap()));
index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64, date => DateTime::from_str("2020-04-09T00:00:00+00:00").unwrap()));
index_writer.add_document(doc!(title => "The Diary of Anne Frank", price => 18_240u64, date => DateTime::from_str("2019-04-20T00:00:00+00:00").unwrap()));
index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64, date => DateTime::from_str("2019-04-09T00:00:00+00:00").unwrap()));
index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64, date => DateTime::from_str("2018-04-09T00:00:00+00:00").unwrap()));
assert!(index_writer.commit().is_ok());
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let query_parser = QueryParser::for_index(&index, vec![title]);
let query = query_parser.parse_query("diary").unwrap();
let filter_some_collector = FilterCollector::new(
price,
&|value: u64| value > 20_120u64,
TopDocs::with_limit(2),
);
let top_docs = searcher.search(&query, &filter_some_collector).unwrap();
assert_eq!(top_docs.len(), 1);
assert_eq!(top_docs[0].1, DocAddress(0, 1));
let filter_all_collector: FilterCollector<_, _, u64> =
FilterCollector::new(price, &|value| value < 5u64, TopDocs::with_limit(2));
let filtered_top_docs = searcher.search(&query, &filter_all_collector).unwrap();
assert_eq!(filtered_top_docs.len(), 0);
fn date_filter(value: DateTime) -> bool {
(value - DateTime::from_str("2019-04-09T00:00:00+00:00").unwrap()).num_weeks() > 0
}
let filter_dates_collector = FilterCollector::new(date, &date_filter, TopDocs::with_limit(5));
let filtered_date_docs = searcher.search(&query, &filter_dates_collector).unwrap();
assert_eq!(filtered_date_docs.len(), 2);
}
/// Stores all of the doc ids.
/// This collector is only used for tests.
/// It is unusable in pr

View File

@@ -115,11 +115,16 @@ pub fn u64_to_i64(val: u64) -> i64 {
/// For simplicity, tantivy internally handles `f64` as `u64`.
/// The mapping is defined by this function.
///
/// Maps `f64` to `u64` so that lexical order is preserved.
/// Maps `f64` to `u64` in a monotonic manner, so that bytes lexical order is preserved.
///
/// This is more suited than simply casting (`val as u64`)
/// which would truncate the result
///
/// # Reference
///
/// Daniel Lemire's [blog post](https://lemire.me/blog/2020/12/14/converting-floating-point-numbers-to-integers-while-preserving-order/)
/// explains the mapping in a clear manner.
///
/// # See also
/// The [reverse mapping is `u64_to_f64`](./fn.u64_to_f64.html).
#[inline(always)]
@@ -148,6 +153,7 @@ pub(crate) mod test {
pub use super::minmax;
pub use super::serialize::test::fixed_size_test;
use super::{compute_num_bits, f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
use proptest::prelude::*;
use std::f64;
fn test_i64_converter_helper(val: i64) {
@@ -158,6 +164,15 @@ pub(crate) mod test {
assert_eq!(u64_to_f64(f64_to_u64(val)), val);
}
proptest! {
#[test]
fn test_f64_converter_monotonicity_proptest((left, right) in (proptest::num::f64::NORMAL, proptest::num::f64::NORMAL)) {
let left_u64 = f64_to_u64(left);
let right_u64 = f64_to_u64(right);
assert_eq!(left_u64 < right_u64, left < right);
}
}
#[test]
fn test_i64_converter() {
assert_eq!(i64_to_u64(i64::min_value()), u64::min_value());

View File

@@ -20,7 +20,7 @@ use crate::reader::IndexReaderBuilder;
use crate::schema::Field;
use crate::schema::FieldType;
use crate::schema::Schema;
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
use crate::tokenizer::{TextAnalyzerT, TokenizerManager};
use crate::IndexWriter;
use std::collections::HashSet;
use std::fmt;
@@ -119,13 +119,12 @@ impl Index {
return Index::create(dir, schema);
}
let index = Index::open(dir)?;
if index.schema() == schema {
Ok(index)
} else {
Err(TantivyError::SchemaError(
if index.schema() != schema {
return Err(TantivyError::SchemaError(
"An index exists but the schema does not match.".to_string(),
))
));
}
Ok(index)
}
/// Creates a new index in a temp directory.
@@ -181,11 +180,11 @@ impl Index {
}
/// Helper to access the tokenizer associated to a specific field.
pub fn tokenizer_for_field(&self, field: Field) -> crate::Result<TextAnalyzer> {
pub fn tokenizer_for_field(&self, field: Field) -> crate::Result<Box<dyn TextAnalyzerT>> {
let field_entry = self.schema.get_field_entry(field);
let field_type = field_entry.field_type();
let tokenizer_manager: &TokenizerManager = self.tokenizers();
let tokenizer_name_opt: Option<TextAnalyzer> = match field_type {
let tokenizer_name_opt: Option<Box<dyn TextAnalyzerT>> = match field_type {
FieldType::Str(text_options) => text_options
.get_indexing_options()
.map(|text_indexing_options| text_indexing_options.tokenizer().to_string())

View File

@@ -310,7 +310,7 @@ impl SegmentReader {
}
/// Returns an iterator that will iterate over the alive document ids
pub fn doc_ids_alive<'a>(&'a self) -> impl Iterator<Item = DocId> + 'a {
pub fn doc_ids_alive(&self) -> impl Iterator<Item = DocId> + '_ {
(0u32..self.max_doc).filter(move |doc| !self.is_deleted(*doc))
}

View File

@@ -58,7 +58,8 @@ pub enum OpenWriteError {
}
impl OpenWriteError {
pub(crate) fn wrap_io_error(io_error: io::Error, filepath: PathBuf) -> Self {
/// Wraps an io error.
pub fn wrap_io_error(io_error: io::Error, filepath: PathBuf) -> Self {
Self::IOError { io_error, filepath }
}
}
@@ -143,7 +144,8 @@ pub enum OpenReadError {
}
impl OpenReadError {
pub(crate) fn wrap_io_error(io_error: io::Error, filepath: PathBuf) -> Self {
/// Wraps an io error.
pub fn wrap_io_error(io_error: io::Error, filepath: PathBuf) -> Self {
Self::IOError { io_error, filepath }
}
}

View File

@@ -3,7 +3,7 @@ use crc32fast::Hasher;
use std::fs;
use std::io;
use std::io::BufRead;
use std::path::PathBuf;
use std::path::Path;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use std::thread;
@@ -13,15 +13,15 @@ pub const POLLING_INTERVAL: Duration = Duration::from_millis(if cfg!(test) { 1 }
// Watches a file and executes registered callbacks when the file is modified.
pub struct FileWatcher {
path: Arc<PathBuf>,
path: Arc<Path>,
callbacks: Arc<WatchCallbackList>,
state: Arc<AtomicUsize>, // 0: new, 1: runnable, 2: terminated
}
impl FileWatcher {
pub fn new(path: &PathBuf) -> FileWatcher {
pub fn new(path: &Path) -> FileWatcher {
FileWatcher {
path: Arc::new(path.clone()),
path: Arc::from(path),
callbacks: Default::default(),
state: Default::default(),
}
@@ -63,7 +63,7 @@ impl FileWatcher {
handle
}
fn compute_checksum(path: &PathBuf) -> Result<u32, io::Error> {
fn compute_checksum(path: &Path) -> Result<u32, io::Error> {
let reader = match fs::File::open(path) {
Ok(f) => io::BufReader::new(f),
Err(e) => {

View File

@@ -115,6 +115,18 @@ impl Footer {
}
Ok(())
}
VersionedFooter::V3 {
crc32: _crc,
store_compression,
} => {
if &library_version.store_compression != store_compression {
return Err(Incompatibility::CompressionMismatch {
library_compression_format: library_version.store_compression.to_string(),
index_compression_format: store_compression.to_string(),
});
}
Ok(())
}
VersionedFooter::UnknownVersion => Err(Incompatibility::IndexMismatch {
library_version: library_version.clone(),
index_version: self.version.clone(),
@@ -136,24 +148,31 @@ pub enum VersionedFooter {
crc32: CrcHashU32,
store_compression: String,
},
// Block wand max termfred on 1 byte
V3 {
crc32: CrcHashU32,
store_compression: String,
},
}
impl BinarySerializable for VersionedFooter {
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
let mut buf = Vec::new();
match self {
VersionedFooter::V2 {
VersionedFooter::V3 {
crc32,
store_compression: compression,
} => {
// Serializes a valid `VersionedFooter` or panics if the version is unknown
// [ version | crc_hash | compression_mode ]
// [ 0..4 | 4..8 | variable ]
BinarySerializable::serialize(&2u32, &mut buf)?;
BinarySerializable::serialize(&3u32, &mut buf)?;
BinarySerializable::serialize(crc32, &mut buf)?;
BinarySerializable::serialize(compression, &mut buf)?;
}
VersionedFooter::V1 { .. } | VersionedFooter::UnknownVersion => {
VersionedFooter::V2 { .. }
| VersionedFooter::V1 { .. }
| VersionedFooter::UnknownVersion => {
return Err(io::Error::new(
io::ErrorKind::InvalidInput,
"Cannot serialize an unknown versioned footer ",
@@ -182,7 +201,7 @@ impl BinarySerializable for VersionedFooter {
reader.read_exact(&mut buf[..])?;
let mut cursor = &buf[..];
let version = u32::deserialize(&mut cursor)?;
if version != 1 && version != 2 {
if version > 3 {
return Ok(VersionedFooter::UnknownVersion);
}
let crc32 = u32::deserialize(&mut cursor)?;
@@ -192,12 +211,17 @@ impl BinarySerializable for VersionedFooter {
crc32,
store_compression,
}
} else {
assert_eq!(version, 2);
} else if version == 2 {
VersionedFooter::V2 {
crc32,
store_compression,
}
} else {
assert_eq!(version, 3);
VersionedFooter::V3 {
crc32,
store_compression,
}
})
}
}
@@ -205,6 +229,7 @@ impl BinarySerializable for VersionedFooter {
impl VersionedFooter {
pub fn crc(&self) -> Option<CrcHashU32> {
match self {
VersionedFooter::V3 { crc32, .. } => Some(*crc32),
VersionedFooter::V2 { crc32, .. } => Some(*crc32),
VersionedFooter::V1 { crc32, .. } => Some(*crc32),
VersionedFooter::UnknownVersion { .. } => None,
@@ -243,7 +268,7 @@ impl<W: TerminatingWrite> Write for FooterProxy<W> {
impl<W: TerminatingWrite> TerminatingWrite for FooterProxy<W> {
fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()> {
let crc32 = self.hasher.take().unwrap().finalize();
let footer = Footer::new(VersionedFooter::V2 {
let footer = Footer::new(VersionedFooter::V3 {
crc32,
store_compression: crate::store::COMPRESSION.to_string(),
});
@@ -278,7 +303,7 @@ mod tests {
let footer = Footer::deserialize(&mut &vec[..]).unwrap();
assert!(matches!(
footer.versioned_footer,
VersionedFooter::V2 { store_compression, .. }
VersionedFooter::V3 { store_compression, .. }
if store_compression == crate::store::COMPRESSION
));
assert_eq!(&footer.version, crate::version());
@@ -288,7 +313,7 @@ mod tests {
fn test_serialize_deserialize_footer() {
let mut buffer = Vec::new();
let crc32 = 123456u32;
let footer: Footer = Footer::new(VersionedFooter::V2 {
let footer: Footer = Footer::new(VersionedFooter::V3 {
crc32,
store_compression: "lz4".to_string(),
});
@@ -300,7 +325,7 @@ mod tests {
#[test]
fn footer_length() {
let crc32 = 1111111u32;
let versioned_footer = VersionedFooter::V2 {
let versioned_footer = VersionedFooter::V3 {
crc32,
store_compression: "lz4".to_string(),
};
@@ -321,7 +346,7 @@ mod tests {
// versionned footer length
12 | 128,
// index format version
2,
3,
0,
0,
0,
@@ -340,7 +365,7 @@ mod tests {
let versioned_footer = VersionedFooter::deserialize(&mut cursor).unwrap();
assert!(cursor.is_empty());
let expected_crc: u32 = LittleEndian::read_u32(&v_footer_bytes[5..9]) as CrcHashU32;
let expected_versioned_footer: VersionedFooter = VersionedFooter::V2 {
let expected_versioned_footer: VersionedFooter = VersionedFooter::V3 {
crc32: expected_crc,
store_compression: "lz4".to_string(),
};

View File

@@ -51,6 +51,15 @@ impl<Item: FastValue> FastFieldReader<Item> {
}
}
pub(crate) fn cast<TFastValue: FastValue>(self) -> FastFieldReader<TFastValue> {
FastFieldReader {
bit_unpacker: self.bit_unpacker,
min_value_u64: self.min_value_u64,
max_value_u64: self.max_value_u64,
_phantom: PhantomData,
}
}
/// Return the value associated to the given document.
///
/// This accessor should return as fast as possible.

View File

@@ -1,6 +1,6 @@
use crate::common::CompositeFile;
use crate::fastfield::BytesFastFieldReader;
use crate::fastfield::MultiValueIntFastFieldReader;
use crate::fastfield::{BytesFastFieldReader, FastValue};
use crate::fastfield::{FastFieldNotAvailableError, FastFieldReader};
use crate::schema::{Cardinality, Field, FieldType, Schema};
use crate::space_usage::PerFieldSpaceUsage;
@@ -201,6 +201,14 @@ impl FastFieldReaders {
None
}
pub(crate) fn typed_fast_field_reader<TFastValue: FastValue>(
&self,
field: Field,
) -> Option<FastFieldReader<TFastValue>> {
self.u64_lenient(field)
.map(|fast_field_reader| fast_field_reader.cast())
}
/// Returns the `i64` fast field reader reader associated to `field`.
///
/// If `field` is not a i64 fast field, this method returns `None`.

View File

@@ -31,7 +31,7 @@ fn test_indexing() {
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
for _ in 0..200 {
let random_val = rng.gen_range(0, 20);
let random_val = rng.gen_range(0..20);
if random_val == 0 {
index_writer.commit().expect("Commit failed");
committed_docs.extend(&uncommitted_docs);

View File

@@ -8,7 +8,7 @@ const DEFAULT_MIN_LAYER_SIZE: u32 = 10_000;
const DEFAULT_MIN_MERGE_SIZE: usize = 8;
const DEFAULT_MAX_MERGE_SIZE: usize = 10_000_000;
/// `LogMergePolicy` tries tries to merge segments that have a similar number of
/// `LogMergePolicy` tries to merge segments that have a similar number of
/// documents.
#[derive(Debug, Clone)]
pub struct LogMergePolicy {

View File

@@ -503,7 +503,6 @@ impl IndexMerger {
let mut positions_buffer: Vec<u32> = Vec::with_capacity(1_000);
let mut delta_computer = DeltaComputer::new();
let mut field_term_streams = Vec::new();
let mut max_term_ords: Vec<TermOrdinal> = Vec::new();
let field_readers: Vec<Arc<InvertedIndexReader>> = self
@@ -512,6 +511,7 @@ impl IndexMerger {
.map(|reader| reader.inverted_index(indexed_field))
.collect::<crate::Result<Vec<_>>>()?;
let mut field_term_streams = Vec::new();
for field_reader in &field_readers {
let terms = field_reader.terms();
field_term_streams.push(terms.stream()?);

View File

@@ -10,10 +10,9 @@ use crate::schema::FieldType;
use crate::schema::Schema;
use crate::schema::Term;
use crate::schema::Value;
use crate::schema::{Field, FieldEntry};
use crate::tokenizer::{BoxTokenStream, PreTokenizedStream};
use crate::tokenizer::{FacetTokenizer, TextAnalyzer};
use crate::tokenizer::{TokenStreamChain, Tokenizer};
use crate::tokenizer::PreTokenizedStream;
use crate::tokenizer::{DynTokenStreamChain, Tokenizer};
use crate::tokenizer::{FacetTokenizer, TextAnalyzerT, Token};
use crate::Opstamp;
use crate::{DocId, SegmentComponent};
@@ -23,7 +22,7 @@ use crate::{DocId, SegmentComponent};
fn initial_table_size(per_thread_memory_budget: usize) -> crate::Result<usize> {
let table_memory_upper_bound = per_thread_memory_budget / 3;
if let Some(limit) = (10..)
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_memory_upper_bound)
.take_while(|&num_bits| compute_table_size(num_bits) < table_memory_upper_bound)
.last()
{
Ok(limit.min(19)) // we cap it at 2^19 = 512K.
@@ -45,7 +44,8 @@ pub struct SegmentWriter {
fast_field_writers: FastFieldsWriter,
fieldnorms_writer: FieldNormsWriter,
doc_opstamps: Vec<Opstamp>,
tokenizers: Vec<Option<TextAnalyzer>>,
// TODO: change type
tokenizers: Vec<Option<Box<dyn TextAnalyzerT>>>,
term_buffer: Term,
}
@@ -70,17 +70,17 @@ impl SegmentWriter {
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_num_bits);
let tokenizers = schema
.fields()
.map(
|(_, field_entry): (Field, &FieldEntry)| match field_entry.field_type() {
FieldType::Str(ref text_options) => text_options
.map(|(_, field_entry)| match field_entry.field_type() {
FieldType::Str(text_options) => {
text_options
.get_indexing_options()
.and_then(|text_index_option| {
let tokenizer_name = &text_index_option.tokenizer();
tokenizer_manager.get(tokenizer_name)
}),
_ => None,
},
)
})
}
_ => None,
})
.collect();
Ok(SegmentWriter {
max_doc: 0,
@@ -141,13 +141,13 @@ impl SegmentWriter {
}
let (term_buffer, multifield_postings) =
(&mut self.term_buffer, &mut self.multifield_postings);
match *field_entry.field_type() {
match field_entry.field_type() {
FieldType::HierarchicalFacet => {
term_buffer.set_field(field);
let facets =
field_values
.iter()
.flat_map(|field_value| match *field_value.value() {
.flat_map(|field_value| match field_value.value() {
Value::Facet(ref facet) => Some(facet.encoded_str()),
_ => {
panic!("Expected hierarchical facet");
@@ -157,12 +157,13 @@ impl SegmentWriter {
let mut unordered_term_id_opt = None;
FacetTokenizer
.token_stream(facet_str)
.process(&mut |token| {
.map(|token| {
term_buffer.set_text(&token.text);
let unordered_term_id =
multifield_postings.subscribe(doc_id, &term_buffer);
unordered_term_id_opt = Some(unordered_term_id);
});
})
.count();
if let Some(unordered_term_id) = unordered_term_id_opt {
self.fast_field_writers
.get_multivalue_writer(field)
@@ -172,37 +173,38 @@ impl SegmentWriter {
}
}
FieldType::Str(_) => {
let mut token_streams: Vec<BoxTokenStream> = vec![];
let mut offsets = vec![];
let mut streams_with_offsets = vec![];
let mut total_offset = 0;
for field_value in field_values {
match field_value.value() {
Value::PreTokStr(tok_str) => {
offsets.push(total_offset);
streams_with_offsets.push((
Box::new(PreTokenizedStream::from(tok_str.clone()))
as Box<dyn Iterator<Item = Token>>,
total_offset,
));
if let Some(last_token) = tok_str.tokens.last() {
total_offset += last_token.offset_to;
}
token_streams
.push(PreTokenizedStream::from(tok_str.clone()).into());
}
Value::Str(ref text) => {
Value::Str(text) => {
if let Some(ref mut tokenizer) =
self.tokenizers[field.field_id() as usize]
{
offsets.push(total_offset);
streams_with_offsets
.push((tokenizer.token_stream(text), total_offset));
total_offset += text.len();
token_streams.push(tokenizer.token_stream(text));
}
}
_ => (),
}
}
let num_tokens = if token_streams.is_empty() {
let num_tokens = if streams_with_offsets.is_empty() {
0
} else {
let mut token_stream = TokenStreamChain::new(offsets, token_streams);
let mut token_stream = DynTokenStreamChain::from_vec(streams_with_offsets);
multifield_postings.index_text(
doc_id,
field,
@@ -213,71 +215,62 @@ impl SegmentWriter {
self.fieldnorms_writer.record(doc_id, field, num_tokens);
}
FieldType::U64(ref int_option) => {
if int_option.is_indexed() {
for field_value in field_values {
term_buffer.set_field(field_value.field());
let u64_val = field_value
.value()
.u64_value()
.ok_or_else(make_schema_error)?;
term_buffer.set_u64(u64_val);
multifield_postings.subscribe(doc_id, &term_buffer);
}
FieldType::U64(int_option) if int_option.is_indexed() => {
for field_value in field_values {
term_buffer.set_field(field_value.field());
let u64_val = field_value
.value()
.u64_value()
.ok_or_else(make_schema_error)?;
term_buffer.set_u64(u64_val);
multifield_postings.subscribe(doc_id, &term_buffer);
}
}
FieldType::Date(ref int_option) => {
if int_option.is_indexed() {
for field_value in field_values {
term_buffer.set_field(field_value.field());
let date_val = field_value
.value()
.date_value()
.ok_or_else(make_schema_error)?;
term_buffer.set_i64(date_val.timestamp());
multifield_postings.subscribe(doc_id, &term_buffer);
}
FieldType::Date(int_option) if int_option.is_indexed() => {
for field_value in field_values {
term_buffer.set_field(field_value.field());
let date_val = field_value
.value()
.date_value()
.ok_or_else(make_schema_error)?;
term_buffer.set_i64(date_val.timestamp());
multifield_postings.subscribe(doc_id, &term_buffer);
}
}
FieldType::I64(ref int_option) => {
if int_option.is_indexed() {
for field_value in field_values {
term_buffer.set_field(field_value.field());
let i64_val = field_value
.value()
.i64_value()
.ok_or_else(make_schema_error)?;
term_buffer.set_i64(i64_val);
multifield_postings.subscribe(doc_id, &term_buffer);
}
FieldType::I64(int_option) if int_option.is_indexed() => {
for field_value in field_values {
term_buffer.set_field(field_value.field());
let i64_val = field_value
.value()
.i64_value()
.ok_or_else(make_schema_error)?;
term_buffer.set_i64(i64_val);
multifield_postings.subscribe(doc_id, &term_buffer);
}
}
FieldType::F64(ref int_option) => {
if int_option.is_indexed() {
for field_value in field_values {
term_buffer.set_field(field_value.field());
let f64_val = field_value
.value()
.f64_value()
.ok_or_else(make_schema_error)?;
term_buffer.set_f64(f64_val);
multifield_postings.subscribe(doc_id, &term_buffer);
}
FieldType::F64(int_option) if int_option.is_indexed() => {
for field_value in field_values {
term_buffer.set_field(field_value.field());
let f64_val = field_value
.value()
.f64_value()
.ok_or_else(make_schema_error)?;
term_buffer.set_f64(f64_val);
multifield_postings.subscribe(doc_id, &term_buffer);
}
}
FieldType::Bytes(ref option) => {
if option.is_indexed() {
for field_value in field_values {
term_buffer.set_field(field_value.field());
let bytes = field_value
.value()
.bytes_value()
.ok_or_else(make_schema_error)?;
term_buffer.set_bytes(bytes);
self.multifield_postings.subscribe(doc_id, &term_buffer);
}
FieldType::Bytes(option) if option.is_indexed() => {
for field_value in field_values {
term_buffer.set_field(field_value.field());
let bytes = field_value
.value()
.bytes_value()
.ok_or_else(make_schema_error)?;
term_buffer.set_bytes(bytes);
self.multifield_postings.subscribe(doc_id, &term_buffer);
}
}
_ => {}
}
}
doc.filter_fields(|field| schema.get_field_entry(field).is_stored());

View File

@@ -174,7 +174,7 @@ use once_cell::sync::Lazy;
use serde::{Deserialize, Serialize};
/// Index format version.
const INDEX_FORMAT_VERSION: u32 = 2;
const INDEX_FORMAT_VERSION: u32 = 3;
/// Structure version for the index.
#[derive(Clone, PartialEq, Eq, Serialize, Deserialize)]

View File

@@ -132,7 +132,7 @@ impl PositionReader {
"offset arguments should be increasing."
);
let delta_to_block_offset = offset as i64 - self.block_offset as i64;
if delta_to_block_offset < 0 || delta_to_block_offset >= 128 {
if !(0..128).contains(&delta_to_block_offset) {
// The first position is not within the first block.
// We need to decompress the first block.
let delta_to_anchor_offset = offset - self.anchor_offset;

View File

@@ -109,9 +109,9 @@ impl BlockSearcher {
/// The results should be equivalent to
/// ```compile_fail
/// block[..]
// .iter()
// .take_while(|&&val| val < target)
// .count()
/// .iter()
/// .take_while(|&&val| val < target)
/// .count()
/// ```
///
/// The `start` argument is just used to hint that the response is

View File

@@ -9,7 +9,6 @@ use crate::postings::{FieldSerializer, InvertedIndexSerializer};
use crate::schema::IndexRecordOption;
use crate::schema::{Field, FieldEntry, FieldType, Schema, Term};
use crate::termdict::TermOrdinal;
use crate::tokenizer::TokenStream;
use crate::tokenizer::{Token, MAX_TOKEN_LEN};
use crate::DocId;
use fnv::FnvHashMap;
@@ -100,12 +99,10 @@ impl MultiFieldPostingsWriter {
&mut self,
doc: DocId,
field: Field,
token_stream: &mut dyn TokenStream,
token_stream: &mut dyn Iterator<Item = Token>,
term_buffer: &mut Term,
) -> u32 {
let postings_writer =
self.per_field_postings_writers[field.field_id() as usize].deref_mut();
postings_writer.index_text(
self.per_field_postings_writers[field.field_id() as usize].index_text(
&mut self.term_index,
doc,
field,
@@ -217,7 +214,7 @@ pub trait PostingsWriter {
term_index: &mut TermHashMap,
doc_id: DocId,
field: Field,
token_stream: &mut dyn TokenStream,
token_stream: &mut dyn Iterator<Item = Token>,
heap: &mut MemoryArena,
term_buffer: &mut Term,
) -> u32 {
@@ -242,7 +239,7 @@ pub trait PostingsWriter {
);
}
};
token_stream.process(&mut sink)
token_stream.map(|tok| sink(&tok)).count() as u32
}
fn total_num_tokens(&self) -> u64;

View File

@@ -1,32 +1,46 @@
use crate::common::{read_u32_vint_no_advance, serialize_vint_u32, BinarySerializable};
use std::convert::TryInto;
use crate::directory::OwnedBytes;
use crate::postings::compression::{compressed_block_size, COMPRESSION_BLOCK_SIZE};
use crate::query::BM25Weight;
use crate::schema::IndexRecordOption;
use crate::{DocId, Score, TERMINATED};
#[inline(always)]
fn encode_block_wand_max_tf(max_tf: u32) -> u8 {
max_tf.min(u8::MAX as u32) as u8
}
#[inline(always)]
fn decode_block_wand_max_tf(max_tf_code: u8) -> u32 {
if max_tf_code == u8::MAX {
u32::MAX
} else {
max_tf_code as u32
}
}
#[inline(always)]
fn read_u32(data: &[u8]) -> u32 {
u32::from_le_bytes(data[..4].try_into().unwrap())
}
#[inline(always)]
fn write_u32(val: u32, buf: &mut Vec<u8>) {
buf.extend_from_slice(&val.to_le_bytes());
}
pub struct SkipSerializer {
buffer: Vec<u8>,
prev_doc: DocId,
}
impl SkipSerializer {
pub fn new() -> SkipSerializer {
SkipSerializer {
buffer: Vec::new(),
prev_doc: 0u32,
}
SkipSerializer { buffer: Vec::new() }
}
pub fn write_doc(&mut self, last_doc: DocId, doc_num_bits: u8) {
assert!(
last_doc > self.prev_doc,
"write_doc(...) called with non-increasing doc ids. \
Did you forget to call clear maybe?"
);
let delta_doc = last_doc - self.prev_doc;
self.prev_doc = last_doc;
delta_doc.serialize(&mut self.buffer).unwrap();
write_u32(last_doc, &mut self.buffer);
self.buffer.push(doc_num_bits);
}
@@ -35,16 +49,13 @@ impl SkipSerializer {
}
pub fn write_total_term_freq(&mut self, tf_sum: u32) {
tf_sum
.serialize(&mut self.buffer)
.expect("Should never fail");
write_u32(tf_sum, &mut self.buffer);
}
pub fn write_blockwand_max(&mut self, fieldnorm_id: u8, term_freq: u32) {
self.buffer.push(fieldnorm_id);
let mut buf = [0u8; 8];
let bytes = serialize_vint_u32(term_freq, &mut buf);
self.buffer.extend_from_slice(bytes);
let block_wand_tf = encode_block_wand_max_tf(term_freq);
self.buffer
.extend_from_slice(&[fieldnorm_id, block_wand_tf]);
}
pub fn data(&self) -> &[u8] {
@@ -52,7 +63,6 @@ impl SkipSerializer {
}
pub fn clear(&mut self) {
self.prev_doc = 0u32;
self.buffer.clear();
}
}
@@ -159,18 +169,13 @@ impl SkipReader {
}
fn read_block_info(&mut self) {
let doc_delta = {
let bytes = self.owned_read.as_slice();
let mut buf = [0; 4];
buf.copy_from_slice(&bytes[..4]);
u32::from_le_bytes(buf)
};
self.last_doc_in_block += doc_delta as DocId;
let doc_num_bits = self.owned_read.as_slice()[4];
let bytes = self.owned_read.as_slice();
let advance_len: usize;
self.last_doc_in_block = read_u32(bytes);
let doc_num_bits = bytes[4];
match self.skip_info {
IndexRecordOption::Basic => {
self.owned_read.advance(5);
advance_len = 5;
self.block_info = BlockInfo::BitPacked {
doc_num_bits,
tf_num_bits: 0,
@@ -180,11 +185,10 @@ impl SkipReader {
};
}
IndexRecordOption::WithFreqs => {
let bytes = self.owned_read.as_slice();
let tf_num_bits = bytes[5];
let block_wand_fieldnorm_id = bytes[6];
let (block_wand_term_freq, num_bytes) = read_u32_vint_no_advance(&bytes[7..]);
self.owned_read.advance(7 + num_bytes);
let block_wand_term_freq = decode_block_wand_max_tf(bytes[7]);
advance_len = 8;
self.block_info = BlockInfo::BitPacked {
doc_num_bits,
tf_num_bits,
@@ -194,16 +198,11 @@ impl SkipReader {
};
}
IndexRecordOption::WithFreqsAndPositions => {
let bytes = self.owned_read.as_slice();
let tf_num_bits = bytes[5];
let tf_sum = {
let mut buf = [0; 4];
buf.copy_from_slice(&bytes[6..10]);
u32::from_le_bytes(buf)
};
let tf_sum = read_u32(&bytes[6..10]);
let block_wand_fieldnorm_id = bytes[10];
let (block_wand_term_freq, num_bytes) = read_u32_vint_no_advance(&bytes[11..]);
self.owned_read.advance(11 + num_bytes);
let block_wand_term_freq = decode_block_wand_max_tf(bytes[11]);
advance_len = 12;
self.block_info = BlockInfo::BitPacked {
doc_num_bits,
tf_num_bits,
@@ -213,6 +212,7 @@ impl SkipReader {
};
}
}
self.owned_read.advance(advance_len);
}
pub fn block_info(&self) -> BlockInfo {
@@ -274,6 +274,24 @@ mod tests {
use crate::directory::OwnedBytes;
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
#[test]
fn test_encode_block_wand_max_tf() {
for tf in 0..255 {
assert_eq!(super::encode_block_wand_max_tf(tf), tf as u8);
}
for &tf in &[255, 256, 1_000_000, u32::MAX] {
assert_eq!(super::encode_block_wand_max_tf(tf), 255);
}
}
#[test]
fn test_decode_block_wand_max_tf() {
for tf in 0..255 {
assert_eq!(super::decode_block_wand_max_tf(tf), tf as u32);
}
assert_eq!(super::decode_block_wand_max_tf(255), u32::MAX);
}
#[test]
fn test_skip_with_freq() {
let buf = {

View File

@@ -1,5 +1,3 @@
use rayon::iter::IntoParallelRefIterator;
use crate::core::SegmentReader;
use crate::postings::FreqReadingOption;
use crate::query::explanation::does_not_match;
@@ -24,7 +22,7 @@ enum SpecializedScorer {
fn scorer_union<TScoreCombiner>(scorers: Vec<Box<dyn Scorer>>) -> SpecializedScorer
where
TScoreCombiner: ScoreCombiner + Send,
TScoreCombiner: ScoreCombiner,
{
assert!(!scorers.is_empty());
if scorers.len() == 1 {
@@ -54,7 +52,7 @@ where
SpecializedScorer::Other(Box::new(Union::<_, TScoreCombiner>::from(scorers)))
}
fn into_box_scorer<TScoreCombiner: ScoreCombiner + Send>(scorer: SpecializedScorer) -> Box<dyn Scorer> {
fn into_box_scorer<TScoreCombiner: ScoreCombiner>(scorer: SpecializedScorer) -> Box<dyn Scorer> {
match scorer {
SpecializedScorer::TermUnion(term_scorers) => {
let union_scorer = Union::<TermScorer, TScoreCombiner>::from(term_scorers);
@@ -82,32 +80,18 @@ impl BooleanWeight {
reader: &SegmentReader,
boost: Score,
) -> crate::Result<HashMap<Occur, Vec<Box<dyn Scorer>>>> {
use rayon::iter::ParallelIterator;
use rayon::iter::IndexedParallelIterator;
let mut per_occur_scorers: HashMap<Occur, Vec<Box<dyn Scorer>>> = HashMap::new();
let mut items_res: Vec<crate::Result<(Occur, Box<dyn Scorer>)>> = Vec::new();
let pool = rayon::ThreadPoolBuilder::new().num_threads(self.weights.len()).build().unwrap();
pool.install(|| {
self.weights.iter()
.collect::<Vec<_>>()
.par_iter()
.map(|(occur, subweight)| {
let sub_scorer: Box<dyn Scorer> = subweight.scorer(reader, boost)?;
Ok((*occur, sub_scorer))
})
.collect_into_vec(&mut items_res);
});
for item_res in items_res {
let (occur, sub_scorer) = item_res?;
for &(ref occur, ref subweight) in &self.weights {
let sub_scorer: Box<dyn Scorer> = subweight.scorer(reader, boost)?;
per_occur_scorers
.entry(occur)
.entry(*occur)
.or_insert_with(Vec::new)
.push(sub_scorer);
}
Ok(per_occur_scorers)
}
fn complex_scorer<TScoreCombiner: ScoreCombiner >(
fn complex_scorer<TScoreCombiner: ScoreCombiner>(
&self,
reader: &SegmentReader,
boost: Score,

View File

@@ -289,7 +289,7 @@ impl QueryParser {
let field_name = field_entry.name().to_string();
return Err(QueryParserError::FieldNotIndexed(field_name));
}
match *field_type {
match field_type {
FieldType::I64(_) => {
let val: i64 = i64::from_str(phrase)?;
let term = Term::from_field_i64(field, val);
@@ -312,7 +312,7 @@ impl QueryParser {
let term = Term::from_field_u64(field, val);
Ok(vec![(0, term)])
}
FieldType::Str(ref str_options) => {
FieldType::Str(str_options) => {
if let Some(option) = str_options.get_indexing_options() {
let tokenizer =
self.tokenizer_manager
@@ -323,15 +323,14 @@ impl QueryParser {
option.tokenizer().to_string(),
)
})?;
let mut terms: Vec<(usize, Term)> = Vec::new();
let mut token_stream = tokenizer.token_stream(phrase);
token_stream.process(&mut |token| {
let term = Term::from_field_text(field, &token.text);
terms.push((token.position, term));
});
if terms.is_empty() {
Ok(vec![])
} else if terms.len() == 1 {
let token_stream = tokenizer.token_stream(phrase);
let terms: Vec<_> = token_stream
.map(|token| {
let term = Term::from_field_text(field, &token.text);
(token.position, term)
})
.collect();
if terms.len() <= 1 {
Ok(terms)
} else {
let field_entry = self.schema.get_field_entry(field);
@@ -414,7 +413,7 @@ impl QueryParser {
&self,
given_field: &Option<String>,
) -> Result<Cow<'_, [Field]>, QueryParserError> {
match *given_field {
match given_field {
None => {
if self.default_fields.is_empty() {
Err(QueryParserError::NoDefaultFieldDeclared)
@@ -422,7 +421,7 @@ impl QueryParser {
Ok(Cow::from(&self.default_fields[..]))
}
}
Some(ref field) => Ok(Cow::from(vec![self.resolve_field_name(&*field)?])),
Some(field) => Ok(Cow::from(vec![self.resolve_field_name(&*field)?])),
}
}
@@ -574,15 +573,12 @@ fn convert_to_query(logical_ast: LogicalAST) -> Box<dyn Query> {
#[cfg(test)]
mod test {
use super::super::logical_ast::*;
use super::QueryParser;
use super::QueryParserError;
use super::*;
use crate::query::Query;
use crate::schema::Field;
use crate::schema::{IndexRecordOption, TextFieldIndexing, TextOptions};
use crate::schema::{Schema, Term, INDEXED, STORED, STRING, TEXT};
use crate::tokenizer::{
LowerCaser, SimpleTokenizer, StopWordFilter, TextAnalyzer, TokenizerManager,
};
use crate::tokenizer::{analyzer_builder, LowerCaser, SimpleTokenizer, StopWordFilter};
use crate::Index;
use matches::assert_matches;
@@ -620,9 +616,10 @@ mod test {
let tokenizer_manager = TokenizerManager::default();
tokenizer_manager.register(
"en_with_stop_words",
TextAnalyzer::from(SimpleTokenizer)
.filter(LowerCaser)
.filter(StopWordFilter::remove(vec!["the".to_string()])),
analyzer_builder(SimpleTokenizer)
.filter(LowerCaser::new())
.filter(StopWordFilter::remove(vec!["the".to_string()]))
.build(),
);
QueryParser::new(schema, default_fields, tokenizer_manager)
}

View File

@@ -302,7 +302,7 @@ mod tests {
let mut rng = rand::thread_rng();
writer.set_merge_policy(Box::new(NoMergePolicy));
for _ in 0..3_000 {
let term_freq = rng.gen_range(1, 10000);
let term_freq = rng.gen_range(1..10000);
let words: Vec<&str> = std::iter::repeat("bbbb").take(term_freq).collect();
let text = words.join(" ");
writer.add_document(doc!(text_field=>text));

View File

@@ -1,5 +1,5 @@
use crate::schema::Value;
use serde::Serialize;
use serde::{Deserialize, Serialize};
use std::collections::BTreeMap;
/// Internal representation of a document used for JSON
@@ -8,5 +8,5 @@ use std::collections::BTreeMap;
/// A `NamedFieldDocument` is a simple representation of a document
/// as a `BTreeMap<String, Vec<Value>>`.
///
#[derive(Serialize)]
#[derive(Debug, Deserialize, Serialize)]
pub struct NamedFieldDocument(pub BTreeMap<String, Vec<Value>>);

View File

@@ -1,7 +1,7 @@
use crate::query::Query;
use crate::schema::Field;
use crate::schema::Value;
use crate::tokenizer::{TextAnalyzer, Token};
use crate::tokenizer::{TextAnalyzerT, Token};
use crate::Searcher;
use crate::{Document, Score};
use htmlescape::encode_minimal;
@@ -139,9 +139,9 @@ impl Snippet {
///
/// Fragments must be valid in the sense that `&text[fragment.start..fragment.stop]`\
/// has to be a valid string.
fn search_fragments<'a>(
tokenizer: &TextAnalyzer,
text: &'a str,
fn search_fragments(
tokenizer: &dyn TextAnalyzerT,
text: &str,
terms: &BTreeMap<String, Score>,
max_num_chars: usize,
) -> Vec<FragmentCandidate> {
@@ -155,7 +155,7 @@ fn search_fragments<'a>(
};
fragment = FragmentCandidate::new(next.offset_from);
}
fragment.try_add_token(next, &terms);
fragment.try_add_token(&next, &terms);
}
if fragment.score > 0.0 {
fragments.push(fragment)
@@ -249,7 +249,7 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str)
/// ```
pub struct SnippetGenerator {
terms_text: BTreeMap<String, Score>,
tokenizer: TextAnalyzer,
tokenizer: Box<dyn TextAnalyzerT>,
field: Field,
max_num_chars: usize,
}
@@ -297,33 +297,37 @@ impl SnippetGenerator {
///
/// This method extract the text associated to the `SnippetGenerator`'s field
/// and computes a snippet.
pub fn snippet_from_doc(&self, doc: &Document) -> Snippet {
pub fn snippet_from_doc(&mut self, doc: &Document) -> Snippet {
let text: String = doc
.get_all(self.field)
.flat_map(Value::text)
.collect::<Vec<&str>>()
.join(" ");
self.snippet(&text)
self.snippet(text.as_ref())
}
/// Generates a snippet for the given text.
pub fn snippet(&self, text: &str) -> Snippet {
let fragment_candidates =
search_fragments(&self.tokenizer, &text, &self.terms_text, self.max_num_chars);
select_best_fragment_combination(&fragment_candidates[..], &text)
pub fn snippet(&mut self, text: &str) -> Snippet {
let fragment_candidates = search_fragments(
&mut *self.tokenizer,
text,
&self.terms_text,
self.max_num_chars,
);
select_best_fragment_combination(&fragment_candidates[..], text)
}
}
#[cfg(test)]
mod tests {
use super::{search_fragments, select_best_fragment_combination};
use super::*;
use crate::query::QueryParser;
use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions, TEXT};
use crate::tokenizer::SimpleTokenizer;
use crate::tokenizer::TextAnalyzer;
use crate::Index;
use crate::SnippetGenerator;
use maplit::btreemap;
use std::collections::BTreeMap;
use std::iter::Iterator;
const TEST_TEXT: &'static str = r#"Rust is a systems programming language sponsored by
@@ -346,7 +350,13 @@ Survey in 2016, 2017, and 2018."#;
String::from("rust") => 1.0,
String::from("language") => 0.9
};
let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 100);
let fragments = search_fragments(
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
TEST_TEXT,
&terms,
100,
);
assert_eq!(fragments.len(), 7);
{
let first = &fragments[0];
@@ -373,7 +383,12 @@ Survey in 2016, 2017, and 2018."#;
String::from("rust") =>1.0,
String::from("language") => 0.9
};
let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 20);
let fragments = search_fragments(
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
TEST_TEXT,
&terms,
20,
);
{
let first = &fragments[0];
assert_eq!(first.score, 1.0);
@@ -387,7 +402,12 @@ Survey in 2016, 2017, and 2018."#;
String::from("rust") =>0.9,
String::from("language") => 1.0
};
let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 20);
let fragments = search_fragments(
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
TEST_TEXT,
&terms,
20,
);
//assert_eq!(fragments.len(), 7);
{
let first = &fragments[0];
@@ -406,7 +426,12 @@ Survey in 2016, 2017, and 2018."#;
let mut terms = BTreeMap::new();
terms.insert(String::from("c"), 1.0);
let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 3);
let fragments = search_fragments(
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
&text,
&terms,
3,
);
assert_eq!(fragments.len(), 1);
{
@@ -428,7 +453,12 @@ Survey in 2016, 2017, and 2018."#;
let mut terms = BTreeMap::new();
terms.insert(String::from("f"), 1.0);
let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 3);
let fragments = search_fragments(
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
&text,
&terms,
3,
);
assert_eq!(fragments.len(), 2);
{
@@ -451,7 +481,12 @@ Survey in 2016, 2017, and 2018."#;
terms.insert(String::from("f"), 1.0);
terms.insert(String::from("a"), 0.9);
let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 7);
let fragments = search_fragments(
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
&text,
&terms,
7,
);
assert_eq!(fragments.len(), 2);
{
@@ -473,7 +508,12 @@ Survey in 2016, 2017, and 2018."#;
let mut terms = BTreeMap::new();
terms.insert(String::from("z"), 1.0);
let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 3);
let fragments = search_fragments(
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
&text,
&terms,
3,
);
assert_eq!(fragments.len(), 0);
@@ -487,7 +527,12 @@ Survey in 2016, 2017, and 2018."#;
let text = "a b c d";
let terms = BTreeMap::new();
let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 3);
let fragments = search_fragments(
&Into::<TextAnalyzer<_>>::into(SimpleTokenizer),
&text,
&terms,
3,
);
assert_eq!(fragments.len(), 0);
let snippet = select_best_fragment_combination(&fragments[..], &text);
@@ -572,12 +617,12 @@ Survey in 2016, 2017, and 2018."#;
let mut snippet_generator =
SnippetGenerator::create(&searcher, &*query, text_field).unwrap();
{
let snippet = snippet_generator.snippet(TEST_TEXT);
let snippet = snippet_generator.snippet(TEST_TEXT.into());
assert_eq!(snippet.to_html(), "imperative-procedural paradigms. <b>Rust</b> is syntactically similar to C++[according to whom?],\nbut its <b>designers</b> intend it to provide better memory safety");
}
{
snippet_generator.set_max_num_chars(90);
let snippet = snippet_generator.snippet(TEST_TEXT);
let snippet = snippet_generator.snippet(TEST_TEXT.into());
assert_eq!(snippet.to_html(), "<b>Rust</b> is syntactically similar to C++[according to whom?],\nbut its <b>designers</b> intend it to");
}
}

View File

@@ -3,7 +3,7 @@ use std::io::{self, Read, Write};
/// Name of the compression scheme used in the doc store.
///
/// This name is appended to the version string of tantivy.
pub const COMPRESSION: &'static str = "lz4";
pub const COMPRESSION: &str = "lz4";
pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
compressed.clear();

View File

@@ -35,11 +35,11 @@ struct Layer {
}
impl Layer {
fn cursor<'a>(&'a self) -> impl Iterator<Item = Checkpoint> + 'a {
fn cursor(&self) -> impl Iterator<Item = Checkpoint> + '_ {
self.cursor_at_offset(0u64)
}
fn cursor_at_offset<'a>(&'a self, start_offset: u64) -> impl Iterator<Item = Checkpoint> + 'a {
fn cursor_at_offset(&self, start_offset: u64) -> impl Iterator<Item = Checkpoint> + '_ {
let data = &self.data.as_slice();
LayerCursor {
remaining: &data[start_offset as usize..],
@@ -59,7 +59,7 @@ pub struct SkipIndex {
}
impl SkipIndex {
pub(crate) fn checkpoints<'a>(&'a self) -> impl Iterator<Item = Checkpoint> + 'a {
pub(crate) fn checkpoints(&self) -> impl Iterator<Item = Checkpoint> + '_ {
self.layers
.last()
.into_iter()

View File

@@ -46,7 +46,7 @@ impl StoreReader {
})
}
pub(crate) fn block_checkpoints<'a>(&'a self) -> impl Iterator<Item = Checkpoint> + 'a {
pub(crate) fn block_checkpoints(&self) -> impl Iterator<Item = Checkpoint> + '_ {
self.skip_index.checkpoints()
}

View File

@@ -22,10 +22,8 @@ A second datastructure makes it possible to access a [`TermInfo`](../postings/st
use tantivy_fst::automaton::AlwaysMatch;
// mod fst_termdict;
// use fst_termdict as termdict;
mod sstable_termdict;
use sstable_termdict as termdict;
mod fst_termdict;
use fst_termdict as termdict;
mod merger;

View File

@@ -1,148 +0,0 @@
use std::io;
mod sstable;
mod streamer;
mod termdict;
use self::sstable::value::{ValueReader, ValueWriter};
use self::sstable::{BlockReader, SSTable};
use crate::common::VInt;
use crate::postings::TermInfo;
pub use self::streamer::{TermStreamer, TermStreamerBuilder};
pub use self::termdict::{TermDictionary, TermDictionaryBuilder};
pub struct TermSSTable;
impl SSTable for TermSSTable {
type Value = TermInfo;
type Reader = TermInfoReader;
type Writer = TermInfoWriter;
}
#[derive(Default)]
pub struct TermInfoReader {
term_infos: Vec<TermInfo>,
}
impl ValueReader for TermInfoReader {
type Value = TermInfo;
fn value(&self, idx: usize) -> &TermInfo {
&self.term_infos[idx]
}
fn read(&mut self, reader: &mut BlockReader) -> io::Result<()> {
self.term_infos.clear();
let num_els = VInt::deserialize_u64(reader)?;
let mut start_offset = VInt::deserialize_u64(reader)?;
let mut positions_idx = 0;
for _ in 0..num_els {
let doc_freq = VInt::deserialize_u64(reader)? as u32;
let posting_num_bytes = VInt::deserialize_u64(reader)?;
let stop_offset = start_offset + posting_num_bytes;
let delta_positions_idx = VInt::deserialize_u64(reader)?;
positions_idx += delta_positions_idx;
let term_info = TermInfo {
doc_freq,
postings_start_offset: start_offset,
postings_stop_offset: stop_offset,
positions_idx,
};
self.term_infos.push(term_info);
start_offset = stop_offset;
}
Ok(())
}
}
#[derive(Default)]
pub struct TermInfoWriter {
term_infos: Vec<TermInfo>,
}
impl ValueWriter for TermInfoWriter {
type Value = TermInfo;
fn write(&mut self, term_info: &TermInfo) {
self.term_infos.push(term_info.clone());
}
fn write_block(&mut self, buffer: &mut Vec<u8>) {
VInt(self.term_infos.len() as u64).serialize_into_vec(buffer);
if self.term_infos.is_empty() {
return;
}
let mut prev_position_idx = 0u64;
VInt(self.term_infos[0].postings_start_offset).serialize_into_vec(buffer);
for term_info in &self.term_infos {
VInt(term_info.doc_freq as u64).serialize_into_vec(buffer);
VInt(term_info.postings_stop_offset - term_info.postings_start_offset)
.serialize_into_vec(buffer);
VInt(term_info.positions_idx - prev_position_idx).serialize_into_vec(buffer);
prev_position_idx = term_info.positions_idx;
}
self.term_infos.clear();
}
}
#[cfg(test)]
mod tests {
use std::io;
use super::BlockReader;
use crate::directory::OwnedBytes;
use crate::postings::TermInfo;
use crate::termdict::sstable_termdict::sstable::value::{ValueReader, ValueWriter};
use crate::termdict::sstable_termdict::TermInfoReader;
#[test]
fn test_block_terminfos() -> io::Result<()> {
let mut term_info_writer = super::TermInfoWriter::default();
term_info_writer.write(&TermInfo {
doc_freq: 120u32,
postings_start_offset: 17u64,
postings_stop_offset: 45u64,
positions_idx: 10u64,
});
term_info_writer.write(&TermInfo {
doc_freq: 10u32,
postings_start_offset: 45u64,
postings_stop_offset: 450u64,
positions_idx: 104u64,
});
term_info_writer.write(&TermInfo {
doc_freq: 17u32,
postings_start_offset: 450u64,
postings_stop_offset: 462u64,
positions_idx: 210u64,
});
let mut buffer = Vec::new();
term_info_writer.write_block(&mut buffer);
let mut block_reader = make_block_reader(&buffer[..]);
let mut term_info_reader = TermInfoReader::default();
term_info_reader.read(&mut block_reader)?;
assert_eq!(
term_info_reader.value(0),
&TermInfo {
doc_freq: 120u32,
postings_start_offset: 17u64,
postings_stop_offset: 45u64,
positions_idx: 10u64
}
);
assert!(block_reader.buffer().is_empty());
Ok(())
}
fn make_block_reader(data: &[u8]) -> BlockReader {
let mut buffer = (data.len() as u32).to_le_bytes().to_vec();
buffer.extend_from_slice(data);
let owned_bytes = OwnedBytes::new(buffer);
let mut block_reader = BlockReader::new(Box::new(owned_bytes));
block_reader.read_block().unwrap();
block_reader
}
}

View File

@@ -1,84 +0,0 @@
use byteorder::{LittleEndian, ReadBytesExt};
use std::io::{self, Read};
pub struct BlockReader<'a> {
buffer: Vec<u8>,
reader: Box<dyn io::Read + 'a>,
offset: usize,
}
impl<'a> BlockReader<'a> {
pub fn new(reader: Box<dyn io::Read + 'a>) -> BlockReader<'a> {
BlockReader {
buffer: Vec::new(),
reader,
offset: 0,
}
}
pub fn deserialize_u64(&mut self) -> u64 {
let (num_bytes, val) = super::vint::deserialize_read(self.buffer());
self.advance(num_bytes);
val
}
#[inline(always)]
pub fn buffer_from_to(&self, start: usize, end: usize) -> &[u8] {
&self.buffer[start..end]
}
pub fn buffer_from(&self, start: usize) -> &[u8] {
&self.buffer[start..]
}
pub fn read_block(&mut self) -> io::Result<bool> {
self.offset = 0;
let block_len_res = self.reader.read_u32::<LittleEndian>();
if let Err(err) = &block_len_res {
if err.kind() == io::ErrorKind::UnexpectedEof {
return Ok(false);
}
}
let block_len = block_len_res?;
if block_len == 0u32 {
self.buffer.clear();
return Ok(false);
}
self.buffer.resize(block_len as usize, 0u8);
self.reader.read_exact(&mut self.buffer[..])?;
Ok(true)
}
pub fn offset(&self) -> usize {
self.offset
}
pub fn advance(&mut self, num_bytes: usize) {
self.offset += num_bytes;
}
pub fn buffer(&self) -> &[u8] {
&self.buffer[self.offset..]
}
}
impl<'a> io::Read for BlockReader<'a> {
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
let len = self.buffer().read(buf)?;
self.advance(len);
Ok(len)
}
fn read_to_end(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
let len = self.buffer.len();
buf.extend_from_slice(self.buffer());
self.advance(len);
Ok(len)
}
fn read_exact(&mut self, buf: &mut [u8]) -> io::Result<()> {
self.buffer().read_exact(buf)?;
self.advance(buf.len());
Ok(())
}
}

View File

@@ -1,203 +0,0 @@
use std::io::{self, BufWriter, Write};
use crate::common::CountingWriter;
use super::value::ValueWriter;
use super::{value, vint, BlockReader};
const FOUR_BIT_LIMITS: usize = 1 << 4;
const VINT_MODE: u8 = 1u8;
const BLOCK_LEN: usize = 256_000;
pub struct DeltaWriter<W, TValueWriter>
where
W: io::Write,
{
block: Vec<u8>,
write: CountingWriter<BufWriter<W>>,
value_writer: TValueWriter,
}
impl<W, TValueWriter> DeltaWriter<W, TValueWriter>
where
W: io::Write,
TValueWriter: ValueWriter,
{
pub fn new(wrt: W) -> Self {
DeltaWriter {
block: Vec::with_capacity(BLOCK_LEN * 2),
write: CountingWriter::wrap(BufWriter::new(wrt)),
value_writer: TValueWriter::default(),
}
}
}
impl<W, TValueWriter> DeltaWriter<W, TValueWriter>
where
W: io::Write,
TValueWriter: value::ValueWriter,
{
pub fn flush_block(&mut self) -> io::Result<Option<(u64, u64)>> {
if self.block.is_empty() {
return Ok(None);
}
let start_offset = self.write.written_bytes();
// TODO avoid buffer allocation
let mut buffer = Vec::new();
self.value_writer.write_block(&mut buffer);
let block_len = buffer.len() + self.block.len();
self.write.write_all(&(block_len as u32).to_le_bytes())?;
self.write.write_all(&buffer[..])?;
self.write.write_all(&mut self.block[..])?;
let end_offset = self.write.written_bytes();
self.block.clear();
Ok(Some((start_offset, end_offset)))
}
fn encode_keep_add(&mut self, keep_len: usize, add_len: usize) {
if keep_len < FOUR_BIT_LIMITS && add_len < FOUR_BIT_LIMITS {
let b = (keep_len | add_len << 4) as u8;
self.block.extend_from_slice(&[b])
} else {
let mut buf = [VINT_MODE; 20];
let mut len = 1 + vint::serialize(keep_len as u64, &mut buf[1..]);
len += vint::serialize(add_len as u64, &mut buf[len..]);
self.block.extend_from_slice(&mut buf[..len])
}
}
pub(crate) fn write_suffix(&mut self, common_prefix_len: usize, suffix: &[u8]) {
let keep_len = common_prefix_len;
let add_len = suffix.len();
self.encode_keep_add(keep_len, add_len);
self.block.extend_from_slice(suffix);
}
pub(crate) fn write_value(&mut self, value: &TValueWriter::Value) {
self.value_writer.write(value);
}
pub fn write_delta(
&mut self,
common_prefix_len: usize,
suffix: &[u8],
value: &TValueWriter::Value,
) {
self.write_suffix(common_prefix_len, suffix);
self.write_value(value);
}
pub fn flush_block_if_required(&mut self) -> io::Result<Option<(u64, u64)>> {
if self.block.len() > BLOCK_LEN {
return self.flush_block();
}
Ok(None)
}
pub fn finalize(mut self) -> CountingWriter<BufWriter<W>> {
self.write
}
}
pub struct DeltaReader<'a, TValueReader> {
common_prefix_len: usize,
suffix_start: usize,
suffix_end: usize,
value_reader: TValueReader,
block_reader: BlockReader<'a>,
idx: usize,
}
impl<'a, TValueReader> DeltaReader<'a, TValueReader>
where
TValueReader: value::ValueReader,
{
pub fn new<R: io::Read + 'a>(reader: R) -> Self {
DeltaReader {
idx: 0,
common_prefix_len: 0,
suffix_start: 0,
suffix_end: 0,
value_reader: TValueReader::default(),
block_reader: BlockReader::new(Box::new(reader)),
}
}
fn deserialize_vint(&mut self) -> u64 {
self.block_reader.deserialize_u64()
}
fn read_keep_add(&mut self) -> Option<(usize, usize)> {
let b = {
let buf = &self.block_reader.buffer();
if buf.is_empty() {
return None;
}
buf[0]
};
self.block_reader.advance(1);
match b {
VINT_MODE => {
let keep = self.deserialize_vint() as usize;
let add = self.deserialize_vint() as usize;
Some((keep, add))
}
b => {
let keep = (b & 0b1111) as usize;
let add = (b >> 4) as usize;
Some((keep, add))
}
}
}
fn read_delta_key(&mut self) -> bool {
if let Some((keep, add)) = self.read_keep_add() {
self.common_prefix_len = keep;
self.suffix_start = self.block_reader.offset();
self.suffix_end = self.suffix_start + add;
self.block_reader.advance(add);
true
} else {
false
}
}
pub fn advance(&mut self) -> io::Result<bool> {
if self.block_reader.buffer().is_empty() {
if !self.block_reader.read_block()? {
return Ok(false);
}
self.value_reader.read(&mut self.block_reader)?;
self.idx = 0;
} else {
self.idx += 1;
}
if !self.read_delta_key() {
return Ok(false);
}
Ok(true)
}
pub fn common_prefix_len(&self) -> usize {
self.common_prefix_len
}
pub fn suffix(&self) -> &[u8] {
&self
.block_reader
.buffer_from_to(self.suffix_start, self.suffix_end)
}
pub fn suffix_from(&self, offset: usize) -> &[u8] {
&self.block_reader.buffer_from_to(
self.suffix_start
.wrapping_add(offset)
.wrapping_sub(self.common_prefix_len),
self.suffix_end,
)
}
pub fn value(&self) -> &TValueReader::Value {
self.value_reader.value(self.idx)
}
}

View File

@@ -1,72 +0,0 @@
use crate::termdict::sstable_termdict::sstable::{Reader, SSTable, Writer};
use super::SingleValueMerger;
use super::ValueMerger;
use std::cmp::Ordering;
use std::collections::binary_heap::PeekMut;
use std::collections::BinaryHeap;
use std::io;
struct HeapItem<B: AsRef<[u8]>>(B);
impl<B: AsRef<[u8]>> Ord for HeapItem<B> {
fn cmp(&self, other: &Self) -> Ordering {
other.0.as_ref().cmp(self.0.as_ref())
}
}
impl<B: AsRef<[u8]>> PartialOrd for HeapItem<B> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(other.0.as_ref().cmp(self.0.as_ref()))
}
}
impl<B: AsRef<[u8]>> Eq for HeapItem<B> {}
impl<B: AsRef<[u8]>> PartialEq for HeapItem<B> {
fn eq(&self, other: &Self) -> bool {
self.0.as_ref() == other.0.as_ref()
}
}
pub fn merge_sstable<SST: SSTable, W: io::Write, M: ValueMerger<SST::Value>>(
readers: Vec<Reader<SST::Reader>>,
mut writer: Writer<W, SST::Writer>,
mut merger: M,
) -> io::Result<()> {
let mut heap: BinaryHeap<HeapItem<Reader<SST::Reader>>> =
BinaryHeap::with_capacity(readers.len());
for mut reader in readers {
if reader.advance()? {
heap.push(HeapItem(reader));
}
}
loop {
let len = heap.len();
let mut value_merger;
if let Some(mut head) = heap.peek_mut() {
writer.write_key(head.0.key());
value_merger = merger.new_value(head.0.value());
if !head.0.advance()? {
PeekMut::pop(head);
}
} else {
break;
}
for _ in 0..len - 1 {
if let Some(mut head) = heap.peek_mut() {
if head.0.key() == writer.current_key() {
value_merger.add(head.0.value());
if !head.0.advance()? {
PeekMut::pop(head);
}
continue;
}
}
break;
}
let value = value_merger.finish();
writer.write_value(&value);
writer.flush_block_if_required()?;
}
writer.finalize()?;
Ok(())
}

View File

@@ -1,184 +0,0 @@
mod heap_merge;
pub use self::heap_merge::merge_sstable;
pub trait SingleValueMerger<V> {
fn add(&mut self, v: &V);
fn finish(self) -> V;
}
pub trait ValueMerger<V> {
type TSingleValueMerger: SingleValueMerger<V>;
fn new_value(&mut self, v: &V) -> Self::TSingleValueMerger;
}
#[derive(Default)]
pub struct KeepFirst;
pub struct FirstVal<V>(V);
impl<V: Clone> ValueMerger<V> for KeepFirst {
type TSingleValueMerger = FirstVal<V>;
fn new_value(&mut self, v: &V) -> FirstVal<V> {
FirstVal(v.clone())
}
}
impl<V> SingleValueMerger<V> for FirstVal<V> {
fn add(&mut self, _: &V) {}
fn finish(self) -> V {
self.0
}
}
pub struct VoidMerge;
impl ValueMerger<()> for VoidMerge {
type TSingleValueMerger = ();
fn new_value(&mut self, _: &()) -> () {
()
}
}
pub struct U64Merge;
impl ValueMerger<u64> for U64Merge {
type TSingleValueMerger = u64;
fn new_value(&mut self, val: &u64) -> u64 {
*val
}
}
impl SingleValueMerger<u64> for u64 {
fn add(&mut self, val: &u64) {
*self += *val;
}
fn finish(self) -> u64 {
self
}
}
impl SingleValueMerger<()> for () {
fn add(&mut self, _: &()) {}
fn finish(self) -> () {
()
}
}
#[cfg(test)]
mod tests {
use super::super::SSTable;
use super::super::{SSTableMonotonicU64, VoidSSTable};
use super::U64Merge;
use super::VoidMerge;
use std::collections::{BTreeMap, BTreeSet};
use std::str;
fn write_sstable(keys: &[&'static str]) -> Vec<u8> {
let mut buffer: Vec<u8> = vec![];
{
let mut sstable_writer = VoidSSTable::writer(&mut buffer);
for &key in keys {
assert!(sstable_writer.write(key.as_bytes(), &()).is_ok());
}
assert!(sstable_writer.finalize().is_ok());
}
dbg!(&buffer);
buffer
}
fn write_sstable_u64(keys: &[(&'static str, u64)]) -> Vec<u8> {
let mut buffer: Vec<u8> = vec![];
{
let mut sstable_writer = SSTableMonotonicU64::writer(&mut buffer);
for (key, val) in keys {
assert!(sstable_writer.write(key.as_bytes(), val).is_ok());
}
assert!(sstable_writer.finalize().is_ok());
}
buffer
}
fn merge_test_aux(arrs: &[&[&'static str]]) {
let sstables = arrs.iter().cloned().map(write_sstable).collect::<Vec<_>>();
let sstables_ref: Vec<&[u8]> = sstables.iter().map(|s| s.as_ref()).collect();
let mut merged = BTreeSet::new();
for &arr in arrs.iter() {
for &s in arr {
merged.insert(s.to_string());
}
}
let mut w = Vec::new();
assert!(VoidSSTable::merge(sstables_ref, &mut w, VoidMerge).is_ok());
let mut reader = VoidSSTable::reader(&w[..]);
for k in merged {
assert!(reader.advance().unwrap());
assert_eq!(reader.key(), k.as_bytes());
}
assert!(!reader.advance().unwrap());
}
fn merge_test_u64_monotonic_aux(arrs: &[&[(&'static str, u64)]]) {
let sstables = arrs
.iter()
.cloned()
.map(write_sstable_u64)
.collect::<Vec<_>>();
let sstables_ref: Vec<&[u8]> = sstables.iter().map(|s| s.as_ref()).collect();
let mut merged = BTreeMap::new();
for &arr in arrs.iter() {
for (key, val) in arr {
let entry = merged.entry(key.to_string()).or_insert(0u64);
*entry += val;
}
}
let mut w = Vec::new();
assert!(SSTableMonotonicU64::merge(sstables_ref, &mut w, U64Merge).is_ok());
let mut reader = SSTableMonotonicU64::reader(&w[..]);
for (k, v) in merged {
assert!(reader.advance().unwrap());
assert_eq!(reader.key(), k.as_bytes());
assert_eq!(reader.value(), &v);
}
assert!(!reader.advance().unwrap());
}
#[test]
fn test_merge_simple_reproduce() {
let sstable_data = write_sstable(&["a"]);
let mut reader = VoidSSTable::reader(&sstable_data[..]);
assert!(reader.advance().unwrap());
assert_eq!(reader.key(), b"a");
assert!(!reader.advance().unwrap());
}
#[test]
fn test_merge() {
merge_test_aux(&[]);
merge_test_aux(&[&["a"]]);
merge_test_aux(&[&["a", "b"], &["ab"]]); // a, ab, b
merge_test_aux(&[&["a", "b"], &["a", "b"]]);
merge_test_aux(&[
&["happy", "hello", "payer", "tax"],
&["habitat", "hello", "zoo"],
&[],
&["a"],
]);
merge_test_aux(&[&["a"]]);
merge_test_aux(&[&["a", "b"], &["ab"]]);
merge_test_aux(&[&["a", "b"], &["a", "b"]]);
}
#[test]
fn test_merge_u64() {
merge_test_u64_monotonic_aux(&[]);
merge_test_u64_monotonic_aux(&[&[("a", 1u64)]]);
merge_test_u64_monotonic_aux(&[&[("a", 1u64), ("b", 3u64)], &[("ab", 2u64)]]); // a, ab, b
merge_test_u64_monotonic_aux(&[&[("a", 1u64), ("b", 2u64)], &[("a", 16u64), ("b", 23u64)]]);
}
}

View File

@@ -1,365 +0,0 @@
use merge::ValueMerger;
use std::io::{self, Write};
use std::usize;
mod delta;
pub mod merge;
pub mod value;
pub(crate) mod sstable_index;
pub(crate) use self::sstable_index::{SSTableIndex, SSTableIndexBuilder};
pub(crate) mod vint;
mod block_reader;
pub use self::delta::DeltaReader;
use self::delta::DeltaWriter;
use self::value::{U64MonotonicReader, U64MonotonicWriter, ValueReader, ValueWriter};
pub use self::block_reader::BlockReader;
pub use self::merge::VoidMerge;
const DEFAULT_KEY_CAPACITY: usize = 50;
pub(crate) fn common_prefix_len(left: &[u8], right: &[u8]) -> usize {
left.iter()
.cloned()
.zip(right.iter().cloned())
.take_while(|(left, right)| left == right)
.count()
}
pub trait SSTable: Sized {
type Value;
type Reader: ValueReader<Value = Self::Value>;
type Writer: ValueWriter<Value = Self::Value>;
fn delta_writer<W: io::Write>(write: W) -> DeltaWriter<W, Self::Writer> {
DeltaWriter::new(write)
}
fn writer<W: io::Write>(write: W) -> Writer<W, Self::Writer> {
Writer {
previous_key: Vec::with_capacity(DEFAULT_KEY_CAPACITY),
num_terms: 0u64,
index_builder: SSTableIndexBuilder::default(),
delta_writer: Self::delta_writer(write),
first_ordinal_of_the_block: 0u64,
}
}
fn delta_reader<'a, R: io::Read + 'a>(reader: R) -> DeltaReader<'a, Self::Reader> {
DeltaReader::new(reader)
}
fn reader<'a, R: io::Read + 'a>(reader: R) -> Reader<'a, Self::Reader> {
Reader {
key: Vec::with_capacity(DEFAULT_KEY_CAPACITY),
delta_reader: Self::delta_reader(reader),
}
}
fn merge<R: io::Read, W: io::Write, M: ValueMerger<Self::Value>>(
io_readers: Vec<R>,
w: W,
merger: M,
) -> io::Result<()> {
let readers: Vec<_> = io_readers.into_iter().map(Self::reader).collect();
let writer = Self::writer(w);
merge::merge_sstable::<Self, _, _>(readers, writer, merger)
}
}
pub struct VoidSSTable;
impl SSTable for VoidSSTable {
type Value = ();
type Reader = value::VoidReader;
type Writer = value::VoidWriter;
}
pub struct SSTableMonotonicU64;
impl SSTable for SSTableMonotonicU64 {
type Value = u64;
type Reader = U64MonotonicReader;
type Writer = U64MonotonicWriter;
}
pub struct Reader<'a, TValueReader> {
key: Vec<u8>,
delta_reader: DeltaReader<'a, TValueReader>,
}
impl<'a, TValueReader> Reader<'a, TValueReader>
where
TValueReader: ValueReader,
{
pub fn advance(&mut self) -> io::Result<bool> {
if !self.delta_reader.advance()? {
return Ok(false);
}
let common_prefix_len = self.delta_reader.common_prefix_len();
let suffix = self.delta_reader.suffix();
let new_len = self.delta_reader.common_prefix_len() + suffix.len();
self.key.resize(new_len, 0u8);
self.key[common_prefix_len..].copy_from_slice(suffix);
Ok(true)
}
pub fn key(&self) -> &[u8] {
&self.key
}
pub fn value(&self) -> &TValueReader::Value {
self.delta_reader.value()
}
pub(crate) fn into_delta_reader(self) -> DeltaReader<'a, TValueReader> {
assert!(self.key.is_empty());
self.delta_reader
}
}
impl<'a, TValueReader> AsRef<[u8]> for Reader<'a, TValueReader> {
fn as_ref(&self) -> &[u8] {
&self.key
}
}
pub struct Writer<W, TValueWriter>
where
W: io::Write,
{
previous_key: Vec<u8>,
index_builder: SSTableIndexBuilder,
delta_writer: DeltaWriter<W, TValueWriter>,
num_terms: u64,
first_ordinal_of_the_block: u64,
}
impl<W, TValueWriter> Writer<W, TValueWriter>
where
W: io::Write,
TValueWriter: value::ValueWriter,
{
pub(crate) fn current_key(&self) -> &[u8] {
&self.previous_key[..]
}
pub fn write_key(&mut self, key: &[u8]) {
let keep_len = common_prefix_len(&self.previous_key, key);
let add_len = key.len() - keep_len;
let increasing_keys = add_len > 0 && (self.previous_key.len() == keep_len)
|| self.previous_key.is_empty()
|| self.previous_key[keep_len] < key[keep_len];
assert!(
increasing_keys,
"Keys should be increasing. ({:?} > {:?})",
self.previous_key, key
);
self.previous_key.resize(key.len(), 0u8);
self.previous_key[keep_len..].copy_from_slice(&key[keep_len..]);
self.delta_writer.write_suffix(keep_len, &key[keep_len..]);
}
pub(crate) fn into_delta_writer(self) -> DeltaWriter<W, TValueWriter> {
self.delta_writer
}
pub fn write(&mut self, key: &[u8], value: &TValueWriter::Value) -> io::Result<()> {
self.write_key(key);
self.write_value(value)?;
Ok(())
}
pub fn write_value(&mut self, value: &TValueWriter::Value) -> io::Result<()> {
self.delta_writer.write_value(value);
self.num_terms += 1u64;
self.flush_block_if_required()
}
pub fn flush_block_if_required(&mut self) -> io::Result<()> {
if let Some((start_offset, end_offset)) = self.delta_writer.flush_block_if_required()? {
self.index_builder.add_block(
&self.previous_key[..],
start_offset,
end_offset,
self.first_ordinal_of_the_block,
);
self.first_ordinal_of_the_block = self.num_terms;
self.previous_key.clear();
}
Ok(())
}
pub fn finalize(mut self) -> io::Result<W> {
if let Some((start_offset, end_offset)) = self.delta_writer.flush_block()? {
self.index_builder.add_block(
&self.previous_key[..],
start_offset,
end_offset,
self.first_ordinal_of_the_block,
);
self.first_ordinal_of_the_block = self.num_terms;
}
let mut wrt = self.delta_writer.finalize();
wrt.write_all(&0u32.to_le_bytes())?;
let offset = wrt.written_bytes();
self.index_builder.serialize(&mut wrt)?;
wrt.write_all(&offset.to_le_bytes())?;
wrt.write_all(&self.num_terms.to_le_bytes())?;
let wrt = wrt.finish();
Ok(wrt.into_inner()?)
}
}
#[cfg(test)]
mod test {
use std::io;
use super::SSTable;
use super::VoidMerge;
use super::VoidSSTable;
use super::{common_prefix_len, SSTableMonotonicU64};
fn aux_test_common_prefix_len(left: &str, right: &str, expect_len: usize) {
assert_eq!(
common_prefix_len(left.as_bytes(), right.as_bytes()),
expect_len
);
assert_eq!(
common_prefix_len(right.as_bytes(), left.as_bytes()),
expect_len
);
}
#[test]
fn test_common_prefix_len() {
aux_test_common_prefix_len("a", "ab", 1);
aux_test_common_prefix_len("", "ab", 0);
aux_test_common_prefix_len("ab", "abc", 2);
aux_test_common_prefix_len("abde", "abce", 2);
}
#[test]
fn test_long_key_diff() {
let long_key = (0..1_024).map(|x| (x % 255) as u8).collect::<Vec<_>>();
let long_key2 = (1..300).map(|x| (x % 255) as u8).collect::<Vec<_>>();
let mut buffer = vec![];
{
let mut sstable_writer = VoidSSTable::writer(&mut buffer);
assert!(sstable_writer.write(&long_key[..], &()).is_ok());
assert!(sstable_writer.write(&[0, 3, 4], &()).is_ok());
assert!(sstable_writer.write(&long_key2[..], &()).is_ok());
assert!(sstable_writer.finalize().is_ok());
}
let mut sstable_reader = VoidSSTable::reader(&buffer[..]);
assert!(sstable_reader.advance().unwrap());
assert_eq!(sstable_reader.key(), &long_key[..]);
assert!(sstable_reader.advance().unwrap());
assert_eq!(sstable_reader.key(), &[0, 3, 4]);
assert!(sstable_reader.advance().unwrap());
assert_eq!(sstable_reader.key(), &long_key2[..]);
assert!(!sstable_reader.advance().unwrap());
}
#[test]
fn test_simple_sstable() {
let mut buffer = vec![];
{
let mut sstable_writer = VoidSSTable::writer(&mut buffer);
assert!(sstable_writer.write(&[17u8], &()).is_ok());
assert!(sstable_writer.write(&[17u8, 18u8, 19u8], &()).is_ok());
assert!(sstable_writer.write(&[17u8, 20u8], &()).is_ok());
assert!(sstable_writer.finalize().is_ok());
}
assert_eq!(
&buffer,
&[
// block len
7u8, 0u8, 0u8, 0u8, // keep 0 push 1 | ""
16u8, 17u8, // keep 1 push 2 | 18 19
33u8, 18u8, 19u8, // keep 1 push 1 | 20
17u8, 20u8, 0u8, 0u8, 0u8, 0u8, // no more blocks
// index
161, 102, 98, 108, 111, 99, 107, 115, 129, 162, 104, 108, 97, 115, 116, 95, 107,
101, 121, 130, 17, 20, 106, 98, 108, 111, 99, 107, 95, 97, 100, 100, 114, 163, 108,
115, 116, 97, 114, 116, 95, 111, 102, 102, 115, 101, 116, 0, 106, 101, 110, 100,
95, 111, 102, 102, 115, 101, 116, 11, 109, 102, 105, 114, 115, 116, 95, 111, 114,
100, 105, 110, 97, 108, 0, 15, 0, 0, 0, 0, 0, 0, 0, // offset for the index
3u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8 // num terms
]
);
let mut sstable_reader = VoidSSTable::reader(&buffer[..]);
assert!(sstable_reader.advance().unwrap());
assert_eq!(sstable_reader.key(), &[17u8]);
assert!(sstable_reader.advance().unwrap());
assert_eq!(sstable_reader.key(), &[17u8, 18u8, 19u8]);
assert!(sstable_reader.advance().unwrap());
assert_eq!(sstable_reader.key(), &[17u8, 20u8]);
assert!(!sstable_reader.advance().unwrap());
}
#[test]
#[should_panic]
fn test_simple_sstable_non_increasing_key() {
let mut buffer = vec![];
let mut sstable_writer = VoidSSTable::writer(&mut buffer);
assert!(sstable_writer.write(&[17u8], &()).is_ok());
assert!(sstable_writer.write(&[16u8], &()).is_ok());
}
#[test]
fn test_merge_abcd_abe() {
let mut buffer = Vec::new();
{
let mut writer = VoidSSTable::writer(&mut buffer);
writer.write(b"abcd", &()).unwrap();
writer.write(b"abe", &()).unwrap();
writer.finalize().unwrap();
}
let mut output = Vec::new();
assert!(VoidSSTable::merge(vec![&buffer[..], &buffer[..]], &mut output, VoidMerge).is_ok());
assert_eq!(&output[..], &buffer[..]);
}
#[test]
fn test_sstable() {
let mut buffer = Vec::new();
{
let mut writer = VoidSSTable::writer(&mut buffer);
writer.write(b"abcd", &()).unwrap();
writer.write(b"abe", &()).unwrap();
writer.finalize().unwrap();
}
let mut output = Vec::new();
assert!(VoidSSTable::merge(vec![&buffer[..], &buffer[..]], &mut output, VoidMerge).is_ok());
assert_eq!(&output[..], &buffer[..]);
}
#[test]
fn test_sstable_u64() -> io::Result<()> {
let mut buffer = Vec::new();
let mut writer = SSTableMonotonicU64::writer(&mut buffer);
writer.write(b"abcd", &1u64)?;
writer.write(b"abe", &4u64)?;
writer.write(b"gogo", &4324234234234234u64)?;
writer.finalize()?;
let mut reader = SSTableMonotonicU64::reader(&buffer[..]);
assert!(reader.advance()?);
assert_eq!(reader.key(), b"abcd");
assert_eq!(reader.value(), &1u64);
assert!(reader.advance()?);
assert_eq!(reader.key(), b"abe");
assert_eq!(reader.value(), &4u64);
assert!(reader.advance()?);
assert_eq!(reader.key(), b"gogo");
assert_eq!(reader.value(), &4324234234234234u64);
assert!(!reader.advance()?);
Ok(())
}
}

View File

@@ -1,90 +0,0 @@
use std::io;
use serde;
use serde::{Deserialize, Serialize};
#[derive(Default, Debug, Serialize, Deserialize)]
pub struct SSTableIndex {
blocks: Vec<BlockMeta>,
}
impl SSTableIndex {
pub fn load(data: &[u8]) -> SSTableIndex {
// TODO
serde_cbor::de::from_slice(data).unwrap()
}
pub fn search(&self, key: &[u8]) -> Option<BlockAddr> {
self.blocks
.iter()
.find(|block| &block.last_key[..] >= &key)
.map(|block| block.block_addr)
}
}
#[derive(Clone, Eq, PartialEq, Debug, Copy, Serialize, Deserialize)]
pub struct BlockAddr {
pub start_offset: u64,
pub end_offset: u64,
pub first_ordinal: u64,
}
#[derive(Debug, Serialize, Deserialize)]
struct BlockMeta {
pub last_key: Vec<u8>,
pub block_addr: BlockAddr,
}
#[derive(Default)]
pub struct SSTableIndexBuilder {
index: SSTableIndex,
}
impl SSTableIndexBuilder {
pub fn add_block(
&mut self,
last_key: &[u8],
start_offset: u64,
stop_offset: u64,
first_ordinal: u64,
) {
self.index.blocks.push(BlockMeta {
last_key: last_key.to_vec(),
block_addr: BlockAddr {
start_offset,
end_offset: stop_offset,
first_ordinal,
},
})
}
pub fn serialize(&self, wrt: &mut dyn io::Write) -> io::Result<()> {
serde_cbor::ser::to_writer(wrt, &self.index).unwrap();
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::{BlockAddr, SSTableIndex, SSTableIndexBuilder};
#[test]
fn test_sstable_index() {
let mut sstable_builder = SSTableIndexBuilder::default();
sstable_builder.add_block(b"aaa", 10u64, 20u64, 0u64);
sstable_builder.add_block(b"bbbbbbb", 20u64, 30u64, 564);
sstable_builder.add_block(b"ccc", 30u64, 40u64, 10u64);
sstable_builder.add_block(b"dddd", 40u64, 50u64, 15u64);
let mut buffer: Vec<u8> = Vec::new();
sstable_builder.serialize(&mut buffer).unwrap();
let sstable = SSTableIndex::load(&buffer[..]);
assert_eq!(
sstable.search(b"bbbde"),
Some(BlockAddr {
first_ordinal: 10u64,
start_offset: 30u64,
end_offset: 40u64
})
);
}
}

View File

@@ -1,94 +0,0 @@
use super::{vint, BlockReader};
use std::io;
pub trait ValueReader: Default {
type Value;
fn value(&self, idx: usize) -> &Self::Value;
fn read(&mut self, reader: &mut BlockReader) -> io::Result<()>;
}
pub trait ValueWriter: Default {
type Value;
fn write(&mut self, val: &Self::Value);
fn write_block(&mut self, writer: &mut Vec<u8>);
}
#[derive(Default)]
pub struct VoidReader;
impl ValueReader for VoidReader {
type Value = ();
fn value(&self, _idx: usize) -> &() {
&()
}
fn read(&mut self, _reader: &mut BlockReader) -> io::Result<()> {
Ok(())
}
}
#[derive(Default)]
pub struct VoidWriter;
impl ValueWriter for VoidWriter {
type Value = ();
fn write(&mut self, _val: &()) {}
fn write_block(&mut self, _writer: &mut Vec<u8>) {}
}
#[derive(Default)]
pub struct U64MonotonicWriter {
vals: Vec<u64>,
}
impl ValueWriter for U64MonotonicWriter {
type Value = u64;
fn write(&mut self, val: &Self::Value) {
self.vals.push(*val);
}
fn write_block(&mut self, writer: &mut Vec<u8>) {
let mut prev_val = 0u64;
vint::serialize_into_vec(self.vals.len() as u64, writer);
for &val in &self.vals {
let delta = val - prev_val;
vint::serialize_into_vec(delta, writer);
prev_val = val;
}
self.vals.clear();
}
}
#[derive(Default)]
pub struct U64MonotonicReader {
vals: Vec<u64>,
}
impl ValueReader for U64MonotonicReader {
type Value = u64;
fn value(&self, idx: usize) -> &Self::Value {
&self.vals[idx]
}
fn read(&mut self, reader: &mut BlockReader) -> io::Result<()> {
let len = reader.deserialize_u64() as usize;
self.vals.clear();
let mut prev_val = 0u64;
for _ in 0..len {
let delta = reader.deserialize_u64() as u64;
let val = prev_val + delta;
self.vals.push(val);
prev_val = val;
}
Ok(())
}
}

View File

@@ -1,74 +0,0 @@
use super::BlockReader;
const CONTINUE_BIT: u8 = 128u8;
pub fn serialize(mut val: u64, buffer: &mut [u8]) -> usize {
for (i, b) in buffer.iter_mut().enumerate() {
let next_byte: u8 = (val & 127u64) as u8;
val = val >> 7;
if val == 0u64 {
*b = next_byte;
return i + 1;
} else {
*b = next_byte | CONTINUE_BIT;
}
}
10 //< actually unreachable
}
pub fn serialize_into_vec(val: u64, buffer: &mut Vec<u8>) {
let mut buf = [0u8; 10];
let num_bytes = serialize(val, &mut buf[..]);
buffer.extend_from_slice(&buf[..num_bytes]);
}
// super slow but we don't care
pub fn deserialize_read(buf: &[u8]) -> (usize, u64) {
let mut result = 0u64;
let mut shift = 0u64;
let mut consumed = 0;
for &b in buf {
consumed += 1;
result |= u64::from(b % 128u8) << shift;
if b < CONTINUE_BIT {
break;
}
shift += 7;
}
(consumed, result)
}
pub fn deserialize_from_block(block: &mut BlockReader) -> u64 {
let (num_bytes, val) = deserialize_read(block.buffer());
block.advance(num_bytes);
val
}
#[cfg(test)]
mod tests {
use super::{deserialize_read, serialize};
use std::u64;
fn aux_test_int(val: u64, expect_len: usize) {
let mut buffer = [0u8; 14];
assert_eq!(serialize(val, &mut buffer[..]), expect_len);
assert_eq!(deserialize_read(&buffer), (expect_len, val));
}
#[test]
fn test_vint() {
aux_test_int(0u64, 1);
aux_test_int(17u64, 1);
aux_test_int(127u64, 1);
aux_test_int(128u64, 2);
aux_test_int(123423418u64, 4);
for i in 1..63 {
let power_of_two = 1u64 << i;
aux_test_int(power_of_two + 1, (i / 7) + 1);
aux_test_int(power_of_two, (i / 7) + 1);
aux_test_int(power_of_two - 1, ((i - 1) / 7) + 1);
}
aux_test_int(u64::MAX, 10);
}
}

View File

@@ -1,227 +0,0 @@
use super::TermDictionary;
use crate::postings::TermInfo;
use crate::termdict::sstable_termdict::TermInfoReader;
use crate::termdict::TermOrdinal;
use std::io;
use std::ops::Bound;
use tantivy_fst::automaton::AlwaysMatch;
use tantivy_fst::Automaton;
/// `TermStreamerBuilder` is a helper object used to define
/// a range of terms that should be streamed.
pub struct TermStreamerBuilder<'a, A = AlwaysMatch>
where
A: Automaton,
A::State: Clone,
{
term_dict: &'a TermDictionary,
automaton: A,
lower: Bound<Vec<u8>>,
upper: Bound<Vec<u8>>,
}
impl<'a, A> TermStreamerBuilder<'a, A>
where
A: Automaton,
A::State: Clone,
{
pub(crate) fn new(term_dict: &'a TermDictionary, automaton: A) -> Self {
TermStreamerBuilder {
term_dict,
automaton,
lower: Bound::Unbounded,
upper: Bound::Unbounded,
}
}
/// Limit the range to terms greater or equal to the bound
pub fn ge<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
self.lower = Bound::Included(bound.as_ref().to_owned());
self
}
/// Limit the range to terms strictly greater than the bound
pub fn gt<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
self.lower = Bound::Excluded(bound.as_ref().to_owned());
self
}
/// Limit the range to terms lesser or equal to the bound
pub fn le<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
self.upper = Bound::Included(bound.as_ref().to_owned());
self
}
/// Limit the range to terms lesser or equal to the bound
pub fn lt<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
self.lower = Bound::Excluded(bound.as_ref().to_owned());
self
}
pub fn backward(mut self) -> Self {
unimplemented!()
}
/// Creates the stream corresponding to the range
/// of terms defined using the `TermStreamerBuilder`.
pub fn into_stream(self) -> io::Result<TermStreamer<'a, A>> {
let start_state = self.automaton.start();
let delta_reader = self.term_dict.sstable_delta_reader()?;
Ok(TermStreamer {
automaton: self.automaton,
states: vec![start_state],
delta_reader,
key: Vec::new(),
term_ord: 0u64,
})
}
}
/// `TermStreamer` acts as a cursor over a range of terms of a segment.
/// Terms are guaranteed to be sorted.
pub struct TermStreamer<'a, A = AlwaysMatch>
where
A: Automaton,
A::State: Clone,
{
automaton: A,
states: Vec<A::State>,
delta_reader: super::sstable::DeltaReader<'a, TermInfoReader>,
key: Vec<u8>,
term_ord: TermOrdinal,
}
impl<'a, A> TermStreamer<'a, A>
where
A: Automaton,
A::State: Clone,
{
/// Advance position the stream on the next item.
/// Before the first call to `.advance()`, the stream
/// is an unitialized state.
pub fn advance(&mut self) -> bool {
while self.delta_reader.advance().unwrap() {
self.term_ord += 1u64;
let common_prefix_len = self.delta_reader.common_prefix_len();
self.states.truncate(common_prefix_len + 1);
self.key.truncate(common_prefix_len);
let mut state: A::State = self.states.last().unwrap().clone();
for &b in self.delta_reader.suffix() {
state = self.automaton.accept(&state, b);
self.states.push(state.clone());
}
self.key.extend_from_slice(self.delta_reader.suffix());
if self.automaton.is_match(&state) {
return true;
}
}
false
}
/// Returns the `TermOrdinal` of the given term.
///
/// May panic if the called as `.advance()` as never
/// been called before.
pub fn term_ord(&self) -> TermOrdinal {
self.term_ord
}
/// Accesses the current key.
///
/// `.key()` should return the key that was returned
/// by the `.next()` method.
///
/// If the end of the stream as been reached, and `.next()`
/// has been called and returned `None`, `.key()` remains
/// the value of the last key encountered.
///
/// Before any call to `.next()`, `.key()` returns an empty array.
pub fn key(&self) -> &[u8] {
&self.key
}
/// Accesses the current value.
///
/// Calling `.value()` after the end of the stream will return the
/// last `.value()` encountered.
///
/// # Panics
///
/// Calling `.value()` before the first call to `.advance()` returns
/// `V::default()`.
pub fn value(&self) -> &TermInfo {
self.delta_reader.value()
}
/// Return the next `(key, value)` pair.
#[cfg_attr(feature = "cargo-clippy", allow(clippy::should_implement_trait))]
pub fn next(&mut self) -> Option<(&[u8], &TermInfo)> {
if self.advance() {
Some((self.key(), self.value()))
} else {
None
}
}
}
#[cfg(test)]
mod tests {
use super::super::TermDictionary;
use crate::directory::OwnedBytes;
use crate::postings::TermInfo;
fn make_term_info(i: u64) -> TermInfo {
TermInfo {
doc_freq: 1000u32 + i as u32,
positions_idx: i * 500,
postings_start_offset: (i + 10) * (i * 10),
postings_stop_offset: ((i + 1) + 10) * ((i + 1) * 10),
}
}
fn create_test_term_dictionary() -> crate::Result<TermDictionary> {
let mut term_dict_builder = super::super::TermDictionaryBuilder::create(Vec::new())?;
term_dict_builder.insert(b"abaisance", &make_term_info(0u64))?;
term_dict_builder.insert(b"abalation", &make_term_info(1u64))?;
term_dict_builder.insert(b"abalienate", &make_term_info(2u64))?;
term_dict_builder.insert(b"abandon", &make_term_info(3u64))?;
let buffer = term_dict_builder.finish()?;
let owned_bytes = OwnedBytes::new(buffer);
TermDictionary::from_bytes(owned_bytes)
}
#[test]
fn test_sstable_stream() -> crate::Result<()> {
let term_dict = create_test_term_dictionary()?;
let mut term_streamer = term_dict.stream()?;
assert!(term_streamer.advance());
assert_eq!(term_streamer.key(), b"abaisance");
assert_eq!(term_streamer.value().doc_freq, 1000u32);
assert!(term_streamer.advance());
assert_eq!(term_streamer.key(), b"abalation");
assert_eq!(term_streamer.value().doc_freq, 1001u32);
assert!(term_streamer.advance());
assert_eq!(term_streamer.key(), b"abalienate");
assert_eq!(term_streamer.value().doc_freq, 1002u32);
assert!(term_streamer.advance());
assert_eq!(term_streamer.key(), b"abandon");
assert_eq!(term_streamer.value().doc_freq, 1003u32);
assert!(!term_streamer.advance());
Ok(())
}
#[test]
fn test_sstable_search() -> crate::Result<()> {
let term_dict = create_test_term_dictionary()?;
let ptn = tantivy_fst::Regex::new("ab.*t.*").unwrap();
let mut term_streamer = term_dict.search(ptn).into_stream()?;
assert!(term_streamer.advance());
assert_eq!(term_streamer.key(), b"abalation");
assert_eq!(term_streamer.value().doc_freq, 1001u32);
assert!(term_streamer.advance());
assert_eq!(term_streamer.key(), b"abalienate");
assert_eq!(term_streamer.value().doc_freq, 1002u32);
assert!(!term_streamer.advance());
Ok(())
}
}

View File

@@ -1,228 +0,0 @@
use std::io;
use crate::common::BinarySerializable;
use crate::directory::{FileSlice, OwnedBytes};
use crate::postings::TermInfo;
use crate::termdict::sstable_termdict::sstable::sstable_index::BlockAddr;
use crate::termdict::sstable_termdict::sstable::Writer;
use crate::termdict::sstable_termdict::sstable::{DeltaReader, SSTable};
use crate::termdict::sstable_termdict::sstable::{Reader, SSTableIndex};
use crate::termdict::sstable_termdict::{
TermInfoReader, TermInfoWriter, TermSSTable, TermStreamer, TermStreamerBuilder,
};
use crate::termdict::TermOrdinal;
use crate::HasLen;
use once_cell::sync::Lazy;
use tantivy_fst::automaton::AlwaysMatch;
use tantivy_fst::Automaton;
pub struct TermInfoSSTable;
impl SSTable for TermInfoSSTable {
type Value = TermInfo;
type Reader = TermInfoReader;
type Writer = TermInfoWriter;
}
pub struct TermDictionaryBuilder<W: io::Write> {
sstable_writer: Writer<W, TermInfoWriter>,
}
impl<W: io::Write> TermDictionaryBuilder<W> {
/// Creates a new `TermDictionaryBuilder`
pub fn create(w: W) -> io::Result<Self> {
let sstable_writer = TermSSTable::writer(w);
Ok(TermDictionaryBuilder { sstable_writer })
}
/// Inserts a `(key, value)` pair in the term dictionary.
///
/// *Keys have to be inserted in order.*
pub fn insert<K: AsRef<[u8]>>(&mut self, key_ref: K, value: &TermInfo) -> io::Result<()> {
let key = key_ref.as_ref();
self.insert_key(key)?;
self.insert_value(value)?;
Ok(())
}
/// # Warning
/// Horribly dangerous internal API
///
/// If used, it must be used by systematically alternating calls
/// to insert_key and insert_value.
///
/// Prefer using `.insert(key, value)`
pub(crate) fn insert_key(&mut self, key: &[u8]) -> io::Result<()> {
self.sstable_writer.write_key(key);
Ok(())
}
/// # Warning
///
/// Horribly dangerous internal API. See `.insert_key(...)`.
pub(crate) fn insert_value(&mut self, term_info: &TermInfo) -> io::Result<()> {
self.sstable_writer.write_value(term_info);
Ok(())
}
/// Finalize writing the builder, and returns the underlying
/// `Write` object.
pub fn finish(self) -> io::Result<W> {
self.sstable_writer.finalize()
}
}
static EMPTY_TERM_DICT_FILE: Lazy<FileSlice> = Lazy::new(|| {
let term_dictionary_data: Vec<u8> = TermDictionaryBuilder::create(Vec::<u8>::new())
.expect("Creating a TermDictionaryBuilder in a Vec<u8> should never fail")
.finish()
.expect("Writing in a Vec<u8> should never fail");
FileSlice::from(term_dictionary_data)
});
/// The term dictionary contains all of the terms in
/// `tantivy index` in a sorted manner.
///
/// The `Fst` crate is used to associate terms to their
/// respective `TermOrdinal`. The `TermInfoStore` then makes it
/// possible to fetch the associated `TermInfo`.
pub struct TermDictionary {
sstable_slice: FileSlice,
sstable_index: SSTableIndex,
num_terms: u64,
}
impl TermDictionary {
pub(crate) fn sstable_reader(&self) -> io::Result<Reader<'static, TermInfoReader>> {
let data = self.sstable_slice.read_bytes()?;
Ok(TermInfoSSTable::reader(data))
}
pub(crate) fn sstable_reader_block(
&self,
block_addr: BlockAddr,
) -> io::Result<Reader<'static, TermInfoReader>> {
let data = self.sstable_slice.read_bytes_slice(
block_addr.start_offset as usize,
block_addr.end_offset as usize,
)?;
Ok(TermInfoSSTable::reader(data))
}
pub(crate) fn sstable_delta_reader(&self) -> io::Result<DeltaReader<'static, TermInfoReader>> {
let data = self.sstable_slice.read_bytes()?;
Ok(TermInfoSSTable::delta_reader(data))
}
/// Opens a `TermDictionary`.
pub fn open(term_dictionary_file: FileSlice) -> crate::Result<Self> {
let (main_slice, footer_len_slice) = term_dictionary_file.split_from_end(16);
let mut footer_len_bytes: OwnedBytes = footer_len_slice.read_bytes()?;
let index_offset = u64::deserialize(&mut footer_len_bytes)?;
let num_terms = u64::deserialize(&mut footer_len_bytes)?;
let (sstable_slice, index_slice) = main_slice.split(index_offset as usize);
// dbg!(index_slice.len());
let sstable_index_bytes = index_slice.read_bytes()?;
let sstable_index = SSTableIndex::load(sstable_index_bytes.as_slice());
// dbg!(&sstable_index);
Ok(TermDictionary {
sstable_slice,
sstable_index,
num_terms,
})
}
pub fn from_bytes(owned_bytes: OwnedBytes) -> crate::Result<TermDictionary> {
TermDictionary::open(FileSlice::new(Box::new(owned_bytes)))
}
/// Creates an empty term dictionary which contains no terms.
pub fn empty() -> Self {
TermDictionary::open(EMPTY_TERM_DICT_FILE.clone()).unwrap()
}
/// Returns the number of terms in the dictionary.
/// Term ordinals range from 0 to `num_terms() - 1`.
pub fn num_terms(&self) -> usize {
self.num_terms as usize
}
/// Returns the ordinal associated to a given term.
pub fn term_ord<K: AsRef<[u8]>>(&self, key: K) -> io::Result<Option<TermOrdinal>> {
let mut term_ord = 0u64;
let key_bytes = key.as_ref();
let mut sstable_reader = self.sstable_reader()?;
while sstable_reader.advance().unwrap_or(false) {
if sstable_reader.key() == key_bytes {
return Ok(Some(term_ord));
}
term_ord += 1;
}
Ok(None)
}
/// Returns the term associated to a given term ordinal.
///
/// Term ordinals are defined as the position of the term in
/// the sorted list of terms.
///
/// Returns true iff the term has been found.
///
/// Regardless of whether the term is found or not,
/// the buffer may be modified.
pub fn ord_to_term(&self, ord: TermOrdinal, bytes: &mut Vec<u8>) -> io::Result<bool> {
let mut sstable_reader = self.sstable_reader()?;
bytes.clear();
for _ in 0..(ord + 1) {
if !sstable_reader.advance().unwrap_or(false) {
return Ok(false);
}
}
bytes.extend_from_slice(sstable_reader.key());
Ok(true)
}
/// Returns the number of terms in the dictionary.
pub fn term_info_from_ord(&self, term_ord: TermOrdinal) -> io::Result<TermInfo> {
let mut sstable_reader = self.sstable_reader()?;
for _ in 0..(term_ord + 1) {
if !sstable_reader.advance().unwrap_or(false) {
return Ok(TermInfo::default());
}
}
Ok(sstable_reader.value().clone())
}
/// Lookups the value corresponding to the key.
pub fn get<K: AsRef<[u8]>>(&self, key: K) -> io::Result<Option<TermInfo>> {
if let Some(block_addr) = self.sstable_index.search(key.as_ref()) {
let mut sstable_reader = self.sstable_reader_block(block_addr)?;
let key_bytes = key.as_ref();
while sstable_reader.advance().unwrap_or(false) {
if sstable_reader.key() == key_bytes {
let term_info = sstable_reader.value().clone();
return Ok(Some(term_info));
}
}
}
Ok(None)
}
// Returns a range builder, to stream all of the terms
// within an interval.
pub fn range(&self) -> TermStreamerBuilder<'_> {
TermStreamerBuilder::new(self, AlwaysMatch)
}
// A stream of all the sorted terms. [See also `.stream_field()`](#method.stream_field)
pub fn stream(&self) -> io::Result<TermStreamer<'_>> {
self.range().into_stream()
}
// Returns a search builder, to stream all of the terms
// within the Automaton
pub fn search<'a, A: Automaton + 'a>(&'a self, automaton: A) -> TermStreamerBuilder<'a, A>
where
A::State: Clone,
{
TermStreamerBuilder::<A>::new(self, automaton)
}
}

View File

@@ -249,8 +249,7 @@ fn test_empty_string() -> crate::Result<()> {
Ok(())
}
#[test]
fn test_stream_range_boundaries() -> crate::Result<()> {
fn stream_range_test_dict() -> crate::Result<TermDictionary> {
let buffer: Vec<u8> = {
let mut term_dictionary_builder = TermDictionaryBuilder::create(Vec::new())?;
for i in 0u8..10u8 {
@@ -260,84 +259,96 @@ fn test_stream_range_boundaries() -> crate::Result<()> {
term_dictionary_builder.finish()?
};
let file = FileSlice::from(buffer);
let term_dictionary: TermDictionary = TermDictionary::open(file)?;
TermDictionary::open(file)
}
let value_list = |mut streamer: TermStreamer<'_>, backwards: bool| {
#[test]
fn test_stream_range_boundaries_forward() -> crate::Result<()> {
let term_dictionary = stream_range_test_dict()?;
let value_list = |mut streamer: TermStreamer<'_>| {
let mut res: Vec<u32> = vec![];
while let Some((_, ref v)) = streamer.next() {
res.push(v.doc_freq);
}
if backwards {
res.reverse();
}
res
};
{
let range = term_dictionary.range().backward().into_stream()?;
assert_eq!(
value_list(range, true),
vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
);
}
{
let range = term_dictionary.range().ge([2u8]).into_stream()?;
assert_eq!(
value_list(range, false),
vec![2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
);
}
{
let range = term_dictionary.range().ge([2u8]).backward().into_stream()?;
assert_eq!(
value_list(range, true),
value_list(range),
vec![2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
);
}
{
let range = term_dictionary.range().gt([2u8]).into_stream()?;
assert_eq!(
value_list(range, false),
vec![3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
);
}
{
let range = term_dictionary.range().gt([2u8]).backward().into_stream()?;
assert_eq!(
value_list(range, true),
value_list(range),
vec![3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
);
}
{
let range = term_dictionary.range().lt([6u8]).into_stream()?;
assert_eq!(
value_list(range, false),
vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32]
);
}
{
let range = term_dictionary.range().lt([6u8]).backward().into_stream()?;
assert_eq!(
value_list(range, true),
vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32]
);
assert_eq!(value_list(range), vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32]);
}
{
let range = term_dictionary.range().le([6u8]).into_stream()?;
assert_eq!(
value_list(range, false),
vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32]
);
}
{
let range = term_dictionary.range().le([6u8]).backward().into_stream()?;
assert_eq!(
value_list(range, true),
value_list(range),
vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32]
);
}
{
let range = term_dictionary.range().ge([0u8]).lt([5u8]).into_stream()?;
assert_eq!(value_list(range, false), vec![0u32, 1u32, 2u32, 3u32, 4u32]);
assert_eq!(value_list(range), vec![0u32, 1u32, 2u32, 3u32, 4u32]);
}
Ok(())
}
#[test]
fn test_stream_range_boundaries_backward() -> crate::Result<()> {
let term_dictionary = stream_range_test_dict()?;
let value_list_backward = |mut streamer: TermStreamer<'_>| {
let mut res: Vec<u32> = vec![];
while let Some((_, ref v)) = streamer.next() {
res.push(v.doc_freq);
}
res.reverse();
res
};
{
let range = term_dictionary.range().backward().into_stream()?;
assert_eq!(
value_list_backward(range),
vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
);
}
{
let range = term_dictionary.range().ge([2u8]).backward().into_stream()?;
assert_eq!(
value_list_backward(range),
vec![2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
);
}
{
let range = term_dictionary.range().gt([2u8]).backward().into_stream()?;
assert_eq!(
value_list_backward(range),
vec![3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
);
}
{
let range = term_dictionary.range().lt([6u8]).backward().into_stream()?;
assert_eq!(
value_list_backward(range),
vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32]
);
}
{
let range = term_dictionary.range().le([6u8]).backward().into_stream()?;
assert_eq!(
value_list_backward(range),
vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32]
);
}
{
let range = term_dictionary
@@ -346,11 +357,38 @@ fn test_stream_range_boundaries() -> crate::Result<()> {
.lt([5u8])
.backward()
.into_stream()?;
assert_eq!(value_list(range, true), vec![0u32, 1u32, 2u32, 3u32, 4u32]);
assert_eq!(
value_list_backward(range),
vec![0u32, 1u32, 2u32, 3u32, 4u32]
);
}
Ok(())
}
#[test]
fn test_ord_to_term() -> crate::Result<()> {
let termdict = stream_range_test_dict()?;
let mut bytes = vec![];
for b in 0u8..10u8 {
termdict.ord_to_term(b as u64, &mut bytes)?;
assert_eq!(&bytes, &[b]);
}
Ok(())
}
#[test]
fn test_stream_term_ord() -> crate::Result<()> {
let termdict = stream_range_test_dict()?;
let mut stream = termdict.stream()?;
for b in 0u8..10u8 {
assert!(stream.advance(), true);
assert_eq!(stream.term_ord(), b as u64);
assert_eq!(stream.key(), &[b]);
}
assert!(!stream.advance());
Ok(())
}
#[test]
fn test_automaton_search() -> crate::Result<()> {
use crate::query::DFAWrapper;

View File

@@ -2,16 +2,16 @@
//! ```rust
//! use tantivy::tokenizer::*;
//!
//! let tokenizer = TextAnalyzer::from(RawTokenizer)
//! .filter(AlphaNumOnlyFilter);
//! let tokenizer = analyzer_builder(RawTokenizer)
//! .filter(AlphaNumOnlyFilter).build();
//!
//! let mut stream = tokenizer.token_stream("hello there");
//! // is none because the raw filter emits one token that
//! // contains a space
//! assert!(stream.next().is_none());
//!
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer)
//! .filter(AlphaNumOnlyFilter);
//! let tokenizer = analyzer_builder(SimpleTokenizer)
//! .filter(AlphaNumOnlyFilter).build();
//!
//! let mut stream = tokenizer.token_stream("hello there 💣");
//! assert!(stream.next().is_some());
@@ -19,45 +19,18 @@
//! // the "emoji" is dropped because its not an alphanum
//! assert!(stream.next().is_none());
//! ```
use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
use super::{Token, TokenFilter};
/// `TokenFilter` that removes all tokens that contain non
/// ascii alphanumeric characters.
#[derive(Clone)]
#[derive(Clone, Debug, Default)]
pub struct AlphaNumOnlyFilter;
pub struct AlphaNumOnlyFilterStream<'a> {
tail: BoxTokenStream<'a>,
}
impl<'a> AlphaNumOnlyFilterStream<'a> {
fn predicate(&self, token: &Token) -> bool {
token.text.chars().all(|c| c.is_ascii_alphanumeric())
}
}
impl TokenFilter for AlphaNumOnlyFilter {
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
BoxTokenStream::from(AlphaNumOnlyFilterStream { tail: token_stream })
}
}
impl<'a> TokenStream for AlphaNumOnlyFilterStream<'a> {
fn advance(&mut self) -> bool {
while self.tail.advance() {
if self.predicate(self.tail.token()) {
return true;
}
fn transform(&mut self, token: Token) -> Option<Token> {
if token.text.chars().all(|c| c.is_ascii_alphanumeric()) {
return Some(token);
}
false
}
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
None
}
}

View File

@@ -1,45 +1,31 @@
use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
use super::{Token, TokenFilter};
use std::mem;
/// This class converts alphabetic, numeric, and symbolic Unicode characters
/// which are not in the first 127 ASCII characters (the "Basic Latin" Unicode
/// block) into their ASCII equivalents, if one exists.
#[derive(Clone)]
pub struct AsciiFoldingFilter;
impl TokenFilter for AsciiFoldingFilter {
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
From::from(AsciiFoldingFilterTokenStream {
tail: token_stream,
buffer: String::with_capacity(100),
})
}
}
pub struct AsciiFoldingFilterTokenStream<'a> {
#[derive(Clone, Debug, Default)]
pub struct AsciiFolding {
buffer: String,
tail: BoxTokenStream<'a>,
}
impl<'a> TokenStream for AsciiFoldingFilterTokenStream<'a> {
fn advance(&mut self) -> bool {
if !self.tail.advance() {
return false;
impl AsciiFolding {
/// Construct a new `AsciiFolding` filter.
pub fn new() -> Self {
Self {
buffer: String::with_capacity(100),
}
if !self.token_mut().text.is_ascii() {
}
}
impl TokenFilter for AsciiFolding {
fn transform(&mut self, mut token: Token) -> Option<Token> {
if !token.text.is_ascii() {
// ignore its already ascii
to_ascii(&mut self.tail.token_mut().text, &mut self.buffer);
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
to_ascii(&token.text, &mut self.buffer);
mem::swap(&mut token.text, &mut self.buffer);
}
true
}
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
Some(token)
}
}
@@ -1526,7 +1512,7 @@ fn fold_non_ascii_char(c: char) -> Option<&'static str> {
}
// https://github.com/apache/lucene-solr/blob/master/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.java#L187
fn to_ascii(text: &mut String, output: &mut String) {
fn to_ascii(text: &String, output: &mut String) {
output.clear();
for c in text.chars() {
@@ -1540,11 +1526,8 @@ fn to_ascii(text: &mut String, output: &mut String) {
#[cfg(test)]
mod tests {
use super::to_ascii;
use crate::tokenizer::AsciiFoldingFilter;
use crate::tokenizer::RawTokenizer;
use crate::tokenizer::SimpleTokenizer;
use crate::tokenizer::TextAnalyzer;
use super::super::*;
use super::*;
use std::iter;
#[test]
@@ -1560,22 +1543,22 @@ mod tests {
}
fn folding_helper(text: &str) -> Vec<String> {
let mut tokens = Vec::new();
TextAnalyzer::from(SimpleTokenizer)
.filter(AsciiFoldingFilter)
let tokens = analyzer_builder(SimpleTokenizer)
.filter(AsciiFolding::new())
.build()
.token_stream(text)
.process(&mut |token| {
tokens.push(token.text.clone());
});
.map(|token| token.text.clone())
.collect();
tokens
}
fn folding_using_raw_tokenizer_helper(text: &str) -> String {
let mut token_stream = TextAnalyzer::from(RawTokenizer)
.filter(AsciiFoldingFilter)
let mut token_stream = analyzer_builder(RawTokenizer)
.filter(AsciiFolding::new())
.build()
.token_stream(text);
token_stream.advance();
token_stream.token().text.clone()
let Token { text, .. } = token_stream.next().unwrap();
text
}
#[test]
@@ -1626,9 +1609,9 @@ mod tests {
#[test]
fn test_to_ascii() {
let mut input = "Rámon".to_string();
let input = "Rámon".to_string();
let mut buffer = String::new();
to_ascii(&mut input, &mut buffer);
to_ascii(&input, &mut buffer);
assert_eq!("Ramon", buffer);
}

View File

@@ -1,4 +1,4 @@
use super::{BoxTokenStream, Token, TokenStream, Tokenizer};
use super::{Token, Tokenizer};
use crate::schema::FACET_SEP_BYTE;
/// The `FacetTokenizer` process a `Facet` binary representation
@@ -9,72 +9,63 @@ use crate::schema::FACET_SEP_BYTE;
/// - `/america/north_america/canada`
/// - `/america/north_america`
/// - `/america`
#[derive(Clone)]
#[derive(Clone, Debug, Default)]
pub struct FacetTokenizer;
#[derive(Debug)]
#[derive(Clone, Debug)]
enum State {
RootFacetNotEmitted,
UpToPosition(usize), //< we already emitted facet prefix up to &text[..cursor]
Terminated,
}
pub struct FacetTokenStream<'a> {
text: &'a str,
#[derive(Clone, Debug)]
pub struct FacetTokenStream {
text: String,
state: State,
token: Token,
}
impl Tokenizer for FacetTokenizer {
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
type Iter = FacetTokenStream;
fn token_stream(&self, text: &str) -> Self::Iter {
FacetTokenStream {
text,
text: text.to_string(),
state: State::RootFacetNotEmitted, //< pos is the first char that has not been processed yet.
token: Token::default(),
}
.into()
}
}
impl<'a> TokenStream for FacetTokenStream<'a> {
fn advance(&mut self) -> bool {
match self.state {
impl Iterator for FacetTokenStream {
type Item = Token;
fn next(&mut self) -> Option<Self::Item> {
self.state = match self.state {
State::RootFacetNotEmitted => {
self.state = if self.text.is_empty() {
if self.text.is_empty() {
State::Terminated
} else {
State::UpToPosition(0)
};
true
}
}
State::UpToPosition(cursor) => {
let bytes: &[u8] = self.text.as_bytes();
if let Some(next_sep_pos) = bytes[cursor + 1..]
if let Some(next_sep_pos) = self.text.as_bytes()[cursor + 1..]
.iter()
.cloned()
.position(|b| b == FACET_SEP_BYTE)
.position(|&b| b == FACET_SEP_BYTE)
.map(|pos| cursor + 1 + pos)
{
let facet_part = &self.text[cursor..next_sep_pos];
self.token.text.push_str(facet_part);
self.state = State::UpToPosition(next_sep_pos);
State::UpToPosition(next_sep_pos)
} else {
let facet_part = &self.text[cursor..];
self.token.text.push_str(facet_part);
self.state = State::Terminated;
State::Terminated
}
true
}
State::Terminated => false,
}
}
fn token(&self) -> &Token {
&self.token
}
fn token_mut(&mut self) -> &mut Token {
&mut self.token
State::Terminated => return None,
};
Some(self.token.clone())
}
}
@@ -83,21 +74,19 @@ mod tests {
use super::FacetTokenizer;
use crate::schema::Facet;
use crate::tokenizer::{Token, Tokenizer};
use crate::tokenizer::Tokenizer;
#[test]
fn test_facet_tokenizer() {
let facet = Facet::from_path(vec!["top", "a", "b"]);
let mut tokens = vec![];
{
let mut add_token = |token: &Token| {
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap();
tokens.push(format!("{}", facet));
};
FacetTokenizer
.token_stream(facet.encoded_str())
.process(&mut add_token);
}
let tokens: Vec<_> = FacetTokenizer
.token_stream(facet.encoded_str())
.map(|token| {
Facet::from_encoded(token.text.as_bytes().to_owned())
.unwrap()
.to_string()
})
.collect();
assert_eq!(tokens.len(), 4);
assert_eq!(tokens[0], "/");
assert_eq!(tokens[1], "/top");
@@ -108,16 +97,14 @@ mod tests {
#[test]
fn test_facet_tokenizer_root_facets() {
let facet = Facet::root();
let mut tokens = vec![];
{
let mut add_token = |token: &Token| {
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); // ok test
tokens.push(format!("{}", facet));
};
FacetTokenizer
.token_stream(facet.encoded_str()) // ok test
.process(&mut add_token);
}
let tokens: Vec<_> = FacetTokenizer
.token_stream(facet.encoded_str())
.map(|token| {
Facet::from_encoded(token.text.as_bytes().to_owned())
.unwrap()
.to_string()
})
.collect();
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0], "/");
}

View File

@@ -1,27 +1,36 @@
use super::{Token, TokenFilter, TokenStream};
use crate::tokenizer::BoxTokenStream;
use super::{Token, TokenFilter};
use std::mem;
impl TokenFilter for LowerCaser {
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
BoxTokenStream::from(LowerCaserTokenStream {
tail: token_stream,
buffer: String::with_capacity(100),
})
fn transform(&mut self, mut token: Token) -> Option<Token> {
if token.text.is_ascii() {
// fast track for ascii.
token.text.make_ascii_lowercase();
} else {
to_lowercase_unicode(&token.text, &mut self.buffer);
mem::swap(&mut token.text, &mut self.buffer);
}
Some(token)
}
}
/// Token filter that lowercase terms.
#[derive(Clone)]
pub struct LowerCaser;
pub struct LowerCaserTokenStream<'a> {
#[derive(Clone, Debug, Default)]
pub struct LowerCaser {
buffer: String,
tail: BoxTokenStream<'a>,
}
impl LowerCaser {
/// Initialize the `LowerCaser`
pub fn new() -> Self {
LowerCaser {
buffer: String::with_capacity(100),
}
}
}
// writes a lowercased version of text into output.
fn to_lowercase_unicode(text: &mut String, output: &mut String) {
fn to_lowercase_unicode(text: &String, output: &mut String) {
output.clear();
for c in text.chars() {
// Contrary to the std, we do not take care of sigma special case.
@@ -30,57 +39,31 @@ fn to_lowercase_unicode(text: &mut String, output: &mut String) {
}
}
impl<'a> TokenStream for LowerCaserTokenStream<'a> {
fn advance(&mut self) -> bool {
if !self.tail.advance() {
return false;
}
if self.token_mut().text.is_ascii() {
// fast track for ascii.
self.token_mut().text.make_ascii_lowercase();
} else {
to_lowercase_unicode(&mut self.tail.token_mut().text, &mut self.buffer);
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
}
true
}
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
}
}
#[cfg(test)]
mod tests {
use crate::tokenizer::{LowerCaser, SimpleTokenizer, TextAnalyzer};
use super::*;
use crate::tokenizer::{analyzer_builder, LowerCaser, SimpleTokenizer, TextAnalyzerT};
#[test]
fn test_to_lower_case() {
assert_eq!(
lowercase_helper("Русский текст"),
vec!["русский".to_string(), "текст".to_string()]
);
assert_eq!(lowercase_helper("Русский текст"), vec!["русский", "текст"]);
}
fn lowercase_helper(text: &str) -> Vec<String> {
let mut tokens = vec![];
let mut token_stream = TextAnalyzer::from(SimpleTokenizer)
.filter(LowerCaser)
.token_stream(text);
while token_stream.advance() {
let token_text = token_stream.token().text.clone();
tokens.push(token_text);
}
tokens
analyzer_builder(SimpleTokenizer)
.filter(LowerCaser::new())
.build()
.token_stream(text)
.map(|token| {
let Token { text, .. } = token;
text
})
.collect()
}
#[test]
fn test_lowercaser() {
assert_eq!(lowercase_helper("Tree"), vec!["tree".to_string()]);
assert_eq!(lowercase_helper("Русский"), vec!["русский".to_string()]);
assert_eq!(lowercase_helper("Tree"), vec!["tree"]);
assert_eq!(lowercase_helper("Русский"), vec!["русский"]);
}
}

View File

@@ -64,10 +64,10 @@
//! ```rust
//! use tantivy::tokenizer::*;
//!
//! let en_stem = TextAnalyzer::from(SimpleTokenizer)
//! let en_stem = analyzer_builder(SimpleTokenizer)
//! .filter(RemoveLongFilter::limit(40))
//! .filter(LowerCaser)
//! .filter(Stemmer::new(Language::English));
//! .filter(LowerCaser::new())
//! .filter(Stemmer::new(Language::English)).build();
//! ```
//!
//! Once your tokenizer is defined, you need to
@@ -109,9 +109,9 @@
//! let index = Index::create_in_ram(schema);
//!
//! // We need to register our tokenizer :
//! let custom_en_tokenizer = TextAnalyzer::from(SimpleTokenizer)
//! let custom_en_tokenizer = analyzer_builder(SimpleTokenizer)
//! .filter(RemoveLongFilter::limit(40))
//! .filter(LowerCaser);
//! .filter(LowerCaser::new()).build();
//! index
//! .tokenizers()
//! .register("custom_en", custom_en_tokenizer);
@@ -133,7 +133,7 @@ mod tokenizer;
mod tokenizer_manager;
pub use self::alphanum_only::AlphaNumOnlyFilter;
pub use self::ascii_folding_filter::AsciiFoldingFilter;
pub use self::ascii_folding_filter::AsciiFolding;
pub use self::facet_tokenizer::FacetTokenizer;
pub use self::lower_caser::LowerCaser;
pub use self::ngram_tokenizer::NgramTokenizer;
@@ -142,11 +142,11 @@ pub use self::remove_long::RemoveLongFilter;
pub use self::simple_tokenizer::SimpleTokenizer;
pub use self::stemmer::{Language, Stemmer};
pub use self::stop_word_filter::StopWordFilter;
pub(crate) use self::token_stream_chain::TokenStreamChain;
pub(crate) use self::token_stream_chain::{DynTokenStreamChain, TokenStreamChain};
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
pub use self::tokenizer::{
BoxTokenFilter, BoxTokenStream, TextAnalyzer, Token, TokenFilter, TokenStream, Tokenizer,
analyzer_builder, Identity, TextAnalyzer, TextAnalyzerT, Token, TokenFilter, Tokenizer,
};
pub use self::tokenizer_manager::TokenizerManager;
@@ -160,10 +160,7 @@ pub const MAX_TOKEN_LEN: usize = u16::max_value() as usize - 4;
#[cfg(test)]
pub mod tests {
use super::{
Language, LowerCaser, RemoveLongFilter, SimpleTokenizer, Stemmer, Token, TokenizerManager,
};
use crate::tokenizer::TextAnalyzer;
use super::*;
/// This is a function that can be used in tests and doc tests
/// to assert a token's correctness.
@@ -190,15 +187,9 @@ pub mod tests {
fn test_raw_tokenizer() {
let tokenizer_manager = TokenizerManager::default();
let en_tokenizer = tokenizer_manager.get("raw").unwrap();
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
en_tokenizer
.token_stream("Hello, happy tax payer!")
.process(&mut add_token);
}
let tokens: Vec<Token> = en_tokenizer
.token_stream("Hello, happy tax payer!")
.collect();
assert_eq!(tokens.len(), 1);
assert_token(&tokens[0], 0, "Hello, happy tax payer!", 0, 23);
}
@@ -208,15 +199,9 @@ pub mod tests {
let tokenizer_manager = TokenizerManager::default();
assert!(tokenizer_manager.get("en_doesnotexist").is_none());
let en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
en_tokenizer
.token_stream("Hello, happy tax payer!")
.process(&mut add_token);
}
let tokens: Vec<Token> = en_tokenizer
.token_stream("Hello, happy tax payer!")
.collect();
assert_eq!(tokens.len(), 4);
assert_token(&tokens[0], 0, "hello", 0, 5);
@@ -230,21 +215,16 @@ pub mod tests {
let tokenizer_manager = TokenizerManager::default();
tokenizer_manager.register(
"el_stem",
TextAnalyzer::from(SimpleTokenizer)
analyzer_builder(SimpleTokenizer)
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(Stemmer::new(Language::Greek)),
.filter(LowerCaser::new())
.filter(Stemmer::new(Language::Greek))
.build(),
);
let en_tokenizer = tokenizer_manager.get("el_stem").unwrap();
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
en_tokenizer
.token_stream("Καλημέρα, χαρούμενε φορολογούμενε!")
.process(&mut add_token);
}
let tokens: Vec<Token> = en_tokenizer
.token_stream("Καλημέρα, χαρούμενε φορολογούμενε!")
.collect();
assert_eq!(tokens.len(), 3);
assert_token(&tokens[0], 0, "καλημερ", 0, 16);
@@ -256,25 +236,9 @@ pub mod tests {
fn test_tokenizer_empty() {
let tokenizer_manager = TokenizerManager::default();
let en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
{
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
en_tokenizer.token_stream(" ").process(&mut add_token);
}
assert!(tokens.is_empty());
}
{
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
en_tokenizer.token_stream(" ").process(&mut add_token);
}
assert!(tokens.is_empty());
}
let tokens: Vec<Token> = en_tokenizer.token_stream(" ").collect();
assert!(tokens.is_empty());
let tokens: Vec<Token> = en_tokenizer.token_stream(" ").collect();
assert!(tokens.is_empty());
}
}

View File

@@ -1,5 +1,4 @@
use super::{Token, TokenStream, Tokenizer};
use crate::tokenizer::BoxTokenStream;
use super::{Token, Tokenizer};
/// Tokenize the text by splitting words into n-grams of the given size(s)
///
@@ -79,7 +78,7 @@ use crate::tokenizer::BoxTokenStream;
/// }
/// assert!(stream.next().is_none());
/// ```
#[derive(Clone)]
#[derive(Clone, Debug, Default)]
pub struct NgramTokenizer {
/// min size of the n-gram
min_gram: usize,
@@ -119,54 +118,48 @@ impl NgramTokenizer {
}
/// TokenStream associate to the `NgramTokenizer`
pub struct NgramTokenStream<'a> {
pub struct NgramTokenStream {
/// parameters
ngram_charidx_iterator: StutteringIterator<CodepointFrontiers<'a>>,
ngram_charidx_iterator: StutteringIterator<CodepointFrontiers>,
/// true if the NgramTokenStream is in prefix mode.
prefix_only: bool,
/// input
text: &'a str,
text: String,
/// output
token: Token,
}
impl Tokenizer for NgramTokenizer {
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
From::from(NgramTokenStream {
type Iter = NgramTokenStream;
fn token_stream(&self, text: &str) -> Self::Iter {
NgramTokenStream {
ngram_charidx_iterator: StutteringIterator::new(
CodepointFrontiers::for_str(text),
self.min_gram,
self.max_gram,
),
prefix_only: self.prefix_only,
text,
text: text.to_string(),
token: Token::default(),
})
}
}
}
impl<'a> TokenStream for NgramTokenStream<'a> {
fn advance(&mut self) -> bool {
impl Iterator for NgramTokenStream {
type Item = Token;
fn next(&mut self) -> Option<Self::Item> {
if let Some((offset_from, offset_to)) = self.ngram_charidx_iterator.next() {
if self.prefix_only && offset_from > 0 {
return false;
return None;
}
self.token.position = 0;
self.token.offset_from = offset_from;
self.token.offset_to = offset_to;
self.token.text.clear();
self.token.text.push_str(&self.text[offset_from..offset_to]);
true
} else {
false
}
}
fn token(&self) -> &Token {
&self.token
}
fn token_mut(&mut self) -> &mut Token {
&mut self.token
return Some(self.token.clone());
};
None
}
}
@@ -257,21 +250,21 @@ where
/// or a codepoint ends.
///
/// By convention, we emit [0] for the empty string.
struct CodepointFrontiers<'a> {
s: &'a str,
struct CodepointFrontiers {
s: String,
next_el: Option<usize>,
}
impl<'a> CodepointFrontiers<'a> {
fn for_str(s: &'a str) -> Self {
impl CodepointFrontiers {
fn for_str(s: &str) -> Self {
CodepointFrontiers {
s,
s: s.to_string(),
next_el: Some(0),
}
}
}
impl<'a> Iterator for CodepointFrontiers<'a> {
impl<'a> Iterator for CodepointFrontiers {
type Item = usize;
fn next(&mut self) -> Option<usize> {
@@ -280,7 +273,7 @@ impl<'a> Iterator for CodepointFrontiers<'a> {
self.next_el = None;
} else {
let first_codepoint_width = utf8_codepoint_width(self.s.as_bytes()[0]);
self.s = &self.s[first_codepoint_width..];
self.s = (&self.s[first_codepoint_width..]).to_string();
self.next_el = Some(offset + first_codepoint_width);
}
offset
@@ -301,20 +294,8 @@ fn utf8_codepoint_width(b: u8) -> usize {
#[cfg(test)]
mod tests {
use super::utf8_codepoint_width;
use super::CodepointFrontiers;
use super::NgramTokenizer;
use super::StutteringIterator;
use super::*;
use crate::tokenizer::tests::assert_token;
use crate::tokenizer::tokenizer::Tokenizer;
use crate::tokenizer::{BoxTokenStream, Token};
fn test_helper(mut tokenizer: BoxTokenStream) -> Vec<Token> {
let mut tokens: Vec<Token> = vec![];
tokenizer.process(&mut |token: &Token| tokens.push(token.clone()));
tokens
}
#[test]
fn test_utf8_codepoint_width() {
@@ -351,7 +332,9 @@ mod tests {
#[test]
fn test_ngram_tokenizer_1_2_false() {
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hello"));
let tokens: Vec<_> = NgramTokenizer::all_ngrams(1, 2)
.token_stream("hello")
.collect();
assert_eq!(tokens.len(), 9);
assert_token(&tokens[0], 0, "h", 0, 1);
assert_token(&tokens[1], 0, "he", 0, 2);
@@ -366,7 +349,9 @@ mod tests {
#[test]
fn test_ngram_tokenizer_min_max_equal() {
let tokens = test_helper(NgramTokenizer::all_ngrams(3, 3).token_stream("hello"));
let tokens: Vec<_> = NgramTokenizer::all_ngrams(3, 3)
.token_stream("hello")
.collect();
assert_eq!(tokens.len(), 3);
assert_token(&tokens[0], 0, "hel", 0, 3);
assert_token(&tokens[1], 0, "ell", 1, 4);
@@ -375,7 +360,9 @@ mod tests {
#[test]
fn test_ngram_tokenizer_2_5_prefix() {
let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("frankenstein"));
let tokens: Vec<_> = NgramTokenizer::prefix_only(2, 5)
.token_stream("frankenstein")
.collect();
assert_eq!(tokens.len(), 4);
assert_token(&tokens[0], 0, "fr", 0, 2);
assert_token(&tokens[1], 0, "fra", 0, 3);
@@ -385,7 +372,9 @@ mod tests {
#[test]
fn test_ngram_non_ascii_1_2() {
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hεllo"));
let tokens: Vec<_> = NgramTokenizer::all_ngrams(1, 2)
.token_stream("hεllo")
.collect();
assert_eq!(tokens.len(), 9);
assert_token(&tokens[0], 0, "h", 0, 1);
assert_token(&tokens[1], 0, "", 0, 3);
@@ -400,7 +389,9 @@ mod tests {
#[test]
fn test_ngram_non_ascii_2_5_prefix() {
let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("hεllo"));
let tokens: Vec<_> = NgramTokenizer::prefix_only(2, 5)
.token_stream("hεllo")
.collect();
assert_eq!(tokens.len(), 4);
assert_token(&tokens[0], 0, "", 0, 3);
assert_token(&tokens[1], 0, "hεl", 0, 4);
@@ -410,16 +401,16 @@ mod tests {
#[test]
fn test_ngram_empty() {
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 5).token_stream(""));
let tokens: Vec<_> = NgramTokenizer::all_ngrams(1, 5).token_stream("").collect();
assert!(tokens.is_empty());
let tokens = test_helper(NgramTokenizer::all_ngrams(2, 5).token_stream(""));
let tokens: Vec<_> = NgramTokenizer::all_ngrams(2, 5).token_stream("").collect();
assert!(tokens.is_empty());
}
#[test]
#[should_panic(expected = "min_gram must be greater than 0")]
fn test_ngram_min_max_interval_empty() {
test_helper(NgramTokenizer::all_ngrams(0, 2).token_stream("hellossss"));
NgramTokenizer::all_ngrams(0, 2).token_stream("hellossss");
}
#[test]

View File

@@ -1,17 +1,17 @@
use super::{Token, TokenStream, Tokenizer};
use crate::tokenizer::BoxTokenStream;
use super::{Token, Tokenizer};
/// For each value of the field, emit a single unprocessed token.
#[derive(Clone)]
#[derive(Clone, Debug, Default)]
pub struct RawTokenizer;
#[derive(Clone, Debug)]
pub struct RawTokenStream {
token: Token,
has_token: bool,
token: Option<Token>,
}
impl Tokenizer for RawTokenizer {
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
type Iter = RawTokenStream;
fn token_stream(&self, text: &str) -> Self::Iter {
let token = Token {
offset_from: 0,
offset_to: text.len(),
@@ -19,26 +19,13 @@ impl Tokenizer for RawTokenizer {
text: text.to_string(),
position_length: 1,
};
RawTokenStream {
token,
has_token: true,
}
.into()
RawTokenStream { token: Some(token) }
}
}
impl TokenStream for RawTokenStream {
fn advance(&mut self) -> bool {
let result = self.has_token;
self.has_token = false;
result
}
fn token(&self) -> &Token {
&self.token
}
fn token_mut(&mut self) -> &mut Token {
&mut self.token
impl Iterator for RawTokenStream {
type Item = Token;
fn next(&mut self) -> Option<Token> {
self.token.take()
}
}

View File

@@ -2,8 +2,8 @@
//! ```rust
//! use tantivy::tokenizer::*;
//!
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer)
//! .filter(RemoveLongFilter::limit(5));
//! let tokenizer = analyzer_builder(SimpleTokenizer)
//! .filter(RemoveLongFilter::limit(5)).build();
//!
//! let mut stream = tokenizer.token_stream("toolong nice");
//! // because `toolong` is more than 5 characters, it is filtered
@@ -12,61 +12,30 @@
//! assert!(stream.next().is_none());
//! ```
//!
use super::{Token, TokenFilter, TokenStream};
use crate::tokenizer::BoxTokenStream;
use super::{Token, TokenFilter};
/// `RemoveLongFilter` removes tokens that are longer
/// than a given number of bytes (in UTF-8 representation).
///
/// It is especially useful when indexing unconstrained content.
/// e.g. Mail containing base-64 encoded pictures etc.
#[derive(Clone)]
#[derive(Clone, Debug)]
pub struct RemoveLongFilter {
length_limit: usize,
limit: usize,
}
impl RemoveLongFilter {
/// Creates a `RemoveLongFilter` given a limit in bytes of the UTF-8 representation.
pub fn limit(length_limit: usize) -> RemoveLongFilter {
RemoveLongFilter { length_limit }
}
}
impl<'a> RemoveLongFilterStream<'a> {
fn predicate(&self, token: &Token) -> bool {
token.text.len() < self.token_length_limit
pub fn limit(limit: usize) -> RemoveLongFilter {
RemoveLongFilter { limit }
}
}
impl TokenFilter for RemoveLongFilter {
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
BoxTokenStream::from(RemoveLongFilterStream {
token_length_limit: self.length_limit,
tail: token_stream,
})
}
}
pub struct RemoveLongFilterStream<'a> {
token_length_limit: usize,
tail: BoxTokenStream<'a>,
}
impl<'a> TokenStream for RemoveLongFilterStream<'a> {
fn advance(&mut self) -> bool {
while self.tail.advance() {
if self.predicate(self.tail.token()) {
return true;
}
fn transform(&mut self, token: Token) -> Option<Token> {
if token.text.len() >= self.limit {
return None;
}
false
}
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
Some(token)
}
}

View File

@@ -1,59 +1,74 @@
use super::BoxTokenStream;
use super::{Token, TokenStream, Tokenizer};
use std::str::CharIndices;
use super::{Token, Tokenizer};
/// Tokenize the text by splitting on whitespaces and punctuation.
#[derive(Clone)]
#[derive(Clone, Debug)]
pub struct SimpleTokenizer;
pub struct SimpleTokenStream<'a> {
text: &'a str,
chars: CharIndices<'a>,
token: Token,
}
impl Tokenizer for SimpleTokenizer {
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
BoxTokenStream::from(SimpleTokenStream {
text,
chars: text.char_indices(),
token: Token::default(),
})
type Iter = SimpleTokenizerStream;
fn token_stream(&self, text: &str) -> Self::Iter {
let vec: Vec<_> = text.char_indices().collect();
SimpleTokenizerStream {
text: text.to_string(),
chars: vec.into_iter(),
position: usize::max_value(),
}
}
}
impl<'a> SimpleTokenStream<'a> {
#[derive(Clone, Debug)]
pub struct SimpleTokenizerStream {
text: String,
chars: std::vec::IntoIter<(usize, char)>,
position: usize,
}
impl SimpleTokenizerStream {
// search for the end of the current token.
fn search_token_end(&mut self) -> usize {
(&mut self.chars)
.filter(|&(_, ref c)| !c.is_alphanumeric())
.filter(|&(_, c)| !c.is_alphanumeric())
.map(|(offset, _)| offset)
.next()
.unwrap_or_else(|| self.text.len())
}
}
impl<'a> TokenStream for SimpleTokenStream<'a> {
fn advance(&mut self) -> bool {
self.token.text.clear();
self.token.position = self.token.position.wrapping_add(1);
impl Iterator for SimpleTokenizerStream {
type Item = Token;
fn next(&mut self) -> Option<Self::Item> {
self.position = self.position.wrapping_add(1);
while let Some((offset_from, c)) = self.chars.next() {
if c.is_alphanumeric() {
let offset_to = self.search_token_end();
self.token.offset_from = offset_from;
self.token.offset_to = offset_to;
self.token.text.push_str(&self.text[offset_from..offset_to]);
return true;
let token = Token {
text: self.text[offset_from..offset_to].into(),
offset_from,
offset_to,
position: self.position,
..Default::default()
};
return Some(token);
}
}
false
}
fn token(&self) -> &Token {
&self.token
}
fn token_mut(&mut self) -> &mut Token {
&mut self.token
None
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_empty() {
let mut empty = SimpleTokenizer.token_stream("");
assert_eq!(empty.next(), None);
}
#[test]
fn simple_tokenizer() {
let mut simple = SimpleTokenizer.token_stream("tokenizer hello world");
assert_eq!(simple.next().unwrap().text, "tokenizer");
assert_eq!(simple.next().unwrap().text, "hello");
assert_eq!(simple.next().unwrap().text, "world");
}
}

View File

@@ -1,5 +1,6 @@
use super::{Token, TokenFilter, TokenStream};
use crate::tokenizer::BoxTokenStream;
use std::sync::Arc;
use super::{Token, TokenFilter};
use rust_stemmers::{self, Algorithm};
use serde::{Deserialize, Serialize};
@@ -58,14 +59,15 @@ impl Language {
/// Tokens are expected to be lowercased beforehand.
#[derive(Clone)]
pub struct Stemmer {
stemmer_algorithm: Algorithm,
stemmer: Arc<rust_stemmers::Stemmer>,
}
impl Stemmer {
/// Creates a new Stemmer `TokenFilter` for a given language algorithm.
pub fn new(language: Language) -> Stemmer {
let stemmer = rust_stemmers::Stemmer::create(language.algorithm());
Stemmer {
stemmer_algorithm: language.algorithm(),
stemmer: Arc::new(stemmer),
}
}
}
@@ -78,37 +80,12 @@ impl Default for Stemmer {
}
impl TokenFilter for Stemmer {
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
let inner_stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm);
BoxTokenStream::from(StemmerTokenStream {
tail: token_stream,
stemmer: inner_stemmer,
})
}
}
pub struct StemmerTokenStream<'a> {
tail: BoxTokenStream<'a>,
stemmer: rust_stemmers::Stemmer,
}
impl<'a> TokenStream for StemmerTokenStream<'a> {
fn advance(&mut self) -> bool {
if !self.tail.advance() {
return false;
}
fn transform(&mut self, mut token: Token) -> Option<Token> {
// TODO remove allocation
let stemmed_str: String = self.stemmer.stem(&self.token().text).into_owned();
self.token_mut().text.clear();
self.token_mut().text.push_str(&stemmed_str);
true
}
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
let stemmed_str: String = self.stemmer.stem(&token.text).into_owned();
// TODO remove clear
token.text.clear();
token.text.push_str(&stemmed_str);
Some(token)
}
}

View File

@@ -2,16 +2,15 @@
//! ```rust
//! use tantivy::tokenizer::*;
//!
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer)
//! .filter(StopWordFilter::remove(vec!["the".to_string(), "is".to_string()]));
//! let tokenizer = analyzer_builder(SimpleTokenizer)
//! .filter(StopWordFilter::remove(vec!["the".to_string(), "is".to_string()])).build();
//!
//! let mut stream = tokenizer.token_stream("the fox is crafty");
//! assert_eq!(stream.next().unwrap().text, "fox");
//! assert_eq!(stream.next().unwrap().text, "crafty");
//! assert!(stream.next().is_none());
//! ```
use super::{Token, TokenFilter, TokenStream};
use crate::tokenizer::BoxTokenStream;
use super::{Token, TokenFilter};
use fnv::FnvHasher;
use std::collections::HashSet;
use std::hash::BuildHasherDefault;
@@ -49,42 +48,12 @@ impl StopWordFilter {
}
}
pub struct StopWordFilterStream<'a> {
words: StopWordHashSet,
tail: BoxTokenStream<'a>,
}
impl TokenFilter for StopWordFilter {
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
BoxTokenStream::from(StopWordFilterStream {
words: self.words.clone(),
tail: token_stream,
})
}
}
impl<'a> StopWordFilterStream<'a> {
fn predicate(&self, token: &Token) -> bool {
!self.words.contains(&token.text)
}
}
impl<'a> TokenStream for StopWordFilterStream<'a> {
fn advance(&mut self) -> bool {
while self.tail.advance() {
if self.predicate(self.tail.token()) {
return true;
}
fn transform(&mut self, token: Token) -> Option<Token> {
if self.words.contains(&token.text) {
return None;
}
false
}
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
Some(token)
}
}

View File

@@ -1,95 +1,121 @@
use crate::tokenizer::{BoxTokenStream, Token, TokenStream};
use std::ops::DerefMut;
use crate::tokenizer::Token;
const POSITION_GAP: usize = 2;
pub(crate) struct TokenStreamChain<'a> {
offsets: Vec<usize>,
token_streams: Vec<BoxTokenStream<'a>>,
pub(crate) struct TokenStreamChain<Inner, Outer> {
streams_with_offsets: Outer,
current: Option<(Inner, usize)>,
position: usize,
position_shift: usize,
stream_idx: usize,
token: Token,
}
impl<'a> TokenStreamChain<'a> {
pub fn new(
offsets: Vec<usize>,
token_streams: Vec<BoxTokenStream<'a>>,
) -> TokenStreamChain<'a> {
impl<'a, Inner, Outer> TokenStreamChain<Inner, Outer>
where
Inner: Iterator<Item = Token>,
Outer: Iterator<Item = (Inner, usize)>,
{
pub fn new(mut streams_with_offsets: Outer) -> TokenStreamChain<Inner, Outer> {
let current = streams_with_offsets.next();
TokenStreamChain {
offsets,
stream_idx: 0,
token_streams,
streams_with_offsets: streams_with_offsets,
current,
position: usize::max_value(),
position_shift: 0,
token: Token::default(),
}
}
}
impl<'a> TokenStream for TokenStreamChain<'a> {
fn advance(&mut self) -> bool {
while self.stream_idx < self.token_streams.len() {
let token_stream = self.token_streams[self.stream_idx].deref_mut();
if token_stream.advance() {
let token = token_stream.token();
let offset_offset = self.offsets[self.stream_idx];
self.token.offset_from = token.offset_from + offset_offset;
self.token.offset_to = token.offset_to + offset_offset;
self.token.position = token.position + self.position_shift;
self.token.text.clear();
self.token.text.push_str(token.text.as_str());
return true;
} else {
self.stream_idx += 1;
self.position_shift = self.token.position.wrapping_add(POSITION_GAP);
impl<'a, Inner, Outer> Iterator for TokenStreamChain<Inner, Outer>
where
Inner: Iterator<Item = Token>,
Outer: Iterator<Item = (Inner, usize)>,
{
type Item = Token;
fn next(&mut self) -> Option<Token> {
while let Some((ref mut token_stream, offset_offset)) = self.current {
if let Some(mut token) = token_stream.next() {
token.offset_from += offset_offset;
token.offset_to += offset_offset;
token.position += self.position_shift;
self.position = token.position;
return Some(token);
}
self.position_shift = self.position.wrapping_add(POSITION_GAP);
self.current = self.streams_with_offsets.next();
}
false
None
}
}
fn token(&self) -> &Token {
assert!(
self.stream_idx <= self.token_streams.len(),
"You called .token(), after the end of the token stream has been reached"
);
&self.token
impl DynTokenStreamChain {
pub fn from_vec(
streams_with_offsets: Vec<(Box<dyn Iterator<Item = Token>>, usize)>,
) -> impl Iterator<Item = Token> {
DynTokenStreamChain {
streams_with_offsets,
idx: 0,
position: usize::max_value(),
position_shift: 0,
}
}
}
fn token_mut(&mut self) -> &mut Token {
assert!(
self.stream_idx <= self.token_streams.len(),
"You called .token(), after the end of the token stream has been reached"
);
&mut self.token
pub(crate) struct DynTokenStreamChain {
streams_with_offsets: Vec<(Box<dyn Iterator<Item = Token>>, usize)>,
idx: usize,
position: usize,
position_shift: usize,
}
impl Iterator for DynTokenStreamChain {
type Item = Token;
fn next(&mut self) -> Option<Token> {
while let Some((token_stream, offset_offset)) = self.streams_with_offsets.get_mut(self.idx)
{
if let Some(mut token) = token_stream.next() {
token.offset_from += *offset_offset;
token.offset_to += *offset_offset;
token.position += self.position_shift;
self.position = token.position;
return Some(token);
}
self.idx += 1;
self.position_shift = self.position.wrapping_add(POSITION_GAP);
}
None
}
}
#[cfg(test)]
mod tests {
use super::super::{SimpleTokenizer, TokenStream, Tokenizer};
use super::TokenStreamChain;
use super::POSITION_GAP;
use super::super::tokenizer::Tokenizer;
use super::super::SimpleTokenizer;
use super::*;
#[test]
fn test_chain_first_emits_no_tokens() {
let token_streams = vec![
SimpleTokenizer.token_stream(""),
SimpleTokenizer.token_stream("hello world"),
(SimpleTokenizer.token_stream(""), 0),
(SimpleTokenizer.token_stream("hello world"), 0),
];
let mut token_chain = TokenStreamChain::new(vec![0, 0], token_streams);
let mut token_chain = TokenStreamChain::new(token_streams.into_iter());
let token = token_chain.next();
assert!(token_chain.advance());
assert_eq!(token_chain.token().text, "hello");
assert_eq!(token_chain.token().offset_from, 0);
assert_eq!(token_chain.token().offset_to, 5);
assert_eq!(token_chain.token().position, POSITION_GAP - 1);
let expect = Token {
offset_from: 0,
offset_to: 5,
position: POSITION_GAP - 1,
text: "hello".into(),
..Token::default()
};
assert_eq!(token.unwrap(), expect);
assert!(token_chain.advance());
assert_eq!(token_chain.token().text, "world");
assert_eq!(token_chain.token().offset_from, 6);
assert_eq!(token_chain.token().offset_to, 11);
assert_eq!(token_chain.token().position, POSITION_GAP);
let token = token_chain.next().unwrap();
assert_eq!(token.text, "world");
assert_eq!(token.offset_from, 6);
assert_eq!(token.offset_to, 11);
assert_eq!(token.position, POSITION_GAP);
assert!(!token_chain.advance());
assert!(token_chain.next().is_none());
}
}

View File

@@ -1,4 +1,4 @@
use crate::tokenizer::{BoxTokenStream, Token, TokenStream, TokenStreamChain};
use crate::tokenizer::{Token, TokenStreamChain};
use serde::{Deserialize, Serialize};
use std::cmp::Ordering;
@@ -26,14 +26,14 @@ impl PartialOrd for PreTokenizedString {
/// TokenStream implementation which wraps PreTokenizedString
pub struct PreTokenizedStream {
tokenized_string: PreTokenizedString,
current_token: i64,
current_token: usize,
}
impl From<PreTokenizedString> for PreTokenizedStream {
fn from(s: PreTokenizedString) -> PreTokenizedStream {
PreTokenizedStream {
tokenized_string: s,
current_token: -1,
current_token: 0,
}
}
}
@@ -41,49 +41,28 @@ impl From<PreTokenizedString> for PreTokenizedStream {
impl PreTokenizedStream {
/// Creates a TokenStream from PreTokenizedString array
pub fn chain_tokenized_strings<'a>(
tok_strings: &'a [&'a PreTokenizedString],
) -> BoxTokenStream {
if tok_strings.len() == 1 {
PreTokenizedStream::from((*tok_strings[0]).clone()).into()
} else {
let mut offsets = vec![];
let mut total_offset = 0;
for &tok_string in tok_strings {
offsets.push(total_offset);
if let Some(last_token) = tok_string.tokens.last() {
total_offset += last_token.offset_to;
}
tok_strings: &'a [&PreTokenizedString],
) -> impl Iterator<Item = Token> + 'a {
let streams_with_offsets = tok_strings.iter().scan(0, |total_offset, tok_string| {
let next = Some((
PreTokenizedStream::from((*tok_string).to_owned()),
*total_offset,
));
if let Some(last_token) = tok_string.tokens.last() {
*total_offset += last_token.offset_to;
}
// TODO remove the string cloning.
let token_streams: Vec<BoxTokenStream<'static>> = tok_strings
.iter()
.map(|&tok_string| PreTokenizedStream::from((*tok_string).clone()).into())
.collect();
TokenStreamChain::new(offsets, token_streams).into()
}
next
});
TokenStreamChain::new(streams_with_offsets)
}
}
impl TokenStream for PreTokenizedStream {
fn advance(&mut self) -> bool {
impl Iterator for PreTokenizedStream {
type Item = Token;
fn next(&mut self) -> Option<Token> {
let token = self.tokenized_string.tokens.get(self.current_token)?;
self.current_token += 1;
self.current_token < self.tokenized_string.tokens.len() as i64
}
fn token(&self) -> &Token {
assert!(
self.current_token >= 0,
"TokenStream not initialized. You should call advance() at least once."
);
&self.tokenized_string.tokens[self.current_token as usize]
}
fn token_mut(&mut self) -> &mut Token {
assert!(
self.current_token >= 0,
"TokenStream not initialized. You should call advance() at least once."
);
&mut self.tokenized_string.tokens[self.current_token as usize]
Some(token.clone())
}
}
@@ -119,10 +98,9 @@ mod tests {
let mut token_stream = PreTokenizedStream::from(tok_text.clone());
for expected_token in tok_text.tokens {
assert!(token_stream.advance());
assert_eq!(token_stream.token(), &expected_token);
assert_eq!(token_stream.next().unwrap(), expected_token);
}
assert!(!token_stream.advance());
assert!(token_stream.next().is_none());
}
#[test]
@@ -183,9 +161,8 @@ mod tests {
];
for expected_token in expected_tokens {
assert!(token_stream.advance());
assert_eq!(token_stream.token(), &expected_token);
assert_eq!(token_stream.next().unwrap(), expected_token);
}
assert!(!token_stream.advance());
assert!(token_stream.next().is_none());
}
}

View File

@@ -2,8 +2,23 @@ use crate::tokenizer::TokenStreamChain;
use serde::{Deserialize, Serialize};
/// The tokenizer module contains all of the tools used to process
/// text in `tantivy`.
use std::borrow::{Borrow, BorrowMut};
use std::ops::{Deref, DerefMut};
pub trait TextAnalyzerClone {
fn box_clone(&self) -> Box<dyn TextAnalyzerT>;
}
/// 'Top-level' trait hiding concrete types, below which static dispatch occurs.
pub trait TextAnalyzerT: 'static + Send + Sync + TextAnalyzerClone {
/// 'Top-level' dynamic dispatch function hiding concrete types of the staticly
/// dispatched `token_stream` from the `Tokenizer` trait.
fn token_stream(&self, text: &str) -> Box<dyn Iterator<Item = Token>>;
}
impl Clone for Box<dyn TextAnalyzerT> {
fn clone(&self) -> Self {
(**self).box_clone()
}
}
/// Token
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
@@ -35,35 +50,116 @@ impl Default for Token {
}
}
/// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`.
/// Trait for the pluggable components of `Tokenizer`s.
pub trait TokenFilter: 'static + Send + Sync + Clone {
/// Take a `Token` and transform it or return `None` if it's to be removed
/// from the output stream.
fn transform(&mut self, token: Token) -> Option<Token>;
}
/// `Tokenizer` are in charge of splitting text into a stream of token
/// before indexing.
///
/// See the [module documentation](./index.html) for more detail.
pub trait Tokenizer: 'static + Send + Sync + Clone {
/// An iteratable type is returned.
type Iter: Iterator<Item = Token>;
/// Creates a token stream for a given `str`.
fn token_stream(&self, text: &str) -> Self::Iter;
/// Tokenize an array`&str`
///
/// The resulting `Token` stream is equivalent to what would be obtained if the &str were
/// one concatenated `&str`, with an artificial position gap of `2` between the different fields
/// to prevent accidental `PhraseQuery` to match accross two terms.
fn token_stream_texts<'a>(&'a self, texts: &'a [&str]) -> Box<dyn Iterator<Item = Token> + 'a> {
let streams_with_offsets = texts.iter().scan(0, move |total_offset, &text| {
let temp = *total_offset;
*total_offset += text.len();
Some((self.token_stream(text), temp))
});
Box::new(TokenStreamChain::new(streams_with_offsets))
}
}
/// `TextAnalyzer` wraps the tokenization of an input text and its modification by any filters applied onto it.
///
/// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially.
pub struct TextAnalyzer {
tokenizer: Box<dyn Tokenizer>,
token_filters: Vec<BoxTokenFilter>,
}
#[derive(Clone, Debug, Default)]
pub struct TextAnalyzer<T>(T);
impl<T: Tokenizer> From<T> for TextAnalyzer {
fn from(tokenizer: T) -> Self {
TextAnalyzer::new(tokenizer, Vec::new())
impl<T: Tokenizer> From<T> for TextAnalyzer<T> {
fn from(src: T) -> TextAnalyzer<T> {
TextAnalyzer(src)
}
}
impl TextAnalyzer {
/// Creates a new `TextAnalyzer` given a tokenizer and a vector of `BoxTokenFilter`.
///
/// When creating a `TextAnalyzer` from a `Tokenizer` alone, prefer using
/// `TextAnalyzer::from(tokenizer)`.
pub fn new<T: Tokenizer>(tokenizer: T, token_filters: Vec<BoxTokenFilter>) -> TextAnalyzer {
TextAnalyzer {
tokenizer: Box::new(tokenizer),
token_filters,
impl<T: Tokenizer> TextAnalyzerClone for TextAnalyzer<T> {
fn box_clone(&self) -> Box<dyn TextAnalyzerT> {
Box::new(TextAnalyzer(self.0.clone()))
}
}
impl<T: Tokenizer> TextAnalyzerT for TextAnalyzer<T> {
fn token_stream(&self, text: &str) -> Box<dyn Iterator<Item = Token>> {
Box::new(self.0.token_stream(text))
}
}
/// Identity `TokenFilter`
#[derive(Clone, Debug, Default)]
pub struct Identity;
impl TokenFilter for Identity {
fn transform(&mut self, token: Token) -> Option<Token> {
Some(token)
}
}
/// `Filter` is a wrapper around a `Token` stream and a `TokenFilter` which modifies it.
#[derive(Clone, Default, Debug)]
pub struct Filter<I, F> {
iter: I,
f: F,
}
impl<I, F> Iterator for Filter<I, F>
where
I: Iterator<Item = Token>,
F: TokenFilter,
{
type Item = Token;
fn next(&mut self) -> Option<Token> {
while let Some(token) = self.iter.next() {
if let Some(tok) = self.f.transform(token) {
return Some(tok);
}
}
None
}
}
#[derive(Clone, Debug, Default)]
pub struct AnalyzerBuilder<T, F> {
tokenizer: T,
f: F,
}
/// Construct an `AnalyzerBuilder` on which to apply `TokenFilter`.
pub fn analyzer_builder<T: Tokenizer>(tokenizer: T) -> AnalyzerBuilder<T, Identity> {
AnalyzerBuilder {
tokenizer,
f: Identity,
}
}
impl<T, F> AnalyzerBuilder<T, F>
where
T: Tokenizer,
F: TokenFilter,
{
/// Appends a token filter to the current tokenizer.
///
/// The method consumes the current `TokenStream` and returns a
/// The method consumes the current `Token` and returns a
/// new one.
///
/// # Example
@@ -71,248 +167,35 @@ impl TextAnalyzer {
/// ```rust
/// use tantivy::tokenizer::*;
///
/// let en_stem = TextAnalyzer::from(SimpleTokenizer)
/// let en_stem = analyzer_builder(SimpleTokenizer)
/// .filter(RemoveLongFilter::limit(40))
/// .filter(LowerCaser)
/// .filter(Stemmer::default());
/// .filter(LowerCaser::new())
/// .filter(Stemmer::default()).build();
/// ```
///
pub fn filter<F: Into<BoxTokenFilter>>(mut self, token_filter: F) -> Self {
self.token_filters.push(token_filter.into());
self
pub fn filter<G: TokenFilter>(self, f: G) -> AnalyzerBuilder<AnalyzerBuilder<T, F>, G> {
AnalyzerBuilder { tokenizer: self, f }
}
/// Finalize the build process.
pub fn build(self) -> TextAnalyzer<AnalyzerBuilder<T, F>> {
TextAnalyzer(self)
}
}
/// Tokenize an array`&str`
///
/// The resulting `BoxTokenStream` is equivalent to what would be obtained if the &str were
/// one concatenated `&str`, with an artificial position gap of `2` between the different fields
/// to prevent accidental `PhraseQuery` to match accross two terms.
pub fn token_stream_texts<'a>(&self, texts: &'a [&'a str]) -> BoxTokenStream<'a> {
assert!(!texts.is_empty());
if texts.len() == 1 {
self.token_stream(texts[0])
} else {
let mut offsets = vec![];
let mut total_offset = 0;
for &text in texts {
offsets.push(total_offset);
total_offset += text.len();
}
let token_streams: Vec<BoxTokenStream<'a>> = texts
.iter()
.cloned()
.map(|text| self.token_stream(text))
.collect();
From::from(TokenStreamChain::new(offsets, token_streams))
impl<T: Tokenizer, F: TokenFilter> Tokenizer for AnalyzerBuilder<T, F> {
type Iter = Filter<T::Iter, F>;
fn token_stream(&self, text: &str) -> Self::Iter {
Filter {
iter: self.tokenizer.token_stream(text),
f: self.f.clone(),
}
}
/// Creates a token stream for a given `str`.
pub fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
let mut token_stream = self.tokenizer.token_stream(text);
for token_filter in &self.token_filters {
token_stream = token_filter.transform(token_stream);
}
token_stream
}
}
impl Clone for TextAnalyzer {
fn clone(&self) -> Self {
TextAnalyzer {
tokenizer: self.tokenizer.box_clone(),
token_filters: self
.token_filters
.iter()
.map(|token_filter| token_filter.box_clone())
.collect(),
}
}
}
/// `Tokenizer` are in charge of splitting text into a stream of token
/// before indexing.
///
/// See the [module documentation](./index.html) for more detail.
///
/// # Warning
///
/// This API may change to use associated types.
pub trait Tokenizer: 'static + Send + Sync + TokenizerClone {
/// Creates a token stream for a given `str`.
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>;
}
pub trait TokenizerClone {
fn box_clone(&self) -> Box<dyn Tokenizer>;
}
impl<T: Tokenizer + Clone> TokenizerClone for T {
fn box_clone(&self) -> Box<dyn Tokenizer> {
Box::new(self.clone())
}
}
impl<'a> TokenStream for Box<dyn TokenStream + 'a> {
fn advance(&mut self) -> bool {
let token_stream: &mut dyn TokenStream = self.borrow_mut();
token_stream.advance()
}
fn token<'b>(&'b self) -> &'b Token {
let token_stream: &'b (dyn TokenStream + 'a) = self.borrow();
token_stream.token()
}
fn token_mut<'b>(&'b mut self) -> &'b mut Token {
let token_stream: &'b mut (dyn TokenStream + 'a) = self.borrow_mut();
token_stream.token_mut()
}
}
/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
///
/// See `TokenStream` for more information.
pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
impl<'a, T> From<T> for BoxTokenStream<'a>
where
T: TokenStream + 'a,
{
fn from(token_stream: T) -> BoxTokenStream<'a> {
BoxTokenStream(Box::new(token_stream))
}
}
impl<'a> Deref for BoxTokenStream<'a> {
type Target = dyn TokenStream + 'a;
fn deref(&self) -> &Self::Target {
&*self.0
}
}
impl<'a> DerefMut for BoxTokenStream<'a> {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut *self.0
}
}
/// Simple wrapper of `Box<dyn TokenFilter + 'a>`.
///
/// See `TokenStream` for more information.
pub struct BoxTokenFilter(Box<dyn TokenFilter>);
impl Deref for BoxTokenFilter {
type Target = dyn TokenFilter;
fn deref(&self) -> &dyn TokenFilter {
&*self.0
}
}
impl<T: TokenFilter> From<T> for BoxTokenFilter {
fn from(tokenizer: T) -> BoxTokenFilter {
BoxTokenFilter(Box::new(tokenizer))
}
}
/// `TokenStream` is the result of the tokenization.
///
/// It consists consumable stream of `Token`s.
///
/// # Example
///
/// ```
/// use tantivy::tokenizer::*;
///
/// let tokenizer = TextAnalyzer::from(SimpleTokenizer)
/// .filter(RemoveLongFilter::limit(40))
/// .filter(LowerCaser);
/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer");
/// {
/// let token = token_stream.next().unwrap();
/// assert_eq!(&token.text, "hello");
/// assert_eq!(token.offset_from, 0);
/// assert_eq!(token.offset_to, 5);
/// assert_eq!(token.position, 0);
/// }
/// {
/// let token = token_stream.next().unwrap();
/// assert_eq!(&token.text, "happy");
/// assert_eq!(token.offset_from, 7);
/// assert_eq!(token.offset_to, 12);
/// assert_eq!(token.position, 1);
/// }
/// ```
///
pub trait TokenStream {
/// Advance to the next token
///
/// Returns false if there are no other tokens.
fn advance(&mut self) -> bool;
/// Returns a reference to the current token.
fn token(&self) -> &Token;
/// Returns a mutable reference to the current token.
fn token_mut(&mut self) -> &mut Token;
/// Helper to iterate over tokens. It
/// simply combines a call to `.advance()`
/// and `.token()`.
///
/// ```
/// use tantivy::tokenizer::*;
///
/// let tokenizer = TextAnalyzer::from(SimpleTokenizer)
/// .filter(RemoveLongFilter::limit(40))
/// .filter(LowerCaser);
/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer");
/// while let Some(token) = token_stream.next() {
/// println!("Token {:?}", token.text);
/// }
/// ```
fn next(&mut self) -> Option<&Token> {
if self.advance() {
Some(self.token())
} else {
None
}
}
/// Helper function to consume the entire `TokenStream`
/// and push the tokens to a sink function.
///
/// Remove this.
fn process(&mut self, sink: &mut dyn FnMut(&Token)) -> u32 {
let mut num_tokens_pushed = 0u32;
while self.advance() {
sink(self.token());
num_tokens_pushed += 1u32;
}
num_tokens_pushed
}
}
pub trait TokenFilterClone {
fn box_clone(&self) -> BoxTokenFilter;
}
/// Trait for the pluggable components of `Tokenizer`s.
pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone {
/// Wraps a token stream and returns the modified one.
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a>;
}
impl<T: TokenFilter + Clone> TokenFilterClone for T {
fn box_clone(&self) -> BoxTokenFilter {
BoxTokenFilter::from(self.clone())
}
}
#[cfg(test)]
mod test {
use super::Token;
use super::*;
use crate::tokenizer::SimpleTokenizer;
#[test]
fn clone() {
@@ -330,4 +213,15 @@ mod test {
assert_eq!(t1.offset_to, t2.offset_to);
assert_eq!(t1.text, t2.text);
}
#[test]
fn text_analyzer() {
let mut stream = SimpleTokenizer.token_stream("tokenizer hello world");
dbg!(stream.next());
dbg!(stream.next());
dbg!(stream.next());
dbg!(stream.next());
dbg!(stream.next());
dbg!(stream.next());
}
}

View File

@@ -1,5 +1,5 @@
use crate::tokenizer::stemmer::Language;
use crate::tokenizer::tokenizer::TextAnalyzer;
use crate::tokenizer::tokenizer::{analyzer_builder, TextAnalyzer, TextAnalyzerT, Tokenizer};
use crate::tokenizer::LowerCaser;
use crate::tokenizer::RawTokenizer;
use crate::tokenizer::RemoveLongFilter;
@@ -22,24 +22,23 @@ use std::sync::{Arc, RwLock};
/// search engine.
#[derive(Clone)]
pub struct TokenizerManager {
tokenizers: Arc<RwLock<HashMap<String, TextAnalyzer>>>,
tokenizers: Arc<RwLock<HashMap<String, Box<dyn TextAnalyzerT>>>>,
}
impl TokenizerManager {
/// Registers a new tokenizer associated with a given name.
pub fn register<T>(&self, tokenizer_name: &str, tokenizer: T)
pub fn register<U: Tokenizer, T>(&self, tokenizer_name: &str, tokenizer: T)
where
TextAnalyzer: From<T>,
T: Into<TextAnalyzer<U>>,
{
let boxed_tokenizer: TextAnalyzer = TextAnalyzer::from(tokenizer);
self.tokenizers
.write()
.expect("Acquiring the lock should never fail")
.insert(tokenizer_name.to_string(), boxed_tokenizer);
.insert(tokenizer_name.to_string(), Box::new(tokenizer.into()));
}
/// Accessing a tokenizer given its name.
pub fn get(&self, tokenizer_name: &str) -> Option<TextAnalyzer> {
pub fn get(&self, tokenizer_name: &str) -> Option<Box<dyn TextAnalyzerT>> {
self.tokenizers
.read()
.expect("Acquiring the lock should never fail")
@@ -54,23 +53,25 @@ impl Default for TokenizerManager {
/// - simple
/// - en_stem
/// - ja
fn default() -> TokenizerManager {
fn default() -> Self {
let manager = TokenizerManager {
tokenizers: Arc::new(RwLock::new(HashMap::new())),
};
manager.register("raw", RawTokenizer);
manager.register(
"default",
TextAnalyzer::from(SimpleTokenizer)
analyzer_builder(SimpleTokenizer)
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser),
.filter(LowerCaser::new())
.build(),
);
manager.register(
"en_stem",
TextAnalyzer::from(SimpleTokenizer)
analyzer_builder(SimpleTokenizer)
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(Stemmer::new(Language::English)),
.filter(LowerCaser::new())
.filter(Stemmer::new(Language::English))
.build(),
);
manager
}