Compare commits

..

2 Commits

Author SHA1 Message Date
Paul Masurel
a854a60e2a criterion 2019-08-11 15:46:42 +09:00
Paul Masurel
92d73a6bfb Added criterion benchmark 2019-08-09 17:34:06 +09:00
36 changed files with 486 additions and 670 deletions

View File

@@ -47,7 +47,6 @@ matrix:
before_install: before_install:
- set -e - set -e
- rustup self update - rustup self update
- rustup component add rustfmt
install: install:
- sh ci/install.sh - sh ci/install.sh
@@ -61,7 +60,6 @@ before_script:
script: script:
- bash ci/script.sh - bash ci/script.sh
- cargo fmt --all -- --check
before_deploy: before_deploy:
- sh ci/before_deploy.sh - sh ci/before_deploy.sh

View File

@@ -5,12 +5,7 @@ Tantivy 0.11.0
- Various bugfixes in the query parser. - Various bugfixes in the query parser.
- Better handling of hyphens in query parser. (#609) - Better handling of hyphens in query parser. (#609)
- Better handling of whitespaces. - Better handling of whitespaces.
- Closes #498 - add support for Elastic-style unbounded range queries for alphanumeric types eg. "title:>hello", "weight:>=70.5", "height:<200" (@petr-tik)
- API change around `Box<BoxableTokenizer>`. See detail in #629
## How to update?
`Box<dyn BoxableTokenizer>` has been replaced by a `BoxedTokenizer` struct.
Tantivy 0.10.1 Tantivy 0.10.1
===================== =====================

View File

@@ -17,7 +17,7 @@ base64 = "0.10.0"
byteorder = "1.0" byteorder = "1.0"
once_cell = "0.2" once_cell = "0.2"
regex = "1.0" regex = "1.0"
tantivy-fst = {git="https://github.com/tantivy-search/fst"} tantivy-fst = "0.1"
memmap = {version = "0.7", optional=true} memmap = {version = "0.7", optional=true}
lz4 = {version="1.20", optional=true} lz4 = {version="1.20", optional=true}
snap = {version="0.2"} snap = {version="0.2"}
@@ -62,6 +62,7 @@ rand = "0.7"
maplit = "1" maplit = "1"
matches = "0.1.8" matches = "0.1.8"
time = "0.1.42" time = "0.1.42"
criterion = "0.2"
[profile.release] [profile.release]
opt-level = 3 opt-level = 3
@@ -74,6 +75,7 @@ overflow-checks = true
[features] [features]
default = ["mmap"] default = ["mmap"]
forbench = []
mmap = ["atomicwrites", "fs2", "memmap", "notify"] mmap = ["atomicwrites", "fs2", "memmap", "notify"]
lz4-compression = ["lz4"] lz4-compression = ["lz4"]
failpoints = ["fail/failpoints"] failpoints = ["fail/failpoints"]
@@ -97,3 +99,15 @@ features = ["failpoints"]
name = "failpoints" name = "failpoints"
path = "tests/failpoints/mod.rs" path = "tests/failpoints/mod.rs"
required-features = ["fail/failpoints"] required-features = ["fail/failpoints"]
[profile.bench]
lto = true
[[bench]]
name = "vint"
harness = false
[[bench]]
name = "fastfield"
harness = false

73
benches/bitset.rs Normal file
View File

@@ -0,0 +1,73 @@
use criterion::{criterion_group, criterion_main, Criterion};
use rand::distributions::{Bernoulli, Uniform};
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use tantivy::forbench::bitset::{BitSet, TinySet};
use tantivy::query::BitSetDocSet;
use tantivy::DocSet;
fn sample_with_seed(n: u32, ratio: f64, seed_val: u8) -> Vec<u32> {
StdRng::from_seed([seed_val; 32])
.sample_iter(&Bernoulli::new(ratio).unwrap())
.take(n as usize)
.enumerate()
.filter_map(|(val, keep)| if keep { Some(val as u32) } else { None })
.collect()
}
fn generate_nonunique_unsorted(max_value: u32, n_elems: usize) -> Vec<u32> {
let seed: [u8; 32] = [1; 32];
StdRng::from_seed(seed)
.sample_iter(&Uniform::new(0u32, max_value))
.take(n_elems)
.collect::<Vec<u32>>()
}
fn bench_tinyset_pop(criterion: &mut Criterion) {
criterion.bench_function("pop_lowest", |b| {
b.iter(|| {
let mut tinyset = TinySet::singleton(criterion::black_box(31u32));
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
})
});
}
fn bench_bitset_insert(criterion: &mut Criterion) {
criterion.bench_function_over_inputs(
"bitset_insert",
|bench, (max_value, n_elems)| {
let els = generate_nonunique_unsorted(*max_value, *n_elems);
bench.iter(move || {
let mut bitset = BitSet::with_max_value(1_000_000);
for el in els.iter().cloned() {
bitset.insert(el);
}
});
},
vec![(1_000_000u32, 10_000)],
);
}
fn bench_bitsetdocset_iterate(b: &mut test::Bencher) {
let mut bitset = BitSet::with_max_value(1_000_000);
for el in sample_with_seed(1_000_000u32, 0.01, 0u8) {
bitset.insert(el);
}
b.iter(|| {
let mut docset = BitSetDocSet::from(bitset.clone());
while docset.advance() {}
});
}
criterion_group!(
benches,
bench_tinyset_pop,
bench_bitset_insert,
bench_bitsetdocset_iterate
);
criterion_main!(benches);

107
benches/fastfield.rs Normal file
View File

@@ -0,0 +1,107 @@
use criterion::criterion_group;
use criterion::criterion_main;
use criterion::Criterion;
use criterion::ParameterizedBenchmark;
use rand::rngs::StdRng;
use rand::seq::SliceRandom;
use rand::SeedableRng;
use tantivy::schema::{Schema, FAST};
use tantivy::{doc, DocId, Index};
const NUM_LOOKUPS: usize = 1_000;
fn generate_permutation(stride: usize, bit_width: u8) -> Vec<u64> {
let mut permutation: Vec<u64> = (0u64..(NUM_LOOKUPS * stride) as u64).collect();
permutation.shuffle(&mut StdRng::from_seed([1u8; 32]));
permutation.push(1u64 << (bit_width as u64)); //< just to force the bit_width
permutation
}
fn bench_linear_lookup(c: &mut Criterion) {
c.bench(
"lookup_stride",
ParameterizedBenchmark::new(
"baseline_vec",
|bench, (stride, num_bits)| {
let arr = generate_permutation(*stride, *num_bits);
bench.iter(move || {
let mut a = 0u64;
for i in (0..NUM_LOOKUPS / stride).map(|v| v * 7) {
a ^= arr[i as usize];
}
a
})
},
vec![(7, 1), (7, 5), (7, 20)],
)
.with_function("fastfield", |bench, (stride, num_bits)| {
let mut schema_builder = Schema::builder();
let val_field = schema_builder.add_u64_field("val", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 80_000_000).unwrap();
for el in generate_permutation(*stride, *num_bits) {
index_writer.add_document(doc!(val_field=>el));
}
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0u32);
let fast_field_reader = segment_reader.fast_fields().u64(val_field).unwrap();
bench.iter(move || {
let mut a = 0u64;
for i in (0..NUM_LOOKUPS / stride).map(|v| v * 7) {
a ^= fast_field_reader.get(i as DocId);
}
a
})
}),
);
}
fn bench_jumpy_lookup(c: &mut Criterion) {
c.bench(
"lookup_jumpy",
ParameterizedBenchmark::new(
"baseline_vec",
|bench, (stride, num_bits)| {
let arr = generate_permutation(*stride, *num_bits);
bench.iter(move || {
let mut a = 0u64;
for _ in 0..NUM_LOOKUPS {
a = arr[a as usize];
}
a
})
},
vec![(7, 1), (7, 5), (7, 20)],
)
.with_function("fastfield", |bench, (stride, num_bits)| {
let mut schema_builder = Schema::builder();
let val_field = schema_builder.add_u64_field("val", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 80_000_000).unwrap();
for el in generate_permutation(*stride, *num_bits) {
index_writer.add_document(doc!(val_field=>el));
}
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0u32);
let fast_field_reader = segment_reader.fast_fields().u64(val_field).unwrap();
bench.iter(move || {
let mut a = 0u64;
for _ in 0..NUM_LOOKUPS {
a = fast_field_reader.get(a as DocId);
}
a
})
}),
);
}
criterion_group!(benches, bench_linear_lookup, bench_jumpy_lookup);
criterion_main!(benches);

50
benches/union.rs Normal file
View File

@@ -0,0 +1,50 @@
use criterion::{criterion_group, criterion_main, Criterion};
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use tantivy::query::QueryParser;
use tantivy::schema::{Schema, STRING};
use tantivy::{Document, Index};
fn bench_union(criterion: &mut Criterion) {
criterion.bench_function_over_inputs(
"union_docset_fulladvance",
|bench, (ratio_left, ratio_right)| {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_text_field("val", STRING);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 80_000_000).unwrap();
let mut stdrng = StdRng::from_seed([0u8; 32]);
for _ in 0u32..100_000u32 {
let mut doc = Document::default();
if stdrng.gen_bool(*ratio_left) {
doc.add_text(field, "left");
}
if stdrng.gen_bool(*ratio_right) {
doc.add_text(field, "right");
}
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let query = QueryParser::for_index(&index, vec![field])
.parse_query("left right")
.unwrap();
bench.iter(move || {
let weight = query.weight(&searcher, false).unwrap();
let mut scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
let mut sum_docs = 0u64;
scorer.for_each(&mut |doc_id, _score| {
sum_docs += doc_id as u64;
});
});
},
vec![(0.2, 0.1), (0.2, 0.02)],
);
}
criterion_group!(benches, bench_union);
criterion_main!(benches);

72
benches/vint.rs Normal file
View File

@@ -0,0 +1,72 @@
use criterion::{criterion_group, criterion_main, Criterion, ParameterizedBenchmark};
use rand::rngs::StdRng;
use rand::Rng;
use rand::SeedableRng;
use tantivy::forbench::compression::{compressed_block_size, BlockDecoder};
use tantivy::forbench::compression::{BlockEncoder, VIntEncoder};
use tantivy::forbench::compression::{VIntDecoder, COMPRESSION_BLOCK_SIZE};
fn generate_array_with_seed(n: usize, ratio: f64, seed_val: u8) -> Vec<u32> {
let seed: [u8; 32] = [seed_val; 32];
let mut rng = StdRng::from_seed(seed);
(0u32..).filter(|_| rng.gen_bool(ratio)).take(n).collect()
}
pub fn generate_array(n: usize, ratio: f64) -> Vec<u32> {
generate_array_with_seed(n, ratio, 4)
}
fn bench_compress(criterion: &mut Criterion) {
criterion.bench(
"compress_sorted",
ParameterizedBenchmark::new(
"bitpack",
|bench, ratio| {
let mut encoder = BlockEncoder::new();
let data = generate_array(COMPRESSION_BLOCK_SIZE, *ratio);
bench.iter(|| {
encoder.compress_block_sorted(&data, 0u32);
});
},
vec![0.1],
)
.with_function("vint", |bench, ratio| {
let mut encoder = BlockEncoder::new();
let data = generate_array(COMPRESSION_BLOCK_SIZE, *ratio);
bench.iter(|| {
encoder.compress_vint_sorted(&data, 0u32);
});
}),
);
}
fn bench_uncompress(criterion: &mut Criterion) {
criterion.bench(
"uncompress_sorted",
ParameterizedBenchmark::new(
"bitpack",
|bench, ratio| {
let mut encoder = BlockEncoder::new();
let data = generate_array(COMPRESSION_BLOCK_SIZE, *ratio);
let (num_bits, compressed) = encoder.compress_block_sorted(&data, 0u32);
let mut decoder = BlockDecoder::new();
bench.iter(|| {
decoder.uncompress_block_sorted(compressed, 0u32, num_bits);
});
},
vec![0.1],
)
.with_function("vint", |bench, ratio| {
let mut encoder = BlockEncoder::new();
let data = generate_array(COMPRESSION_BLOCK_SIZE, *ratio);
let compressed = encoder.compress_vint_sorted(&data, 0u32);
let mut decoder = BlockDecoder::new();
bench.iter(move || {
decoder.uncompress_vint_sorted(compressed, 0u32, COMPRESSION_BLOCK_SIZE);
});
}),
);
}
criterion_group!(benches, bench_compress, bench_uncompress);
criterion_main!(benches);

2
run-bench.rs Executable file
View File

@@ -0,0 +1,2 @@
#!/usr/bin/env bash
cargo bench --features forbench

View File

@@ -86,6 +86,7 @@ where
} }
} }
#[inline(always)]
pub fn get(&self, idx: u64) -> u64 { pub fn get(&self, idx: u64) -> u64 {
if self.num_bits == 0 { if self.num_bits == 0 {
return 0u64; return 0u64;

View File

@@ -2,7 +2,7 @@ use std::fmt;
use std::u64; use std::u64;
#[derive(Clone, Copy, Eq, PartialEq)] #[derive(Clone, Copy, Eq, PartialEq)]
pub(crate) struct TinySet(u64); pub struct TinySet(u64);
impl fmt::Debug for TinySet { impl fmt::Debug for TinySet {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
@@ -353,43 +353,3 @@ mod tests {
} }
} }
} }
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::BitSet;
use super::TinySet;
use test;
#[bench]
fn bench_tinyset_pop(b: &mut test::Bencher) {
b.iter(|| {
let mut tinyset = TinySet::singleton(test::black_box(31u32));
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
});
}
#[bench]
fn bench_tinyset_sum(b: &mut test::Bencher) {
let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
b.iter(|| {
assert_eq!(test::black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
});
}
#[bench]
fn bench_tinyarr_sum(b: &mut test::Bencher) {
let v = [10u32, 14u32, 21u32];
b.iter(|| test::black_box(v).iter().cloned().sum::<u32>());
}
#[bench]
fn bench_bitset_initialize(b: &mut test::Bencher) {
b.iter(|| BitSet::with_max_value(1_000_000));
}
}

View File

@@ -6,7 +6,7 @@ mod serialize;
mod vint; mod vint;
pub use self::bitset::BitSet; pub use self::bitset::BitSet;
pub(crate) use self::bitset::TinySet; pub use self::bitset::TinySet;
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite}; pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
pub use self::counting_writer::CountingWriter; pub use self::counting_writer::CountingWriter;
pub use self::serialize::{BinarySerializable, FixedSize}; pub use self::serialize::{BinarySerializable, FixedSize};

View File

@@ -173,11 +173,11 @@ impl Index {
} }
/// Helper to access the tokenizer associated to a specific field. /// Helper to access the tokenizer associated to a specific field.
pub fn tokenizer_for_field(&self, field: Field) -> Result<BoxedTokenizer> { pub fn tokenizer_for_field(&self, field: Field) -> Result<Box<dyn BoxedTokenizer>> {
let field_entry = self.schema.get_field_entry(field); let field_entry = self.schema.get_field_entry(field);
let field_type = field_entry.field_type(); let field_type = field_entry.field_type();
let tokenizer_manager: &TokenizerManager = self.tokenizers(); let tokenizer_manager: &TokenizerManager = self.tokenizers();
let tokenizer_name_opt: Option<BoxedTokenizer> = match field_type { let tokenizer_name_opt: Option<Box<dyn BoxedTokenizer>> = match field_type {
FieldType::Str(text_options) => text_options FieldType::Str(text_options) => text_options
.get_indexing_options() .get_indexing_options()
.map(|text_indexing_options| text_indexing_options.tokenizer().to_string()) .map(|text_indexing_options| text_indexing_options.tokenizer().to_string())

View File

@@ -431,111 +431,3 @@ mod tests {
} }
} }
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::tests::FIELD;
use super::tests::{generate_permutation, SCHEMA};
use super::*;
use common::CompositeFile;
use directory::{Directory, RAMDirectory, WritePtr};
use fastfield::FastFieldReader;
use std::collections::HashMap;
use std::path::Path;
use test::{self, Bencher};
#[bench]
fn bench_intfastfield_linear_veclookup(b: &mut Bencher) {
let permutation = generate_permutation();
b.iter(|| {
let n = test::black_box(7000u32);
let mut a = 0u64;
for i in (0u32..n / 7).map(|v| v * 7) {
a ^= permutation[i as usize];
}
a
});
}
#[bench]
fn bench_intfastfield_veclookup(b: &mut Bencher) {
let permutation = generate_permutation();
b.iter(|| {
let n = test::black_box(1000u32);
let mut a = 0u64;
for _ in 0u32..n {
a = permutation[a as usize];
}
a
});
}
#[bench]
fn bench_intfastfield_linear_fflookup(b: &mut Bencher) {
let path = Path::new("test");
let permutation = generate_permutation();
let mut directory: RAMDirectory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for &x in &permutation {
fast_field_writers.add_document(&doc!(*FIELD=>x));
}
fast_field_writers
.serialize(&mut serializer, &HashMap::new())
.unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
{
let fast_fields_composite = CompositeFile::open(&source).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = FastFieldReader::<u64>::open(data);
b.iter(|| {
let n = test::black_box(7000u32);
let mut a = 0u64;
for i in (0u32..n / 7).map(|val| val * 7) {
a ^= fast_field_reader.get(i);
}
a
});
}
}
#[bench]
fn bench_intfastfield_fflookup(b: &mut Bencher) {
let path = Path::new("test");
let permutation = generate_permutation();
let mut directory: RAMDirectory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for &x in &permutation {
fast_field_writers.add_document(&doc!(*FIELD=>x));
}
fast_field_writers
.serialize(&mut serializer, &HashMap::new())
.unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
{
let fast_fields_composite = CompositeFile::open(&source).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = FastFieldReader::<u64>::open(data);
b.iter(|| {
let n = test::black_box(1000u32);
let mut a = 0u32;
for _ in 0u32..n {
a = fast_field_reader.get(a) as u32;
}
a
});
}
}
}

View File

@@ -67,10 +67,12 @@ impl<Item: FastValue> FastFieldReader<Item> {
/// ///
/// May panic if `doc` is greater than the segment /// May panic if `doc` is greater than the segment
// `maxdoc`. // `maxdoc`.
#[inline(always)]
pub fn get(&self, doc: DocId) -> Item { pub fn get(&self, doc: DocId) -> Item {
self.get_u64(u64::from(doc)) self.get_u64(u64::from(doc))
} }
#[inline(always)]
pub(crate) fn get_u64(&self, doc: u64) -> Item { pub(crate) fn get_u64(&self, doc: u64) -> Item {
Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc)) Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc))
} }

View File

@@ -49,7 +49,7 @@ pub struct SegmentWriter {
fast_field_writers: FastFieldsWriter, fast_field_writers: FastFieldsWriter,
fieldnorms_writer: FieldNormsWriter, fieldnorms_writer: FieldNormsWriter,
doc_opstamps: Vec<Opstamp>, doc_opstamps: Vec<Opstamp>,
tokenizers: Vec<Option<BoxedTokenizer>>, tokenizers: Vec<Option<Box<dyn BoxedTokenizer>>>,
} }
impl SegmentWriter { impl SegmentWriter {

View File

@@ -249,7 +249,6 @@ pub struct DocAddress(pub SegmentLocalId, pub DocId);
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE; use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE;
use crate::core::SegmentReader; use crate::core::SegmentReader;
use crate::docset::DocSet; use crate::docset::DocSet;
@@ -895,3 +894,15 @@ mod tests {
} }
} }
} }
#[cfg(feature = "forbench")]
pub mod forbench {
pub mod compression {
pub use crate::postings::compression::*;
}
pub mod bitset {
pub use crate::common::BitSet;
pub use crate::common::TinySet;
}
}

View File

@@ -160,9 +160,9 @@ impl VIntEncoder for BlockEncoder {
} }
impl VIntDecoder for BlockDecoder { impl VIntDecoder for BlockDecoder {
fn uncompress_vint_sorted<'a>( fn uncompress_vint_sorted(
&mut self, &mut self,
compressed_data: &'a [u8], compressed_data: &[u8],
offset: u32, offset: u32,
num_els: usize, num_els: usize,
) -> usize { ) -> usize {
@@ -170,7 +170,7 @@ impl VIntDecoder for BlockDecoder {
vint::uncompress_sorted(compressed_data, &mut self.output.0[..num_els], offset) vint::uncompress_sorted(compressed_data, &mut self.output.0[..num_els], offset)
} }
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize { fn uncompress_vint_unsorted(&mut self, compressed_data: &[u8], num_els: usize) -> usize {
self.output_len = num_els; self.output_len = num_els;
vint::uncompress_unsorted(compressed_data, &mut self.output.0[..num_els]) vint::uncompress_unsorted(compressed_data, &mut self.output.0[..num_els])
} }
@@ -268,78 +268,17 @@ pub mod tests {
} }
} }
} }
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::*;
use rand::SeedableRng;
use rand::{Rng, XorShiftRng};
use test::Bencher;
fn generate_array_with_seed(n: usize, ratio: f64, seed_val: u8) -> Vec<u32> {
let seed: &[u8; 16] = &[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, seed_val];
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
(0u32..).filter(|_| rng.gen_bool(ratio)).take(n).collect()
}
pub fn generate_array(n: usize, ratio: f64) -> Vec<u32> {
generate_array_with_seed(n, ratio, 4)
}
#[bench]
fn bench_compress(b: &mut Bencher) {
let mut encoder = BlockEncoder::new();
let data = generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
b.iter(|| {
encoder.compress_block_sorted(&data, 0u32);
});
}
#[bench]
fn bench_uncompress(b: &mut Bencher) {
let mut encoder = BlockEncoder::new();
let data = generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
let (num_bits, compressed) = encoder.compress_block_sorted(&data, 0u32);
let mut decoder = BlockDecoder::new();
b.iter(|| {
decoder.uncompress_block_sorted(compressed, 0u32, num_bits);
});
}
#[test] #[test]
fn test_all_docs_compression_numbits() { fn test_all_docs_compression_numbits() {
for expected_num_bits in 0u8.. { for expected_num_bits in 0u8..33u8 {
let mut data = [0u32; 128]; let mut data = [0u32; 128];
if expected_num_bits > 0 { if expected_num_bits > 0 {
data[0] = (1u64 << (expected_num_bits as usize) - 1) as u32; data[0] = (1u64 << (expected_num_bits as u64) - 1u64) as u32;
} }
let mut encoder = BlockEncoder::new(); let mut encoder = BlockEncoder::new();
let (num_bits, compressed) = encoder.compress_block_unsorted(&data); let (num_bits, compressed) = encoder.compress_block_unsorted(&data);
assert_eq!(compressed.len(), compressed_block_size(num_bits)); assert_eq!(compressed.len(), compressed_block_size(num_bits));
} }
} }
const NUM_INTS_BENCH_VINT: usize = 10;
#[bench]
fn bench_compress_vint(b: &mut Bencher) {
let mut encoder = BlockEncoder::new();
let data = generate_array(NUM_INTS_BENCH_VINT, 0.001);
b.iter(|| {
encoder.compress_vint_sorted(&data, 0u32);
});
}
#[bench]
fn bench_uncompress_vint(b: &mut Bencher) {
let mut encoder = BlockEncoder::new();
let data = generate_array(NUM_INTS_BENCH_VINT, 0.001);
let compressed = encoder.compress_vint_sorted(&data, 0u32);
let mut decoder = BlockDecoder::new();
b.iter(|| {
decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT);
});
}
} }

View File

@@ -3,6 +3,7 @@ Postings module (also called inverted index)
*/ */
mod block_search; mod block_search;
pub(crate) mod compression; pub(crate) mod compression;
/// Postings module /// Postings module
/// ///

View File

@@ -14,7 +14,6 @@ use tantivy_fst::Automaton;
pub struct AutomatonWeight<A> pub struct AutomatonWeight<A>
where where
A: Automaton + Send + Sync + 'static, A: Automaton + Send + Sync + 'static,
A::State: Clone + Default + Sized,
{ {
field: Field, field: Field,
automaton: A, automaton: A,
@@ -23,7 +22,6 @@ where
impl<A> AutomatonWeight<A> impl<A> AutomatonWeight<A>
where where
A: Automaton + Send + Sync + 'static, A: Automaton + Send + Sync + 'static,
A::State: Clone + Default + Sized,
{ {
/// Create a new AutomationWeight /// Create a new AutomationWeight
pub fn new(field: Field, automaton: A) -> AutomatonWeight<A> { pub fn new(field: Field, automaton: A) -> AutomatonWeight<A> {
@@ -39,7 +37,6 @@ where
impl<A> Weight for AutomatonWeight<A> impl<A> Weight for AutomatonWeight<A>
where where
A: Automaton + Send + Sync + 'static, A: Automaton + Send + Sync + 'static,
A::State: Clone + Default + Sized,
{ {
fn scorer(&self, reader: &SegmentReader) -> Result<Box<dyn Scorer>> { fn scorer(&self, reader: &SegmentReader) -> Result<Box<dyn Scorer>> {
let max_doc = reader.max_doc(); let max_doc = reader.max_doc();

View File

@@ -218,49 +218,3 @@ mod tests {
} }
} }
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::BitSet;
use super::BitSetDocSet;
use test;
use tests;
use DocSet;
#[bench]
fn bench_bitset_1pct_insert(b: &mut test::Bencher) {
use tests;
let els = tests::generate_nonunique_unsorted(1_000_000u32, 10_000);
b.iter(|| {
let mut bitset = BitSet::with_max_value(1_000_000);
for el in els.iter().cloned() {
bitset.insert(el);
}
});
}
#[bench]
fn bench_bitset_1pct_clone(b: &mut test::Bencher) {
use tests;
let els = tests::generate_nonunique_unsorted(1_000_000u32, 10_000);
let mut bitset = BitSet::with_max_value(1_000_000);
for el in els {
bitset.insert(el);
}
b.iter(|| bitset.clone());
}
#[bench]
fn bench_bitset_1pct_clone_iterate(b: &mut test::Bencher) {
let els = tests::sample(1_000_000u32, 0.01);
let mut bitset = BitSet::with_max_value(1_000_000);
for el in els {
bitset.insert(el);
}
b.iter(|| {
let mut docset = BitSetDocSet::from(bitset.clone());
while docset.advance() {}
});
}
}

View File

@@ -83,67 +83,28 @@ parser! {
} }
parser! { parser! {
/// Function that parses a range out of a Stream
/// Supports ranges like:
/// [5 TO 10], {5 TO 10}, [* TO 10], [10 TO *], {10 TO *], >5, <=10
/// [a TO *], [a TO c], [abc TO bcd}
fn range[I]()(I) -> UserInputLeaf fn range[I]()(I) -> UserInputLeaf
where [I: Stream<Item = char>] { where [I: Stream<Item = char>] {
let range_term_val = || { let range_term_val = || {
word().or(negative_number()).or(char('*').with(value("*".to_string()))) word().or(negative_number()).or(char('*').with(value("*".to_string())))
}; };
// check for unbounded range in the form of <5, <=10, >5, >=5
let elastic_unbounded_range = (choice([attempt(string(">=")),
attempt(string("<=")),
attempt(string("<")),
attempt(string(">"))])
.skip(spaces()),
range_term_val()).
map(|(comparison_sign, bound): (&str, String)|
match comparison_sign {
">=" => (UserInputBound::Inclusive(bound), UserInputBound::Unbounded),
"<=" => (UserInputBound::Unbounded, UserInputBound::Inclusive(bound)),
"<" => (UserInputBound::Unbounded, UserInputBound::Exclusive(bound)),
">" => (UserInputBound::Exclusive(bound), UserInputBound::Unbounded),
// default case
_ => (UserInputBound::Unbounded, UserInputBound::Unbounded)
});
let lower_bound = (one_of("{[".chars()), range_term_val()) let lower_bound = (one_of("{[".chars()), range_term_val())
.map(|(boundary_char, lower_bound): (char, String)| .map(|(boundary_char, lower_bound): (char, String)|
if lower_bound == "*" { if boundary_char == '{' { UserInputBound::Exclusive(lower_bound) }
UserInputBound::Unbounded else { UserInputBound::Inclusive(lower_bound) });
} else if boundary_char == '{' {
UserInputBound::Exclusive(lower_bound)
} else {
UserInputBound::Inclusive(lower_bound)
});
let upper_bound = (range_term_val(), one_of("}]".chars())) let upper_bound = (range_term_val(), one_of("}]".chars()))
.map(|(higher_bound, boundary_char): (String, char)| .map(|(higher_bound, boundary_char): (String, char)|
if higher_bound == "*" { if boundary_char == '}' { UserInputBound::Exclusive(higher_bound) }
UserInputBound::Unbounded else { UserInputBound::Inclusive(higher_bound) });
} else if boundary_char == '}' { (
UserInputBound::Exclusive(higher_bound) optional(field()),
} else { lower_bound
UserInputBound::Inclusive(higher_bound) .skip((spaces(), string("TO"), spaces())),
}); upper_bound,
// return only lower and upper ).map(|(field, lower, upper)| UserInputLeaf::Range {
let lower_to_upper = (lower_bound. field,
skip((spaces(), lower,
string("TO"), upper
spaces())),
upper_bound);
(optional(field()).skip(spaces()),
// try elastic first, if it matches, the range is unbounded
attempt(elastic_unbounded_range).or(lower_to_upper))
.map(|(field, (lower, upper))|
// Construct the leaf from extracted field (optional)
// and bounds
UserInputLeaf::Range {
field,
lower,
upper
}) })
} }
} }
@@ -297,49 +258,6 @@ mod test {
); );
} }
#[test]
fn test_parse_elastic_query_ranges() {
test_parse_query_to_ast_helper("title: >a", "title:{\"a\" TO \"*\"}");
test_parse_query_to_ast_helper("title:>=a", "title:[\"a\" TO \"*\"}");
test_parse_query_to_ast_helper("title: <a", "title:{\"*\" TO \"a\"}");
test_parse_query_to_ast_helper("title:<=a", "title:{\"*\" TO \"a\"]");
test_parse_query_to_ast_helper("title:<=bsd", "title:{\"*\" TO \"bsd\"]");
test_parse_query_to_ast_helper("weight: >70", "weight:{\"70\" TO \"*\"}");
test_parse_query_to_ast_helper("weight:>=70", "weight:[\"70\" TO \"*\"}");
test_parse_query_to_ast_helper("weight: <70", "weight:{\"*\" TO \"70\"}");
test_parse_query_to_ast_helper("weight:<=70", "weight:{\"*\" TO \"70\"]");
test_parse_query_to_ast_helper("weight: >60.7", "weight:{\"60.7\" TO \"*\"}");
test_parse_query_to_ast_helper("weight: <= 70", "weight:{\"*\" TO \"70\"]");
test_parse_query_to_ast_helper("weight: <= 70.5", "weight:{\"*\" TO \"70.5\"]");
}
#[test]
fn test_range_parser() {
// testing the range() parser separately
let res = range().parse("title: <hello").unwrap().0;
let expected = UserInputLeaf::Range {
field: Some("title".to_string()),
lower: UserInputBound::Unbounded,
upper: UserInputBound::Exclusive("hello".to_string()),
};
let res2 = range().parse("title:{* TO hello}").unwrap().0;
assert_eq!(res, expected);
assert_eq!(res2, expected);
let expected_weight = UserInputLeaf::Range {
field: Some("weight".to_string()),
lower: UserInputBound::Inclusive("71.2".to_string()),
upper: UserInputBound::Unbounded,
};
let res3 = range().parse("weight: >=71.2").unwrap().0;
let res4 = range().parse("weight:[71.2 TO *}").unwrap().0;
assert_eq!(res3, expected_weight);
assert_eq!(res4, expected_weight);
}
#[test] #[test]
fn test_parse_query_to_triming_spaces() { fn test_parse_query_to_triming_spaces() {
test_parse_query_to_ast_helper(" abc", "\"abc\""); test_parse_query_to_ast_helper(" abc", "\"abc\"");
@@ -373,7 +291,7 @@ mod test {
test_parse_query_to_ast_helper("[1 TO 5]", "[\"1\" TO \"5\"]"); test_parse_query_to_ast_helper("[1 TO 5]", "[\"1\" TO \"5\"]");
test_parse_query_to_ast_helper("foo:{a TO z}", "foo:{\"a\" TO \"z\"}"); test_parse_query_to_ast_helper("foo:{a TO z}", "foo:{\"a\" TO \"z\"}");
test_parse_query_to_ast_helper("foo:[1 TO toto}", "foo:[\"1\" TO \"toto\"}"); test_parse_query_to_ast_helper("foo:[1 TO toto}", "foo:[\"1\" TO \"toto\"}");
test_parse_query_to_ast_helper("foo:[* TO toto}", "foo:{\"*\" TO \"toto\"}"); test_parse_query_to_ast_helper("foo:[* TO toto}", "foo:[\"*\" TO \"toto\"}");
test_parse_query_to_ast_helper("foo:[1 TO *}", "foo:[\"1\" TO \"*\"}"); test_parse_query_to_ast_helper("foo:[1 TO *}", "foo:[\"1\" TO \"*\"}");
test_parse_query_to_ast_helper("foo:[1.1 TO *}", "foo:[\"1.1\" TO \"*\"}"); test_parse_query_to_ast_helper("foo:[1.1 TO *}", "foo:[\"1.1\" TO \"*\"}");
test_is_parse_err("abc + "); test_is_parse_err("abc + ");

View File

@@ -369,7 +369,6 @@ impl QueryParser {
match *bound { match *bound {
UserInputBound::Inclusive(_) => Ok(Bound::Included(term)), UserInputBound::Inclusive(_) => Ok(Bound::Included(term)),
UserInputBound::Exclusive(_) => Ok(Bound::Excluded(term)), UserInputBound::Exclusive(_) => Ok(Bound::Excluded(term)),
UserInputBound::Unbounded => Ok(Bound::Unbounded),
} }
} }
@@ -629,7 +628,7 @@ mod test {
pub fn test_parse_query_untokenized() { pub fn test_parse_query_untokenized() {
test_parse_query_to_logical_ast_helper( test_parse_query_to_logical_ast_helper(
"nottokenized:\"wordone wordtwo\"", "nottokenized:\"wordone wordtwo\"",
"Term(field=7,bytes=[119, 111, 114, 100, 111, 110, \ "Term([0, 0, 0, 7, 119, 111, 114, 100, 111, 110, \
101, 32, 119, 111, 114, 100, 116, 119, 111])", 101, 32, 119, 111, 114, 100, 116, 119, 111])",
false, false,
); );
@@ -673,7 +672,7 @@ mod test {
.is_ok()); .is_ok());
test_parse_query_to_logical_ast_helper( test_parse_query_to_logical_ast_helper(
"unsigned:2324", "unsigned:2324",
"Term(field=3,bytes=[0, 0, 0, 0, 0, 0, 9, 20])", "Term([0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 9, 20])",
false, false,
); );
@@ -694,19 +693,19 @@ mod test {
pub fn test_parse_query_to_ast_single_term() { pub fn test_parse_query_to_ast_single_term() {
test_parse_query_to_logical_ast_helper( test_parse_query_to_logical_ast_helper(
"title:toto", "title:toto",
"Term(field=0,bytes=[116, 111, 116, 111])", "Term([0, 0, 0, 0, 116, 111, 116, 111])",
false, false,
); );
test_parse_query_to_logical_ast_helper( test_parse_query_to_logical_ast_helper(
"+title:toto", "+title:toto",
"Term(field=0,bytes=[116, 111, 116, 111])", "Term([0, 0, 0, 0, 116, 111, 116, 111])",
false, false,
); );
test_parse_query_to_logical_ast_helper( test_parse_query_to_logical_ast_helper(
"+title:toto -titi", "+title:toto -titi",
"(+Term(field=0,bytes=[116, 111, 116, 111]) \ "(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \
-(Term(field=0,bytes=[116, 105, 116, 105]) \ -(Term([0, 0, 0, 0, 116, 105, 116, 105]) \
Term(field=1,bytes=[116, 105, 116, 105])))", Term([0, 0, 0, 1, 116, 105, 116, 105])))",
false, false,
); );
assert_eq!( assert_eq!(
@@ -721,13 +720,14 @@ mod test {
pub fn test_parse_query_to_ast_two_terms() { pub fn test_parse_query_to_ast_two_terms() {
test_parse_query_to_logical_ast_helper( test_parse_query_to_logical_ast_helper(
"title:a b", "title:a b",
"(Term(field=0,bytes=[97]) (Term(field=0,bytes=[98]) Term(field=1,bytes=[98])))", "(Term([0, 0, 0, 0, 97]) (Term([0, 0, 0, 0, 98]) \
Term([0, 0, 0, 1, 98])))",
false, false,
); );
test_parse_query_to_logical_ast_helper( test_parse_query_to_logical_ast_helper(
"title:\"a b\"", "title:\"a b\"",
"\"[(0, Term(field=0,bytes=[97])), \ "\"[(0, Term([0, 0, 0, 0, 97])), \
(1, Term(field=0,bytes=[98]))]\"", (1, Term([0, 0, 0, 0, 98]))]\"",
false, false,
); );
} }
@@ -736,43 +736,45 @@ mod test {
pub fn test_parse_query_to_ast_ranges() { pub fn test_parse_query_to_ast_ranges() {
test_parse_query_to_logical_ast_helper( test_parse_query_to_logical_ast_helper(
"title:[a TO b]", "title:[a TO b]",
"(Included(Term(field=0,bytes=[97])) TO Included(Term(field=0,bytes=[98])))", "(Included(Term([0, 0, 0, 0, 97])) TO \
Included(Term([0, 0, 0, 0, 98])))",
false, false,
); );
test_parse_query_to_logical_ast_helper( test_parse_query_to_logical_ast_helper(
"[a TO b]", "[a TO b]",
"((Included(Term(field=0,bytes=[97])) TO \ "((Included(Term([0, 0, 0, 0, 97])) TO \
Included(Term(field=0,bytes=[98]))) \ Included(Term([0, 0, 0, 0, 98]))) \
(Included(Term(field=1,bytes=[97])) TO \ (Included(Term([0, 0, 0, 1, 97])) TO \
Included(Term(field=1,bytes=[98]))))", Included(Term([0, 0, 0, 1, 98]))))",
false, false,
); );
test_parse_query_to_logical_ast_helper( test_parse_query_to_logical_ast_helper(
"title:{titi TO toto}", "title:{titi TO toto}",
"(Excluded(Term(field=0,bytes=[116, 105, 116, 105])) TO \ "(Excluded(Term([0, 0, 0, 0, 116, 105, 116, 105])) TO \
Excluded(Term(field=0,bytes=[116, 111, 116, 111])))", Excluded(Term([0, 0, 0, 0, 116, 111, 116, 111])))",
false, false,
); );
test_parse_query_to_logical_ast_helper( test_parse_query_to_logical_ast_helper(
"title:{* TO toto}", "title:{* TO toto}",
"(Unbounded TO Excluded(Term(field=0,bytes=[116, 111, 116, 111])))", "(Unbounded TO \
Excluded(Term([0, 0, 0, 0, 116, 111, 116, 111])))",
false, false,
); );
test_parse_query_to_logical_ast_helper( test_parse_query_to_logical_ast_helper(
"title:{titi TO *}", "title:{titi TO *}",
"(Excluded(Term(field=0,bytes=[116, 105, 116, 105])) TO Unbounded)", "(Excluded(Term([0, 0, 0, 0, 116, 105, 116, 105])) TO Unbounded)",
false, false,
); );
test_parse_query_to_logical_ast_helper( test_parse_query_to_logical_ast_helper(
"signed:{-5 TO 3}", "signed:{-5 TO 3}",
"(Excluded(Term(field=2,bytes=[127, 255, 255, 255, 255, 255, 255, 251])) TO \ "(Excluded(Term([0, 0, 0, 2, 127, 255, 255, 255, 255, 255, 255, 251])) TO \
Excluded(Term(field=2,bytes=[128, 0, 0, 0, 0, 0, 0, 3])))", Excluded(Term([0, 0, 0, 2, 128, 0, 0, 0, 0, 0, 0, 3])))",
false, false,
); );
test_parse_query_to_logical_ast_helper( test_parse_query_to_logical_ast_helper(
"float:{-1.5 TO 1.5}", "float:{-1.5 TO 1.5}",
"(Excluded(Term(field=10,bytes=[64, 7, 255, 255, 255, 255, 255, 255])) TO \ "(Excluded(Term([0, 0, 0, 10, 64, 7, 255, 255, 255, 255, 255, 255])) TO \
Excluded(Term(field=10,bytes=[191, 248, 0, 0, 0, 0, 0, 0])))", Excluded(Term([0, 0, 0, 10, 191, 248, 0, 0, 0, 0, 0, 0])))",
false, false,
); );
@@ -877,19 +879,19 @@ mod test {
pub fn test_parse_query_to_ast_conjunction() { pub fn test_parse_query_to_ast_conjunction() {
test_parse_query_to_logical_ast_helper( test_parse_query_to_logical_ast_helper(
"title:toto", "title:toto",
"Term(field=0,bytes=[116, 111, 116, 111])", "Term([0, 0, 0, 0, 116, 111, 116, 111])",
true, true,
); );
test_parse_query_to_logical_ast_helper( test_parse_query_to_logical_ast_helper(
"+title:toto", "+title:toto",
"Term(field=0,bytes=[116, 111, 116, 111])", "Term([0, 0, 0, 0, 116, 111, 116, 111])",
true, true,
); );
test_parse_query_to_logical_ast_helper( test_parse_query_to_logical_ast_helper(
"+title:toto -titi", "+title:toto -titi",
"(+Term(field=0,bytes=[116, 111, 116, 111]) \ "(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \
-(Term(field=0,bytes=[116, 105, 116, 105]) \ -(Term([0, 0, 0, 0, 116, 105, 116, 105]) \
Term(field=1,bytes=[116, 105, 116, 105])))", Term([0, 0, 0, 1, 116, 105, 116, 105])))",
true, true,
); );
assert_eq!( assert_eq!(
@@ -900,15 +902,15 @@ mod test {
); );
test_parse_query_to_logical_ast_helper( test_parse_query_to_logical_ast_helper(
"title:a b", "title:a b",
"(+Term(field=0,bytes=[97]) \ "(+Term([0, 0, 0, 0, 97]) \
+(Term(field=0,bytes=[98]) \ +(Term([0, 0, 0, 0, 98]) \
Term(field=1,bytes=[98])))", Term([0, 0, 0, 1, 98])))",
true, true,
); );
test_parse_query_to_logical_ast_helper( test_parse_query_to_logical_ast_helper(
"title:\"a b\"", "title:\"a b\"",
"\"[(0, Term(field=0,bytes=[97])), \ "\"[(0, Term([0, 0, 0, 0, 97])), \
(1, Term(field=0,bytes=[98]))]\"", (1, Term([0, 0, 0, 0, 98]))]\"",
true, true,
); );
} }
@@ -917,8 +919,10 @@ mod test {
pub fn test_query_parser_hyphen() { pub fn test_query_parser_hyphen() {
test_parse_query_to_logical_ast_helper( test_parse_query_to_logical_ast_helper(
"title:www-form-encoded", "title:www-form-encoded",
"\"[(0, Term(field=0,bytes=[119, 119, 119])), (1, Term(field=0,bytes=[102, 111, 114, 109])), (2, Term(field=0,bytes=[101, 110, 99, 111, 100, 101, 100]))]\"", "\"[(0, Term([0, 0, 0, 0, 119, 119, 119])), \
false (1, Term([0, 0, 0, 0, 102, 111, 114, 109])), \
(2, Term([0, 0, 0, 0, 101, 110, 99, 111, 100, 101, 100]))]\"",
false,
); );
} }
} }

View File

@@ -3,7 +3,6 @@ use std::fmt::{Debug, Formatter};
use crate::query::Occur; use crate::query::Occur;
#[derive(PartialEq)]
pub enum UserInputLeaf { pub enum UserInputLeaf {
Literal(UserInputLiteral), Literal(UserInputLiteral),
All, All,
@@ -36,7 +35,6 @@ impl Debug for UserInputLeaf {
} }
} }
#[derive(PartialEq)]
pub struct UserInputLiteral { pub struct UserInputLiteral {
pub field_name: Option<String>, pub field_name: Option<String>,
pub phrase: String, pub phrase: String,
@@ -51,11 +49,9 @@ impl fmt::Debug for UserInputLiteral {
} }
} }
#[derive(PartialEq)]
pub enum UserInputBound { pub enum UserInputBound {
Inclusive(String), Inclusive(String),
Exclusive(String), Exclusive(String),
Unbounded,
} }
impl UserInputBound { impl UserInputBound {
@@ -63,7 +59,6 @@ impl UserInputBound {
match *self { match *self {
UserInputBound::Inclusive(ref word) => write!(formatter, "[\"{}\"", word), UserInputBound::Inclusive(ref word) => write!(formatter, "[\"{}\"", word),
UserInputBound::Exclusive(ref word) => write!(formatter, "{{\"{}\"", word), UserInputBound::Exclusive(ref word) => write!(formatter, "{{\"{}\"", word),
UserInputBound::Unbounded => write!(formatter, "{{\"*\""),
} }
} }
@@ -71,7 +66,6 @@ impl UserInputBound {
match *self { match *self {
UserInputBound::Inclusive(ref word) => write!(formatter, "\"{}\"]", word), UserInputBound::Inclusive(ref word) => write!(formatter, "\"{}\"]", word),
UserInputBound::Exclusive(ref word) => write!(formatter, "\"{}\"}}", word), UserInputBound::Exclusive(ref word) => write!(formatter, "\"{}\"}}", word),
UserInputBound::Unbounded => write!(formatter, "\"*\"}}"),
} }
} }
@@ -79,7 +73,6 @@ impl UserInputBound {
match *self { match *self {
UserInputBound::Inclusive(ref contents) => contents, UserInputBound::Inclusive(ref contents) => contents,
UserInputBound::Exclusive(ref contents) => contents, UserInputBound::Exclusive(ref contents) => contents,
UserInputBound::Unbounded => &"*",
} }
} }
} }

View File

@@ -338,33 +338,39 @@ mod tests {
use crate::collector::Count; use crate::collector::Count;
use crate::schema::{Document, Field, Schema, INDEXED}; use crate::schema::{Document, Field, Schema, INDEXED};
use crate::Index; use crate::Index;
use crate::Result;
use std::collections::Bound; use std::collections::Bound;
#[test] #[test]
fn test_range_query_simple() { fn test_range_query_simple() {
let mut schema_builder = Schema::builder(); fn run() -> Result<()> {
let year_field = schema_builder.add_u64_field("year", INDEXED); let mut schema_builder = Schema::builder();
let schema = schema_builder.build(); let year_field = schema_builder.add_u64_field("year", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_with_num_threads(1, 6_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 6_000_000).unwrap();
for year in 1950u64..2017u64 { for year in 1950u64..2017u64 {
let num_docs_within_year = 10 + (year - 1950) * (year - 1950); let num_docs_within_year = 10 + (year - 1950) * (year - 1950);
for _ in 0..num_docs_within_year { for _ in 0..num_docs_within_year {
index_writer.add_document(doc!(year_field => year)); index_writer.add_document(doc!(year_field => year));
}
} }
index_writer.commit().unwrap();
} }
index_writer.commit().unwrap(); let reader = index.reader().unwrap();
let searcher = reader.searcher();
let docs_in_the_sixties = RangeQuery::new_u64(year_field, 1960u64..1970u64);
// ... or `1960..=1969` if inclusive range is enabled.
let count = searcher.search(&docs_in_the_sixties, &Count)?;
assert_eq!(count, 2285);
Ok(())
} }
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let docs_in_the_sixties = RangeQuery::new_u64(year_field, 1960u64..1970u64); run().unwrap();
// ... or `1960..=1969` if inclusive range is enabled.
let count = searcher.search(&docs_in_the_sixties, &Count).unwrap();
assert_eq!(count, 2285);
} }
#[test] #[test]

View File

@@ -12,7 +12,7 @@ mod tests {
use crate::collector::TopDocs; use crate::collector::TopDocs;
use crate::docset::DocSet; use crate::docset::DocSet;
use crate::query::{Query, QueryParser, Scorer, TermQuery}; use crate::query::{Query, QueryParser, Scorer, TermQuery};
use crate::schema::{Field, IndexRecordOption, Schema, STRING, TEXT}; use crate::schema::{IndexRecordOption, Schema, STRING, TEXT};
use crate::tests::assert_nearly_equals; use crate::tests::assert_nearly_equals;
use crate::Index; use crate::Index;
use crate::Term; use crate::Term;
@@ -114,16 +114,4 @@ mod tests {
let reader = index.reader().unwrap(); let reader = index.reader().unwrap();
assert_eq!(term_query.count(&*reader.searcher()).unwrap(), 1); assert_eq!(term_query.count(&*reader.searcher()).unwrap(), 1);
} }
#[test]
fn test_term_query_debug() {
let term_query = TermQuery::new(
Term::from_field_text(Field(1), "hello"),
IndexRecordOption::WithFreqs,
);
assert_eq!(
format!("{:?}", term_query),
"TermQuery(Term(field=1,bytes=[104, 101, 108, 108, 111]))"
);
}
} }

View File

@@ -7,7 +7,6 @@ use crate::Result;
use crate::Searcher; use crate::Searcher;
use crate::Term; use crate::Term;
use std::collections::BTreeSet; use std::collections::BTreeSet;
use std::fmt;
/// A Term query matches all of the documents /// A Term query matches all of the documents
/// containing a specific term. /// containing a specific term.
@@ -62,18 +61,12 @@ use std::fmt;
/// Ok(()) /// Ok(())
/// } /// }
/// ``` /// ```
#[derive(Clone)] #[derive(Clone, Debug)]
pub struct TermQuery { pub struct TermQuery {
term: Term, term: Term,
index_record_option: IndexRecordOption, index_record_option: IndexRecordOption,
} }
impl fmt::Debug for TermQuery {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "TermQuery({:?})", self.term)
}
}
impl TermQuery { impl TermQuery {
/// Creates a new term query. /// Creates a new term query.
pub fn new(term: Term, segment_postings_options: IndexRecordOption) -> TermQuery { pub fn new(term: Term, segment_postings_options: IndexRecordOption) -> TermQuery {

View File

@@ -411,52 +411,3 @@ mod tests {
} }
} }
#[cfg(all(test, feature = "unstable"))]
mod bench {
use query::score_combiner::DoNothingCombiner;
use query::ConstScorer;
use query::Union;
use query::VecDocSet;
use test::Bencher;
use tests;
use DocId;
use DocSet;
#[bench]
fn bench_union_3_high(bench: &mut Bencher) {
let union_docset: Vec<Vec<DocId>> = vec![
tests::sample_with_seed(100_000, 0.1, 0),
tests::sample_with_seed(100_000, 0.2, 1),
];
bench.iter(|| {
let mut v = Union::<_, DoNothingCombiner>::from(
union_docset
.iter()
.map(|doc_ids| VecDocSet::from(doc_ids.clone()))
.map(ConstScorer::new)
.collect::<Vec<_>>(),
);
while v.advance() {}
});
}
#[bench]
fn bench_union_3_low(bench: &mut Bencher) {
let union_docset: Vec<Vec<DocId>> = vec![
tests::sample_with_seed(100_000, 0.01, 0),
tests::sample_with_seed(100_000, 0.05, 1),
tests::sample_with_seed(100_000, 0.001, 2),
];
bench.iter(|| {
let mut v = Union::<_, DoNothingCombiner>::from(
union_docset
.iter()
.map(|doc_ids| VecDocSet::from(doc_ids.clone()))
.map(ConstScorer::new)
.collect::<Vec<_>>(),
);
while v.advance() {}
});
}
}

View File

@@ -10,7 +10,7 @@ use serde_json::Value as JsonValue;
/// Possible error that may occur while parsing a field value /// Possible error that may occur while parsing a field value
/// At this point the JSON is known to be valid. /// At this point the JSON is known to be valid.
#[derive(Debug, PartialEq)] #[derive(Debug)]
pub enum ValueParsingError { pub enum ValueParsingError {
/// Encountered a numerical value that overflows or underflow its integer type. /// Encountered a numerical value that overflows or underflow its integer type.
OverflowError(String), OverflowError(String),

View File

@@ -246,25 +246,6 @@ impl Schema {
self.0.fields_map.get(field_name).cloned() self.0.fields_map.get(field_name).cloned()
} }
/// Create a named document off the doc.
pub fn convert_named_doc(
&self,
named_doc: NamedFieldDocument,
) -> Result<Document, DocParsingError> {
let mut document = Document::new();
for (field_name, values) in named_doc.0 {
if let Some(field) = self.get_field(&field_name) {
for value in values {
let field_value = FieldValue::new(field, value);
document.add(field_value);
}
} else {
return Err(DocParsingError::NoSuchFieldInSchema(field_name));
}
}
Ok(document)
}
/// Create a named document off the doc. /// Create a named document off the doc.
pub fn to_named_doc(&self, doc: &Document) -> NamedFieldDocument { pub fn to_named_doc(&self, doc: &Document) -> NamedFieldDocument {
let mut field_map = BTreeMap::new(); let mut field_map = BTreeMap::new();
@@ -379,7 +360,7 @@ impl<'de> Deserialize<'de> for Schema {
/// Error that may happen when deserializing /// Error that may happen when deserializing
/// a document from JSON. /// a document from JSON.
#[derive(Debug, Fail, PartialEq)] #[derive(Debug, Fail)]
pub enum DocParsingError { pub enum DocParsingError {
/// The payload given is not valid JSON. /// The payload given is not valid JSON.
#[fail(display = "The provided string is not valid JSON")] #[fail(display = "The provided string is not valid JSON")]
@@ -388,10 +369,7 @@ pub enum DocParsingError {
#[fail(display = "The field '{:?}' could not be parsed: {:?}", _0, _1)] #[fail(display = "The field '{:?}' could not be parsed: {:?}", _0, _1)]
ValueError(String, ValueParsingError), ValueError(String, ValueParsingError),
/// The json-document contains a field that is not declared in the schema. /// The json-document contains a field that is not declared in the schema.
#[fail( #[fail(display = "The json-document contains an unknown field: {:?}", _0)]
display = "The document contains a field that is not declared in the schema: {:?}",
_0
)]
NoSuchFieldInSchema(String), NoSuchFieldInSchema(String),
} }
@@ -403,7 +381,6 @@ mod tests {
use crate::schema::*; use crate::schema::*;
use matches::{assert_matches, matches}; use matches::{assert_matches, matches};
use serde_json; use serde_json;
use std::collections::BTreeMap;
#[test] #[test]
pub fn is_indexed_test() { pub fn is_indexed_test() {
@@ -518,54 +495,6 @@ mod tests {
assert_eq!(doc, doc_serdeser); assert_eq!(doc, doc_serdeser);
} }
#[test]
pub fn test_document_from_nameddoc() {
let mut schema_builder = Schema::builder();
let title = schema_builder.add_text_field("title", TEXT);
let val = schema_builder.add_i64_field("val", INDEXED);
let schema = schema_builder.build();
let mut named_doc_map = BTreeMap::default();
named_doc_map.insert(
"title".to_string(),
vec![Value::from("title1"), Value::from("title2")],
);
named_doc_map.insert(
"val".to_string(),
vec![Value::from(14u64), Value::from(-1i64)],
);
let doc = schema
.convert_named_doc(NamedFieldDocument(named_doc_map))
.unwrap();
assert_eq!(
doc.get_all(title),
vec![
&Value::from("title1".to_string()),
&Value::from("title2".to_string())
]
);
assert_eq!(
doc.get_all(val),
vec![&Value::from(14u64), &Value::from(-1i64)]
);
}
#[test]
pub fn test_document_from_nameddoc_error() {
let schema = Schema::builder().build();
let mut named_doc_map = BTreeMap::default();
named_doc_map.insert(
"title".to_string(),
vec![Value::from("title1"), Value::from("title2")],
);
let err = schema
.convert_named_doc(NamedFieldDocument(named_doc_map))
.unwrap_err();
assert_eq!(
err,
DocParsingError::NoSuchFieldInSchema("title".to_string())
);
}
#[test] #[test]
pub fn test_parse_document() { pub fn test_parse_document() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();

View File

@@ -224,12 +224,7 @@ where
impl fmt::Debug for Term { impl fmt::Debug for Term {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!( write!(f, "Term({:?})", &self.0[..])
f,
"Term(field={},bytes={:?})",
self.field().0,
self.value_bytes()
)
} }
} }

View File

@@ -63,7 +63,7 @@ impl FragmentCandidate {
fn try_add_token(&mut self, token: &Token, terms: &BTreeMap<String, f32>) { fn try_add_token(&mut self, token: &Token, terms: &BTreeMap<String, f32>) {
self.stop_offset = token.offset_to; self.stop_offset = token.offset_to;
if let Some(&score) = terms.get(&token.text.to_lowercase()) { if let Some(score) = terms.get(&token.text.to_lowercase()) {
self.score += score; self.score += score;
self.highlighted self.highlighted
.push(HighlightSection::new(token.offset_from, token.offset_to)); .push(HighlightSection::new(token.offset_from, token.offset_to));
@@ -142,7 +142,7 @@ impl Snippet {
/// Fragments must be valid in the sense that `&text[fragment.start..fragment.stop]`\ /// Fragments must be valid in the sense that `&text[fragment.start..fragment.stop]`\
/// has to be a valid string. /// has to be a valid string.
fn search_fragments<'a>( fn search_fragments<'a>(
tokenizer: &BoxedTokenizer, tokenizer: &dyn BoxedTokenizer,
text: &'a str, text: &'a str,
terms: &BTreeMap<String, f32>, terms: &BTreeMap<String, f32>,
max_num_chars: usize, max_num_chars: usize,
@@ -150,6 +150,7 @@ fn search_fragments<'a>(
let mut token_stream = tokenizer.token_stream(text); let mut token_stream = tokenizer.token_stream(text);
let mut fragment = FragmentCandidate::new(0); let mut fragment = FragmentCandidate::new(0);
let mut fragments: Vec<FragmentCandidate> = vec![]; let mut fragments: Vec<FragmentCandidate> = vec![];
while let Some(next) = token_stream.next() { while let Some(next) = token_stream.next() {
if (next.offset_to - fragment.start_offset) > max_num_chars { if (next.offset_to - fragment.start_offset) > max_num_chars {
if fragment.score > 0.0 { if fragment.score > 0.0 {
@@ -253,7 +254,7 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str)
/// ``` /// ```
pub struct SnippetGenerator { pub struct SnippetGenerator {
terms_text: BTreeMap<String, f32>, terms_text: BTreeMap<String, f32>,
tokenizer: BoxedTokenizer, tokenizer: Box<dyn BoxedTokenizer>,
field: Field, field: Field,
max_num_chars: usize, max_num_chars: usize,
} }
@@ -315,8 +316,12 @@ impl SnippetGenerator {
/// Generates a snippet for the given text. /// Generates a snippet for the given text.
pub fn snippet(&self, text: &str) -> Snippet { pub fn snippet(&self, text: &str) -> Snippet {
let fragment_candidates = let fragment_candidates = search_fragments(
search_fragments(&self.tokenizer, &text, &self.terms_text, self.max_num_chars); &*self.tokenizer,
&text,
&self.terms_text,
self.max_num_chars,
);
select_best_fragment_combination(&fragment_candidates[..], &text) select_best_fragment_combination(&fragment_candidates[..], &text)
} }
} }
@@ -326,7 +331,7 @@ mod tests {
use super::{search_fragments, select_best_fragment_combination}; use super::{search_fragments, select_best_fragment_combination};
use crate::query::QueryParser; use crate::query::QueryParser;
use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions, TEXT}; use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions, TEXT};
use crate::tokenizer::SimpleTokenizer; use crate::tokenizer::{box_tokenizer, SimpleTokenizer};
use crate::Index; use crate::Index;
use crate::SnippetGenerator; use crate::SnippetGenerator;
use maplit::btreemap; use maplit::btreemap;
@@ -350,12 +355,12 @@ Survey in 2016, 2017, and 2018."#;
#[test] #[test]
fn test_snippet() { fn test_snippet() {
let boxed_tokenizer = SimpleTokenizer.into(); let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
let terms = btreemap! { let terms = btreemap! {
String::from("rust") => 1.0, String::from("rust") => 1.0,
String::from("language") => 0.9 String::from("language") => 0.9
}; };
let fragments = search_fragments(&boxed_tokenizer, TEST_TEXT, &terms, 100); let fragments = search_fragments(&*boxed_tokenizer, TEST_TEXT, &terms, 100);
assert_eq!(fragments.len(), 7); assert_eq!(fragments.len(), 7);
{ {
let first = &fragments[0]; let first = &fragments[0];
@@ -377,13 +382,13 @@ Survey in 2016, 2017, and 2018."#;
#[test] #[test]
fn test_snippet_scored_fragment() { fn test_snippet_scored_fragment() {
let boxed_tokenizer = SimpleTokenizer.into(); let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
{ {
let terms = btreemap! { let terms = btreemap! {
String::from("rust") =>1.0f32, String::from("rust") =>1.0f32,
String::from("language") => 0.9f32 String::from("language") => 0.9f32
}; };
let fragments = search_fragments(&boxed_tokenizer, TEST_TEXT, &terms, 20); let fragments = search_fragments(&*boxed_tokenizer, TEST_TEXT, &terms, 20);
{ {
let first = &fragments[0]; let first = &fragments[0];
assert_eq!(first.score, 1.0); assert_eq!(first.score, 1.0);
@@ -392,13 +397,13 @@ Survey in 2016, 2017, and 2018."#;
let snippet = select_best_fragment_combination(&fragments[..], &TEST_TEXT); let snippet = select_best_fragment_combination(&fragments[..], &TEST_TEXT);
assert_eq!(snippet.to_html(), "<b>Rust</b> is a systems") assert_eq!(snippet.to_html(), "<b>Rust</b> is a systems")
} }
let boxed_tokenizer = SimpleTokenizer.into(); let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
{ {
let terms = btreemap! { let terms = btreemap! {
String::from("rust") =>0.9f32, String::from("rust") =>0.9f32,
String::from("language") => 1.0f32 String::from("language") => 1.0f32
}; };
let fragments = search_fragments(&boxed_tokenizer, TEST_TEXT, &terms, 20); let fragments = search_fragments(&*boxed_tokenizer, TEST_TEXT, &terms, 20);
//assert_eq!(fragments.len(), 7); //assert_eq!(fragments.len(), 7);
{ {
let first = &fragments[0]; let first = &fragments[0];
@@ -412,14 +417,14 @@ Survey in 2016, 2017, and 2018."#;
#[test] #[test]
fn test_snippet_in_second_fragment() { fn test_snippet_in_second_fragment() {
let boxed_tokenizer = SimpleTokenizer.into(); let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
let text = "a b c d e f g"; let text = "a b c d e f g";
let mut terms = BTreeMap::new(); let mut terms = BTreeMap::new();
terms.insert(String::from("c"), 1.0); terms.insert(String::from("c"), 1.0);
let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 3); let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3);
assert_eq!(fragments.len(), 1); assert_eq!(fragments.len(), 1);
{ {
@@ -436,14 +441,14 @@ Survey in 2016, 2017, and 2018."#;
#[test] #[test]
fn test_snippet_with_term_at_the_end_of_fragment() { fn test_snippet_with_term_at_the_end_of_fragment() {
let boxed_tokenizer = SimpleTokenizer.into(); let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
let text = "a b c d e f f g"; let text = "a b c d e f f g";
let mut terms = BTreeMap::new(); let mut terms = BTreeMap::new();
terms.insert(String::from("f"), 1.0); terms.insert(String::from("f"), 1.0);
let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 3); let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3);
assert_eq!(fragments.len(), 2); assert_eq!(fragments.len(), 2);
{ {
@@ -460,7 +465,7 @@ Survey in 2016, 2017, and 2018."#;
#[test] #[test]
fn test_snippet_with_second_fragment_has_the_highest_score() { fn test_snippet_with_second_fragment_has_the_highest_score() {
let boxed_tokenizer = SimpleTokenizer.into(); let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
let text = "a b c d e f g"; let text = "a b c d e f g";
@@ -468,7 +473,7 @@ Survey in 2016, 2017, and 2018."#;
terms.insert(String::from("f"), 1.0); terms.insert(String::from("f"), 1.0);
terms.insert(String::from("a"), 0.9); terms.insert(String::from("a"), 0.9);
let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 7); let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 7);
assert_eq!(fragments.len(), 2); assert_eq!(fragments.len(), 2);
{ {
@@ -485,14 +490,14 @@ Survey in 2016, 2017, and 2018."#;
#[test] #[test]
fn test_snippet_with_term_not_in_text() { fn test_snippet_with_term_not_in_text() {
let boxed_tokenizer = SimpleTokenizer.into(); let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
let text = "a b c d"; let text = "a b c d";
let mut terms = BTreeMap::new(); let mut terms = BTreeMap::new();
terms.insert(String::from("z"), 1.0); terms.insert(String::from("z"), 1.0);
let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 3); let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3);
assert_eq!(fragments.len(), 0); assert_eq!(fragments.len(), 0);
@@ -503,12 +508,12 @@ Survey in 2016, 2017, and 2018."#;
#[test] #[test]
fn test_snippet_with_no_terms() { fn test_snippet_with_no_terms() {
let boxed_tokenizer = SimpleTokenizer.into(); let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
let text = "a b c d"; let text = "a b c d";
let terms = BTreeMap::new(); let terms = BTreeMap::new();
let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 3); let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3);
assert_eq!(fragments.len(), 0); assert_eq!(fragments.len(), 0);
let snippet = select_best_fragment_combination(&fragments[..], &text); let snippet = select_best_fragment_combination(&fragments[..], &text);

View File

@@ -2,7 +2,7 @@ use super::TermDictionary;
use crate::postings::TermInfo; use crate::postings::TermInfo;
use crate::termdict::TermOrdinal; use crate::termdict::TermOrdinal;
use tantivy_fst::automaton::AlwaysMatch; use tantivy_fst::automaton::AlwaysMatch;
use tantivy_fst::map::{Stream, StreamBuilder, StreamWithState}; use tantivy_fst::map::{Stream, StreamBuilder};
use tantivy_fst::Automaton; use tantivy_fst::Automaton;
use tantivy_fst::{IntoStreamer, Streamer}; use tantivy_fst::{IntoStreamer, Streamer};
@@ -11,7 +11,6 @@ use tantivy_fst::{IntoStreamer, Streamer};
pub struct TermStreamerBuilder<'a, A = AlwaysMatch> pub struct TermStreamerBuilder<'a, A = AlwaysMatch>
where where
A: Automaton, A: Automaton,
A::State: Clone,
{ {
fst_map: &'a TermDictionary, fst_map: &'a TermDictionary,
stream_builder: StreamBuilder<'a, A>, stream_builder: StreamBuilder<'a, A>,
@@ -20,7 +19,6 @@ where
impl<'a, A> TermStreamerBuilder<'a, A> impl<'a, A> TermStreamerBuilder<'a, A>
where where
A: Automaton, A: Automaton,
A::State: Clone + Default + Sized,
{ {
pub(crate) fn new(fst_map: &'a TermDictionary, stream_builder: StreamBuilder<'a, A>) -> Self { pub(crate) fn new(fst_map: &'a TermDictionary, stream_builder: StreamBuilder<'a, A>) -> Self {
TermStreamerBuilder { TermStreamerBuilder {
@@ -58,11 +56,10 @@ where
pub fn into_stream(self) -> TermStreamer<'a, A> { pub fn into_stream(self) -> TermStreamer<'a, A> {
TermStreamer { TermStreamer {
fst_map: self.fst_map, fst_map: self.fst_map,
stream: self.stream_builder.with_state().into_stream(), stream: self.stream_builder.into_stream(),
term_ord: 0u64, term_ord: 0u64,
current_key: Vec::with_capacity(100), current_key: Vec::with_capacity(100),
current_value: TermInfo::default(), current_value: TermInfo::default(),
state: Default::default(),
} }
} }
} }
@@ -72,31 +69,27 @@ where
pub struct TermStreamer<'a, A = AlwaysMatch> pub struct TermStreamer<'a, A = AlwaysMatch>
where where
A: Automaton, A: Automaton,
A::State: Clone + Default + Sized,
{ {
fst_map: &'a TermDictionary, fst_map: &'a TermDictionary,
stream: StreamWithState<'a, A>, stream: Stream<'a, A>,
term_ord: TermOrdinal, term_ord: TermOrdinal,
current_key: Vec<u8>, current_key: Vec<u8>,
current_value: TermInfo, current_value: TermInfo,
state: A::State,
} }
impl<'a, A> TermStreamer<'a, A> impl<'a, A> TermStreamer<'a, A>
where where
A: Automaton, A: Automaton,
A::State: Clone + Default + Sized,
{ {
/// Advance position the stream on the next item. /// Advance position the stream on the next item.
/// Before the first call to `.advance()`, the stream /// Before the first call to `.advance()`, the stream
/// is an unitialized state. /// is an unitialized state.
pub fn advance(&mut self) -> bool { pub fn advance(&mut self) -> bool {
if let Some((term, term_ord, state)) = self.stream.next() { if let Some((term, term_ord)) = self.stream.next() {
self.current_key.clear(); self.current_key.clear();
self.current_key.extend_from_slice(term); self.current_key.extend_from_slice(term);
self.term_ord = term_ord; self.term_ord = term_ord;
self.current_value = self.fst_map.term_info_from_ord(term_ord); self.current_value = self.fst_map.term_info_from_ord(term_ord);
self.state = state;
true true
} else { } else {
false false
@@ -125,10 +118,6 @@ where
&self.current_key &self.current_key
} }
pub fn state(&self) -> &A::State {
&self.state
}
/// Accesses the current value. /// Accesses the current value.
/// ///
/// Calling `.value()` after the end of the stream will return the /// Calling `.value()` after the end of the stream will return the

View File

@@ -197,11 +197,7 @@ impl TermDictionary {
/// Returns a search builder, to stream all of the terms /// Returns a search builder, to stream all of the terms
/// within the Automaton /// within the Automaton
pub fn search<'a, A>(&'a self, automaton: A) -> TermStreamerBuilder<'a, A> pub fn search<'a, A: Automaton + 'a>(&'a self, automaton: A) -> TermStreamerBuilder<'a, A> {
where
A: Automaton + 'a,
A::State: Clone + Default + Sized,
{
let stream_builder = self.fst_index.search(automaton); let stream_builder = self.fst_index.search(automaton);
TermStreamerBuilder::<A>::new(self, stream_builder) TermStreamerBuilder::<A>::new(self, stream_builder)
} }

View File

@@ -155,6 +155,7 @@ pub use self::simple_tokenizer::SimpleTokenizer;
pub use self::stemmer::{Language, Stemmer}; pub use self::stemmer::{Language, Stemmer};
pub use self::stop_word_filter::StopWordFilter; pub use self::stop_word_filter::StopWordFilter;
pub(crate) use self::token_stream_chain::TokenStreamChain; pub(crate) use self::token_stream_chain::TokenStreamChain;
pub(crate) use self::tokenizer::box_tokenizer;
pub use self::tokenizer::BoxedTokenizer; pub use self::tokenizer::BoxedTokenizer;
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer}; pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};

View File

@@ -56,6 +56,8 @@ pub trait Tokenizer<'a>: Sized + Clone {
/// # Example /// # Example
/// ///
/// ```rust /// ```rust
/// # extern crate tantivy;
///
/// use tantivy::tokenizer::*; /// use tantivy::tokenizer::*;
/// ///
/// # fn main() { /// # fn main() {
@@ -78,7 +80,7 @@ pub trait Tokenizer<'a>: Sized + Clone {
} }
/// A boxed tokenizer /// A boxed tokenizer
trait BoxedTokenizerTrait: Send + Sync { pub trait BoxedTokenizer: Send + Sync {
/// Tokenize a `&str` /// Tokenize a `&str`
fn token_stream<'a>(&self, text: &'a str) -> Box<dyn TokenStream + 'a>; fn token_stream<'a>(&self, text: &'a str) -> Box<dyn TokenStream + 'a>;
@@ -90,41 +92,7 @@ trait BoxedTokenizerTrait: Send + Sync {
fn token_stream_texts<'b>(&self, texts: &'b [&'b str]) -> Box<dyn TokenStream + 'b>; fn token_stream_texts<'b>(&self, texts: &'b [&'b str]) -> Box<dyn TokenStream + 'b>;
/// Return a boxed clone of the tokenizer /// Return a boxed clone of the tokenizer
fn boxed_clone(&self) -> BoxedTokenizer; fn boxed_clone(&self) -> Box<dyn BoxedTokenizer>;
}
/// A boxed tokenizer
pub struct BoxedTokenizer(Box<dyn BoxedTokenizerTrait>);
impl<T> From<T> for BoxedTokenizer
where
T: 'static + Send + Sync + for<'a> Tokenizer<'a>,
{
fn from(tokenizer: T) -> BoxedTokenizer {
BoxedTokenizer(Box::new(BoxableTokenizer(tokenizer)))
}
}
impl BoxedTokenizer {
/// Tokenize a `&str`
pub fn token_stream<'a>(&self, text: &'a str) -> Box<dyn TokenStream + 'a> {
self.0.token_stream(text)
}
/// Tokenize an array`&str`
///
/// The resulting `TokenStream` is equivalent to what would be obtained if the &str were
/// one concatenated `&str`, with an artificial position gap of `2` between the different fields
/// to prevent accidental `PhraseQuery` to match accross two terms.
pub fn token_stream_texts<'b>(&self, texts: &'b [&'b str]) -> Box<dyn TokenStream + 'b> {
self.0.token_stream_texts(texts)
}
}
impl Clone for BoxedTokenizer {
fn clone(&self) -> BoxedTokenizer {
self.0.boxed_clone()
}
} }
#[derive(Clone)] #[derive(Clone)]
@@ -132,7 +100,7 @@ struct BoxableTokenizer<A>(A)
where where
A: for<'a> Tokenizer<'a> + Send + Sync; A: for<'a> Tokenizer<'a> + Send + Sync;
impl<A> BoxedTokenizerTrait for BoxableTokenizer<A> impl<A> BoxedTokenizer for BoxableTokenizer<A>
where where
A: 'static + Send + Sync + for<'a> Tokenizer<'a>, A: 'static + Send + Sync + for<'a> Tokenizer<'a>,
{ {
@@ -157,11 +125,18 @@ where
} }
} }
fn boxed_clone(&self) -> BoxedTokenizer { fn boxed_clone(&self) -> Box<dyn BoxedTokenizer> {
self.0.clone().into() Box::new(self.clone())
} }
} }
pub(crate) fn box_tokenizer<A>(a: A) -> Box<dyn BoxedTokenizer>
where
A: 'static + Send + Sync + for<'a> Tokenizer<'a>,
{
Box::new(BoxableTokenizer(a))
}
impl<'b> TokenStream for Box<dyn TokenStream + 'b> { impl<'b> TokenStream for Box<dyn TokenStream + 'b> {
fn advance(&mut self) -> bool { fn advance(&mut self) -> bool {
let token_stream: &mut dyn TokenStream = self.borrow_mut(); let token_stream: &mut dyn TokenStream = self.borrow_mut();
@@ -186,6 +161,7 @@ impl<'b> TokenStream for Box<dyn TokenStream + 'b> {
/// # Example /// # Example
/// ///
/// ``` /// ```
/// extern crate tantivy;
/// use tantivy::tokenizer::*; /// use tantivy::tokenizer::*;
/// ///
/// # fn main() { /// # fn main() {
@@ -227,6 +203,7 @@ pub trait TokenStream {
/// and `.token()`. /// and `.token()`.
/// ///
/// ``` /// ```
/// # extern crate tantivy;
/// # use tantivy::tokenizer::*; /// # use tantivy::tokenizer::*;
/// # /// #
/// # fn main() { /// # fn main() {

View File

@@ -1,3 +1,4 @@
use crate::tokenizer::box_tokenizer;
use crate::tokenizer::stemmer::Language; use crate::tokenizer::stemmer::Language;
use crate::tokenizer::BoxedTokenizer; use crate::tokenizer::BoxedTokenizer;
use crate::tokenizer::LowerCaser; use crate::tokenizer::LowerCaser;
@@ -7,6 +8,7 @@ use crate::tokenizer::SimpleTokenizer;
use crate::tokenizer::Stemmer; use crate::tokenizer::Stemmer;
use crate::tokenizer::Tokenizer; use crate::tokenizer::Tokenizer;
use std::collections::HashMap; use std::collections::HashMap;
use std::ops::Deref;
use std::sync::{Arc, RwLock}; use std::sync::{Arc, RwLock};
/// The tokenizer manager serves as a store for /// The tokenizer manager serves as a store for
@@ -23,16 +25,16 @@ use std::sync::{Arc, RwLock};
/// search engine. /// search engine.
#[derive(Clone)] #[derive(Clone)]
pub struct TokenizerManager { pub struct TokenizerManager {
tokenizers: Arc<RwLock<HashMap<String, BoxedTokenizer>>>, tokenizers: Arc<RwLock<HashMap<String, Box<dyn BoxedTokenizer>>>>,
} }
impl TokenizerManager { impl TokenizerManager {
/// Registers a new tokenizer associated with a given name. /// Registers a new tokenizer associated with a given name.
pub fn register<A>(&self, tokenizer_name: &str, tokenizer: A) pub fn register<A>(&self, tokenizer_name: &str, tokenizer: A)
where where
A: Into<BoxedTokenizer>, A: 'static + Send + Sync + for<'a> Tokenizer<'a>,
{ {
let boxed_tokenizer = tokenizer.into(); let boxed_tokenizer = box_tokenizer(tokenizer);
self.tokenizers self.tokenizers
.write() .write()
.expect("Acquiring the lock should never fail") .expect("Acquiring the lock should never fail")
@@ -40,12 +42,13 @@ impl TokenizerManager {
} }
/// Accessing a tokenizer given its name. /// Accessing a tokenizer given its name.
pub fn get(&self, tokenizer_name: &str) -> Option<BoxedTokenizer> { pub fn get(&self, tokenizer_name: &str) -> Option<Box<dyn BoxedTokenizer>> {
self.tokenizers self.tokenizers
.read() .read()
.expect("Acquiring the lock should never fail") .expect("Acquiring the lock should never fail")
.get(tokenizer_name) .get(tokenizer_name)
.cloned() .map(Deref::deref)
.map(BoxedTokenizer::boxed_clone)
} }
} }