mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-30 22:12:55 +00:00
Compare commits
6 Commits
criterion
...
streamer-w
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
790baa7adf | ||
|
|
039c0a0863 | ||
|
|
b3b0138b82 | ||
|
|
ea56160cdc | ||
|
|
028b0a749c | ||
|
|
941f06eb9f |
@@ -47,6 +47,7 @@ matrix:
|
||||
before_install:
|
||||
- set -e
|
||||
- rustup self update
|
||||
- rustup component add rustfmt
|
||||
|
||||
install:
|
||||
- sh ci/install.sh
|
||||
@@ -60,6 +61,7 @@ before_script:
|
||||
|
||||
script:
|
||||
- bash ci/script.sh
|
||||
- cargo fmt --all -- --check
|
||||
|
||||
before_deploy:
|
||||
- sh ci/before_deploy.sh
|
||||
|
||||
@@ -5,7 +5,12 @@ Tantivy 0.11.0
|
||||
- Various bugfixes in the query parser.
|
||||
- Better handling of hyphens in query parser. (#609)
|
||||
- Better handling of whitespaces.
|
||||
- Closes #498 - add support for Elastic-style unbounded range queries for alphanumeric types eg. "title:>hello", "weight:>=70.5", "height:<200" (@petr-tik)
|
||||
- API change around `Box<BoxableTokenizer>`. See detail in #629
|
||||
|
||||
## How to update?
|
||||
|
||||
`Box<dyn BoxableTokenizer>` has been replaced by a `BoxedTokenizer` struct.
|
||||
|
||||
Tantivy 0.10.1
|
||||
=====================
|
||||
|
||||
16
Cargo.toml
16
Cargo.toml
@@ -17,7 +17,7 @@ base64 = "0.10.0"
|
||||
byteorder = "1.0"
|
||||
once_cell = "0.2"
|
||||
regex = "1.0"
|
||||
tantivy-fst = "0.1"
|
||||
tantivy-fst = {git="https://github.com/tantivy-search/fst"}
|
||||
memmap = {version = "0.7", optional=true}
|
||||
lz4 = {version="1.20", optional=true}
|
||||
snap = {version="0.2"}
|
||||
@@ -62,7 +62,6 @@ rand = "0.7"
|
||||
maplit = "1"
|
||||
matches = "0.1.8"
|
||||
time = "0.1.42"
|
||||
criterion = "0.2"
|
||||
|
||||
[profile.release]
|
||||
opt-level = 3
|
||||
@@ -75,7 +74,6 @@ overflow-checks = true
|
||||
|
||||
[features]
|
||||
default = ["mmap"]
|
||||
forbench = []
|
||||
mmap = ["atomicwrites", "fs2", "memmap", "notify"]
|
||||
lz4-compression = ["lz4"]
|
||||
failpoints = ["fail/failpoints"]
|
||||
@@ -99,15 +97,3 @@ features = ["failpoints"]
|
||||
name = "failpoints"
|
||||
path = "tests/failpoints/mod.rs"
|
||||
required-features = ["fail/failpoints"]
|
||||
|
||||
[profile.bench]
|
||||
lto = true
|
||||
|
||||
[[bench]]
|
||||
name = "vint"
|
||||
harness = false
|
||||
|
||||
|
||||
[[bench]]
|
||||
name = "fastfield"
|
||||
harness = false
|
||||
@@ -1,73 +0,0 @@
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use rand::distributions::{Bernoulli, Uniform};
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
use tantivy::forbench::bitset::{BitSet, TinySet};
|
||||
use tantivy::query::BitSetDocSet;
|
||||
use tantivy::DocSet;
|
||||
|
||||
fn sample_with_seed(n: u32, ratio: f64, seed_val: u8) -> Vec<u32> {
|
||||
StdRng::from_seed([seed_val; 32])
|
||||
.sample_iter(&Bernoulli::new(ratio).unwrap())
|
||||
.take(n as usize)
|
||||
.enumerate()
|
||||
.filter_map(|(val, keep)| if keep { Some(val as u32) } else { None })
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn generate_nonunique_unsorted(max_value: u32, n_elems: usize) -> Vec<u32> {
|
||||
let seed: [u8; 32] = [1; 32];
|
||||
StdRng::from_seed(seed)
|
||||
.sample_iter(&Uniform::new(0u32, max_value))
|
||||
.take(n_elems)
|
||||
.collect::<Vec<u32>>()
|
||||
}
|
||||
|
||||
fn bench_tinyset_pop(criterion: &mut Criterion) {
|
||||
criterion.bench_function("pop_lowest", |b| {
|
||||
b.iter(|| {
|
||||
let mut tinyset = TinySet::singleton(criterion::black_box(31u32));
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
fn bench_bitset_insert(criterion: &mut Criterion) {
|
||||
criterion.bench_function_over_inputs(
|
||||
"bitset_insert",
|
||||
|bench, (max_value, n_elems)| {
|
||||
let els = generate_nonunique_unsorted(*max_value, *n_elems);
|
||||
bench.iter(move || {
|
||||
let mut bitset = BitSet::with_max_value(1_000_000);
|
||||
for el in els.iter().cloned() {
|
||||
bitset.insert(el);
|
||||
}
|
||||
});
|
||||
},
|
||||
vec![(1_000_000u32, 10_000)],
|
||||
);
|
||||
}
|
||||
|
||||
fn bench_bitsetdocset_iterate(b: &mut test::Bencher) {
|
||||
let mut bitset = BitSet::with_max_value(1_000_000);
|
||||
for el in sample_with_seed(1_000_000u32, 0.01, 0u8) {
|
||||
bitset.insert(el);
|
||||
}
|
||||
b.iter(|| {
|
||||
let mut docset = BitSetDocSet::from(bitset.clone());
|
||||
while docset.advance() {}
|
||||
});
|
||||
}
|
||||
|
||||
criterion_group!(
|
||||
benches,
|
||||
bench_tinyset_pop,
|
||||
bench_bitset_insert,
|
||||
bench_bitsetdocset_iterate
|
||||
);
|
||||
criterion_main!(benches);
|
||||
@@ -1,107 +0,0 @@
|
||||
use criterion::criterion_group;
|
||||
use criterion::criterion_main;
|
||||
use criterion::Criterion;
|
||||
use criterion::ParameterizedBenchmark;
|
||||
use rand::rngs::StdRng;
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::SeedableRng;
|
||||
use tantivy::schema::{Schema, FAST};
|
||||
use tantivy::{doc, DocId, Index};
|
||||
|
||||
const NUM_LOOKUPS: usize = 1_000;
|
||||
|
||||
fn generate_permutation(stride: usize, bit_width: u8) -> Vec<u64> {
|
||||
let mut permutation: Vec<u64> = (0u64..(NUM_LOOKUPS * stride) as u64).collect();
|
||||
permutation.shuffle(&mut StdRng::from_seed([1u8; 32]));
|
||||
permutation.push(1u64 << (bit_width as u64)); //< just to force the bit_width
|
||||
permutation
|
||||
}
|
||||
|
||||
fn bench_linear_lookup(c: &mut Criterion) {
|
||||
c.bench(
|
||||
"lookup_stride",
|
||||
ParameterizedBenchmark::new(
|
||||
"baseline_vec",
|
||||
|bench, (stride, num_bits)| {
|
||||
let arr = generate_permutation(*stride, *num_bits);
|
||||
bench.iter(move || {
|
||||
let mut a = 0u64;
|
||||
for i in (0..NUM_LOOKUPS / stride).map(|v| v * 7) {
|
||||
a ^= arr[i as usize];
|
||||
}
|
||||
a
|
||||
})
|
||||
},
|
||||
vec![(7, 1), (7, 5), (7, 20)],
|
||||
)
|
||||
.with_function("fastfield", |bench, (stride, num_bits)| {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let val_field = schema_builder.add_u64_field("val", FAST);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 80_000_000).unwrap();
|
||||
for el in generate_permutation(*stride, *num_bits) {
|
||||
index_writer.add_document(doc!(val_field=>el));
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
let fast_field_reader = segment_reader.fast_fields().u64(val_field).unwrap();
|
||||
bench.iter(move || {
|
||||
let mut a = 0u64;
|
||||
for i in (0..NUM_LOOKUPS / stride).map(|v| v * 7) {
|
||||
a ^= fast_field_reader.get(i as DocId);
|
||||
}
|
||||
a
|
||||
})
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
fn bench_jumpy_lookup(c: &mut Criterion) {
|
||||
c.bench(
|
||||
"lookup_jumpy",
|
||||
ParameterizedBenchmark::new(
|
||||
"baseline_vec",
|
||||
|bench, (stride, num_bits)| {
|
||||
let arr = generate_permutation(*stride, *num_bits);
|
||||
bench.iter(move || {
|
||||
let mut a = 0u64;
|
||||
for _ in 0..NUM_LOOKUPS {
|
||||
a = arr[a as usize];
|
||||
}
|
||||
a
|
||||
})
|
||||
},
|
||||
vec![(7, 1), (7, 5), (7, 20)],
|
||||
)
|
||||
.with_function("fastfield", |bench, (stride, num_bits)| {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let val_field = schema_builder.add_u64_field("val", FAST);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 80_000_000).unwrap();
|
||||
for el in generate_permutation(*stride, *num_bits) {
|
||||
index_writer.add_document(doc!(val_field=>el));
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
let fast_field_reader = segment_reader.fast_fields().u64(val_field).unwrap();
|
||||
bench.iter(move || {
|
||||
let mut a = 0u64;
|
||||
for _ in 0..NUM_LOOKUPS {
|
||||
a = fast_field_reader.get(a as DocId);
|
||||
}
|
||||
a
|
||||
})
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
criterion_group!(benches, bench_linear_lookup, bench_jumpy_lookup);
|
||||
criterion_main!(benches);
|
||||
@@ -1,50 +0,0 @@
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::{Schema, STRING};
|
||||
use tantivy::{Document, Index};
|
||||
|
||||
fn bench_union(criterion: &mut Criterion) {
|
||||
criterion.bench_function_over_inputs(
|
||||
"union_docset_fulladvance",
|
||||
|bench, (ratio_left, ratio_right)| {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_text_field("val", STRING);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 80_000_000).unwrap();
|
||||
let mut stdrng = StdRng::from_seed([0u8; 32]);
|
||||
for _ in 0u32..100_000u32 {
|
||||
let mut doc = Document::default();
|
||||
if stdrng.gen_bool(*ratio_left) {
|
||||
doc.add_text(field, "left");
|
||||
}
|
||||
if stdrng.gen_bool(*ratio_right) {
|
||||
doc.add_text(field, "right");
|
||||
}
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let query = QueryParser::for_index(&index, vec![field])
|
||||
.parse_query("left right")
|
||||
.unwrap();
|
||||
|
||||
bench.iter(move || {
|
||||
let weight = query.weight(&searcher, false).unwrap();
|
||||
let mut scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||
let mut sum_docs = 0u64;
|
||||
scorer.for_each(&mut |doc_id, _score| {
|
||||
sum_docs += doc_id as u64;
|
||||
});
|
||||
});
|
||||
},
|
||||
vec![(0.2, 0.1), (0.2, 0.02)],
|
||||
);
|
||||
}
|
||||
|
||||
criterion_group!(benches, bench_union);
|
||||
criterion_main!(benches);
|
||||
@@ -1,72 +0,0 @@
|
||||
use criterion::{criterion_group, criterion_main, Criterion, ParameterizedBenchmark};
|
||||
use rand::rngs::StdRng;
|
||||
use rand::Rng;
|
||||
use rand::SeedableRng;
|
||||
use tantivy::forbench::compression::{compressed_block_size, BlockDecoder};
|
||||
use tantivy::forbench::compression::{BlockEncoder, VIntEncoder};
|
||||
use tantivy::forbench::compression::{VIntDecoder, COMPRESSION_BLOCK_SIZE};
|
||||
|
||||
fn generate_array_with_seed(n: usize, ratio: f64, seed_val: u8) -> Vec<u32> {
|
||||
let seed: [u8; 32] = [seed_val; 32];
|
||||
let mut rng = StdRng::from_seed(seed);
|
||||
(0u32..).filter(|_| rng.gen_bool(ratio)).take(n).collect()
|
||||
}
|
||||
|
||||
pub fn generate_array(n: usize, ratio: f64) -> Vec<u32> {
|
||||
generate_array_with_seed(n, ratio, 4)
|
||||
}
|
||||
|
||||
fn bench_compress(criterion: &mut Criterion) {
|
||||
criterion.bench(
|
||||
"compress_sorted",
|
||||
ParameterizedBenchmark::new(
|
||||
"bitpack",
|
||||
|bench, ratio| {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = generate_array(COMPRESSION_BLOCK_SIZE, *ratio);
|
||||
bench.iter(|| {
|
||||
encoder.compress_block_sorted(&data, 0u32);
|
||||
});
|
||||
},
|
||||
vec![0.1],
|
||||
)
|
||||
.with_function("vint", |bench, ratio| {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = generate_array(COMPRESSION_BLOCK_SIZE, *ratio);
|
||||
bench.iter(|| {
|
||||
encoder.compress_vint_sorted(&data, 0u32);
|
||||
});
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
fn bench_uncompress(criterion: &mut Criterion) {
|
||||
criterion.bench(
|
||||
"uncompress_sorted",
|
||||
ParameterizedBenchmark::new(
|
||||
"bitpack",
|
||||
|bench, ratio| {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = generate_array(COMPRESSION_BLOCK_SIZE, *ratio);
|
||||
let (num_bits, compressed) = encoder.compress_block_sorted(&data, 0u32);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
bench.iter(|| {
|
||||
decoder.uncompress_block_sorted(compressed, 0u32, num_bits);
|
||||
});
|
||||
},
|
||||
vec![0.1],
|
||||
)
|
||||
.with_function("vint", |bench, ratio| {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = generate_array(COMPRESSION_BLOCK_SIZE, *ratio);
|
||||
let compressed = encoder.compress_vint_sorted(&data, 0u32);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
bench.iter(move || {
|
||||
decoder.uncompress_vint_sorted(compressed, 0u32, COMPRESSION_BLOCK_SIZE);
|
||||
});
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
criterion_group!(benches, bench_compress, bench_uncompress);
|
||||
criterion_main!(benches);
|
||||
@@ -1,2 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
cargo bench --features forbench
|
||||
@@ -86,7 +86,6 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn get(&self, idx: u64) -> u64 {
|
||||
if self.num_bits == 0 {
|
||||
return 0u64;
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::fmt;
|
||||
use std::u64;
|
||||
|
||||
#[derive(Clone, Copy, Eq, PartialEq)]
|
||||
pub struct TinySet(u64);
|
||||
pub(crate) struct TinySet(u64);
|
||||
|
||||
impl fmt::Debug for TinySet {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
@@ -353,3 +353,43 @@ mod tests {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use super::BitSet;
|
||||
use super::TinySet;
|
||||
use test;
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyset_pop(b: &mut test::Bencher) {
|
||||
b.iter(|| {
|
||||
let mut tinyset = TinySet::singleton(test::black_box(31u32));
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyset_sum(b: &mut test::Bencher) {
|
||||
let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
|
||||
b.iter(|| {
|
||||
assert_eq!(test::black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyarr_sum(b: &mut test::Bencher) {
|
||||
let v = [10u32, 14u32, 21u32];
|
||||
b.iter(|| test::black_box(v).iter().cloned().sum::<u32>());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_bitset_initialize(b: &mut test::Bencher) {
|
||||
b.iter(|| BitSet::with_max_value(1_000_000));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ mod serialize;
|
||||
mod vint;
|
||||
|
||||
pub use self::bitset::BitSet;
|
||||
pub use self::bitset::TinySet;
|
||||
pub(crate) use self::bitset::TinySet;
|
||||
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
|
||||
pub use self::counting_writer::CountingWriter;
|
||||
pub use self::serialize::{BinarySerializable, FixedSize};
|
||||
|
||||
@@ -173,11 +173,11 @@ impl Index {
|
||||
}
|
||||
|
||||
/// Helper to access the tokenizer associated to a specific field.
|
||||
pub fn tokenizer_for_field(&self, field: Field) -> Result<Box<dyn BoxedTokenizer>> {
|
||||
pub fn tokenizer_for_field(&self, field: Field) -> Result<BoxedTokenizer> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
let tokenizer_manager: &TokenizerManager = self.tokenizers();
|
||||
let tokenizer_name_opt: Option<Box<dyn BoxedTokenizer>> = match field_type {
|
||||
let tokenizer_name_opt: Option<BoxedTokenizer> = match field_type {
|
||||
FieldType::Str(text_options) => text_options
|
||||
.get_indexing_options()
|
||||
.map(|text_indexing_options| text_indexing_options.tokenizer().to_string())
|
||||
|
||||
@@ -431,3 +431,111 @@ mod tests {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
use super::tests::FIELD;
|
||||
use super::tests::{generate_permutation, SCHEMA};
|
||||
use super::*;
|
||||
use common::CompositeFile;
|
||||
use directory::{Directory, RAMDirectory, WritePtr};
|
||||
use fastfield::FastFieldReader;
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
use test::{self, Bencher};
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_linear_veclookup(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
b.iter(|| {
|
||||
let n = test::black_box(7000u32);
|
||||
let mut a = 0u64;
|
||||
for i in (0u32..n / 7).map(|v| v * 7) {
|
||||
a ^= permutation[i as usize];
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_veclookup(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
b.iter(|| {
|
||||
let n = test::black_box(1000u32);
|
||||
let mut a = 0u64;
|
||||
for _ in 0u32..n {
|
||||
a = permutation[a as usize];
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_linear_fflookup(b: &mut Bencher) {
|
||||
let path = Path::new("test");
|
||||
let permutation = generate_permutation();
|
||||
let mut directory: RAMDirectory = RAMDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for &x in &permutation {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>x));
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new())
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&source).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(data);
|
||||
|
||||
b.iter(|| {
|
||||
let n = test::black_box(7000u32);
|
||||
let mut a = 0u64;
|
||||
for i in (0u32..n / 7).map(|val| val * 7) {
|
||||
a ^= fast_field_reader.get(i);
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_fflookup(b: &mut Bencher) {
|
||||
let path = Path::new("test");
|
||||
let permutation = generate_permutation();
|
||||
let mut directory: RAMDirectory = RAMDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for &x in &permutation {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>x));
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new())
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&source).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(data);
|
||||
|
||||
b.iter(|| {
|
||||
let n = test::black_box(1000u32);
|
||||
let mut a = 0u32;
|
||||
for _ in 0u32..n {
|
||||
a = fast_field_reader.get(a) as u32;
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -67,12 +67,10 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
||||
///
|
||||
/// May panic if `doc` is greater than the segment
|
||||
// `maxdoc`.
|
||||
#[inline(always)]
|
||||
pub fn get(&self, doc: DocId) -> Item {
|
||||
self.get_u64(u64::from(doc))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub(crate) fn get_u64(&self, doc: u64) -> Item {
|
||||
Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc))
|
||||
}
|
||||
|
||||
@@ -49,7 +49,7 @@ pub struct SegmentWriter {
|
||||
fast_field_writers: FastFieldsWriter,
|
||||
fieldnorms_writer: FieldNormsWriter,
|
||||
doc_opstamps: Vec<Opstamp>,
|
||||
tokenizers: Vec<Option<Box<dyn BoxedTokenizer>>>,
|
||||
tokenizers: Vec<Option<BoxedTokenizer>>,
|
||||
}
|
||||
|
||||
impl SegmentWriter {
|
||||
|
||||
13
src/lib.rs
13
src/lib.rs
@@ -249,6 +249,7 @@ pub struct DocAddress(pub SegmentLocalId, pub DocId);
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE;
|
||||
use crate::core::SegmentReader;
|
||||
use crate::docset::DocSet;
|
||||
@@ -894,15 +895,3 @@ mod tests {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "forbench")]
|
||||
pub mod forbench {
|
||||
pub mod compression {
|
||||
pub use crate::postings::compression::*;
|
||||
}
|
||||
|
||||
pub mod bitset {
|
||||
pub use crate::common::BitSet;
|
||||
pub use crate::common::TinySet;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -160,9 +160,9 @@ impl VIntEncoder for BlockEncoder {
|
||||
}
|
||||
|
||||
impl VIntDecoder for BlockDecoder {
|
||||
fn uncompress_vint_sorted(
|
||||
fn uncompress_vint_sorted<'a>(
|
||||
&mut self,
|
||||
compressed_data: &[u8],
|
||||
compressed_data: &'a [u8],
|
||||
offset: u32,
|
||||
num_els: usize,
|
||||
) -> usize {
|
||||
@@ -170,7 +170,7 @@ impl VIntDecoder for BlockDecoder {
|
||||
vint::uncompress_sorted(compressed_data, &mut self.output.0[..num_els], offset)
|
||||
}
|
||||
|
||||
fn uncompress_vint_unsorted(&mut self, compressed_data: &[u8], num_els: usize) -> usize {
|
||||
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize {
|
||||
self.output_len = num_els;
|
||||
vint::uncompress_unsorted(compressed_data, &mut self.output.0[..num_els])
|
||||
}
|
||||
@@ -268,17 +268,78 @@ pub mod tests {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use super::*;
|
||||
use rand::SeedableRng;
|
||||
use rand::{Rng, XorShiftRng};
|
||||
use test::Bencher;
|
||||
|
||||
fn generate_array_with_seed(n: usize, ratio: f64, seed_val: u8) -> Vec<u32> {
|
||||
let seed: &[u8; 16] = &[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, seed_val];
|
||||
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
|
||||
(0u32..).filter(|_| rng.gen_bool(ratio)).take(n).collect()
|
||||
}
|
||||
|
||||
pub fn generate_array(n: usize, ratio: f64) -> Vec<u32> {
|
||||
generate_array_with_seed(n, ratio, 4)
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_compress(b: &mut Bencher) {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
|
||||
b.iter(|| {
|
||||
encoder.compress_block_sorted(&data, 0u32);
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_uncompress(b: &mut Bencher) {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
|
||||
let (num_bits, compressed) = encoder.compress_block_sorted(&data, 0u32);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
b.iter(|| {
|
||||
decoder.uncompress_block_sorted(compressed, 0u32, num_bits);
|
||||
});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_all_docs_compression_numbits() {
|
||||
for expected_num_bits in 0u8..33u8 {
|
||||
for expected_num_bits in 0u8.. {
|
||||
let mut data = [0u32; 128];
|
||||
if expected_num_bits > 0 {
|
||||
data[0] = (1u64 << (expected_num_bits as u64) - 1u64) as u32;
|
||||
data[0] = (1u64 << (expected_num_bits as usize) - 1) as u32;
|
||||
}
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let (num_bits, compressed) = encoder.compress_block_unsorted(&data);
|
||||
assert_eq!(compressed.len(), compressed_block_size(num_bits));
|
||||
}
|
||||
}
|
||||
|
||||
const NUM_INTS_BENCH_VINT: usize = 10;
|
||||
|
||||
#[bench]
|
||||
fn bench_compress_vint(b: &mut Bencher) {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = generate_array(NUM_INTS_BENCH_VINT, 0.001);
|
||||
b.iter(|| {
|
||||
encoder.compress_vint_sorted(&data, 0u32);
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_uncompress_vint(b: &mut Bencher) {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = generate_array(NUM_INTS_BENCH_VINT, 0.001);
|
||||
let compressed = encoder.compress_vint_sorted(&data, 0u32);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
b.iter(|| {
|
||||
decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,6 @@ Postings module (also called inverted index)
|
||||
*/
|
||||
|
||||
mod block_search;
|
||||
|
||||
pub(crate) mod compression;
|
||||
/// Postings module
|
||||
///
|
||||
|
||||
@@ -14,6 +14,7 @@ use tantivy_fst::Automaton;
|
||||
pub struct AutomatonWeight<A>
|
||||
where
|
||||
A: Automaton + Send + Sync + 'static,
|
||||
A::State: Clone + Default + Sized,
|
||||
{
|
||||
field: Field,
|
||||
automaton: A,
|
||||
@@ -22,6 +23,7 @@ where
|
||||
impl<A> AutomatonWeight<A>
|
||||
where
|
||||
A: Automaton + Send + Sync + 'static,
|
||||
A::State: Clone + Default + Sized,
|
||||
{
|
||||
/// Create a new AutomationWeight
|
||||
pub fn new(field: Field, automaton: A) -> AutomatonWeight<A> {
|
||||
@@ -37,6 +39,7 @@ where
|
||||
impl<A> Weight for AutomatonWeight<A>
|
||||
where
|
||||
A: Automaton + Send + Sync + 'static,
|
||||
A::State: Clone + Default + Sized,
|
||||
{
|
||||
fn scorer(&self, reader: &SegmentReader) -> Result<Box<dyn Scorer>> {
|
||||
let max_doc = reader.max_doc();
|
||||
|
||||
@@ -218,3 +218,49 @@ mod tests {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use super::BitSet;
|
||||
use super::BitSetDocSet;
|
||||
use test;
|
||||
use tests;
|
||||
use DocSet;
|
||||
|
||||
#[bench]
|
||||
fn bench_bitset_1pct_insert(b: &mut test::Bencher) {
|
||||
use tests;
|
||||
let els = tests::generate_nonunique_unsorted(1_000_000u32, 10_000);
|
||||
b.iter(|| {
|
||||
let mut bitset = BitSet::with_max_value(1_000_000);
|
||||
for el in els.iter().cloned() {
|
||||
bitset.insert(el);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_bitset_1pct_clone(b: &mut test::Bencher) {
|
||||
use tests;
|
||||
let els = tests::generate_nonunique_unsorted(1_000_000u32, 10_000);
|
||||
let mut bitset = BitSet::with_max_value(1_000_000);
|
||||
for el in els {
|
||||
bitset.insert(el);
|
||||
}
|
||||
b.iter(|| bitset.clone());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_bitset_1pct_clone_iterate(b: &mut test::Bencher) {
|
||||
let els = tests::sample(1_000_000u32, 0.01);
|
||||
let mut bitset = BitSet::with_max_value(1_000_000);
|
||||
for el in els {
|
||||
bitset.insert(el);
|
||||
}
|
||||
b.iter(|| {
|
||||
let mut docset = BitSetDocSet::from(bitset.clone());
|
||||
while docset.advance() {}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -83,28 +83,67 @@ parser! {
|
||||
}
|
||||
|
||||
parser! {
|
||||
/// Function that parses a range out of a Stream
|
||||
/// Supports ranges like:
|
||||
/// [5 TO 10], {5 TO 10}, [* TO 10], [10 TO *], {10 TO *], >5, <=10
|
||||
/// [a TO *], [a TO c], [abc TO bcd}
|
||||
fn range[I]()(I) -> UserInputLeaf
|
||||
where [I: Stream<Item = char>] {
|
||||
let range_term_val = || {
|
||||
word().or(negative_number()).or(char('*').with(value("*".to_string())))
|
||||
};
|
||||
|
||||
// check for unbounded range in the form of <5, <=10, >5, >=5
|
||||
let elastic_unbounded_range = (choice([attempt(string(">=")),
|
||||
attempt(string("<=")),
|
||||
attempt(string("<")),
|
||||
attempt(string(">"))])
|
||||
.skip(spaces()),
|
||||
range_term_val()).
|
||||
map(|(comparison_sign, bound): (&str, String)|
|
||||
match comparison_sign {
|
||||
">=" => (UserInputBound::Inclusive(bound), UserInputBound::Unbounded),
|
||||
"<=" => (UserInputBound::Unbounded, UserInputBound::Inclusive(bound)),
|
||||
"<" => (UserInputBound::Unbounded, UserInputBound::Exclusive(bound)),
|
||||
">" => (UserInputBound::Exclusive(bound), UserInputBound::Unbounded),
|
||||
// default case
|
||||
_ => (UserInputBound::Unbounded, UserInputBound::Unbounded)
|
||||
});
|
||||
let lower_bound = (one_of("{[".chars()), range_term_val())
|
||||
.map(|(boundary_char, lower_bound): (char, String)|
|
||||
if boundary_char == '{' { UserInputBound::Exclusive(lower_bound) }
|
||||
else { UserInputBound::Inclusive(lower_bound) });
|
||||
if lower_bound == "*" {
|
||||
UserInputBound::Unbounded
|
||||
} else if boundary_char == '{' {
|
||||
UserInputBound::Exclusive(lower_bound)
|
||||
} else {
|
||||
UserInputBound::Inclusive(lower_bound)
|
||||
});
|
||||
let upper_bound = (range_term_val(), one_of("}]".chars()))
|
||||
.map(|(higher_bound, boundary_char): (String, char)|
|
||||
if boundary_char == '}' { UserInputBound::Exclusive(higher_bound) }
|
||||
else { UserInputBound::Inclusive(higher_bound) });
|
||||
(
|
||||
optional(field()),
|
||||
lower_bound
|
||||
.skip((spaces(), string("TO"), spaces())),
|
||||
upper_bound,
|
||||
).map(|(field, lower, upper)| UserInputLeaf::Range {
|
||||
field,
|
||||
lower,
|
||||
upper
|
||||
if higher_bound == "*" {
|
||||
UserInputBound::Unbounded
|
||||
} else if boundary_char == '}' {
|
||||
UserInputBound::Exclusive(higher_bound)
|
||||
} else {
|
||||
UserInputBound::Inclusive(higher_bound)
|
||||
});
|
||||
// return only lower and upper
|
||||
let lower_to_upper = (lower_bound.
|
||||
skip((spaces(),
|
||||
string("TO"),
|
||||
spaces())),
|
||||
upper_bound);
|
||||
|
||||
(optional(field()).skip(spaces()),
|
||||
// try elastic first, if it matches, the range is unbounded
|
||||
attempt(elastic_unbounded_range).or(lower_to_upper))
|
||||
.map(|(field, (lower, upper))|
|
||||
// Construct the leaf from extracted field (optional)
|
||||
// and bounds
|
||||
UserInputLeaf::Range {
|
||||
field,
|
||||
lower,
|
||||
upper
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -258,6 +297,49 @@ mod test {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_elastic_query_ranges() {
|
||||
test_parse_query_to_ast_helper("title: >a", "title:{\"a\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper("title:>=a", "title:[\"a\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper("title: <a", "title:{\"*\" TO \"a\"}");
|
||||
test_parse_query_to_ast_helper("title:<=a", "title:{\"*\" TO \"a\"]");
|
||||
test_parse_query_to_ast_helper("title:<=bsd", "title:{\"*\" TO \"bsd\"]");
|
||||
|
||||
test_parse_query_to_ast_helper("weight: >70", "weight:{\"70\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper("weight:>=70", "weight:[\"70\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper("weight: <70", "weight:{\"*\" TO \"70\"}");
|
||||
test_parse_query_to_ast_helper("weight:<=70", "weight:{\"*\" TO \"70\"]");
|
||||
test_parse_query_to_ast_helper("weight: >60.7", "weight:{\"60.7\" TO \"*\"}");
|
||||
|
||||
test_parse_query_to_ast_helper("weight: <= 70", "weight:{\"*\" TO \"70\"]");
|
||||
|
||||
test_parse_query_to_ast_helper("weight: <= 70.5", "weight:{\"*\" TO \"70.5\"]");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_range_parser() {
|
||||
// testing the range() parser separately
|
||||
let res = range().parse("title: <hello").unwrap().0;
|
||||
let expected = UserInputLeaf::Range {
|
||||
field: Some("title".to_string()),
|
||||
lower: UserInputBound::Unbounded,
|
||||
upper: UserInputBound::Exclusive("hello".to_string()),
|
||||
};
|
||||
let res2 = range().parse("title:{* TO hello}").unwrap().0;
|
||||
assert_eq!(res, expected);
|
||||
assert_eq!(res2, expected);
|
||||
let expected_weight = UserInputLeaf::Range {
|
||||
field: Some("weight".to_string()),
|
||||
lower: UserInputBound::Inclusive("71.2".to_string()),
|
||||
upper: UserInputBound::Unbounded,
|
||||
};
|
||||
|
||||
let res3 = range().parse("weight: >=71.2").unwrap().0;
|
||||
let res4 = range().parse("weight:[71.2 TO *}").unwrap().0;
|
||||
assert_eq!(res3, expected_weight);
|
||||
assert_eq!(res4, expected_weight);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_query_to_triming_spaces() {
|
||||
test_parse_query_to_ast_helper(" abc", "\"abc\"");
|
||||
@@ -291,7 +373,7 @@ mod test {
|
||||
test_parse_query_to_ast_helper("[1 TO 5]", "[\"1\" TO \"5\"]");
|
||||
test_parse_query_to_ast_helper("foo:{a TO z}", "foo:{\"a\" TO \"z\"}");
|
||||
test_parse_query_to_ast_helper("foo:[1 TO toto}", "foo:[\"1\" TO \"toto\"}");
|
||||
test_parse_query_to_ast_helper("foo:[* TO toto}", "foo:[\"*\" TO \"toto\"}");
|
||||
test_parse_query_to_ast_helper("foo:[* TO toto}", "foo:{\"*\" TO \"toto\"}");
|
||||
test_parse_query_to_ast_helper("foo:[1 TO *}", "foo:[\"1\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper("foo:[1.1 TO *}", "foo:[\"1.1\" TO \"*\"}");
|
||||
test_is_parse_err("abc + ");
|
||||
|
||||
@@ -369,6 +369,7 @@ impl QueryParser {
|
||||
match *bound {
|
||||
UserInputBound::Inclusive(_) => Ok(Bound::Included(term)),
|
||||
UserInputBound::Exclusive(_) => Ok(Bound::Excluded(term)),
|
||||
UserInputBound::Unbounded => Ok(Bound::Unbounded),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -628,7 +629,7 @@ mod test {
|
||||
pub fn test_parse_query_untokenized() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"nottokenized:\"wordone wordtwo\"",
|
||||
"Term([0, 0, 0, 7, 119, 111, 114, 100, 111, 110, \
|
||||
"Term(field=7,bytes=[119, 111, 114, 100, 111, 110, \
|
||||
101, 32, 119, 111, 114, 100, 116, 119, 111])",
|
||||
false,
|
||||
);
|
||||
@@ -672,7 +673,7 @@ mod test {
|
||||
.is_ok());
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"unsigned:2324",
|
||||
"Term([0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 9, 20])",
|
||||
"Term(field=3,bytes=[0, 0, 0, 0, 0, 0, 9, 20])",
|
||||
false,
|
||||
);
|
||||
|
||||
@@ -693,19 +694,19 @@ mod test {
|
||||
pub fn test_parse_query_to_ast_single_term() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:toto",
|
||||
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
|
||||
"Term(field=0,bytes=[116, 111, 116, 111])",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"+title:toto",
|
||||
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
|
||||
"Term(field=0,bytes=[116, 111, 116, 111])",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"+title:toto -titi",
|
||||
"(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \
|
||||
-(Term([0, 0, 0, 0, 116, 105, 116, 105]) \
|
||||
Term([0, 0, 0, 1, 116, 105, 116, 105])))",
|
||||
"(+Term(field=0,bytes=[116, 111, 116, 111]) \
|
||||
-(Term(field=0,bytes=[116, 105, 116, 105]) \
|
||||
Term(field=1,bytes=[116, 105, 116, 105])))",
|
||||
false,
|
||||
);
|
||||
assert_eq!(
|
||||
@@ -720,14 +721,13 @@ mod test {
|
||||
pub fn test_parse_query_to_ast_two_terms() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:a b",
|
||||
"(Term([0, 0, 0, 0, 97]) (Term([0, 0, 0, 0, 98]) \
|
||||
Term([0, 0, 0, 1, 98])))",
|
||||
"(Term(field=0,bytes=[97]) (Term(field=0,bytes=[98]) Term(field=1,bytes=[98])))",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:\"a b\"",
|
||||
"\"[(0, Term([0, 0, 0, 0, 97])), \
|
||||
(1, Term([0, 0, 0, 0, 98]))]\"",
|
||||
"\"[(0, Term(field=0,bytes=[97])), \
|
||||
(1, Term(field=0,bytes=[98]))]\"",
|
||||
false,
|
||||
);
|
||||
}
|
||||
@@ -736,45 +736,43 @@ mod test {
|
||||
pub fn test_parse_query_to_ast_ranges() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:[a TO b]",
|
||||
"(Included(Term([0, 0, 0, 0, 97])) TO \
|
||||
Included(Term([0, 0, 0, 0, 98])))",
|
||||
"(Included(Term(field=0,bytes=[97])) TO Included(Term(field=0,bytes=[98])))",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"[a TO b]",
|
||||
"((Included(Term([0, 0, 0, 0, 97])) TO \
|
||||
Included(Term([0, 0, 0, 0, 98]))) \
|
||||
(Included(Term([0, 0, 0, 1, 97])) TO \
|
||||
Included(Term([0, 0, 0, 1, 98]))))",
|
||||
"((Included(Term(field=0,bytes=[97])) TO \
|
||||
Included(Term(field=0,bytes=[98]))) \
|
||||
(Included(Term(field=1,bytes=[97])) TO \
|
||||
Included(Term(field=1,bytes=[98]))))",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:{titi TO toto}",
|
||||
"(Excluded(Term([0, 0, 0, 0, 116, 105, 116, 105])) TO \
|
||||
Excluded(Term([0, 0, 0, 0, 116, 111, 116, 111])))",
|
||||
"(Excluded(Term(field=0,bytes=[116, 105, 116, 105])) TO \
|
||||
Excluded(Term(field=0,bytes=[116, 111, 116, 111])))",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:{* TO toto}",
|
||||
"(Unbounded TO \
|
||||
Excluded(Term([0, 0, 0, 0, 116, 111, 116, 111])))",
|
||||
"(Unbounded TO Excluded(Term(field=0,bytes=[116, 111, 116, 111])))",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:{titi TO *}",
|
||||
"(Excluded(Term([0, 0, 0, 0, 116, 105, 116, 105])) TO Unbounded)",
|
||||
"(Excluded(Term(field=0,bytes=[116, 105, 116, 105])) TO Unbounded)",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"signed:{-5 TO 3}",
|
||||
"(Excluded(Term([0, 0, 0, 2, 127, 255, 255, 255, 255, 255, 255, 251])) TO \
|
||||
Excluded(Term([0, 0, 0, 2, 128, 0, 0, 0, 0, 0, 0, 3])))",
|
||||
"(Excluded(Term(field=2,bytes=[127, 255, 255, 255, 255, 255, 255, 251])) TO \
|
||||
Excluded(Term(field=2,bytes=[128, 0, 0, 0, 0, 0, 0, 3])))",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"float:{-1.5 TO 1.5}",
|
||||
"(Excluded(Term([0, 0, 0, 10, 64, 7, 255, 255, 255, 255, 255, 255])) TO \
|
||||
Excluded(Term([0, 0, 0, 10, 191, 248, 0, 0, 0, 0, 0, 0])))",
|
||||
"(Excluded(Term(field=10,bytes=[64, 7, 255, 255, 255, 255, 255, 255])) TO \
|
||||
Excluded(Term(field=10,bytes=[191, 248, 0, 0, 0, 0, 0, 0])))",
|
||||
false,
|
||||
);
|
||||
|
||||
@@ -879,19 +877,19 @@ mod test {
|
||||
pub fn test_parse_query_to_ast_conjunction() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:toto",
|
||||
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
|
||||
"Term(field=0,bytes=[116, 111, 116, 111])",
|
||||
true,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"+title:toto",
|
||||
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
|
||||
"Term(field=0,bytes=[116, 111, 116, 111])",
|
||||
true,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"+title:toto -titi",
|
||||
"(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \
|
||||
-(Term([0, 0, 0, 0, 116, 105, 116, 105]) \
|
||||
Term([0, 0, 0, 1, 116, 105, 116, 105])))",
|
||||
"(+Term(field=0,bytes=[116, 111, 116, 111]) \
|
||||
-(Term(field=0,bytes=[116, 105, 116, 105]) \
|
||||
Term(field=1,bytes=[116, 105, 116, 105])))",
|
||||
true,
|
||||
);
|
||||
assert_eq!(
|
||||
@@ -902,15 +900,15 @@ mod test {
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:a b",
|
||||
"(+Term([0, 0, 0, 0, 97]) \
|
||||
+(Term([0, 0, 0, 0, 98]) \
|
||||
Term([0, 0, 0, 1, 98])))",
|
||||
"(+Term(field=0,bytes=[97]) \
|
||||
+(Term(field=0,bytes=[98]) \
|
||||
Term(field=1,bytes=[98])))",
|
||||
true,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:\"a b\"",
|
||||
"\"[(0, Term([0, 0, 0, 0, 97])), \
|
||||
(1, Term([0, 0, 0, 0, 98]))]\"",
|
||||
"\"[(0, Term(field=0,bytes=[97])), \
|
||||
(1, Term(field=0,bytes=[98]))]\"",
|
||||
true,
|
||||
);
|
||||
}
|
||||
@@ -919,10 +917,8 @@ mod test {
|
||||
pub fn test_query_parser_hyphen() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:www-form-encoded",
|
||||
"\"[(0, Term([0, 0, 0, 0, 119, 119, 119])), \
|
||||
(1, Term([0, 0, 0, 0, 102, 111, 114, 109])), \
|
||||
(2, Term([0, 0, 0, 0, 101, 110, 99, 111, 100, 101, 100]))]\"",
|
||||
false,
|
||||
"\"[(0, Term(field=0,bytes=[119, 119, 119])), (1, Term(field=0,bytes=[102, 111, 114, 109])), (2, Term(field=0,bytes=[101, 110, 99, 111, 100, 101, 100]))]\"",
|
||||
false
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@ use std::fmt::{Debug, Formatter};
|
||||
|
||||
use crate::query::Occur;
|
||||
|
||||
#[derive(PartialEq)]
|
||||
pub enum UserInputLeaf {
|
||||
Literal(UserInputLiteral),
|
||||
All,
|
||||
@@ -35,6 +36,7 @@ impl Debug for UserInputLeaf {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq)]
|
||||
pub struct UserInputLiteral {
|
||||
pub field_name: Option<String>,
|
||||
pub phrase: String,
|
||||
@@ -49,9 +51,11 @@ impl fmt::Debug for UserInputLiteral {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq)]
|
||||
pub enum UserInputBound {
|
||||
Inclusive(String),
|
||||
Exclusive(String),
|
||||
Unbounded,
|
||||
}
|
||||
|
||||
impl UserInputBound {
|
||||
@@ -59,6 +63,7 @@ impl UserInputBound {
|
||||
match *self {
|
||||
UserInputBound::Inclusive(ref word) => write!(formatter, "[\"{}\"", word),
|
||||
UserInputBound::Exclusive(ref word) => write!(formatter, "{{\"{}\"", word),
|
||||
UserInputBound::Unbounded => write!(formatter, "{{\"*\""),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -66,6 +71,7 @@ impl UserInputBound {
|
||||
match *self {
|
||||
UserInputBound::Inclusive(ref word) => write!(formatter, "\"{}\"]", word),
|
||||
UserInputBound::Exclusive(ref word) => write!(formatter, "\"{}\"}}", word),
|
||||
UserInputBound::Unbounded => write!(formatter, "\"*\"}}"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -73,6 +79,7 @@ impl UserInputBound {
|
||||
match *self {
|
||||
UserInputBound::Inclusive(ref contents) => contents,
|
||||
UserInputBound::Exclusive(ref contents) => contents,
|
||||
UserInputBound::Unbounded => &"*",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -338,39 +338,33 @@ mod tests {
|
||||
use crate::collector::Count;
|
||||
use crate::schema::{Document, Field, Schema, INDEXED};
|
||||
use crate::Index;
|
||||
use crate::Result;
|
||||
use std::collections::Bound;
|
||||
|
||||
#[test]
|
||||
fn test_range_query_simple() {
|
||||
fn run() -> Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let year_field = schema_builder.add_u64_field("year", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let mut schema_builder = Schema::builder();
|
||||
let year_field = schema_builder.add_u64_field("year", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 6_000_000).unwrap();
|
||||
for year in 1950u64..2017u64 {
|
||||
let num_docs_within_year = 10 + (year - 1950) * (year - 1950);
|
||||
for _ in 0..num_docs_within_year {
|
||||
index_writer.add_document(doc!(year_field => year));
|
||||
}
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 6_000_000).unwrap();
|
||||
for year in 1950u64..2017u64 {
|
||||
let num_docs_within_year = 10 + (year - 1950) * (year - 1950);
|
||||
for _ in 0..num_docs_within_year {
|
||||
index_writer.add_document(doc!(year_field => year));
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let docs_in_the_sixties = RangeQuery::new_u64(year_field, 1960u64..1970u64);
|
||||
|
||||
// ... or `1960..=1969` if inclusive range is enabled.
|
||||
let count = searcher.search(&docs_in_the_sixties, &Count)?;
|
||||
assert_eq!(count, 2285);
|
||||
Ok(())
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
|
||||
run().unwrap();
|
||||
let docs_in_the_sixties = RangeQuery::new_u64(year_field, 1960u64..1970u64);
|
||||
|
||||
// ... or `1960..=1969` if inclusive range is enabled.
|
||||
let count = searcher.search(&docs_in_the_sixties, &Count).unwrap();
|
||||
assert_eq!(count, 2285);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -12,7 +12,7 @@ mod tests {
|
||||
use crate::collector::TopDocs;
|
||||
use crate::docset::DocSet;
|
||||
use crate::query::{Query, QueryParser, Scorer, TermQuery};
|
||||
use crate::schema::{IndexRecordOption, Schema, STRING, TEXT};
|
||||
use crate::schema::{Field, IndexRecordOption, Schema, STRING, TEXT};
|
||||
use crate::tests::assert_nearly_equals;
|
||||
use crate::Index;
|
||||
use crate::Term;
|
||||
@@ -114,4 +114,16 @@ mod tests {
|
||||
let reader = index.reader().unwrap();
|
||||
assert_eq!(term_query.count(&*reader.searcher()).unwrap(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_term_query_debug() {
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(Field(1), "hello"),
|
||||
IndexRecordOption::WithFreqs,
|
||||
);
|
||||
assert_eq!(
|
||||
format!("{:?}", term_query),
|
||||
"TermQuery(Term(field=1,bytes=[104, 101, 108, 108, 111]))"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@ use crate::Result;
|
||||
use crate::Searcher;
|
||||
use crate::Term;
|
||||
use std::collections::BTreeSet;
|
||||
use std::fmt;
|
||||
|
||||
/// A Term query matches all of the documents
|
||||
/// containing a specific term.
|
||||
@@ -61,12 +62,18 @@ use std::collections::BTreeSet;
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
#[derive(Clone, Debug)]
|
||||
#[derive(Clone)]
|
||||
pub struct TermQuery {
|
||||
term: Term,
|
||||
index_record_option: IndexRecordOption,
|
||||
}
|
||||
|
||||
impl fmt::Debug for TermQuery {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "TermQuery({:?})", self.term)
|
||||
}
|
||||
}
|
||||
|
||||
impl TermQuery {
|
||||
/// Creates a new term query.
|
||||
pub fn new(term: Term, segment_postings_options: IndexRecordOption) -> TermQuery {
|
||||
|
||||
@@ -411,3 +411,52 @@ mod tests {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use query::score_combiner::DoNothingCombiner;
|
||||
use query::ConstScorer;
|
||||
use query::Union;
|
||||
use query::VecDocSet;
|
||||
use test::Bencher;
|
||||
use tests;
|
||||
use DocId;
|
||||
use DocSet;
|
||||
|
||||
#[bench]
|
||||
fn bench_union_3_high(bench: &mut Bencher) {
|
||||
let union_docset: Vec<Vec<DocId>> = vec![
|
||||
tests::sample_with_seed(100_000, 0.1, 0),
|
||||
tests::sample_with_seed(100_000, 0.2, 1),
|
||||
];
|
||||
bench.iter(|| {
|
||||
let mut v = Union::<_, DoNothingCombiner>::from(
|
||||
union_docset
|
||||
.iter()
|
||||
.map(|doc_ids| VecDocSet::from(doc_ids.clone()))
|
||||
.map(ConstScorer::new)
|
||||
.collect::<Vec<_>>(),
|
||||
);
|
||||
while v.advance() {}
|
||||
});
|
||||
}
|
||||
#[bench]
|
||||
fn bench_union_3_low(bench: &mut Bencher) {
|
||||
let union_docset: Vec<Vec<DocId>> = vec![
|
||||
tests::sample_with_seed(100_000, 0.01, 0),
|
||||
tests::sample_with_seed(100_000, 0.05, 1),
|
||||
tests::sample_with_seed(100_000, 0.001, 2),
|
||||
];
|
||||
bench.iter(|| {
|
||||
let mut v = Union::<_, DoNothingCombiner>::from(
|
||||
union_docset
|
||||
.iter()
|
||||
.map(|doc_ids| VecDocSet::from(doc_ids.clone()))
|
||||
.map(ConstScorer::new)
|
||||
.collect::<Vec<_>>(),
|
||||
);
|
||||
while v.advance() {}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,7 +10,7 @@ use serde_json::Value as JsonValue;
|
||||
|
||||
/// Possible error that may occur while parsing a field value
|
||||
/// At this point the JSON is known to be valid.
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum ValueParsingError {
|
||||
/// Encountered a numerical value that overflows or underflow its integer type.
|
||||
OverflowError(String),
|
||||
|
||||
@@ -246,6 +246,25 @@ impl Schema {
|
||||
self.0.fields_map.get(field_name).cloned()
|
||||
}
|
||||
|
||||
/// Create a named document off the doc.
|
||||
pub fn convert_named_doc(
|
||||
&self,
|
||||
named_doc: NamedFieldDocument,
|
||||
) -> Result<Document, DocParsingError> {
|
||||
let mut document = Document::new();
|
||||
for (field_name, values) in named_doc.0 {
|
||||
if let Some(field) = self.get_field(&field_name) {
|
||||
for value in values {
|
||||
let field_value = FieldValue::new(field, value);
|
||||
document.add(field_value);
|
||||
}
|
||||
} else {
|
||||
return Err(DocParsingError::NoSuchFieldInSchema(field_name));
|
||||
}
|
||||
}
|
||||
Ok(document)
|
||||
}
|
||||
|
||||
/// Create a named document off the doc.
|
||||
pub fn to_named_doc(&self, doc: &Document) -> NamedFieldDocument {
|
||||
let mut field_map = BTreeMap::new();
|
||||
@@ -360,7 +379,7 @@ impl<'de> Deserialize<'de> for Schema {
|
||||
|
||||
/// Error that may happen when deserializing
|
||||
/// a document from JSON.
|
||||
#[derive(Debug, Fail)]
|
||||
#[derive(Debug, Fail, PartialEq)]
|
||||
pub enum DocParsingError {
|
||||
/// The payload given is not valid JSON.
|
||||
#[fail(display = "The provided string is not valid JSON")]
|
||||
@@ -369,7 +388,10 @@ pub enum DocParsingError {
|
||||
#[fail(display = "The field '{:?}' could not be parsed: {:?}", _0, _1)]
|
||||
ValueError(String, ValueParsingError),
|
||||
/// The json-document contains a field that is not declared in the schema.
|
||||
#[fail(display = "The json-document contains an unknown field: {:?}", _0)]
|
||||
#[fail(
|
||||
display = "The document contains a field that is not declared in the schema: {:?}",
|
||||
_0
|
||||
)]
|
||||
NoSuchFieldInSchema(String),
|
||||
}
|
||||
|
||||
@@ -381,6 +403,7 @@ mod tests {
|
||||
use crate::schema::*;
|
||||
use matches::{assert_matches, matches};
|
||||
use serde_json;
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
#[test]
|
||||
pub fn is_indexed_test() {
|
||||
@@ -495,6 +518,54 @@ mod tests {
|
||||
assert_eq!(doc, doc_serdeser);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_document_from_nameddoc() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let title = schema_builder.add_text_field("title", TEXT);
|
||||
let val = schema_builder.add_i64_field("val", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let mut named_doc_map = BTreeMap::default();
|
||||
named_doc_map.insert(
|
||||
"title".to_string(),
|
||||
vec![Value::from("title1"), Value::from("title2")],
|
||||
);
|
||||
named_doc_map.insert(
|
||||
"val".to_string(),
|
||||
vec![Value::from(14u64), Value::from(-1i64)],
|
||||
);
|
||||
let doc = schema
|
||||
.convert_named_doc(NamedFieldDocument(named_doc_map))
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
doc.get_all(title),
|
||||
vec![
|
||||
&Value::from("title1".to_string()),
|
||||
&Value::from("title2".to_string())
|
||||
]
|
||||
);
|
||||
assert_eq!(
|
||||
doc.get_all(val),
|
||||
vec![&Value::from(14u64), &Value::from(-1i64)]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_document_from_nameddoc_error() {
|
||||
let schema = Schema::builder().build();
|
||||
let mut named_doc_map = BTreeMap::default();
|
||||
named_doc_map.insert(
|
||||
"title".to_string(),
|
||||
vec![Value::from("title1"), Value::from("title2")],
|
||||
);
|
||||
let err = schema
|
||||
.convert_named_doc(NamedFieldDocument(named_doc_map))
|
||||
.unwrap_err();
|
||||
assert_eq!(
|
||||
err,
|
||||
DocParsingError::NoSuchFieldInSchema("title".to_string())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_document() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
@@ -224,7 +224,12 @@ where
|
||||
|
||||
impl fmt::Debug for Term {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "Term({:?})", &self.0[..])
|
||||
write!(
|
||||
f,
|
||||
"Term(field={},bytes={:?})",
|
||||
self.field().0,
|
||||
self.value_bytes()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -63,7 +63,7 @@ impl FragmentCandidate {
|
||||
fn try_add_token(&mut self, token: &Token, terms: &BTreeMap<String, f32>) {
|
||||
self.stop_offset = token.offset_to;
|
||||
|
||||
if let Some(score) = terms.get(&token.text.to_lowercase()) {
|
||||
if let Some(&score) = terms.get(&token.text.to_lowercase()) {
|
||||
self.score += score;
|
||||
self.highlighted
|
||||
.push(HighlightSection::new(token.offset_from, token.offset_to));
|
||||
@@ -142,7 +142,7 @@ impl Snippet {
|
||||
/// Fragments must be valid in the sense that `&text[fragment.start..fragment.stop]`\
|
||||
/// has to be a valid string.
|
||||
fn search_fragments<'a>(
|
||||
tokenizer: &dyn BoxedTokenizer,
|
||||
tokenizer: &BoxedTokenizer,
|
||||
text: &'a str,
|
||||
terms: &BTreeMap<String, f32>,
|
||||
max_num_chars: usize,
|
||||
@@ -150,7 +150,6 @@ fn search_fragments<'a>(
|
||||
let mut token_stream = tokenizer.token_stream(text);
|
||||
let mut fragment = FragmentCandidate::new(0);
|
||||
let mut fragments: Vec<FragmentCandidate> = vec![];
|
||||
|
||||
while let Some(next) = token_stream.next() {
|
||||
if (next.offset_to - fragment.start_offset) > max_num_chars {
|
||||
if fragment.score > 0.0 {
|
||||
@@ -254,7 +253,7 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str)
|
||||
/// ```
|
||||
pub struct SnippetGenerator {
|
||||
terms_text: BTreeMap<String, f32>,
|
||||
tokenizer: Box<dyn BoxedTokenizer>,
|
||||
tokenizer: BoxedTokenizer,
|
||||
field: Field,
|
||||
max_num_chars: usize,
|
||||
}
|
||||
@@ -316,12 +315,8 @@ impl SnippetGenerator {
|
||||
|
||||
/// Generates a snippet for the given text.
|
||||
pub fn snippet(&self, text: &str) -> Snippet {
|
||||
let fragment_candidates = search_fragments(
|
||||
&*self.tokenizer,
|
||||
&text,
|
||||
&self.terms_text,
|
||||
self.max_num_chars,
|
||||
);
|
||||
let fragment_candidates =
|
||||
search_fragments(&self.tokenizer, &text, &self.terms_text, self.max_num_chars);
|
||||
select_best_fragment_combination(&fragment_candidates[..], &text)
|
||||
}
|
||||
}
|
||||
@@ -331,7 +326,7 @@ mod tests {
|
||||
use super::{search_fragments, select_best_fragment_combination};
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions, TEXT};
|
||||
use crate::tokenizer::{box_tokenizer, SimpleTokenizer};
|
||||
use crate::tokenizer::SimpleTokenizer;
|
||||
use crate::Index;
|
||||
use crate::SnippetGenerator;
|
||||
use maplit::btreemap;
|
||||
@@ -355,12 +350,12 @@ Survey in 2016, 2017, and 2018."#;
|
||||
|
||||
#[test]
|
||||
fn test_snippet() {
|
||||
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
|
||||
let boxed_tokenizer = SimpleTokenizer.into();
|
||||
let terms = btreemap! {
|
||||
String::from("rust") => 1.0,
|
||||
String::from("language") => 0.9
|
||||
};
|
||||
let fragments = search_fragments(&*boxed_tokenizer, TEST_TEXT, &terms, 100);
|
||||
let fragments = search_fragments(&boxed_tokenizer, TEST_TEXT, &terms, 100);
|
||||
assert_eq!(fragments.len(), 7);
|
||||
{
|
||||
let first = &fragments[0];
|
||||
@@ -382,13 +377,13 @@ Survey in 2016, 2017, and 2018."#;
|
||||
|
||||
#[test]
|
||||
fn test_snippet_scored_fragment() {
|
||||
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
|
||||
let boxed_tokenizer = SimpleTokenizer.into();
|
||||
{
|
||||
let terms = btreemap! {
|
||||
String::from("rust") =>1.0f32,
|
||||
String::from("language") => 0.9f32
|
||||
};
|
||||
let fragments = search_fragments(&*boxed_tokenizer, TEST_TEXT, &terms, 20);
|
||||
let fragments = search_fragments(&boxed_tokenizer, TEST_TEXT, &terms, 20);
|
||||
{
|
||||
let first = &fragments[0];
|
||||
assert_eq!(first.score, 1.0);
|
||||
@@ -397,13 +392,13 @@ Survey in 2016, 2017, and 2018."#;
|
||||
let snippet = select_best_fragment_combination(&fragments[..], &TEST_TEXT);
|
||||
assert_eq!(snippet.to_html(), "<b>Rust</b> is a systems")
|
||||
}
|
||||
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
|
||||
let boxed_tokenizer = SimpleTokenizer.into();
|
||||
{
|
||||
let terms = btreemap! {
|
||||
String::from("rust") =>0.9f32,
|
||||
String::from("language") => 1.0f32
|
||||
};
|
||||
let fragments = search_fragments(&*boxed_tokenizer, TEST_TEXT, &terms, 20);
|
||||
let fragments = search_fragments(&boxed_tokenizer, TEST_TEXT, &terms, 20);
|
||||
//assert_eq!(fragments.len(), 7);
|
||||
{
|
||||
let first = &fragments[0];
|
||||
@@ -417,14 +412,14 @@ Survey in 2016, 2017, and 2018."#;
|
||||
|
||||
#[test]
|
||||
fn test_snippet_in_second_fragment() {
|
||||
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
|
||||
let boxed_tokenizer = SimpleTokenizer.into();
|
||||
|
||||
let text = "a b c d e f g";
|
||||
|
||||
let mut terms = BTreeMap::new();
|
||||
terms.insert(String::from("c"), 1.0);
|
||||
|
||||
let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3);
|
||||
let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 3);
|
||||
|
||||
assert_eq!(fragments.len(), 1);
|
||||
{
|
||||
@@ -441,14 +436,14 @@ Survey in 2016, 2017, and 2018."#;
|
||||
|
||||
#[test]
|
||||
fn test_snippet_with_term_at_the_end_of_fragment() {
|
||||
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
|
||||
let boxed_tokenizer = SimpleTokenizer.into();
|
||||
|
||||
let text = "a b c d e f f g";
|
||||
|
||||
let mut terms = BTreeMap::new();
|
||||
terms.insert(String::from("f"), 1.0);
|
||||
|
||||
let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3);
|
||||
let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 3);
|
||||
|
||||
assert_eq!(fragments.len(), 2);
|
||||
{
|
||||
@@ -465,7 +460,7 @@ Survey in 2016, 2017, and 2018."#;
|
||||
|
||||
#[test]
|
||||
fn test_snippet_with_second_fragment_has_the_highest_score() {
|
||||
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
|
||||
let boxed_tokenizer = SimpleTokenizer.into();
|
||||
|
||||
let text = "a b c d e f g";
|
||||
|
||||
@@ -473,7 +468,7 @@ Survey in 2016, 2017, and 2018."#;
|
||||
terms.insert(String::from("f"), 1.0);
|
||||
terms.insert(String::from("a"), 0.9);
|
||||
|
||||
let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 7);
|
||||
let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 7);
|
||||
|
||||
assert_eq!(fragments.len(), 2);
|
||||
{
|
||||
@@ -490,14 +485,14 @@ Survey in 2016, 2017, and 2018."#;
|
||||
|
||||
#[test]
|
||||
fn test_snippet_with_term_not_in_text() {
|
||||
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
|
||||
let boxed_tokenizer = SimpleTokenizer.into();
|
||||
|
||||
let text = "a b c d";
|
||||
|
||||
let mut terms = BTreeMap::new();
|
||||
terms.insert(String::from("z"), 1.0);
|
||||
|
||||
let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3);
|
||||
let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 3);
|
||||
|
||||
assert_eq!(fragments.len(), 0);
|
||||
|
||||
@@ -508,12 +503,12 @@ Survey in 2016, 2017, and 2018."#;
|
||||
|
||||
#[test]
|
||||
fn test_snippet_with_no_terms() {
|
||||
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
|
||||
let boxed_tokenizer = SimpleTokenizer.into();
|
||||
|
||||
let text = "a b c d";
|
||||
|
||||
let terms = BTreeMap::new();
|
||||
let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3);
|
||||
let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 3);
|
||||
assert_eq!(fragments.len(), 0);
|
||||
|
||||
let snippet = select_best_fragment_combination(&fragments[..], &text);
|
||||
|
||||
@@ -2,7 +2,7 @@ use super::TermDictionary;
|
||||
use crate::postings::TermInfo;
|
||||
use crate::termdict::TermOrdinal;
|
||||
use tantivy_fst::automaton::AlwaysMatch;
|
||||
use tantivy_fst::map::{Stream, StreamBuilder};
|
||||
use tantivy_fst::map::{Stream, StreamBuilder, StreamWithState};
|
||||
use tantivy_fst::Automaton;
|
||||
use tantivy_fst::{IntoStreamer, Streamer};
|
||||
|
||||
@@ -11,6 +11,7 @@ use tantivy_fst::{IntoStreamer, Streamer};
|
||||
pub struct TermStreamerBuilder<'a, A = AlwaysMatch>
|
||||
where
|
||||
A: Automaton,
|
||||
A::State: Clone,
|
||||
{
|
||||
fst_map: &'a TermDictionary,
|
||||
stream_builder: StreamBuilder<'a, A>,
|
||||
@@ -19,6 +20,7 @@ where
|
||||
impl<'a, A> TermStreamerBuilder<'a, A>
|
||||
where
|
||||
A: Automaton,
|
||||
A::State: Clone + Default + Sized,
|
||||
{
|
||||
pub(crate) fn new(fst_map: &'a TermDictionary, stream_builder: StreamBuilder<'a, A>) -> Self {
|
||||
TermStreamerBuilder {
|
||||
@@ -56,10 +58,11 @@ where
|
||||
pub fn into_stream(self) -> TermStreamer<'a, A> {
|
||||
TermStreamer {
|
||||
fst_map: self.fst_map,
|
||||
stream: self.stream_builder.into_stream(),
|
||||
stream: self.stream_builder.with_state().into_stream(),
|
||||
term_ord: 0u64,
|
||||
current_key: Vec::with_capacity(100),
|
||||
current_value: TermInfo::default(),
|
||||
state: Default::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -69,27 +72,31 @@ where
|
||||
pub struct TermStreamer<'a, A = AlwaysMatch>
|
||||
where
|
||||
A: Automaton,
|
||||
A::State: Clone + Default + Sized,
|
||||
{
|
||||
fst_map: &'a TermDictionary,
|
||||
stream: Stream<'a, A>,
|
||||
stream: StreamWithState<'a, A>,
|
||||
term_ord: TermOrdinal,
|
||||
current_key: Vec<u8>,
|
||||
current_value: TermInfo,
|
||||
state: A::State,
|
||||
}
|
||||
|
||||
impl<'a, A> TermStreamer<'a, A>
|
||||
where
|
||||
A: Automaton,
|
||||
A::State: Clone + Default + Sized,
|
||||
{
|
||||
/// Advance position the stream on the next item.
|
||||
/// Before the first call to `.advance()`, the stream
|
||||
/// is an unitialized state.
|
||||
pub fn advance(&mut self) -> bool {
|
||||
if let Some((term, term_ord)) = self.stream.next() {
|
||||
if let Some((term, term_ord, state)) = self.stream.next() {
|
||||
self.current_key.clear();
|
||||
self.current_key.extend_from_slice(term);
|
||||
self.term_ord = term_ord;
|
||||
self.current_value = self.fst_map.term_info_from_ord(term_ord);
|
||||
self.state = state;
|
||||
true
|
||||
} else {
|
||||
false
|
||||
@@ -118,6 +125,10 @@ where
|
||||
&self.current_key
|
||||
}
|
||||
|
||||
pub fn state(&self) -> &A::State {
|
||||
&self.state
|
||||
}
|
||||
|
||||
/// Accesses the current value.
|
||||
///
|
||||
/// Calling `.value()` after the end of the stream will return the
|
||||
|
||||
@@ -197,7 +197,11 @@ impl TermDictionary {
|
||||
|
||||
/// Returns a search builder, to stream all of the terms
|
||||
/// within the Automaton
|
||||
pub fn search<'a, A: Automaton + 'a>(&'a self, automaton: A) -> TermStreamerBuilder<'a, A> {
|
||||
pub fn search<'a, A>(&'a self, automaton: A) -> TermStreamerBuilder<'a, A>
|
||||
where
|
||||
A: Automaton + 'a,
|
||||
A::State: Clone + Default + Sized,
|
||||
{
|
||||
let stream_builder = self.fst_index.search(automaton);
|
||||
TermStreamerBuilder::<A>::new(self, stream_builder)
|
||||
}
|
||||
|
||||
@@ -155,7 +155,6 @@ pub use self::simple_tokenizer::SimpleTokenizer;
|
||||
pub use self::stemmer::{Language, Stemmer};
|
||||
pub use self::stop_word_filter::StopWordFilter;
|
||||
pub(crate) use self::token_stream_chain::TokenStreamChain;
|
||||
pub(crate) use self::tokenizer::box_tokenizer;
|
||||
pub use self::tokenizer::BoxedTokenizer;
|
||||
|
||||
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
|
||||
@@ -56,8 +56,6 @@ pub trait Tokenizer<'a>: Sized + Clone {
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// # extern crate tantivy;
|
||||
///
|
||||
/// use tantivy::tokenizer::*;
|
||||
///
|
||||
/// # fn main() {
|
||||
@@ -80,7 +78,7 @@ pub trait Tokenizer<'a>: Sized + Clone {
|
||||
}
|
||||
|
||||
/// A boxed tokenizer
|
||||
pub trait BoxedTokenizer: Send + Sync {
|
||||
trait BoxedTokenizerTrait: Send + Sync {
|
||||
/// Tokenize a `&str`
|
||||
fn token_stream<'a>(&self, text: &'a str) -> Box<dyn TokenStream + 'a>;
|
||||
|
||||
@@ -92,7 +90,41 @@ pub trait BoxedTokenizer: Send + Sync {
|
||||
fn token_stream_texts<'b>(&self, texts: &'b [&'b str]) -> Box<dyn TokenStream + 'b>;
|
||||
|
||||
/// Return a boxed clone of the tokenizer
|
||||
fn boxed_clone(&self) -> Box<dyn BoxedTokenizer>;
|
||||
fn boxed_clone(&self) -> BoxedTokenizer;
|
||||
}
|
||||
|
||||
/// A boxed tokenizer
|
||||
pub struct BoxedTokenizer(Box<dyn BoxedTokenizerTrait>);
|
||||
|
||||
impl<T> From<T> for BoxedTokenizer
|
||||
where
|
||||
T: 'static + Send + Sync + for<'a> Tokenizer<'a>,
|
||||
{
|
||||
fn from(tokenizer: T) -> BoxedTokenizer {
|
||||
BoxedTokenizer(Box::new(BoxableTokenizer(tokenizer)))
|
||||
}
|
||||
}
|
||||
|
||||
impl BoxedTokenizer {
|
||||
/// Tokenize a `&str`
|
||||
pub fn token_stream<'a>(&self, text: &'a str) -> Box<dyn TokenStream + 'a> {
|
||||
self.0.token_stream(text)
|
||||
}
|
||||
|
||||
/// Tokenize an array`&str`
|
||||
///
|
||||
/// The resulting `TokenStream` is equivalent to what would be obtained if the &str were
|
||||
/// one concatenated `&str`, with an artificial position gap of `2` between the different fields
|
||||
/// to prevent accidental `PhraseQuery` to match accross two terms.
|
||||
pub fn token_stream_texts<'b>(&self, texts: &'b [&'b str]) -> Box<dyn TokenStream + 'b> {
|
||||
self.0.token_stream_texts(texts)
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for BoxedTokenizer {
|
||||
fn clone(&self) -> BoxedTokenizer {
|
||||
self.0.boxed_clone()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
@@ -100,7 +132,7 @@ struct BoxableTokenizer<A>(A)
|
||||
where
|
||||
A: for<'a> Tokenizer<'a> + Send + Sync;
|
||||
|
||||
impl<A> BoxedTokenizer for BoxableTokenizer<A>
|
||||
impl<A> BoxedTokenizerTrait for BoxableTokenizer<A>
|
||||
where
|
||||
A: 'static + Send + Sync + for<'a> Tokenizer<'a>,
|
||||
{
|
||||
@@ -125,18 +157,11 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
fn boxed_clone(&self) -> Box<dyn BoxedTokenizer> {
|
||||
Box::new(self.clone())
|
||||
fn boxed_clone(&self) -> BoxedTokenizer {
|
||||
self.0.clone().into()
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn box_tokenizer<A>(a: A) -> Box<dyn BoxedTokenizer>
|
||||
where
|
||||
A: 'static + Send + Sync + for<'a> Tokenizer<'a>,
|
||||
{
|
||||
Box::new(BoxableTokenizer(a))
|
||||
}
|
||||
|
||||
impl<'b> TokenStream for Box<dyn TokenStream + 'b> {
|
||||
fn advance(&mut self) -> bool {
|
||||
let token_stream: &mut dyn TokenStream = self.borrow_mut();
|
||||
@@ -161,7 +186,6 @@ impl<'b> TokenStream for Box<dyn TokenStream + 'b> {
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::tokenizer::*;
|
||||
///
|
||||
/// # fn main() {
|
||||
@@ -203,7 +227,6 @@ pub trait TokenStream {
|
||||
/// and `.token()`.
|
||||
///
|
||||
/// ```
|
||||
/// # extern crate tantivy;
|
||||
/// # use tantivy::tokenizer::*;
|
||||
/// #
|
||||
/// # fn main() {
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use crate::tokenizer::box_tokenizer;
|
||||
use crate::tokenizer::stemmer::Language;
|
||||
use crate::tokenizer::BoxedTokenizer;
|
||||
use crate::tokenizer::LowerCaser;
|
||||
@@ -8,7 +7,6 @@ use crate::tokenizer::SimpleTokenizer;
|
||||
use crate::tokenizer::Stemmer;
|
||||
use crate::tokenizer::Tokenizer;
|
||||
use std::collections::HashMap;
|
||||
use std::ops::Deref;
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
/// The tokenizer manager serves as a store for
|
||||
@@ -25,16 +23,16 @@ use std::sync::{Arc, RwLock};
|
||||
/// search engine.
|
||||
#[derive(Clone)]
|
||||
pub struct TokenizerManager {
|
||||
tokenizers: Arc<RwLock<HashMap<String, Box<dyn BoxedTokenizer>>>>,
|
||||
tokenizers: Arc<RwLock<HashMap<String, BoxedTokenizer>>>,
|
||||
}
|
||||
|
||||
impl TokenizerManager {
|
||||
/// Registers a new tokenizer associated with a given name.
|
||||
pub fn register<A>(&self, tokenizer_name: &str, tokenizer: A)
|
||||
where
|
||||
A: 'static + Send + Sync + for<'a> Tokenizer<'a>,
|
||||
A: Into<BoxedTokenizer>,
|
||||
{
|
||||
let boxed_tokenizer = box_tokenizer(tokenizer);
|
||||
let boxed_tokenizer = tokenizer.into();
|
||||
self.tokenizers
|
||||
.write()
|
||||
.expect("Acquiring the lock should never fail")
|
||||
@@ -42,13 +40,12 @@ impl TokenizerManager {
|
||||
}
|
||||
|
||||
/// Accessing a tokenizer given its name.
|
||||
pub fn get(&self, tokenizer_name: &str) -> Option<Box<dyn BoxedTokenizer>> {
|
||||
pub fn get(&self, tokenizer_name: &str) -> Option<BoxedTokenizer> {
|
||||
self.tokenizers
|
||||
.read()
|
||||
.expect("Acquiring the lock should never fail")
|
||||
.get(tokenizer_name)
|
||||
.map(Deref::deref)
|
||||
.map(BoxedTokenizer::boxed_clone)
|
||||
.cloned()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user