mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-28 04:52:55 +00:00
Compare commits
10 Commits
python-bin
...
criterion
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a854a60e2a | ||
|
|
92d73a6bfb | ||
|
|
04832a86eb | ||
|
|
beb8e990cd | ||
|
|
001af3876f | ||
|
|
f428f344da | ||
|
|
143f78eced | ||
|
|
754b55eee5 | ||
|
|
280ea1209c | ||
|
|
0154dbe477 |
@@ -2,6 +2,10 @@ Tantivy 0.11.0
|
||||
=====================
|
||||
|
||||
- Added f64 field. Internally reuse u64 code the same way i64 does (@fdb-hiroshima)
|
||||
- Various bugfixes in the query parser.
|
||||
- Better handling of hyphens in query parser. (#609)
|
||||
- Better handling of whitespaces.
|
||||
|
||||
|
||||
Tantivy 0.10.1
|
||||
=====================
|
||||
|
||||
22
Cargo.toml
22
Cargo.toml
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.10.1"
|
||||
version = "0.11.0"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
@@ -25,7 +25,6 @@ atomicwrites = {version="0.2.2", optional=true}
|
||||
tempfile = "3.0"
|
||||
log = "0.4"
|
||||
combine = ">=3.6.0,<4.0.0"
|
||||
tempdir = "0.3"
|
||||
serde = "1.0"
|
||||
serde_derive = "1.0"
|
||||
serde_json = "1.0"
|
||||
@@ -36,7 +35,7 @@ levenshtein_automata = {version="0.1", features=["fst_automaton"]}
|
||||
notify = {version="4", optional=true}
|
||||
bit-set = "0.5"
|
||||
uuid = { version = "0.7.2", features = ["v4", "serde"] }
|
||||
crossbeam = "0.5"
|
||||
crossbeam = "0.7"
|
||||
futures = "0.1"
|
||||
futures-cpupool = "0.1"
|
||||
owning_ref = "0.4"
|
||||
@@ -63,6 +62,7 @@ rand = "0.7"
|
||||
maplit = "1"
|
||||
matches = "0.1.8"
|
||||
time = "0.1.42"
|
||||
criterion = "0.2"
|
||||
|
||||
[profile.release]
|
||||
opt-level = 3
|
||||
@@ -75,6 +75,7 @@ overflow-checks = true
|
||||
|
||||
[features]
|
||||
default = ["mmap"]
|
||||
forbench = []
|
||||
mmap = ["atomicwrites", "fs2", "memmap", "notify"]
|
||||
lz4-compression = ["lz4"]
|
||||
failpoints = ["fail/failpoints"]
|
||||
@@ -87,7 +88,6 @@ travis-ci = { repository = "tantivy-search/tantivy" }
|
||||
[dev-dependencies.fail]
|
||||
features = ["failpoints"]
|
||||
|
||||
|
||||
# Following the "fail" crate best practises, we isolate
|
||||
# tests that define specific behavior in fail check points
|
||||
# in a different binary.
|
||||
@@ -98,4 +98,16 @@ features = ["failpoints"]
|
||||
[[test]]
|
||||
name = "failpoints"
|
||||
path = "tests/failpoints/mod.rs"
|
||||
required-features = ["fail/failpoints"]
|
||||
required-features = ["fail/failpoints"]
|
||||
|
||||
[profile.bench]
|
||||
lto = true
|
||||
|
||||
[[bench]]
|
||||
name = "vint"
|
||||
harness = false
|
||||
|
||||
|
||||
[[bench]]
|
||||
name = "fastfield"
|
||||
harness = false
|
||||
73
benches/bitset.rs
Normal file
73
benches/bitset.rs
Normal file
@@ -0,0 +1,73 @@
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use rand::distributions::{Bernoulli, Uniform};
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
use tantivy::forbench::bitset::{BitSet, TinySet};
|
||||
use tantivy::query::BitSetDocSet;
|
||||
use tantivy::DocSet;
|
||||
|
||||
fn sample_with_seed(n: u32, ratio: f64, seed_val: u8) -> Vec<u32> {
|
||||
StdRng::from_seed([seed_val; 32])
|
||||
.sample_iter(&Bernoulli::new(ratio).unwrap())
|
||||
.take(n as usize)
|
||||
.enumerate()
|
||||
.filter_map(|(val, keep)| if keep { Some(val as u32) } else { None })
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn generate_nonunique_unsorted(max_value: u32, n_elems: usize) -> Vec<u32> {
|
||||
let seed: [u8; 32] = [1; 32];
|
||||
StdRng::from_seed(seed)
|
||||
.sample_iter(&Uniform::new(0u32, max_value))
|
||||
.take(n_elems)
|
||||
.collect::<Vec<u32>>()
|
||||
}
|
||||
|
||||
fn bench_tinyset_pop(criterion: &mut Criterion) {
|
||||
criterion.bench_function("pop_lowest", |b| {
|
||||
b.iter(|| {
|
||||
let mut tinyset = TinySet::singleton(criterion::black_box(31u32));
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
fn bench_bitset_insert(criterion: &mut Criterion) {
|
||||
criterion.bench_function_over_inputs(
|
||||
"bitset_insert",
|
||||
|bench, (max_value, n_elems)| {
|
||||
let els = generate_nonunique_unsorted(*max_value, *n_elems);
|
||||
bench.iter(move || {
|
||||
let mut bitset = BitSet::with_max_value(1_000_000);
|
||||
for el in els.iter().cloned() {
|
||||
bitset.insert(el);
|
||||
}
|
||||
});
|
||||
},
|
||||
vec![(1_000_000u32, 10_000)],
|
||||
);
|
||||
}
|
||||
|
||||
fn bench_bitsetdocset_iterate(b: &mut test::Bencher) {
|
||||
let mut bitset = BitSet::with_max_value(1_000_000);
|
||||
for el in sample_with_seed(1_000_000u32, 0.01, 0u8) {
|
||||
bitset.insert(el);
|
||||
}
|
||||
b.iter(|| {
|
||||
let mut docset = BitSetDocSet::from(bitset.clone());
|
||||
while docset.advance() {}
|
||||
});
|
||||
}
|
||||
|
||||
criterion_group!(
|
||||
benches,
|
||||
bench_tinyset_pop,
|
||||
bench_bitset_insert,
|
||||
bench_bitsetdocset_iterate
|
||||
);
|
||||
criterion_main!(benches);
|
||||
107
benches/fastfield.rs
Normal file
107
benches/fastfield.rs
Normal file
@@ -0,0 +1,107 @@
|
||||
use criterion::criterion_group;
|
||||
use criterion::criterion_main;
|
||||
use criterion::Criterion;
|
||||
use criterion::ParameterizedBenchmark;
|
||||
use rand::rngs::StdRng;
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::SeedableRng;
|
||||
use tantivy::schema::{Schema, FAST};
|
||||
use tantivy::{doc, DocId, Index};
|
||||
|
||||
const NUM_LOOKUPS: usize = 1_000;
|
||||
|
||||
fn generate_permutation(stride: usize, bit_width: u8) -> Vec<u64> {
|
||||
let mut permutation: Vec<u64> = (0u64..(NUM_LOOKUPS * stride) as u64).collect();
|
||||
permutation.shuffle(&mut StdRng::from_seed([1u8; 32]));
|
||||
permutation.push(1u64 << (bit_width as u64)); //< just to force the bit_width
|
||||
permutation
|
||||
}
|
||||
|
||||
fn bench_linear_lookup(c: &mut Criterion) {
|
||||
c.bench(
|
||||
"lookup_stride",
|
||||
ParameterizedBenchmark::new(
|
||||
"baseline_vec",
|
||||
|bench, (stride, num_bits)| {
|
||||
let arr = generate_permutation(*stride, *num_bits);
|
||||
bench.iter(move || {
|
||||
let mut a = 0u64;
|
||||
for i in (0..NUM_LOOKUPS / stride).map(|v| v * 7) {
|
||||
a ^= arr[i as usize];
|
||||
}
|
||||
a
|
||||
})
|
||||
},
|
||||
vec![(7, 1), (7, 5), (7, 20)],
|
||||
)
|
||||
.with_function("fastfield", |bench, (stride, num_bits)| {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let val_field = schema_builder.add_u64_field("val", FAST);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 80_000_000).unwrap();
|
||||
for el in generate_permutation(*stride, *num_bits) {
|
||||
index_writer.add_document(doc!(val_field=>el));
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
let fast_field_reader = segment_reader.fast_fields().u64(val_field).unwrap();
|
||||
bench.iter(move || {
|
||||
let mut a = 0u64;
|
||||
for i in (0..NUM_LOOKUPS / stride).map(|v| v * 7) {
|
||||
a ^= fast_field_reader.get(i as DocId);
|
||||
}
|
||||
a
|
||||
})
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
fn bench_jumpy_lookup(c: &mut Criterion) {
|
||||
c.bench(
|
||||
"lookup_jumpy",
|
||||
ParameterizedBenchmark::new(
|
||||
"baseline_vec",
|
||||
|bench, (stride, num_bits)| {
|
||||
let arr = generate_permutation(*stride, *num_bits);
|
||||
bench.iter(move || {
|
||||
let mut a = 0u64;
|
||||
for _ in 0..NUM_LOOKUPS {
|
||||
a = arr[a as usize];
|
||||
}
|
||||
a
|
||||
})
|
||||
},
|
||||
vec![(7, 1), (7, 5), (7, 20)],
|
||||
)
|
||||
.with_function("fastfield", |bench, (stride, num_bits)| {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let val_field = schema_builder.add_u64_field("val", FAST);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 80_000_000).unwrap();
|
||||
for el in generate_permutation(*stride, *num_bits) {
|
||||
index_writer.add_document(doc!(val_field=>el));
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
let fast_field_reader = segment_reader.fast_fields().u64(val_field).unwrap();
|
||||
bench.iter(move || {
|
||||
let mut a = 0u64;
|
||||
for _ in 0..NUM_LOOKUPS {
|
||||
a = fast_field_reader.get(a as DocId);
|
||||
}
|
||||
a
|
||||
})
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
criterion_group!(benches, bench_linear_lookup, bench_jumpy_lookup);
|
||||
criterion_main!(benches);
|
||||
50
benches/union.rs
Normal file
50
benches/union.rs
Normal file
@@ -0,0 +1,50 @@
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::{Schema, STRING};
|
||||
use tantivy::{Document, Index};
|
||||
|
||||
fn bench_union(criterion: &mut Criterion) {
|
||||
criterion.bench_function_over_inputs(
|
||||
"union_docset_fulladvance",
|
||||
|bench, (ratio_left, ratio_right)| {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_text_field("val", STRING);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 80_000_000).unwrap();
|
||||
let mut stdrng = StdRng::from_seed([0u8; 32]);
|
||||
for _ in 0u32..100_000u32 {
|
||||
let mut doc = Document::default();
|
||||
if stdrng.gen_bool(*ratio_left) {
|
||||
doc.add_text(field, "left");
|
||||
}
|
||||
if stdrng.gen_bool(*ratio_right) {
|
||||
doc.add_text(field, "right");
|
||||
}
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let query = QueryParser::for_index(&index, vec![field])
|
||||
.parse_query("left right")
|
||||
.unwrap();
|
||||
|
||||
bench.iter(move || {
|
||||
let weight = query.weight(&searcher, false).unwrap();
|
||||
let mut scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||
let mut sum_docs = 0u64;
|
||||
scorer.for_each(&mut |doc_id, _score| {
|
||||
sum_docs += doc_id as u64;
|
||||
});
|
||||
});
|
||||
},
|
||||
vec![(0.2, 0.1), (0.2, 0.02)],
|
||||
);
|
||||
}
|
||||
|
||||
criterion_group!(benches, bench_union);
|
||||
criterion_main!(benches);
|
||||
72
benches/vint.rs
Normal file
72
benches/vint.rs
Normal file
@@ -0,0 +1,72 @@
|
||||
use criterion::{criterion_group, criterion_main, Criterion, ParameterizedBenchmark};
|
||||
use rand::rngs::StdRng;
|
||||
use rand::Rng;
|
||||
use rand::SeedableRng;
|
||||
use tantivy::forbench::compression::{compressed_block_size, BlockDecoder};
|
||||
use tantivy::forbench::compression::{BlockEncoder, VIntEncoder};
|
||||
use tantivy::forbench::compression::{VIntDecoder, COMPRESSION_BLOCK_SIZE};
|
||||
|
||||
fn generate_array_with_seed(n: usize, ratio: f64, seed_val: u8) -> Vec<u32> {
|
||||
let seed: [u8; 32] = [seed_val; 32];
|
||||
let mut rng = StdRng::from_seed(seed);
|
||||
(0u32..).filter(|_| rng.gen_bool(ratio)).take(n).collect()
|
||||
}
|
||||
|
||||
pub fn generate_array(n: usize, ratio: f64) -> Vec<u32> {
|
||||
generate_array_with_seed(n, ratio, 4)
|
||||
}
|
||||
|
||||
fn bench_compress(criterion: &mut Criterion) {
|
||||
criterion.bench(
|
||||
"compress_sorted",
|
||||
ParameterizedBenchmark::new(
|
||||
"bitpack",
|
||||
|bench, ratio| {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = generate_array(COMPRESSION_BLOCK_SIZE, *ratio);
|
||||
bench.iter(|| {
|
||||
encoder.compress_block_sorted(&data, 0u32);
|
||||
});
|
||||
},
|
||||
vec![0.1],
|
||||
)
|
||||
.with_function("vint", |bench, ratio| {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = generate_array(COMPRESSION_BLOCK_SIZE, *ratio);
|
||||
bench.iter(|| {
|
||||
encoder.compress_vint_sorted(&data, 0u32);
|
||||
});
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
fn bench_uncompress(criterion: &mut Criterion) {
|
||||
criterion.bench(
|
||||
"uncompress_sorted",
|
||||
ParameterizedBenchmark::new(
|
||||
"bitpack",
|
||||
|bench, ratio| {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = generate_array(COMPRESSION_BLOCK_SIZE, *ratio);
|
||||
let (num_bits, compressed) = encoder.compress_block_sorted(&data, 0u32);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
bench.iter(|| {
|
||||
decoder.uncompress_block_sorted(compressed, 0u32, num_bits);
|
||||
});
|
||||
},
|
||||
vec![0.1],
|
||||
)
|
||||
.with_function("vint", |bench, ratio| {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = generate_array(COMPRESSION_BLOCK_SIZE, *ratio);
|
||||
let compressed = encoder.compress_vint_sorted(&data, 0u32);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
bench.iter(move || {
|
||||
decoder.uncompress_vint_sorted(compressed, 0u32, COMPRESSION_BLOCK_SIZE);
|
||||
});
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
criterion_group!(benches, bench_compress, bench_uncompress);
|
||||
criterion_main!(benches);
|
||||
@@ -19,12 +19,12 @@ use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::Index;
|
||||
use tantivy::ReloadPolicy;
|
||||
use tempdir::TempDir;
|
||||
use tempfile::TempDir;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// Let's create a temporary directory for the
|
||||
// sake of this example
|
||||
let index_path = TempDir::new("tantivy_example_dir")?;
|
||||
let index_path = TempDir::new()?;
|
||||
|
||||
// # Defining the schema
|
||||
//
|
||||
|
||||
@@ -18,11 +18,12 @@ use tantivy::collector::FacetCollector;
|
||||
use tantivy::query::AllQuery;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::Index;
|
||||
use tempfile::TempDir;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// Let's create a temporary directory for the
|
||||
// sake of this example
|
||||
let index_path = TempDir::new("tantivy_facet_example_dir")?;
|
||||
let index_path = TempDir::new()?;
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
schema_builder.add_text_field("name", TEXT | STORED);
|
||||
@@ -74,5 +75,3 @@ fn main() -> tantivy::Result<()> {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
use tempdir::TempDir;
|
||||
|
||||
@@ -14,12 +14,12 @@ use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::Index;
|
||||
use tantivy::{Snippet, SnippetGenerator};
|
||||
use tempdir::TempDir;
|
||||
use tempfile::TempDir;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// Let's create a temporary directory for the
|
||||
// sake of this example
|
||||
let index_path = TempDir::new("tantivy_example_dir")?;
|
||||
let index_path = TempDir::new()?;
|
||||
|
||||
// # Defining the schema
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
2
run-bench.rs
Executable file
2
run-bench.rs
Executable file
@@ -0,0 +1,2 @@
|
||||
#!/usr/bin/env bash
|
||||
cargo bench --features forbench
|
||||
@@ -13,6 +13,7 @@ use crate::Result;
|
||||
use crate::Score;
|
||||
use crate::SegmentLocalId;
|
||||
use crate::SegmentReader;
|
||||
use std::fmt;
|
||||
|
||||
/// The Top Score Collector keeps track of the K documents
|
||||
/// sorted by their score.
|
||||
@@ -68,6 +69,12 @@ use crate::SegmentReader;
|
||||
/// ```
|
||||
pub struct TopDocs(TopCollector<Score>);
|
||||
|
||||
impl fmt::Debug for TopDocs {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "TopDocs({})", self.0.limit())
|
||||
}
|
||||
}
|
||||
|
||||
impl TopDocs {
|
||||
/// Creates a top score collector, with a number of documents equal to "limit".
|
||||
///
|
||||
@@ -584,7 +591,7 @@ mod tests {
|
||||
query_field: Field,
|
||||
schema: Schema,
|
||||
mut doc_adder: impl FnMut(&mut IndexWriter) -> (),
|
||||
) -> (Index, Box<Query>) {
|
||||
) -> (Index, Box<dyn Query>) {
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
|
||||
@@ -86,6 +86,7 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn get(&self, idx: u64) -> u64 {
|
||||
if self.num_bits == 0 {
|
||||
return 0u64;
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::fmt;
|
||||
use std::u64;
|
||||
|
||||
#[derive(Clone, Copy, Eq, PartialEq)]
|
||||
pub(crate) struct TinySet(u64);
|
||||
pub struct TinySet(u64);
|
||||
|
||||
impl fmt::Debug for TinySet {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
@@ -353,43 +353,3 @@ mod tests {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use super::BitSet;
|
||||
use super::TinySet;
|
||||
use test;
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyset_pop(b: &mut test::Bencher) {
|
||||
b.iter(|| {
|
||||
let mut tinyset = TinySet::singleton(test::black_box(31u32));
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyset_sum(b: &mut test::Bencher) {
|
||||
let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
|
||||
b.iter(|| {
|
||||
assert_eq!(test::black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyarr_sum(b: &mut test::Bencher) {
|
||||
let v = [10u32, 14u32, 21u32];
|
||||
b.iter(|| test::black_box(v).iter().cloned().sum::<u32>());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_bitset_initialize(b: &mut test::Bencher) {
|
||||
b.iter(|| BitSet::with_max_value(1_000_000));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ mod serialize;
|
||||
mod vint;
|
||||
|
||||
pub use self::bitset::BitSet;
|
||||
pub(crate) use self::bitset::TinySet;
|
||||
pub use self::bitset::TinySet;
|
||||
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
|
||||
pub use self::counting_writer::CountingWriter;
|
||||
pub use self::serialize::{BinarySerializable, FixedSize};
|
||||
@@ -124,26 +124,24 @@ pub fn f64_to_u64(val: f64) -> u64 {
|
||||
/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
|
||||
#[inline(always)]
|
||||
pub fn u64_to_f64(val: u64) -> f64 {
|
||||
f64::from_bits(
|
||||
if val & HIGHEST_BIT != 0 {
|
||||
val ^ HIGHEST_BIT
|
||||
} else {
|
||||
!val
|
||||
}
|
||||
)
|
||||
f64::from_bits(if val & HIGHEST_BIT != 0 {
|
||||
val ^ HIGHEST_BIT
|
||||
} else {
|
||||
!val
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod test {
|
||||
|
||||
pub use super::serialize::test::fixed_size_test;
|
||||
use super::{compute_num_bits, i64_to_u64, u64_to_i64, f64_to_u64, u64_to_f64};
|
||||
use super::{compute_num_bits, f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
|
||||
use std::f64;
|
||||
|
||||
fn test_i64_converter_helper(val: i64) {
|
||||
assert_eq!(u64_to_i64(i64_to_u64(val)), val);
|
||||
}
|
||||
|
||||
|
||||
fn test_f64_converter_helper(val: f64) {
|
||||
assert_eq!(u64_to_f64(f64_to_u64(val)), val);
|
||||
}
|
||||
@@ -172,7 +170,8 @@ pub(crate) mod test {
|
||||
|
||||
#[test]
|
||||
fn test_f64_order() {
|
||||
assert!(!(f64_to_u64(f64::NEG_INFINITY)..f64_to_u64(f64::INFINITY)).contains(&f64_to_u64(f64::NAN))); //nan is not a number
|
||||
assert!(!(f64_to_u64(f64::NEG_INFINITY)..f64_to_u64(f64::INFINITY))
|
||||
.contains(&f64_to_u64(f64::NAN))); //nan is not a number
|
||||
assert!(f64_to_u64(1.5) > f64_to_u64(1.0)); //same exponent, different mantissa
|
||||
assert!(f64_to_u64(2.0) > f64_to_u64(1.0)); //same mantissa, different exponent
|
||||
assert!(f64_to_u64(2.0) > f64_to_u64(1.5)); //different exponent and mantissa
|
||||
|
||||
@@ -459,13 +459,13 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
use std::path::PathBuf;
|
||||
use tempdir::TempDir;
|
||||
use tempfile::TempDir;
|
||||
|
||||
#[test]
|
||||
fn test_index_on_commit_reload_policy_mmap() {
|
||||
let schema = throw_away_schema();
|
||||
let field = schema.get_field("num_likes").unwrap();
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir = TempDir::new().unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
let index = Index::create_in_dir(&tempdir_path, schema).unwrap();
|
||||
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
@@ -504,7 +504,7 @@ mod tests {
|
||||
fn test_index_on_commit_reload_policy_different_directories() {
|
||||
let schema = throw_away_schema();
|
||||
let field = schema.get_field("num_likes").unwrap();
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir = TempDir::new().unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
let write_index = Index::create_in_dir(&tempdir_path, schema).unwrap();
|
||||
let read_index = Index::open_in_dir(&tempdir_path).unwrap();
|
||||
|
||||
@@ -48,14 +48,14 @@ impl RetryPolicy {
|
||||
///
|
||||
/// It is transparently associated to a lock file, that gets deleted
|
||||
/// on `Drop.` The lock is released automatically on `Drop`.
|
||||
pub struct DirectoryLock(Box<dyn Drop + Send + Sync + 'static>);
|
||||
pub struct DirectoryLock(Box<dyn Send + Sync + 'static>);
|
||||
|
||||
struct DirectoryLockGuard {
|
||||
directory: Box<dyn Directory>,
|
||||
path: PathBuf,
|
||||
}
|
||||
|
||||
impl<T: Drop + Send + Sync + 'static> From<Box<T>> for DirectoryLock {
|
||||
impl<T: Send + Sync + 'static> From<Box<T>> for DirectoryLock {
|
||||
fn from(underlying: Box<T>) -> Self {
|
||||
DirectoryLock(underlying)
|
||||
}
|
||||
|
||||
@@ -263,11 +263,11 @@ mod tests_mmap_specific {
|
||||
use std::collections::HashSet;
|
||||
use std::io::Write;
|
||||
use std::path::{Path, PathBuf};
|
||||
use tempdir::TempDir;
|
||||
use tempfile::TempDir;
|
||||
|
||||
#[test]
|
||||
fn test_managed_directory() {
|
||||
let tempdir = TempDir::new("tantivy-test").unwrap();
|
||||
let tempdir = TempDir::new().unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
|
||||
let test_path1: &'static Path = Path::new("some_path_for_test");
|
||||
@@ -304,7 +304,7 @@ mod tests_mmap_specific {
|
||||
fn test_managed_directory_gc_while_mmapped() {
|
||||
let test_path1: &'static Path = Path::new("some_path_for_test");
|
||||
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir = TempDir::new().unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
let living_files = HashSet::new();
|
||||
|
||||
|
||||
@@ -36,7 +36,7 @@ use std::sync::Mutex;
|
||||
use std::sync::RwLock;
|
||||
use std::sync::Weak;
|
||||
use std::thread;
|
||||
use tempdir::TempDir;
|
||||
use tempfile::TempDir;
|
||||
|
||||
/// Create a default io error given a string.
|
||||
pub(crate) fn make_io_err(msg: String) -> io::Error {
|
||||
@@ -294,7 +294,7 @@ impl MmapDirectory {
|
||||
/// This is mostly useful to test the MmapDirectory itself.
|
||||
/// For your unit tests, prefer the RAMDirectory.
|
||||
pub fn create_from_tempdir() -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
let tempdir = TempDir::new("index").map_err(OpenDirectoryError::IoError)?;
|
||||
let tempdir = TempDir::new().map_err(OpenDirectoryError::IoError)?;
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
MmapDirectory::new(tempdir_path, Some(tempdir))
|
||||
}
|
||||
@@ -642,7 +642,7 @@ mod tests {
|
||||
fn test_watch_wrapper() {
|
||||
let counter: Arc<AtomicUsize> = Default::default();
|
||||
let counter_clone = counter.clone();
|
||||
let tmp_dir: TempDir = tempdir::TempDir::new("test_watch_wrapper").unwrap();
|
||||
let tmp_dir = tempfile::TempDir::new().unwrap();
|
||||
let tmp_dirpath = tmp_dir.path().to_owned();
|
||||
let mut watch_wrapper = WatcherWrapper::new(&tmp_dirpath).unwrap();
|
||||
let tmp_file = tmp_dirpath.join("coucou");
|
||||
|
||||
@@ -177,7 +177,7 @@ impl Directory for RAMDirectory {
|
||||
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
|
||||
fail_point!("RAMDirectory::atomic_write", |msg| Err(io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
msg.unwrap_or("Undefined".to_string())
|
||||
msg.unwrap_or_else(|| "Undefined".to_string())
|
||||
)));
|
||||
let path_buf = PathBuf::from(path);
|
||||
|
||||
|
||||
@@ -431,111 +431,3 @@ mod tests {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
use super::tests::FIELD;
|
||||
use super::tests::{generate_permutation, SCHEMA};
|
||||
use super::*;
|
||||
use common::CompositeFile;
|
||||
use directory::{Directory, RAMDirectory, WritePtr};
|
||||
use fastfield::FastFieldReader;
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
use test::{self, Bencher};
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_linear_veclookup(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
b.iter(|| {
|
||||
let n = test::black_box(7000u32);
|
||||
let mut a = 0u64;
|
||||
for i in (0u32..n / 7).map(|v| v * 7) {
|
||||
a ^= permutation[i as usize];
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_veclookup(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
b.iter(|| {
|
||||
let n = test::black_box(1000u32);
|
||||
let mut a = 0u64;
|
||||
for _ in 0u32..n {
|
||||
a = permutation[a as usize];
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_linear_fflookup(b: &mut Bencher) {
|
||||
let path = Path::new("test");
|
||||
let permutation = generate_permutation();
|
||||
let mut directory: RAMDirectory = RAMDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for &x in &permutation {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>x));
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new())
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&source).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(data);
|
||||
|
||||
b.iter(|| {
|
||||
let n = test::black_box(7000u32);
|
||||
let mut a = 0u64;
|
||||
for i in (0u32..n / 7).map(|val| val * 7) {
|
||||
a ^= fast_field_reader.get(i);
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_fflookup(b: &mut Bencher) {
|
||||
let path = Path::new("test");
|
||||
let permutation = generate_permutation();
|
||||
let mut directory: RAMDirectory = RAMDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for &x in &permutation {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>x));
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new())
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&source).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(data);
|
||||
|
||||
b.iter(|| {
|
||||
let n = test::black_box(1000u32);
|
||||
let mut a = 0u32;
|
||||
for _ in 0u32..n {
|
||||
a = fast_field_reader.get(a) as u32;
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -67,10 +67,12 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
||||
///
|
||||
/// May panic if `doc` is greater than the segment
|
||||
// `maxdoc`.
|
||||
#[inline(always)]
|
||||
pub fn get(&self, doc: DocId) -> Item {
|
||||
self.get_u64(u64::from(doc))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub(crate) fn get_u64(&self, doc: u64) -> Item {
|
||||
Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc))
|
||||
}
|
||||
|
||||
@@ -31,7 +31,9 @@ impl FastFieldsWriter {
|
||||
_ => 0u64,
|
||||
};
|
||||
match *field_entry.field_type() {
|
||||
FieldType::I64(ref int_options) | FieldType::U64(ref int_options) | FieldType::F64(ref int_options) => {
|
||||
FieldType::I64(ref int_options)
|
||||
| FieldType::U64(ref int_options)
|
||||
| FieldType::F64(ref int_options) => {
|
||||
match int_options.get_fastfield_cardinality() {
|
||||
Some(Cardinality::SingleValue) => {
|
||||
let mut fast_field_writer = IntFastFieldWriter::new(field);
|
||||
|
||||
@@ -761,7 +761,6 @@ mod tests {
|
||||
use crate::Index;
|
||||
use crate::ReloadPolicy;
|
||||
use crate::Term;
|
||||
use fail;
|
||||
|
||||
#[test]
|
||||
fn test_operations_group() {
|
||||
|
||||
28
src/lib.rs
28
src/lib.rs
@@ -1,9 +1,9 @@
|
||||
#![doc(html_logo_url = "http://fulmicoton.com/tantivy-logo/tantivy-logo.png")]
|
||||
#![recursion_limit = "100"]
|
||||
#![cfg_attr(all(feature = "unstable", test), feature(test))]
|
||||
#![cfg_attr(feature = "cargo-clippy", allow(clippy::module_inception))]
|
||||
#![doc(test(attr(allow(unused_variables), deny(warnings))))]
|
||||
#![warn(missing_docs)]
|
||||
#![recursion_limit = "80"]
|
||||
|
||||
//! # `tantivy`
|
||||
//!
|
||||
@@ -12,7 +12,7 @@
|
||||
//!
|
||||
//! ```rust
|
||||
|
||||
//! # extern crate tempdir;
|
||||
//! # extern crate tempfile;
|
||||
//! #
|
||||
//! #[macro_use]
|
||||
//! extern crate tantivy;
|
||||
@@ -20,7 +20,7 @@
|
||||
//! // ...
|
||||
//!
|
||||
//! # use std::path::Path;
|
||||
//! # use tempdir::TempDir;
|
||||
//! # use tempfile::TempDir;
|
||||
//! # use tantivy::Index;
|
||||
//! # use tantivy::schema::*;
|
||||
//! # use tantivy::{Score, DocAddress};
|
||||
@@ -30,7 +30,7 @@
|
||||
//! # fn main() {
|
||||
//! # // Let's create a temporary directory for the
|
||||
//! # // sake of this example
|
||||
//! # if let Ok(dir) = TempDir::new("tantivy_example_dir") {
|
||||
//! # if let Ok(dir) = TempDir::new() {
|
||||
//! # run_example(dir.path()).unwrap();
|
||||
//! # dir.close().unwrap();
|
||||
//! # }
|
||||
@@ -171,16 +171,16 @@ pub use self::snippet::{Snippet, SnippetGenerator};
|
||||
mod docset;
|
||||
pub use self::docset::{DocSet, SkipResult};
|
||||
|
||||
pub use crate::common::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
|
||||
pub use crate::core::SegmentComponent;
|
||||
pub use crate::core::{Index, IndexMeta, Searcher, Segment, SegmentId, SegmentMeta};
|
||||
pub use crate::core::{InvertedIndexReader, SegmentReader};
|
||||
pub use crate::directory::Directory;
|
||||
pub use crate::indexer::IndexWriter;
|
||||
pub use crate::postings::Postings;
|
||||
pub use crate::reader::LeasedItem;
|
||||
pub use crate::schema::{Document, Term};
|
||||
|
||||
pub use crate::common::{i64_to_u64, u64_to_i64, f64_to_u64, u64_to_f64};
|
||||
|
||||
/// Expose the current version of tantivy, as well
|
||||
/// whether it was compiled with the simd compression.
|
||||
pub fn version() -> &'static str {
|
||||
@@ -249,7 +249,6 @@ pub struct DocAddress(pub SegmentLocalId, pub DocId);
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE;
|
||||
use crate::core::SegmentReader;
|
||||
use crate::docset::DocSet;
|
||||
@@ -849,7 +848,8 @@ mod tests {
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 50_000_000).unwrap();
|
||||
{
|
||||
let document = doc!(fast_field_unsigned => 4u64, fast_field_signed=>4i64, fast_field_float=>4f64);
|
||||
let document =
|
||||
doc!(fast_field_unsigned => 4u64, fast_field_signed=>4i64, fast_field_float=>4f64);
|
||||
index_writer.add_document(document);
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
@@ -894,3 +894,15 @@ mod tests {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "forbench")]
|
||||
pub mod forbench {
|
||||
pub mod compression {
|
||||
pub use crate::postings::compression::*;
|
||||
}
|
||||
|
||||
pub mod bitset {
|
||||
pub use crate::common::BitSet;
|
||||
pub use crate::common::TinySet;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -160,9 +160,9 @@ impl VIntEncoder for BlockEncoder {
|
||||
}
|
||||
|
||||
impl VIntDecoder for BlockDecoder {
|
||||
fn uncompress_vint_sorted<'a>(
|
||||
fn uncompress_vint_sorted(
|
||||
&mut self,
|
||||
compressed_data: &'a [u8],
|
||||
compressed_data: &[u8],
|
||||
offset: u32,
|
||||
num_els: usize,
|
||||
) -> usize {
|
||||
@@ -170,7 +170,7 @@ impl VIntDecoder for BlockDecoder {
|
||||
vint::uncompress_sorted(compressed_data, &mut self.output.0[..num_els], offset)
|
||||
}
|
||||
|
||||
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize {
|
||||
fn uncompress_vint_unsorted(&mut self, compressed_data: &[u8], num_els: usize) -> usize {
|
||||
self.output_len = num_els;
|
||||
vint::uncompress_unsorted(compressed_data, &mut self.output.0[..num_els])
|
||||
}
|
||||
@@ -268,78 +268,17 @@ pub mod tests {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use super::*;
|
||||
use rand::SeedableRng;
|
||||
use rand::{Rng, XorShiftRng};
|
||||
use test::Bencher;
|
||||
|
||||
fn generate_array_with_seed(n: usize, ratio: f64, seed_val: u8) -> Vec<u32> {
|
||||
let seed: &[u8; 16] = &[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, seed_val];
|
||||
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
|
||||
(0u32..).filter(|_| rng.gen_bool(ratio)).take(n).collect()
|
||||
}
|
||||
|
||||
pub fn generate_array(n: usize, ratio: f64) -> Vec<u32> {
|
||||
generate_array_with_seed(n, ratio, 4)
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_compress(b: &mut Bencher) {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
|
||||
b.iter(|| {
|
||||
encoder.compress_block_sorted(&data, 0u32);
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_uncompress(b: &mut Bencher) {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
|
||||
let (num_bits, compressed) = encoder.compress_block_sorted(&data, 0u32);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
b.iter(|| {
|
||||
decoder.uncompress_block_sorted(compressed, 0u32, num_bits);
|
||||
});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_all_docs_compression_numbits() {
|
||||
for expected_num_bits in 0u8.. {
|
||||
for expected_num_bits in 0u8..33u8 {
|
||||
let mut data = [0u32; 128];
|
||||
if expected_num_bits > 0 {
|
||||
data[0] = (1u64 << (expected_num_bits as usize) - 1) as u32;
|
||||
data[0] = (1u64 << (expected_num_bits as u64) - 1u64) as u32;
|
||||
}
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let (num_bits, compressed) = encoder.compress_block_unsorted(&data);
|
||||
assert_eq!(compressed.len(), compressed_block_size(num_bits));
|
||||
}
|
||||
}
|
||||
|
||||
const NUM_INTS_BENCH_VINT: usize = 10;
|
||||
|
||||
#[bench]
|
||||
fn bench_compress_vint(b: &mut Bencher) {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = generate_array(NUM_INTS_BENCH_VINT, 0.001);
|
||||
b.iter(|| {
|
||||
encoder.compress_vint_sorted(&data, 0u32);
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_uncompress_vint(b: &mut Bencher) {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = generate_array(NUM_INTS_BENCH_VINT, 0.001);
|
||||
let compressed = encoder.compress_vint_sorted(&data, 0u32);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
b.iter(|| {
|
||||
decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@ Postings module (also called inverted index)
|
||||
*/
|
||||
|
||||
mod block_search;
|
||||
|
||||
pub(crate) mod compression;
|
||||
/// Postings module
|
||||
///
|
||||
|
||||
@@ -218,49 +218,3 @@ mod tests {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use super::BitSet;
|
||||
use super::BitSetDocSet;
|
||||
use test;
|
||||
use tests;
|
||||
use DocSet;
|
||||
|
||||
#[bench]
|
||||
fn bench_bitset_1pct_insert(b: &mut test::Bencher) {
|
||||
use tests;
|
||||
let els = tests::generate_nonunique_unsorted(1_000_000u32, 10_000);
|
||||
b.iter(|| {
|
||||
let mut bitset = BitSet::with_max_value(1_000_000);
|
||||
for el in els.iter().cloned() {
|
||||
bitset.insert(el);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_bitset_1pct_clone(b: &mut test::Bencher) {
|
||||
use tests;
|
||||
let els = tests::generate_nonunique_unsorted(1_000_000u32, 10_000);
|
||||
let mut bitset = BitSet::with_max_value(1_000_000);
|
||||
for el in els {
|
||||
bitset.insert(el);
|
||||
}
|
||||
b.iter(|| bitset.clone());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_bitset_1pct_clone_iterate(b: &mut test::Bencher) {
|
||||
let els = tests::sample(1_000_000u32, 0.01);
|
||||
let mut bitset = BitSet::with_max_value(1_000_000);
|
||||
for el in els {
|
||||
bitset.insert(el);
|
||||
}
|
||||
b.iter(|| {
|
||||
let mut docset = BitSetDocSet::from(bitset.clone());
|
||||
while docset.advance() {}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use crate::error::TantivyError::InvalidArgument;
|
||||
use crate::query::{AutomatonWeight, Query, Weight};
|
||||
use crate::schema::Term;
|
||||
use crate::Result;
|
||||
@@ -5,11 +6,16 @@ use crate::Searcher;
|
||||
use levenshtein_automata::{LevenshteinAutomatonBuilder, DFA};
|
||||
use once_cell::sync::Lazy;
|
||||
use std::collections::HashMap;
|
||||
use std::ops::Range;
|
||||
|
||||
/// A range of Levenshtein distances that we will build DFAs for our terms
|
||||
/// The computation is exponential, so best keep it to low single digits
|
||||
const VALID_LEVENSHTEIN_DISTANCE_RANGE: Range<u8> = (0..3);
|
||||
|
||||
static LEV_BUILDER: Lazy<HashMap<(u8, bool), LevenshteinAutomatonBuilder>> = Lazy::new(|| {
|
||||
let mut lev_builder_cache = HashMap::new();
|
||||
// TODO make population lazy on a `(distance, val)` basis
|
||||
for distance in 0..3 {
|
||||
for distance in VALID_LEVENSHTEIN_DISTANCE_RANGE {
|
||||
for &transposition in &[false, true] {
|
||||
let lev_automaton_builder = LevenshteinAutomatonBuilder::new(distance, transposition);
|
||||
lev_builder_cache.insert((distance, transposition), lev_automaton_builder);
|
||||
@@ -100,10 +106,18 @@ impl FuzzyTermQuery {
|
||||
}
|
||||
|
||||
fn specialized_weight(&self) -> Result<AutomatonWeight<DFA>> {
|
||||
let automaton = LEV_BUILDER.get(&(self.distance, false))
|
||||
.unwrap() // TODO return an error
|
||||
.build_dfa(self.term.text());
|
||||
Ok(AutomatonWeight::new(self.term.field(), automaton))
|
||||
// LEV_BUILDER is a HashMap, whose `get` method returns an Option
|
||||
match LEV_BUILDER.get(&(self.distance, false)) {
|
||||
// Unwrap the option and build the Ok(AutomatonWeight)
|
||||
Some(automaton_builder) => {
|
||||
let automaton = automaton_builder.build_dfa(self.term.text());
|
||||
Ok(AutomatonWeight::new(self.term.field(), automaton))
|
||||
}
|
||||
None => Err(InvalidArgument(format!(
|
||||
"Levenshtein distance of {} is not allowed. Choose a value in the {:?} range",
|
||||
self.distance, VALID_LEVENSHTEIN_DISTANCE_RANGE
|
||||
))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -18,7 +18,6 @@ pub enum LogicalLiteral {
|
||||
All,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub enum LogicalAST {
|
||||
Clause(Vec<(Occur, LogicalAST)>),
|
||||
Leaf(Box<LogicalLiteral>),
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use super::query_grammar;
|
||||
use super::user_input_ast::*;
|
||||
use crate::query::occur::Occur;
|
||||
use crate::query::query_parser::user_input_ast::UserInputBound;
|
||||
@@ -13,22 +12,25 @@ parser! {
|
||||
(
|
||||
letter(),
|
||||
many(satisfy(|c: char| c.is_alphanumeric() || c == '_')),
|
||||
).map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
|
||||
).skip(char(':')).map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
|
||||
}
|
||||
}
|
||||
|
||||
parser! {
|
||||
fn word[I]()(I) -> String
|
||||
where [I: Stream<Item = char>] {
|
||||
many1(satisfy(|c: char| c.is_alphanumeric() || c=='.'))
|
||||
.and_then(|s: String| {
|
||||
match s.as_str() {
|
||||
"OR" => Err(StreamErrorFor::<I>::unexpected_static_message("OR")),
|
||||
"AND" => Err(StreamErrorFor::<I>::unexpected_static_message("AND")),
|
||||
"NOT" => Err(StreamErrorFor::<I>::unexpected_static_message("NOT")),
|
||||
_ => Ok(s)
|
||||
}
|
||||
})
|
||||
(
|
||||
satisfy(|c: char| !c.is_whitespace() && !['-', '`', ':', '{', '}', '"', '[', ']', '(',')'].contains(&c) ),
|
||||
many(satisfy(|c: char| !c.is_whitespace() && ![':', '{', '}', '"', '[', ']', '(',')'].contains(&c)))
|
||||
)
|
||||
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
|
||||
.and_then(|s: String|
|
||||
match s.as_str() {
|
||||
"OR" => Err(StreamErrorFor::<I>::unexpected_static_message("OR")),
|
||||
"AND" => Err(StreamErrorFor::<I>::unexpected_static_message("AND")),
|
||||
"NOT" => Err(StreamErrorFor::<I>::unexpected_static_message("NOT")),
|
||||
_ => Ok(s)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -37,12 +39,13 @@ parser! {
|
||||
where [I: Stream<Item = char>]
|
||||
{
|
||||
let term_val = || {
|
||||
let phrase = (char('"'), many1(satisfy(|c| c != '"')), char('"')).map(|(_, s, _)| s);
|
||||
let phrase = char('"').with(many1(satisfy(|c| c != '"'))).skip(char('"'));
|
||||
phrase.or(word())
|
||||
};
|
||||
let term_val_with_field = negative_number().or(term_val());
|
||||
let term_query =
|
||||
(field(), char(':'), term_val_with_field).map(|(field_name, _, phrase)| UserInputLiteral {
|
||||
(field(), term_val_with_field)
|
||||
.map(|(field_name, phrase)| UserInputLiteral {
|
||||
field_name: Some(field_name),
|
||||
phrase,
|
||||
});
|
||||
@@ -60,8 +63,15 @@ parser! {
|
||||
fn negative_number[I]()(I) -> String
|
||||
where [I: Stream<Item = char>]
|
||||
{
|
||||
(char('-'), many1(satisfy(char::is_numeric)))
|
||||
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
|
||||
(char('-'), many1(satisfy(char::is_numeric)),
|
||||
optional((char('.'), many1(satisfy(char::is_numeric)))))
|
||||
.map(|(s1, s2, s3): (char, String, Option<(char, String)>)| {
|
||||
if let Some(('.', s3)) = s3 {
|
||||
format!("{}{}.{}", s1, s2, s3)
|
||||
} else {
|
||||
format!("{}{}", s1, s2)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -75,27 +85,23 @@ parser! {
|
||||
parser! {
|
||||
fn range[I]()(I) -> UserInputLeaf
|
||||
where [I: Stream<Item = char>] {
|
||||
let term_val = || {
|
||||
word().or(negative_number()).or(char('*').map(|_| "*".to_string()))
|
||||
};
|
||||
let lower_bound = {
|
||||
let excl = (char('{'), term_val()).map(|(_, w)| UserInputBound::Exclusive(w));
|
||||
let incl = (char('['), term_val()).map(|(_, w)| UserInputBound::Inclusive(w));
|
||||
attempt(excl).or(incl)
|
||||
};
|
||||
let upper_bound = {
|
||||
let excl = (term_val(), char('}')).map(|(w, _)| UserInputBound::Exclusive(w));
|
||||
let incl = (term_val(), char(']')).map(|(w, _)| UserInputBound::Inclusive(w));
|
||||
attempt(excl).or(incl)
|
||||
let range_term_val = || {
|
||||
word().or(negative_number()).or(char('*').with(value("*".to_string())))
|
||||
};
|
||||
let lower_bound = (one_of("{[".chars()), range_term_val())
|
||||
.map(|(boundary_char, lower_bound): (char, String)|
|
||||
if boundary_char == '{' { UserInputBound::Exclusive(lower_bound) }
|
||||
else { UserInputBound::Inclusive(lower_bound) });
|
||||
let upper_bound = (range_term_val(), one_of("}]".chars()))
|
||||
.map(|(higher_bound, boundary_char): (String, char)|
|
||||
if boundary_char == '}' { UserInputBound::Exclusive(higher_bound) }
|
||||
else { UserInputBound::Inclusive(higher_bound) });
|
||||
(
|
||||
optional((field(), char(':')).map(|x| x.0)),
|
||||
lower_bound,
|
||||
spaces(),
|
||||
string("TO"),
|
||||
spaces(),
|
||||
optional(field()),
|
||||
lower_bound
|
||||
.skip((spaces(), string("TO"), spaces())),
|
||||
upper_bound,
|
||||
).map(|(field, lower, _, _, _, upper)| UserInputLeaf::Range {
|
||||
).map(|(field, lower, upper)| UserInputLeaf::Range {
|
||||
field,
|
||||
lower,
|
||||
upper
|
||||
@@ -103,25 +109,28 @@ parser! {
|
||||
}
|
||||
}
|
||||
|
||||
fn negate(expr: UserInputAST) -> UserInputAST {
|
||||
expr.unary(Occur::MustNot)
|
||||
}
|
||||
|
||||
fn must(expr: UserInputAST) -> UserInputAST {
|
||||
expr.unary(Occur::Must)
|
||||
}
|
||||
|
||||
parser! {
|
||||
fn leaf[I]()(I) -> UserInputAST
|
||||
where [I: Stream<Item = char>] {
|
||||
(char('-'), leaf()).map(|(_, expr)| expr.unary(Occur::MustNot) )
|
||||
.or((char('+'), leaf()).map(|(_, expr)| expr.unary(Occur::Must) ))
|
||||
.or((char('('), parse_to_ast(), char(')')).map(|(_, expr, _)| expr))
|
||||
.or(char('*').map(|_| UserInputAST::from(UserInputLeaf::All) ))
|
||||
.or(attempt(
|
||||
(string("NOT"), spaces1(), leaf()).map(|(_, _, expr)| expr.unary(Occur::MustNot))
|
||||
)
|
||||
)
|
||||
.or(attempt(
|
||||
range().map(UserInputAST::from)
|
||||
)
|
||||
)
|
||||
.or(literal().map(|leaf| UserInputAST::Leaf(Box::new(leaf))))
|
||||
char('-').with(leaf()).map(negate)
|
||||
.or(char('+').with(leaf()).map(must))
|
||||
.or(char('(').with(ast()).skip(char(')')))
|
||||
.or(char('*').map(|_| UserInputAST::from(UserInputLeaf::All)))
|
||||
.or(attempt(string("NOT").skip(spaces1()).with(leaf()).map(negate)))
|
||||
.or(attempt(range().map(UserInputAST::from)))
|
||||
.or(literal().map(UserInputAST::from))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
enum BinaryOperand {
|
||||
Or,
|
||||
And,
|
||||
@@ -129,27 +138,54 @@ enum BinaryOperand {
|
||||
|
||||
parser! {
|
||||
fn binary_operand[I]()(I) -> BinaryOperand
|
||||
where [I: Stream<Item = char>] {
|
||||
(spaces1(),
|
||||
(
|
||||
string("AND").map(|_| BinaryOperand::And)
|
||||
.or(string("OR").map(|_| BinaryOperand::Or))
|
||||
),
|
||||
spaces1()).map(|(_, op,_)| op)
|
||||
where [I: Stream<Item = char>]
|
||||
{
|
||||
string("AND").with(value(BinaryOperand::And))
|
||||
.or(string("OR").with(value(BinaryOperand::Or)))
|
||||
}
|
||||
}
|
||||
|
||||
enum Element {
|
||||
SingleEl(UserInputAST),
|
||||
NormalDisjunctive(Vec<Vec<UserInputAST>>),
|
||||
fn aggregate_binary_expressions(
|
||||
left: UserInputAST,
|
||||
others: Vec<(BinaryOperand, UserInputAST)>,
|
||||
) -> UserInputAST {
|
||||
let mut dnf: Vec<Vec<UserInputAST>> = vec![vec![left]];
|
||||
for (operator, operand_ast) in others {
|
||||
match operator {
|
||||
BinaryOperand::And => {
|
||||
if let Some(last) = dnf.last_mut() {
|
||||
last.push(operand_ast);
|
||||
}
|
||||
}
|
||||
BinaryOperand::Or => {
|
||||
dnf.push(vec![operand_ast]);
|
||||
}
|
||||
}
|
||||
}
|
||||
if dnf.len() == 1 {
|
||||
UserInputAST::and(dnf.into_iter().next().unwrap()) //< safe
|
||||
} else {
|
||||
let conjunctions = dnf.into_iter().map(UserInputAST::and).collect();
|
||||
UserInputAST::or(conjunctions)
|
||||
}
|
||||
}
|
||||
|
||||
impl Element {
|
||||
pub fn into_dnf(self) -> Vec<Vec<UserInputAST>> {
|
||||
match self {
|
||||
Element::NormalDisjunctive(conjunctions) => conjunctions,
|
||||
Element::SingleEl(el) => vec![vec![el]],
|
||||
}
|
||||
parser! {
|
||||
pub fn ast[I]()(I) -> UserInputAST
|
||||
where [I: Stream<Item = char>]
|
||||
{
|
||||
let operand_leaf = (binary_operand().skip(spaces()), leaf().skip(spaces()));
|
||||
let boolean_expr = (leaf().skip(spaces().silent()), many1(operand_leaf)).map(
|
||||
|(left, right)| aggregate_binary_expressions(left,right));
|
||||
let whitespace_separated_leaves = many1(leaf().skip(spaces().silent()))
|
||||
.map(|subqueries: Vec<UserInputAST>|
|
||||
if subqueries.len() == 1 {
|
||||
subqueries.into_iter().next().unwrap()
|
||||
} else {
|
||||
UserInputAST::Clause(subqueries.into_iter().collect())
|
||||
});
|
||||
let expr = attempt(boolean_expr).or(whitespace_separated_leaves);
|
||||
spaces().with(expr).skip(spaces())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -157,56 +193,7 @@ parser! {
|
||||
pub fn parse_to_ast[I]()(I) -> UserInputAST
|
||||
where [I: Stream<Item = char>]
|
||||
{
|
||||
(
|
||||
attempt(
|
||||
chainl1(
|
||||
leaf().map(Element::SingleEl),
|
||||
binary_operand().map(|op: BinaryOperand|
|
||||
move |left: Element, right: Element| {
|
||||
let mut dnf = left.into_dnf();
|
||||
if let Element::SingleEl(el) = right {
|
||||
match op {
|
||||
BinaryOperand::And => {
|
||||
if let Some(last) = dnf.last_mut() {
|
||||
last.push(el);
|
||||
}
|
||||
}
|
||||
BinaryOperand::Or => {
|
||||
dnf.push(vec!(el));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
unreachable!("Please report.")
|
||||
}
|
||||
Element::NormalDisjunctive(dnf)
|
||||
}
|
||||
)
|
||||
)
|
||||
.map(query_grammar::Element::into_dnf)
|
||||
.map(|fnd| {
|
||||
if fnd.len() == 1 {
|
||||
UserInputAST::and(fnd.into_iter().next().unwrap()) //< safe
|
||||
} else {
|
||||
let conjunctions = fnd
|
||||
.into_iter()
|
||||
.map(UserInputAST::and)
|
||||
.collect();
|
||||
UserInputAST::or(conjunctions)
|
||||
}
|
||||
})
|
||||
)
|
||||
.or(
|
||||
sep_by(leaf(), spaces())
|
||||
.map(|subqueries: Vec<UserInputAST>| {
|
||||
if subqueries.len() == 1 {
|
||||
subqueries.into_iter().next().unwrap()
|
||||
} else {
|
||||
UserInputAST::Clause(subqueries.into_iter().collect())
|
||||
}
|
||||
})
|
||||
)
|
||||
)
|
||||
|
||||
spaces().with(optional(ast()).skip(eof())).map(|opt_ast| opt_ast.unwrap_or_else(UserInputAST::empty_query))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -225,6 +212,18 @@ mod test {
|
||||
assert!(parse_to_ast().parse(query).is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_empty_to_ast() {
|
||||
test_parse_query_to_ast_helper("", "<emptyclause>");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_query_to_ast_hyphen() {
|
||||
test_parse_query_to_ast_helper("\"www-form-encoded\"", "\"www-form-encoded\"");
|
||||
test_parse_query_to_ast_helper("www-form-encoded", "\"www-form-encoded\"");
|
||||
test_parse_query_to_ast_helper("www-form-encoded", "\"www-form-encoded\"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_query_to_ast_not_op() {
|
||||
assert_eq!(
|
||||
@@ -259,8 +258,24 @@ mod test {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_query_to_triming_spaces() {
|
||||
test_parse_query_to_ast_helper(" abc", "\"abc\"");
|
||||
test_parse_query_to_ast_helper("abc ", "\"abc\"");
|
||||
test_parse_query_to_ast_helper("( a OR abc)", "(?(\"a\") ?(\"abc\"))");
|
||||
test_parse_query_to_ast_helper("(a OR abc)", "(?(\"a\") ?(\"abc\"))");
|
||||
test_parse_query_to_ast_helper("(a OR abc)", "(?(\"a\") ?(\"abc\"))");
|
||||
test_parse_query_to_ast_helper("a OR abc ", "(?(\"a\") ?(\"abc\"))");
|
||||
test_parse_query_to_ast_helper("(a OR abc )", "(?(\"a\") ?(\"abc\"))");
|
||||
test_parse_query_to_ast_helper("(a OR abc) ", "(?(\"a\") ?(\"abc\"))");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_query_to_ast() {
|
||||
test_parse_query_to_ast_helper("abc", "\"abc\"");
|
||||
test_parse_query_to_ast_helper("a b", "(\"a\" \"b\")");
|
||||
test_parse_query_to_ast_helper("+(a b)", "+((\"a\" \"b\"))");
|
||||
test_parse_query_to_ast_helper("+d", "+(\"d\")");
|
||||
test_parse_query_to_ast_helper("+(a b) +d", "(+((\"a\" \"b\")) +(\"d\"))");
|
||||
test_parse_query_to_ast_helper("(+a +b) d", "((+(\"a\") +(\"b\")) \"d\")");
|
||||
test_parse_query_to_ast_helper("(+a)", "+(\"a\")");
|
||||
|
||||
@@ -18,42 +18,56 @@ use crate::schema::{FieldType, Term};
|
||||
use crate::tokenizer::TokenizerManager;
|
||||
use combine::Parser;
|
||||
use std::borrow::Cow;
|
||||
use std::num::{ParseIntError, ParseFloatError};
|
||||
use std::num::{ParseFloatError, ParseIntError};
|
||||
use std::ops::Bound;
|
||||
use std::str::FromStr;
|
||||
|
||||
/// Possible error that may happen when parsing a query.
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
#[derive(Debug, PartialEq, Eq, Fail)]
|
||||
pub enum QueryParserError {
|
||||
/// Error in the query syntax
|
||||
#[fail(display = "Syntax Error")]
|
||||
SyntaxError,
|
||||
/// `FieldDoesNotExist(field_name: String)`
|
||||
/// The query references a field that is not in the schema
|
||||
#[fail(display = "File does not exists: '{:?}'", _0)]
|
||||
FieldDoesNotExist(String),
|
||||
/// The query contains a term for a `u64` or `i64`-field, but the value
|
||||
/// is neither.
|
||||
#[fail(display = "Expected a valid integer: '{:?}'", _0)]
|
||||
ExpectedInt(ParseIntError),
|
||||
/// The query contains a term for a `f64`-field, but the value
|
||||
/// is not a f64.
|
||||
#[fail(display = "Invalid query: Only excluding terms given")]
|
||||
ExpectedFloat(ParseFloatError),
|
||||
/// It is forbidden queries that are only "excluding". (e.g. -title:pop)
|
||||
#[fail(display = "Invalid query: Only excluding terms given")]
|
||||
AllButQueryForbidden,
|
||||
/// If no default field is declared, running a query without any
|
||||
/// field specified is forbbidden.
|
||||
#[fail(display = "No default field declared and no field specified in query")]
|
||||
NoDefaultFieldDeclared,
|
||||
/// The field searched for is not declared
|
||||
/// as indexed in the schema.
|
||||
#[fail(display = "The field '{:?}' is not declared as indexed", _0)]
|
||||
FieldNotIndexed(String),
|
||||
/// A phrase query was requested for a field that does not
|
||||
/// have any positions indexed.
|
||||
#[fail(display = "The field '{:?}' does not have positions indexed", _0)]
|
||||
FieldDoesNotHavePositionsIndexed(String),
|
||||
/// The tokenizer for the given field is unknown
|
||||
/// The two argument strings are the name of the field, the name of the tokenizer
|
||||
#[fail(
|
||||
display = "The tokenizer '{:?}' for the field '{:?}' is unknown",
|
||||
_0, _1
|
||||
)]
|
||||
UnknownTokenizer(String, String),
|
||||
/// The query contains a range query with a phrase as one of the bounds.
|
||||
/// Only terms can be used as bounds.
|
||||
#[fail(display = "A range query cannot have a phrase as one of the bounds")]
|
||||
RangeMustNotHavePhrase,
|
||||
/// The format for the date field is not RFC 3339 compliant.
|
||||
#[fail(display = "The date field has an invalid format")]
|
||||
DateFormatError(chrono::ParseError),
|
||||
}
|
||||
|
||||
@@ -676,7 +690,7 @@ mod test {
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_to_ast_disjunction() {
|
||||
pub fn test_parse_query_to_ast_single_term() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:toto",
|
||||
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
|
||||
@@ -700,6 +714,10 @@ mod test {
|
||||
.unwrap(),
|
||||
QueryParserError::AllButQueryForbidden
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_to_ast_two_terms() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:a b",
|
||||
"(Term([0, 0, 0, 0, 97]) (Term([0, 0, 0, 0, 98]) \
|
||||
@@ -712,6 +730,10 @@ mod test {
|
||||
(1, Term([0, 0, 0, 0, 98]))]\"",
|
||||
false,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_to_ast_ranges() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:[a TO b]",
|
||||
"(Included(Term([0, 0, 0, 0, 97])) TO \
|
||||
@@ -743,6 +765,19 @@ mod test {
|
||||
"(Excluded(Term([0, 0, 0, 0, 116, 105, 116, 105])) TO Unbounded)",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"signed:{-5 TO 3}",
|
||||
"(Excluded(Term([0, 0, 0, 2, 127, 255, 255, 255, 255, 255, 255, 251])) TO \
|
||||
Excluded(Term([0, 0, 0, 2, 128, 0, 0, 0, 0, 0, 0, 3])))",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"float:{-1.5 TO 1.5}",
|
||||
"(Excluded(Term([0, 0, 0, 10, 64, 7, 255, 255, 255, 255, 255, 255])) TO \
|
||||
Excluded(Term([0, 0, 0, 10, 191, 248, 0, 0, 0, 0, 0, 0])))",
|
||||
false,
|
||||
);
|
||||
|
||||
test_parse_query_to_logical_ast_helper("*", "*", false);
|
||||
}
|
||||
|
||||
@@ -879,4 +914,15 @@ mod test {
|
||||
true,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_query_parser_hyphen() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:www-form-encoded",
|
||||
"\"[(0, Term([0, 0, 0, 0, 119, 119, 119])), \
|
||||
(1, Term([0, 0, 0, 0, 102, 111, 114, 109])), \
|
||||
(2, Term([0, 0, 0, 0, 101, 110, 99, 111, 100, 101, 100]))]\"",
|
||||
false,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,44 +0,0 @@
|
||||
use std::sync::Arc;
|
||||
use stemmer;
|
||||
|
||||
|
||||
pub struct StemmerTokenStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream {
|
||||
tail: TailTokenStream,
|
||||
stemmer: Arc<stemmer::Stemmer>,
|
||||
}
|
||||
|
||||
impl<TailTokenStream> TokenStream for StemmerTokenStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream {
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
self.tail.token()
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
self.tail.token_mut()
|
||||
}
|
||||
|
||||
fn advance(&mut self) -> bool {
|
||||
if self.tail.advance() {
|
||||
// self.tail.token_mut().term.make_ascii_lowercase();
|
||||
let new_str = self.stemmer.stem_str(&self.token().term);
|
||||
true
|
||||
}
|
||||
else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
impl<TailTokenStream> StemmerTokenStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream {
|
||||
|
||||
fn wrap(stemmer: Arc<stemmer::Stemmer>, tail: TailTokenStream) -> StemmerTokenStream<TailTokenStream> {
|
||||
StemmerTokenStream {
|
||||
tail,
|
||||
stemmer,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -80,9 +80,6 @@ impl UserInputBound {
|
||||
pub enum UserInputAST {
|
||||
Clause(Vec<UserInputAST>),
|
||||
Unary(Occur, Box<UserInputAST>),
|
||||
// Not(Box<UserInputAST>),
|
||||
// Should(Box<UserInputAST>),
|
||||
// Must(Box<UserInputAST>),
|
||||
Leaf(Box<UserInputLeaf>),
|
||||
}
|
||||
|
||||
@@ -92,7 +89,7 @@ impl UserInputAST {
|
||||
}
|
||||
|
||||
fn compose(occur: Occur, asts: Vec<UserInputAST>) -> UserInputAST {
|
||||
assert!(occur != Occur::MustNot);
|
||||
assert_ne!(occur, Occur::MustNot);
|
||||
assert!(!asts.is_empty());
|
||||
if asts.len() == 1 {
|
||||
asts.into_iter().next().unwrap() //< safe
|
||||
@@ -105,6 +102,10 @@ impl UserInputAST {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn empty_query() -> UserInputAST {
|
||||
UserInputAST::Clause(Vec::default())
|
||||
}
|
||||
|
||||
pub fn and(asts: Vec<UserInputAST>) -> UserInputAST {
|
||||
UserInputAST::compose(Occur::Must, asts)
|
||||
}
|
||||
@@ -114,42 +115,6 @@ impl UserInputAST {
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
impl UserInputAST {
|
||||
|
||||
fn compose_occur(self, occur: Occur) -> UserInputAST {
|
||||
match self {
|
||||
UserInputAST::Not(other) => {
|
||||
let new_occur = compose_occur(Occur::MustNot, occur);
|
||||
other.simplify()
|
||||
}
|
||||
_ => {
|
||||
self
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn simplify(self) -> UserInputAST {
|
||||
match self {
|
||||
UserInputAST::Clause(els) => {
|
||||
if els.len() == 1 {
|
||||
return els.into_iter().next().unwrap();
|
||||
} else {
|
||||
return self;
|
||||
}
|
||||
}
|
||||
UserInputAST::Not(els) => {
|
||||
if els.len() == 1 {
|
||||
return els.into_iter().next().unwrap();
|
||||
} else {
|
||||
return self;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
impl From<UserInputLiteral> for UserInputLeaf {
|
||||
fn from(literal: UserInputLiteral) -> UserInputLeaf {
|
||||
UserInputLeaf::Literal(literal)
|
||||
|
||||
@@ -460,7 +460,10 @@ mod tests {
|
||||
let count_multiples =
|
||||
|range_query: RangeQuery| searcher.search(&range_query, &Count).unwrap();
|
||||
|
||||
assert_eq!(count_multiples(RangeQuery::new_f64(float_field, 10.0..11.0)), 9);
|
||||
assert_eq!(
|
||||
count_multiples(RangeQuery::new_f64(float_field, 10.0..11.0)),
|
||||
9
|
||||
);
|
||||
assert_eq!(
|
||||
count_multiples(RangeQuery::new_f64_bounds(
|
||||
float_field,
|
||||
|
||||
@@ -411,52 +411,3 @@ mod tests {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use query::score_combiner::DoNothingCombiner;
|
||||
use query::ConstScorer;
|
||||
use query::Union;
|
||||
use query::VecDocSet;
|
||||
use test::Bencher;
|
||||
use tests;
|
||||
use DocId;
|
||||
use DocSet;
|
||||
|
||||
#[bench]
|
||||
fn bench_union_3_high(bench: &mut Bencher) {
|
||||
let union_docset: Vec<Vec<DocId>> = vec![
|
||||
tests::sample_with_seed(100_000, 0.1, 0),
|
||||
tests::sample_with_seed(100_000, 0.2, 1),
|
||||
];
|
||||
bench.iter(|| {
|
||||
let mut v = Union::<_, DoNothingCombiner>::from(
|
||||
union_docset
|
||||
.iter()
|
||||
.map(|doc_ids| VecDocSet::from(doc_ids.clone()))
|
||||
.map(ConstScorer::new)
|
||||
.collect::<Vec<_>>(),
|
||||
);
|
||||
while v.advance() {}
|
||||
});
|
||||
}
|
||||
#[bench]
|
||||
fn bench_union_3_low(bench: &mut Bencher) {
|
||||
let union_docset: Vec<Vec<DocId>> = vec![
|
||||
tests::sample_with_seed(100_000, 0.01, 0),
|
||||
tests::sample_with_seed(100_000, 0.05, 1),
|
||||
tests::sample_with_seed(100_000, 0.001, 2),
|
||||
];
|
||||
bench.iter(|| {
|
||||
let mut v = Union::<_, DoNothingCombiner>::from(
|
||||
union_docset
|
||||
.iter()
|
||||
.map(|doc_ids| VecDocSet::from(doc_ids.clone()))
|
||||
.map(ConstScorer::new)
|
||||
.collect::<Vec<_>>(),
|
||||
);
|
||||
while v.advance() {}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
mod pool;
|
||||
|
||||
use self::pool::{LeasedItem, Pool};
|
||||
pub use self::pool::LeasedItem;
|
||||
use self::pool::Pool;
|
||||
use crate::core::Segment;
|
||||
use crate::directory::Directory;
|
||||
use crate::directory::WatchHandle;
|
||||
|
||||
@@ -123,6 +123,10 @@ impl<T> Pool<T> {
|
||||
}
|
||||
}
|
||||
|
||||
/// A LeasedItem holds an object borrowed from a Pool.
|
||||
///
|
||||
/// Upon drop, the object is automatically returned
|
||||
/// into the pool.
|
||||
pub struct LeasedItem<T> {
|
||||
gen_item: Option<GenerationItem<T>>,
|
||||
recycle_queue: Arc<Queue<GenerationItem<T>>>,
|
||||
|
||||
@@ -120,9 +120,7 @@ impl Facet {
|
||||
|
||||
/// Extract path from the `Facet`.
|
||||
pub fn to_path(&self) -> Vec<&str> {
|
||||
self.encoded_str()
|
||||
.split(|c| c == FACET_SEP_CHAR)
|
||||
.collect()
|
||||
self.encoded_str().split(|c| c == FACET_SEP_CHAR).collect()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -108,7 +108,9 @@ impl FieldEntry {
|
||||
/// Returns true iff the field is a int (signed or unsigned) fast field
|
||||
pub fn is_int_fast(&self) -> bool {
|
||||
match self.field_type {
|
||||
FieldType::U64(ref options) | FieldType::I64(ref options) | FieldType::F64(ref options) => options.is_fast(),
|
||||
FieldType::U64(ref options)
|
||||
| FieldType::I64(ref options)
|
||||
| FieldType::F64(ref options) => options.is_fast(),
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -83,9 +83,9 @@ impl FieldType {
|
||||
pub fn is_indexed(&self) -> bool {
|
||||
match *self {
|
||||
FieldType::Str(ref text_options) => text_options.get_indexing_options().is_some(),
|
||||
FieldType::U64(ref int_options) | FieldType::I64(ref int_options) | FieldType::F64(ref int_options) => {
|
||||
int_options.is_indexed()
|
||||
}
|
||||
FieldType::U64(ref int_options)
|
||||
| FieldType::I64(ref int_options)
|
||||
| FieldType::F64(ref int_options) => int_options.is_indexed(),
|
||||
FieldType::Date(ref date_options) => date_options.is_indexed(),
|
||||
FieldType::HierarchicalFacet => true,
|
||||
FieldType::Bytes => false,
|
||||
@@ -125,9 +125,12 @@ impl FieldType {
|
||||
match *json {
|
||||
JsonValue::String(ref field_text) => match *self {
|
||||
FieldType::Str(_) => Ok(Value::Str(field_text.clone())),
|
||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) | FieldType::Date(_) => Err(
|
||||
ValueParsingError::TypeError(format!("Expected an integer, got {:?}", json)),
|
||||
),
|
||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) | FieldType::Date(_) => {
|
||||
Err(ValueParsingError::TypeError(format!(
|
||||
"Expected an integer, got {:?}",
|
||||
json
|
||||
)))
|
||||
}
|
||||
FieldType::HierarchicalFacet => Ok(Value::Facet(Facet::from(field_text))),
|
||||
FieldType::Bytes => decode(field_text).map(Value::Bytes).map_err(|_| {
|
||||
ValueParsingError::InvalidBase64(format!(
|
||||
@@ -152,7 +155,7 @@ impl FieldType {
|
||||
let msg = format!("Expected a u64 int, got {:?}", json);
|
||||
Err(ValueParsingError::OverflowError(msg))
|
||||
}
|
||||
},
|
||||
}
|
||||
FieldType::F64(_) => {
|
||||
if let Some(field_val_f64) = field_val_num.as_f64() {
|
||||
Ok(Value::F64(field_val_f64))
|
||||
|
||||
@@ -360,13 +360,16 @@ impl<'de> Deserialize<'de> for Schema {
|
||||
|
||||
/// Error that may happen when deserializing
|
||||
/// a document from JSON.
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, Fail)]
|
||||
pub enum DocParsingError {
|
||||
/// The payload given is not valid JSON.
|
||||
#[fail(display = "The provided string is not valid JSON")]
|
||||
NotJSON(String),
|
||||
/// One of the value node could not be parsed.
|
||||
#[fail(display = "The field '{:?}' could not be parsed: {:?}", _0, _1)]
|
||||
ValueError(String, ValueParsingError),
|
||||
/// The json-document contains a field that is not declared in the schema.
|
||||
#[fail(display = "The json-document contains an unknown field: {:?}", _0)]
|
||||
NoSuchFieldInSchema(String),
|
||||
}
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@ use crate::schema::Facet;
|
||||
use crate::DateTime;
|
||||
use serde::de::Visitor;
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
use std::{fmt, cmp::Ordering};
|
||||
use std::{cmp::Ordering, fmt};
|
||||
|
||||
/// Value represents the value of a any field.
|
||||
/// It is an enum over all over all of the possible field type.
|
||||
@@ -27,7 +27,7 @@ pub enum Value {
|
||||
impl Eq for Value {}
|
||||
impl Ord for Value {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
match (self,other) {
|
||||
match (self, other) {
|
||||
(Value::Str(l), Value::Str(r)) => l.cmp(r),
|
||||
(Value::U64(l), Value::U64(r)) => l.cmp(r),
|
||||
(Value::I64(l), Value::I64(r)) => l.cmp(r),
|
||||
@@ -35,7 +35,7 @@ impl Ord for Value {
|
||||
(Value::Facet(l), Value::Facet(r)) => l.cmp(r),
|
||||
(Value::Bytes(l), Value::Bytes(r)) => l.cmp(r),
|
||||
(Value::F64(l), Value::F64(r)) => {
|
||||
match (l.is_nan(),r.is_nan()) {
|
||||
match (l.is_nan(), r.is_nan()) {
|
||||
(false, false) => l.partial_cmp(r).unwrap(), // only fail on NaN
|
||||
(true, true) => Ordering::Equal,
|
||||
(true, false) => Ordering::Less, // we define NaN as less than -∞
|
||||
@@ -155,7 +155,7 @@ impl Value {
|
||||
Value::F64(ref value) => *value,
|
||||
_ => panic!("This is not a f64 field."),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the Date-value, provided the value is of the `Date` type.
|
||||
///
|
||||
@@ -219,7 +219,7 @@ impl From<Vec<u8>> for Value {
|
||||
|
||||
mod binary_serialize {
|
||||
use super::Value;
|
||||
use crate::common::{BinarySerializable, f64_to_u64, u64_to_f64};
|
||||
use crate::common::{f64_to_u64, u64_to_f64, BinarySerializable};
|
||||
use crate::schema::Facet;
|
||||
use chrono::{TimeZone, Utc};
|
||||
use std::io::{self, Read, Write};
|
||||
|
||||
@@ -8,7 +8,7 @@ use tantivy::{Index, Term};
|
||||
|
||||
#[test]
|
||||
fn test_failpoints_managed_directory_gc_if_delete_fails() {
|
||||
let scenario = fail::FailScenario::setup();
|
||||
let _scenario = fail::FailScenario::setup();
|
||||
|
||||
let test_path: &'static Path = Path::new("some_path_for_test");
|
||||
|
||||
|
||||
Reference in New Issue
Block a user