mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-29 21:42:55 +00:00
Compare commits
43 Commits
0.12
...
elshize-bm
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6f4b24e30e | ||
|
|
e12d6fe6f1 | ||
|
|
6cad0e34ce | ||
|
|
c3ccb8aa81 | ||
|
|
61fc1e353a | ||
|
|
ef77523145 | ||
|
|
a16497fb69 | ||
|
|
c55db83609 | ||
|
|
1e5ebdbf3c | ||
|
|
9a2090ab21 | ||
|
|
e4aaacdb86 | ||
|
|
29acf1104d | ||
|
|
c59dfd75e9 | ||
|
|
2a41d62285 | ||
|
|
3d34fa0b69 | ||
|
|
77f363987a | ||
|
|
c0be461191 | ||
|
|
a4f21691be | ||
|
|
b86308ff19 | ||
|
|
1fb562f44a | ||
|
|
c591d0e591 | ||
|
|
186d7fc20e | ||
|
|
cfbdef5186 | ||
|
|
d04368b1d4 | ||
|
|
b167058028 | ||
|
|
262957717b | ||
|
|
873a808321 | ||
|
|
6fa8f9330e | ||
|
|
b3f0ef0878 | ||
|
|
04304262ba | ||
|
|
920ced364a | ||
|
|
e0499118e2 | ||
|
|
50b5efae46 | ||
|
|
486b8fa9c5 | ||
|
|
b2baed9bdd | ||
|
|
b591542c0b | ||
|
|
a83fa00ac4 | ||
|
|
7ff5c7c797 | ||
|
|
1748602691 | ||
|
|
6542dd5337 | ||
|
|
c64a44b9e1 | ||
|
|
fccc5b3bed | ||
|
|
98b9d5c6c4 |
@@ -1,3 +1,9 @@
|
||||
Tantivy 0.13.0
|
||||
======================
|
||||
- Bugfix in `FuzzyTermQuery` not matching terms by prefix when it should (@Peachball)
|
||||
- Relaxed constraints on the custom/tweak score functions. At the segment level, they can be mut, and they are not required to be Sync + Send.
|
||||
- `MMapDirectory::open` does not return a `Result` anymore.
|
||||
|
||||
Tantivy 0.12.0
|
||||
======================
|
||||
- Removing static dispatch in tokenizers for simplicity. (#762)
|
||||
|
||||
21
Cargo.toml
21
Cargo.toml
@@ -5,7 +5,7 @@ authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
description = """Search engine library"""
|
||||
documentation = "https://tantivy-search.github.io/tantivy/tantivy/index.html"
|
||||
documentation = "https://docs.rs/tantivy/"
|
||||
homepage = "https://github.com/tantivy-search/tantivy"
|
||||
repository = "https://github.com/tantivy-search/tantivy"
|
||||
readme = "README.md"
|
||||
@@ -13,25 +13,23 @@ keywords = ["search", "information", "retrieval"]
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
base64 = "0.11.0"
|
||||
base64 = "0.12.0"
|
||||
byteorder = "1.0"
|
||||
crc32fast = "1.2.0"
|
||||
once_cell = "1.0"
|
||||
regex ={version = "1.3.0", default-features = false, features = ["std"]}
|
||||
tantivy-fst = "0.2.1"
|
||||
tantivy-fst = "0.3"
|
||||
memmap = {version = "0.7", optional=true}
|
||||
lz4 = {version="1.20", optional=true}
|
||||
snap = "1"
|
||||
atomicwrites = {version="0.2.2", optional=true}
|
||||
tempfile = "3.0"
|
||||
log = "0.4"
|
||||
serde = "1.0"
|
||||
serde_derive = "1.0"
|
||||
serde = {version="1.0", features=["derive"]}
|
||||
serde_json = "1.0"
|
||||
num_cpus = "1.2"
|
||||
fs2={version="0.4", optional=true}
|
||||
itertools = "0.8"
|
||||
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
|
||||
levenshtein_automata = "0.2"
|
||||
notify = {version="4", optional=true}
|
||||
uuid = { version = "0.8", features = ["v4", "serde"] }
|
||||
crossbeam = "0.7"
|
||||
@@ -40,18 +38,19 @@ owning_ref = "0.4"
|
||||
stable_deref_trait = "1.0.0"
|
||||
rust-stemmers = "1.2"
|
||||
downcast-rs = { version="1.0" }
|
||||
tantivy-query-grammar = { version="0.12", path="./query-grammar" }
|
||||
tantivy-query-grammar = { version="0.13", path="./query-grammar" }
|
||||
bitpacking = {version="0.8", default-features = false, features=["bitpacker4x"]}
|
||||
census = "0.4"
|
||||
fnv = "1.0.6"
|
||||
owned-read = "0.4"
|
||||
failure = "0.1"
|
||||
htmlescape = "0.3.1"
|
||||
fail = "0.3"
|
||||
fail = "0.4"
|
||||
murmurhash32 = "0.2"
|
||||
chrono = "0.4"
|
||||
smallvec = "1.0"
|
||||
rayon = "1"
|
||||
# ordered-float = "1"
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
winapi = "0.3"
|
||||
@@ -60,9 +59,11 @@ winapi = "0.3"
|
||||
rand = "0.7"
|
||||
maplit = "1"
|
||||
matches = "0.1.8"
|
||||
proptest = "0.9"
|
||||
float-cmp = "0.6"
|
||||
|
||||
[dev-dependencies.fail]
|
||||
version = "0.3"
|
||||
version = "0.4"
|
||||
features = ["failpoints"]
|
||||
|
||||
[profile.release]
|
||||
|
||||
19
README.md
19
README.md
@@ -31,16 +31,20 @@ Tantivy is, in fact, strongly inspired by Lucene's design.
|
||||
|
||||
# Benchmark
|
||||
|
||||
Tantivy is typically faster than Lucene, but the results depend on
|
||||
the nature of the queries in your workload.
|
||||
|
||||
The following [benchmark](https://tantivy-search.github.io/bench/) break downs
|
||||
performance for different type of queries / collection.
|
||||
|
||||
|
||||
In general, Tantivy tends to be
|
||||
- slower than Lucene on union with a Top-K due to Block-WAND optimization.
|
||||
- faster than Lucene on intersection and phrase queries.
|
||||
|
||||
Your mileage WILL vary depending on the nature of queries and their load.
|
||||
|
||||
# Features
|
||||
|
||||
- Full-text search
|
||||
- Configurable tokenizer (stemming available for 17 Latin languages with third party support for Chinese ([tantivy-jieba](https://crates.io/crates/tantivy-jieba) and [cang-jie](https://crates.io/crates/cang-jie)) and [Japanese](https://crates.io/crates/tantivy-tokenizer-tiny-segmenter))
|
||||
- Configurable tokenizer (stemming available for 17 Latin languages with third party support for Chinese ([tantivy-jieba](https://crates.io/crates/tantivy-jieba) and [cang-jie](https://crates.io/crates/cang-jie)), Japanese ([lindera](https://github.com/lindera-morphology/lindera-tantivy) and [tantivy-tokenizer-tiny-segmente](https://crates.io/crates/tantivy-tokenizer-tiny-segmenter)) and Korean ([lindera](https://github.com/lindera-morphology/lindera-tantivy) + [lindera-ko-dic-builder](https://github.com/lindera-morphology/lindera-ko-dic-builder))
|
||||
- Fast (check out the :racehorse: :sparkles: [benchmark](https://tantivy-search.github.io/bench/) :sparkles: :racehorse:)
|
||||
- Tiny startup time (<10ms), perfect for command line tools
|
||||
- BM25 scoring (the same as Lucene)
|
||||
@@ -59,18 +63,17 @@ performance for different type of queries / collection.
|
||||
- Configurable indexing (optional term frequency and position indexing)
|
||||
- Cheesy logo with a horse
|
||||
|
||||
# Non-features
|
||||
## Non-features
|
||||
|
||||
- Distributed search is out of the scope of Tantivy. That being said, Tantivy is a
|
||||
library upon which one could build a distributed search. Serializable/mergeable collector state for instance,
|
||||
are within the scope of Tantivy.
|
||||
|
||||
# Supported OS and compiler
|
||||
|
||||
Tantivy works on stable Rust (>= 1.27) and supports Linux, MacOS, and Windows.
|
||||
|
||||
# Getting started
|
||||
|
||||
Tantivy works on stable Rust (>= 1.27) and supports Linux, MacOS, and Windows.
|
||||
|
||||
- [Tantivy's simple search example](https://tantivy-search.github.io/examples/basic_search.html)
|
||||
- [tantivy-cli and its tutorial](https://github.com/tantivy-search/tantivy-cli) - `tantivy-cli` is an actual command line interface that makes it easy for you to create a search engine,
|
||||
index documents, and search via the CLI or a small server with a REST API.
|
||||
|
||||
@@ -18,5 +18,5 @@ install:
|
||||
build: false
|
||||
|
||||
test_script:
|
||||
- REM SET RUST_LOG=tantivy,test & cargo test --verbose --no-default-features --features mmap
|
||||
- REM SET RUST_LOG=tantivy,test & cargo test --all --verbose --no-default-features --features mmap
|
||||
- REM SET RUST_BACKTRACE=1 & cargo build --examples
|
||||
|
||||
98
examples/faceted_search_with_tweaked_score.rs
Normal file
98
examples/faceted_search_with_tweaked_score.rs
Normal file
@@ -0,0 +1,98 @@
|
||||
use std::collections::HashSet;
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::doc;
|
||||
use tantivy::query::BooleanQuery;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::{DocId, Index, Score, SegmentReader};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
let title = schema_builder.add_text_field("title", STORED);
|
||||
let ingredient = schema_builder.add_facet_field("ingredient");
|
||||
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
let mut index_writer = index.writer(30_000_000)?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "Fried egg",
|
||||
ingredient => Facet::from("/ingredient/egg"),
|
||||
ingredient => Facet::from("/ingredient/oil"),
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
title => "Scrambled egg",
|
||||
ingredient => Facet::from("/ingredient/egg"),
|
||||
ingredient => Facet::from("/ingredient/butter"),
|
||||
ingredient => Facet::from("/ingredient/milk"),
|
||||
ingredient => Facet::from("/ingredient/salt"),
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
title => "Egg rolls",
|
||||
ingredient => Facet::from("/ingredient/egg"),
|
||||
ingredient => Facet::from("/ingredient/garlic"),
|
||||
ingredient => Facet::from("/ingredient/salt"),
|
||||
ingredient => Facet::from("/ingredient/oil"),
|
||||
ingredient => Facet::from("/ingredient/tortilla-wrap"),
|
||||
ingredient => Facet::from("/ingredient/mushroom"),
|
||||
));
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
{
|
||||
let facets = vec![
|
||||
Facet::from("/ingredient/egg"),
|
||||
Facet::from("/ingredient/oil"),
|
||||
Facet::from("/ingredient/garlic"),
|
||||
Facet::from("/ingredient/mushroom"),
|
||||
];
|
||||
let query = BooleanQuery::new_multiterms_query(
|
||||
facets
|
||||
.iter()
|
||||
.map(|key| Term::from_facet(ingredient, &key))
|
||||
.collect(),
|
||||
);
|
||||
let top_docs_by_custom_score =
|
||||
TopDocs::with_limit(2).tweak_score(move |segment_reader: &SegmentReader| {
|
||||
let mut ingredient_reader = segment_reader.facet_reader(ingredient).unwrap();
|
||||
let facet_dict = ingredient_reader.facet_dict();
|
||||
|
||||
let query_ords: HashSet<u64> = facets
|
||||
.iter()
|
||||
.filter_map(|key| facet_dict.term_ord(key.encoded_str()))
|
||||
.collect();
|
||||
|
||||
let mut facet_ords_buffer: Vec<u64> = Vec::with_capacity(20);
|
||||
|
||||
move |doc: DocId, original_score: Score| {
|
||||
ingredient_reader.facet_ords(doc, &mut facet_ords_buffer);
|
||||
let missing_ingredients = facet_ords_buffer
|
||||
.iter()
|
||||
.filter(|ord| !query_ords.contains(ord))
|
||||
.count();
|
||||
let tweak = 1.0 / 4_f32.powi(missing_ingredients as i32);
|
||||
|
||||
original_score * tweak
|
||||
}
|
||||
});
|
||||
let top_docs = searcher.search(&query, &top_docs_by_custom_score)?;
|
||||
|
||||
let titles: Vec<String> = top_docs
|
||||
.iter()
|
||||
.map(|(_, doc_id)| {
|
||||
searcher
|
||||
.doc(*doc_id)
|
||||
.unwrap()
|
||||
.get_first(title)
|
||||
.unwrap()
|
||||
.text()
|
||||
.unwrap()
|
||||
.to_owned()
|
||||
})
|
||||
.collect();
|
||||
assert_eq!(titles, vec!["Fried egg", "Egg rolls"]);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -9,11 +9,10 @@
|
||||
// - import tokenized text straight from json,
|
||||
// - perform a search on documents with pre-tokenized text
|
||||
|
||||
use tantivy::tokenizer::{PreTokenizedString, SimpleTokenizer, Token, Tokenizer};
|
||||
|
||||
use tantivy::collector::{Count, TopDocs};
|
||||
use tantivy::query::TermQuery;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::tokenizer::{PreTokenizedString, SimpleTokenizer, Token, Tokenizer};
|
||||
use tantivy::{doc, Index, ReloadPolicy};
|
||||
use tempfile::TempDir;
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy-query-grammar"
|
||||
version = "0.12.0"
|
||||
version = "0.13.0"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
@@ -13,4 +13,4 @@ keywords = ["search", "information", "retrieval"]
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
combine = "4"
|
||||
combine = {version="4", default-features=false, features=[] }
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
#![recursion_limit = "100"]
|
||||
|
||||
mod occur;
|
||||
mod query_grammar;
|
||||
mod user_input_ast;
|
||||
|
||||
@@ -1,215 +1,209 @@
|
||||
use super::user_input_ast::*;
|
||||
use super::user_input_ast::{UserInputAST, UserInputBound, UserInputLeaf, UserInputLiteral};
|
||||
use crate::Occur;
|
||||
use combine::error::StreamError;
|
||||
use combine::error::StringStreamError;
|
||||
use combine::parser::char::{char, digit, letter, space, spaces, string};
|
||||
use combine::stream::StreamErrorFor;
|
||||
use combine::parser::Parser;
|
||||
use combine::{
|
||||
attempt, choice, eof, many, many1, one_of, optional, parser, satisfy, skip_many1, value, Stream,
|
||||
attempt, choice, eof, many, many1, one_of, optional, parser, satisfy, skip_many1, value,
|
||||
};
|
||||
|
||||
parser! {
|
||||
fn field[I]()(I) -> String
|
||||
where [I: Stream<Token = char>] {
|
||||
(
|
||||
letter(),
|
||||
many(satisfy(|c: char| c.is_alphanumeric() || c == '_')),
|
||||
).skip(char(':')).map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
|
||||
}
|
||||
}
|
||||
|
||||
parser! {
|
||||
fn word[I]()(I) -> String
|
||||
where [I: Stream<Token = char>] {
|
||||
(
|
||||
satisfy(|c: char| !c.is_whitespace() && !['-', '^', '`', ':', '{', '}', '"', '[', ']', '(',')'].contains(&c) ),
|
||||
many(satisfy(|c: char| !c.is_whitespace() && ![':', '^', '{', '}', '"', '[', ']', '(',')'].contains(&c)))
|
||||
)
|
||||
fn field<'a>() -> impl Parser<&'a str, Output = String> {
|
||||
(
|
||||
letter(),
|
||||
many(satisfy(|c: char| c.is_alphanumeric() || c == '_')),
|
||||
)
|
||||
.skip(char(':'))
|
||||
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
|
||||
.and_then(|s: String|
|
||||
match s.as_str() {
|
||||
"OR" => Err(StreamErrorFor::<I>::unexpected_static_message("OR")),
|
||||
"AND" => Err(StreamErrorFor::<I>::unexpected_static_message("AND")),
|
||||
"NOT" => Err(StreamErrorFor::<I>::unexpected_static_message("NOT")),
|
||||
_ => Ok(s)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
parser! {
|
||||
fn literal[I]()(I) -> UserInputLeaf
|
||||
where [I: Stream<Token = char>]
|
||||
{
|
||||
let term_val = || {
|
||||
let phrase = char('"').with(many1(satisfy(|c| c != '"'))).skip(char('"'));
|
||||
phrase.or(word())
|
||||
};
|
||||
let term_val_with_field = negative_number().or(term_val());
|
||||
let term_query =
|
||||
(field(), term_val_with_field)
|
||||
.map(|(field_name, phrase)| UserInputLiteral {
|
||||
field_name: Some(field_name),
|
||||
phrase,
|
||||
});
|
||||
let term_default_field = term_val().map(|phrase| UserInputLiteral {
|
||||
field_name: None,
|
||||
phrase,
|
||||
});
|
||||
attempt(term_query)
|
||||
.or(term_default_field)
|
||||
.map(UserInputLeaf::from)
|
||||
}
|
||||
}
|
||||
|
||||
parser! {
|
||||
fn negative_number[I]()(I) -> String
|
||||
where [I: Stream<Token = char>]
|
||||
{
|
||||
(char('-'), many1(digit()),
|
||||
optional((char('.'), many1(digit()))))
|
||||
.map(|(s1, s2, s3): (char, String, Option<(char, String)>)| {
|
||||
if let Some(('.', s3)) = s3 {
|
||||
format!("{}{}.{}", s1, s2, s3)
|
||||
} else {
|
||||
format!("{}{}", s1, s2)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
parser! {
|
||||
fn spaces1[I]()(I) -> ()
|
||||
where [I: Stream<Token = char>] {
|
||||
skip_many1(space())
|
||||
}
|
||||
}
|
||||
|
||||
parser! {
|
||||
/// Function that parses a range out of a Stream
|
||||
/// Supports ranges like:
|
||||
/// [5 TO 10], {5 TO 10}, [* TO 10], [10 TO *], {10 TO *], >5, <=10
|
||||
/// [a TO *], [a TO c], [abc TO bcd}
|
||||
fn range[I]()(I) -> UserInputLeaf
|
||||
where [I: Stream<Token = char>] {
|
||||
let range_term_val = || {
|
||||
word().or(negative_number()).or(char('*').with(value("*".to_string())))
|
||||
};
|
||||
|
||||
// check for unbounded range in the form of <5, <=10, >5, >=5
|
||||
let elastic_unbounded_range = (choice([attempt(string(">=")),
|
||||
attempt(string("<=")),
|
||||
attempt(string("<")),
|
||||
attempt(string(">"))])
|
||||
.skip(spaces()),
|
||||
range_term_val()).
|
||||
map(|(comparison_sign, bound): (&str, String)|
|
||||
match comparison_sign {
|
||||
">=" => (UserInputBound::Inclusive(bound), UserInputBound::Unbounded),
|
||||
"<=" => (UserInputBound::Unbounded, UserInputBound::Inclusive(bound)),
|
||||
"<" => (UserInputBound::Unbounded, UserInputBound::Exclusive(bound)),
|
||||
">" => (UserInputBound::Exclusive(bound), UserInputBound::Unbounded),
|
||||
// default case
|
||||
_ => (UserInputBound::Unbounded, UserInputBound::Unbounded)
|
||||
});
|
||||
let lower_bound = (one_of("{[".chars()), range_term_val())
|
||||
.map(|(boundary_char, lower_bound): (char, String)|
|
||||
if lower_bound == "*" {
|
||||
UserInputBound::Unbounded
|
||||
} else if boundary_char == '{' {
|
||||
UserInputBound::Exclusive(lower_bound)
|
||||
} else {
|
||||
UserInputBound::Inclusive(lower_bound)
|
||||
});
|
||||
let upper_bound = (range_term_val(), one_of("}]".chars()))
|
||||
.map(|(higher_bound, boundary_char): (String, char)|
|
||||
if higher_bound == "*" {
|
||||
UserInputBound::Unbounded
|
||||
} else if boundary_char == '}' {
|
||||
UserInputBound::Exclusive(higher_bound)
|
||||
} else {
|
||||
UserInputBound::Inclusive(higher_bound)
|
||||
});
|
||||
// return only lower and upper
|
||||
let lower_to_upper = (lower_bound.
|
||||
skip((spaces(),
|
||||
string("TO"),
|
||||
spaces())),
|
||||
upper_bound);
|
||||
|
||||
(optional(field()).skip(spaces()),
|
||||
// try elastic first, if it matches, the range is unbounded
|
||||
attempt(elastic_unbounded_range).or(lower_to_upper))
|
||||
.map(|(field, (lower, upper))|
|
||||
// Construct the leaf from extracted field (optional)
|
||||
// and bounds
|
||||
UserInputLeaf::Range {
|
||||
field,
|
||||
lower,
|
||||
upper
|
||||
fn word<'a>() -> impl Parser<&'a str, Output = String> {
|
||||
(
|
||||
satisfy(|c: char| {
|
||||
!c.is_whitespace()
|
||||
&& !['-', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')'].contains(&c)
|
||||
}),
|
||||
many(satisfy(|c: char| {
|
||||
!c.is_whitespace() && ![':', '^', '{', '}', '"', '[', ']', '(', ')'].contains(&c)
|
||||
})),
|
||||
)
|
||||
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
|
||||
.and_then(|s: String| match s.as_str() {
|
||||
"OR" | "AND " | "NOT" => Err(StringStreamError::UnexpectedParse),
|
||||
_ => Ok(s),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn term_val<'a>() -> impl Parser<&'a str, Output = String> {
|
||||
let phrase = char('"').with(many1(satisfy(|c| c != '"'))).skip(char('"'));
|
||||
phrase.or(word())
|
||||
}
|
||||
|
||||
fn term_query<'a>() -> impl Parser<&'a str, Output = UserInputLiteral> {
|
||||
let term_val_with_field = negative_number().or(term_val());
|
||||
(field(), term_val_with_field).map(|(field_name, phrase)| UserInputLiteral {
|
||||
field_name: Some(field_name),
|
||||
phrase,
|
||||
})
|
||||
}
|
||||
|
||||
fn literal<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
|
||||
let term_default_field = term_val().map(|phrase| UserInputLiteral {
|
||||
field_name: None,
|
||||
phrase,
|
||||
});
|
||||
attempt(term_query())
|
||||
.or(term_default_field)
|
||||
.map(UserInputLeaf::from)
|
||||
}
|
||||
|
||||
fn negative_number<'a>() -> impl Parser<&'a str, Output = String> {
|
||||
(
|
||||
char('-'),
|
||||
many1(digit()),
|
||||
optional((char('.'), many1(digit()))),
|
||||
)
|
||||
.map(|(s1, s2, s3): (char, String, Option<(char, String)>)| {
|
||||
if let Some(('.', s3)) = s3 {
|
||||
format!("{}{}.{}", s1, s2, s3)
|
||||
} else {
|
||||
format!("{}{}", s1, s2)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn spaces1<'a>() -> impl Parser<&'a str, Output = ()> {
|
||||
skip_many1(space())
|
||||
}
|
||||
|
||||
/// Function that parses a range out of a Stream
|
||||
/// Supports ranges like:
|
||||
/// [5 TO 10], {5 TO 10}, [* TO 10], [10 TO *], {10 TO *], >5, <=10
|
||||
/// [a TO *], [a TO c], [abc TO bcd}
|
||||
fn range<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
|
||||
let range_term_val = || {
|
||||
word()
|
||||
.or(negative_number())
|
||||
.or(char('*').with(value("*".to_string())))
|
||||
};
|
||||
|
||||
// check for unbounded range in the form of <5, <=10, >5, >=5
|
||||
let elastic_unbounded_range = (
|
||||
choice([
|
||||
attempt(string(">=")),
|
||||
attempt(string("<=")),
|
||||
attempt(string("<")),
|
||||
attempt(string(">")),
|
||||
])
|
||||
.skip(spaces()),
|
||||
range_term_val(),
|
||||
)
|
||||
.map(
|
||||
|(comparison_sign, bound): (&str, String)| match comparison_sign {
|
||||
">=" => (UserInputBound::Inclusive(bound), UserInputBound::Unbounded),
|
||||
"<=" => (UserInputBound::Unbounded, UserInputBound::Inclusive(bound)),
|
||||
"<" => (UserInputBound::Unbounded, UserInputBound::Exclusive(bound)),
|
||||
">" => (UserInputBound::Exclusive(bound), UserInputBound::Unbounded),
|
||||
// default case
|
||||
_ => (UserInputBound::Unbounded, UserInputBound::Unbounded),
|
||||
},
|
||||
);
|
||||
let lower_bound = (one_of("{[".chars()), range_term_val()).map(
|
||||
|(boundary_char, lower_bound): (char, String)| {
|
||||
if lower_bound == "*" {
|
||||
UserInputBound::Unbounded
|
||||
} else if boundary_char == '{' {
|
||||
UserInputBound::Exclusive(lower_bound)
|
||||
} else {
|
||||
UserInputBound::Inclusive(lower_bound)
|
||||
}
|
||||
},
|
||||
);
|
||||
let upper_bound = (range_term_val(), one_of("}]".chars())).map(
|
||||
|(higher_bound, boundary_char): (String, char)| {
|
||||
if higher_bound == "*" {
|
||||
UserInputBound::Unbounded
|
||||
} else if boundary_char == '}' {
|
||||
UserInputBound::Exclusive(higher_bound)
|
||||
} else {
|
||||
UserInputBound::Inclusive(higher_bound)
|
||||
}
|
||||
},
|
||||
);
|
||||
// return only lower and upper
|
||||
let lower_to_upper = (
|
||||
lower_bound.skip((spaces(), string("TO"), spaces())),
|
||||
upper_bound,
|
||||
);
|
||||
|
||||
(
|
||||
optional(field()).skip(spaces()),
|
||||
// try elastic first, if it matches, the range is unbounded
|
||||
attempt(elastic_unbounded_range).or(lower_to_upper),
|
||||
)
|
||||
.map(|(field, (lower, upper))|
|
||||
// Construct the leaf from extracted field (optional)
|
||||
// and bounds
|
||||
UserInputLeaf::Range {
|
||||
field,
|
||||
lower,
|
||||
upper
|
||||
})
|
||||
}
|
||||
|
||||
fn negate(expr: UserInputAST) -> UserInputAST {
|
||||
expr.unary(Occur::MustNot)
|
||||
}
|
||||
|
||||
fn must(expr: UserInputAST) -> UserInputAST {
|
||||
expr.unary(Occur::Must)
|
||||
fn leaf<'a>() -> impl Parser<&'a str, Output = UserInputAST> {
|
||||
parser(|input| {
|
||||
char('(')
|
||||
.with(ast())
|
||||
.skip(char(')'))
|
||||
.or(char('*').map(|_| UserInputAST::from(UserInputLeaf::All)))
|
||||
.or(attempt(
|
||||
string("NOT").skip(spaces1()).with(leaf()).map(negate),
|
||||
))
|
||||
.or(attempt(range().map(UserInputAST::from)))
|
||||
.or(literal().map(UserInputAST::from))
|
||||
.parse_stream(input)
|
||||
.into_result()
|
||||
})
|
||||
}
|
||||
|
||||
parser! {
|
||||
fn leaf[I]()(I) -> UserInputAST
|
||||
where [I: Stream<Token = char>] {
|
||||
char('-').with(leaf()).map(negate)
|
||||
.or(char('+').with(leaf()).map(must))
|
||||
.or(char('(').with(ast()).skip(char(')')))
|
||||
.or(char('*').map(|_| UserInputAST::from(UserInputLeaf::All)))
|
||||
.or(attempt(string("NOT").skip(spaces1()).with(leaf()).map(negate)))
|
||||
.or(attempt(range().map(UserInputAST::from)))
|
||||
.or(literal().map(UserInputAST::from))
|
||||
}
|
||||
fn occur_symbol<'a>() -> impl Parser<&'a str, Output = Occur> {
|
||||
char('-')
|
||||
.map(|_| Occur::MustNot)
|
||||
.or(char('+').map(|_| Occur::Must))
|
||||
}
|
||||
|
||||
parser! {
|
||||
fn positive_float_number[I]()(I) -> f32
|
||||
where [I: Stream<Token = char>] {
|
||||
(
|
||||
many1(digit()),
|
||||
optional(
|
||||
(char('.'), many1(digit()))
|
||||
)
|
||||
)
|
||||
.map(|(int_part, decimal_part_opt): (String, Option<(char, String)>)| {
|
||||
fn occur_leaf<'a>() -> impl Parser<&'a str, Output = (Option<Occur>, UserInputAST)> {
|
||||
(optional(occur_symbol()), boosted_leaf())
|
||||
}
|
||||
|
||||
fn positive_float_number<'a>() -> impl Parser<&'a str, Output = f32> {
|
||||
(many1(digit()), optional((char('.'), many1(digit())))).map(
|
||||
|(int_part, decimal_part_opt): (String, Option<(char, String)>)| {
|
||||
let mut float_str = int_part;
|
||||
if let Some((chr, decimal_str)) = decimal_part_opt {
|
||||
float_str.push(chr);
|
||||
float_str.push_str(&decimal_str);
|
||||
}
|
||||
float_str.parse::<f32>().unwrap()
|
||||
})
|
||||
}
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
parser! {
|
||||
fn boost[I]()(I) -> f32
|
||||
where [I: Stream<Token = char>] {
|
||||
(char('^'), positive_float_number())
|
||||
.map(|(_, boost)| boost)
|
||||
}
|
||||
fn boost<'a>() -> impl Parser<&'a str, Output = f32> {
|
||||
(char('^'), positive_float_number()).map(|(_, boost)| boost)
|
||||
}
|
||||
|
||||
parser! {
|
||||
fn boosted_leaf[I]()(I) -> UserInputAST
|
||||
where [I: Stream<Token = char>] {
|
||||
(leaf(), optional(boost()))
|
||||
.map(|(leaf, boost_opt)|
|
||||
match boost_opt {
|
||||
Some(boost) if (boost - 1.0).abs() > std::f32::EPSILON =>
|
||||
UserInputAST::Boost(Box::new(leaf), boost),
|
||||
_ => leaf
|
||||
}
|
||||
)
|
||||
}
|
||||
fn boosted_leaf<'a>() -> impl Parser<&'a str, Output = UserInputAST> {
|
||||
(leaf(), optional(boost())).map(|(leaf, boost_opt)| match boost_opt {
|
||||
Some(boost) if (boost - 1.0).abs() > std::f32::EPSILON => {
|
||||
UserInputAST::Boost(Box::new(leaf), boost)
|
||||
}
|
||||
_ => leaf,
|
||||
})
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
@@ -218,13 +212,10 @@ enum BinaryOperand {
|
||||
And,
|
||||
}
|
||||
|
||||
parser! {
|
||||
fn binary_operand[I]()(I) -> BinaryOperand
|
||||
where [I: Stream<Token = char>]
|
||||
{
|
||||
string("AND").with(value(BinaryOperand::And))
|
||||
.or(string("OR").with(value(BinaryOperand::Or)))
|
||||
}
|
||||
fn binary_operand<'a>() -> impl Parser<&'a str, Output = BinaryOperand> {
|
||||
string("AND")
|
||||
.with(value(BinaryOperand::And))
|
||||
.or(string("OR").with(value(BinaryOperand::Or)))
|
||||
}
|
||||
|
||||
fn aggregate_binary_expressions(
|
||||
@@ -252,31 +243,37 @@ fn aggregate_binary_expressions(
|
||||
}
|
||||
}
|
||||
|
||||
parser! {
|
||||
pub fn ast[I]()(I) -> UserInputAST
|
||||
where [I: Stream<Token = char>]
|
||||
{
|
||||
let operand_leaf = (binary_operand().skip(spaces()), boosted_leaf().skip(spaces()));
|
||||
let boolean_expr = (boosted_leaf().skip(spaces().silent()), many1(operand_leaf)).map(
|
||||
|(left, right)| aggregate_binary_expressions(left,right));
|
||||
let whitespace_separated_leaves = many1(boosted_leaf().skip(spaces().silent()))
|
||||
.map(|subqueries: Vec<UserInputAST>|
|
||||
if subqueries.len() == 1 {
|
||||
subqueries.into_iter().next().unwrap()
|
||||
} else {
|
||||
UserInputAST::Clause(subqueries.into_iter().collect())
|
||||
});
|
||||
let expr = attempt(boolean_expr).or(whitespace_separated_leaves);
|
||||
spaces().with(expr).skip(spaces())
|
||||
}
|
||||
fn operand_leaf<'a>() -> impl Parser<&'a str, Output = (BinaryOperand, UserInputAST)> {
|
||||
(
|
||||
binary_operand().skip(spaces()),
|
||||
boosted_leaf().skip(spaces()),
|
||||
)
|
||||
}
|
||||
|
||||
parser! {
|
||||
pub fn parse_to_ast[I]()(I) -> UserInputAST
|
||||
where [I: Stream<Token = char>]
|
||||
{
|
||||
spaces().with(optional(ast()).skip(eof())).map(|opt_ast| opt_ast.unwrap_or_else(UserInputAST::empty_query))
|
||||
}
|
||||
pub fn ast<'a>() -> impl Parser<&'a str, Output = UserInputAST> {
|
||||
let boolean_expr = (boosted_leaf().skip(spaces()), many1(operand_leaf()))
|
||||
.map(|(left, right)| aggregate_binary_expressions(left, right));
|
||||
let whitespace_separated_leaves = many1(occur_leaf().skip(spaces().silent())).map(
|
||||
|subqueries: Vec<(Option<Occur>, UserInputAST)>| {
|
||||
if subqueries.len() == 1 {
|
||||
let (occur_opt, ast) = subqueries.into_iter().next().unwrap();
|
||||
match occur_opt.unwrap_or(Occur::Should) {
|
||||
Occur::Must | Occur::Should => ast,
|
||||
Occur::MustNot => UserInputAST::Clause(vec![(Some(Occur::MustNot), ast)]),
|
||||
}
|
||||
} else {
|
||||
UserInputAST::Clause(subqueries.into_iter().collect())
|
||||
}
|
||||
},
|
||||
);
|
||||
let expr = attempt(boolean_expr).or(whitespace_separated_leaves);
|
||||
spaces().with(expr).skip(spaces())
|
||||
}
|
||||
|
||||
pub fn parse_to_ast<'a>() -> impl Parser<&'a str, Output = UserInputAST> {
|
||||
spaces()
|
||||
.with(optional(ast()).skip(eof()))
|
||||
.map(|opt_ast| opt_ast.unwrap_or_else(UserInputAST::empty_query))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -298,6 +295,12 @@ mod test {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_occur_symbol() {
|
||||
assert_eq!(super::occur_symbol().parse("-"), Ok((Occur::MustNot, "")));
|
||||
assert_eq!(super::occur_symbol().parse("+"), Ok((Occur::Must, "")));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_positive_float_number() {
|
||||
fn valid_parse(float_str: &str, expected_val: f32, expected_remaining: &str) {
|
||||
@@ -345,7 +348,7 @@ mod test {
|
||||
"Err(UnexpectedParse)"
|
||||
);
|
||||
test_parse_query_to_ast_helper("NOTa", "\"NOTa\"");
|
||||
test_parse_query_to_ast_helper("NOT a", "-(\"a\")");
|
||||
test_parse_query_to_ast_helper("NOT a", "(-\"a\")");
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -353,16 +356,16 @@ mod test {
|
||||
assert!(parse_to_ast().parse("a^2^3").is_err());
|
||||
assert!(parse_to_ast().parse("a^2^").is_err());
|
||||
test_parse_query_to_ast_helper("a^3", "(\"a\")^3");
|
||||
test_parse_query_to_ast_helper("a^3 b^2", "((\"a\")^3 (\"b\")^2)");
|
||||
test_parse_query_to_ast_helper("a^3 b^2", "(*(\"a\")^3 *(\"b\")^2)");
|
||||
test_parse_query_to_ast_helper("a^1", "\"a\"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_query_to_ast_binary_op() {
|
||||
test_parse_query_to_ast_helper("a AND b", "(+(\"a\") +(\"b\"))");
|
||||
test_parse_query_to_ast_helper("a OR b", "(?(\"a\") ?(\"b\"))");
|
||||
test_parse_query_to_ast_helper("a OR b AND c", "(?(\"a\") ?((+(\"b\") +(\"c\"))))");
|
||||
test_parse_query_to_ast_helper("a AND b AND c", "(+(\"a\") +(\"b\") +(\"c\"))");
|
||||
test_parse_query_to_ast_helper("a AND b", "(+\"a\" +\"b\")");
|
||||
test_parse_query_to_ast_helper("a OR b", "(?\"a\" ?\"b\")");
|
||||
test_parse_query_to_ast_helper("a OR b AND c", "(?\"a\" ?(+\"b\" +\"c\"))");
|
||||
test_parse_query_to_ast_helper("a AND b AND c", "(+\"a\" +\"b\" +\"c\")");
|
||||
assert_eq!(
|
||||
format!("{:?}", parse_to_ast().parse("a OR b aaa")),
|
||||
"Err(UnexpectedParse)"
|
||||
@@ -400,6 +403,13 @@ mod test {
|
||||
test_parse_query_to_ast_helper("weight: <= 70.5", "weight:{\"*\" TO \"70.5\"]");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_occur_leaf() {
|
||||
let ((occur, ast), _) = super::occur_leaf().parse("+abc").unwrap();
|
||||
assert_eq!(occur, Some(Occur::Must));
|
||||
assert_eq!(format!("{:?}", ast), "\"abc\"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_range_parser() {
|
||||
// testing the range() parser separately
|
||||
@@ -428,32 +438,67 @@ mod test {
|
||||
fn test_parse_query_to_triming_spaces() {
|
||||
test_parse_query_to_ast_helper(" abc", "\"abc\"");
|
||||
test_parse_query_to_ast_helper("abc ", "\"abc\"");
|
||||
test_parse_query_to_ast_helper("( a OR abc)", "(?(\"a\") ?(\"abc\"))");
|
||||
test_parse_query_to_ast_helper("(a OR abc)", "(?(\"a\") ?(\"abc\"))");
|
||||
test_parse_query_to_ast_helper("(a OR abc)", "(?(\"a\") ?(\"abc\"))");
|
||||
test_parse_query_to_ast_helper("a OR abc ", "(?(\"a\") ?(\"abc\"))");
|
||||
test_parse_query_to_ast_helper("(a OR abc )", "(?(\"a\") ?(\"abc\"))");
|
||||
test_parse_query_to_ast_helper("(a OR abc) ", "(?(\"a\") ?(\"abc\"))");
|
||||
test_parse_query_to_ast_helper("( a OR abc)", "(?\"a\" ?\"abc\")");
|
||||
test_parse_query_to_ast_helper("(a OR abc)", "(?\"a\" ?\"abc\")");
|
||||
test_parse_query_to_ast_helper("(a OR abc)", "(?\"a\" ?\"abc\")");
|
||||
test_parse_query_to_ast_helper("a OR abc ", "(?\"a\" ?\"abc\")");
|
||||
test_parse_query_to_ast_helper("(a OR abc )", "(?\"a\" ?\"abc\")");
|
||||
test_parse_query_to_ast_helper("(a OR abc) ", "(?\"a\" ?\"abc\")");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_query_to_ast() {
|
||||
fn test_parse_query_single_term() {
|
||||
test_parse_query_to_ast_helper("abc", "\"abc\"");
|
||||
test_parse_query_to_ast_helper("a b", "(\"a\" \"b\")");
|
||||
test_parse_query_to_ast_helper("+(a b)", "+((\"a\" \"b\"))");
|
||||
test_parse_query_to_ast_helper("+d", "+(\"d\")");
|
||||
test_parse_query_to_ast_helper("+(a b) +d", "(+((\"a\" \"b\")) +(\"d\"))");
|
||||
test_parse_query_to_ast_helper("(+a +b) d", "((+(\"a\") +(\"b\")) \"d\")");
|
||||
test_parse_query_to_ast_helper("(+a)", "+(\"a\")");
|
||||
test_parse_query_to_ast_helper("(+a +b)", "(+(\"a\") +(\"b\"))");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_query_default_clause() {
|
||||
test_parse_query_to_ast_helper("a b", "(*\"a\" *\"b\")");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_query_must_default_clause() {
|
||||
test_parse_query_to_ast_helper("+(a b)", "(*\"a\" *\"b\")");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_query_must_single_term() {
|
||||
test_parse_query_to_ast_helper("+d", "\"d\"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_single_term_with_field() {
|
||||
test_parse_query_to_ast_helper("abc:toto", "abc:\"toto\"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_single_term_with_float() {
|
||||
test_parse_query_to_ast_helper("abc:1.1", "abc:\"1.1\"");
|
||||
test_parse_query_to_ast_helper("+abc:toto", "+(abc:\"toto\")");
|
||||
test_parse_query_to_ast_helper("(+abc:toto -titi)", "(+(abc:\"toto\") -(\"titi\"))");
|
||||
test_parse_query_to_ast_helper("-abc:toto", "-(abc:\"toto\")");
|
||||
test_parse_query_to_ast_helper("abc:a b", "(abc:\"a\" \"b\")");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_must_clause() {
|
||||
test_parse_query_to_ast_helper("(+a +b)", "(+\"a\" +\"b\")");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_test_query_plus_a_b_plus_d() {
|
||||
test_parse_query_to_ast_helper("+(a b) +d", "(+(*\"a\" *\"b\") +\"d\")");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_test_query_other() {
|
||||
test_parse_query_to_ast_helper("(+a +b) d", "(*(+\"a\" +\"b\") *\"d\")");
|
||||
test_parse_query_to_ast_helper("+abc:toto", "abc:\"toto\"");
|
||||
test_parse_query_to_ast_helper("(+abc:toto -titi)", "(+abc:\"toto\" -\"titi\")");
|
||||
test_parse_query_to_ast_helper("-abc:toto", "(-abc:\"toto\")");
|
||||
test_parse_query_to_ast_helper("abc:a b", "(*abc:\"a\" *\"b\")");
|
||||
test_parse_query_to_ast_helper("abc:\"a b\"", "abc:\"a b\"");
|
||||
test_parse_query_to_ast_helper("foo:[1 TO 5]", "foo:[\"1\" TO \"5\"]");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_query_with_range() {
|
||||
test_parse_query_to_ast_helper("[1 TO 5]", "[\"1\" TO \"5\"]");
|
||||
test_parse_query_to_ast_helper("foo:{a TO z}", "foo:{\"a\" TO \"z\"}");
|
||||
test_parse_query_to_ast_helper("foo:[1 TO toto}", "foo:[\"1\" TO \"toto\"}");
|
||||
|
||||
@@ -85,15 +85,14 @@ impl UserInputBound {
|
||||
}
|
||||
|
||||
pub enum UserInputAST {
|
||||
Clause(Vec<UserInputAST>),
|
||||
Unary(Occur, Box<UserInputAST>),
|
||||
Clause(Vec<(Option<Occur>, UserInputAST)>),
|
||||
Leaf(Box<UserInputLeaf>),
|
||||
Boost(Box<UserInputAST>, f32),
|
||||
}
|
||||
|
||||
impl UserInputAST {
|
||||
pub fn unary(self, occur: Occur) -> UserInputAST {
|
||||
UserInputAST::Unary(occur, Box::new(self))
|
||||
UserInputAST::Clause(vec![(Some(occur), self)])
|
||||
}
|
||||
|
||||
fn compose(occur: Occur, asts: Vec<UserInputAST>) -> UserInputAST {
|
||||
@@ -104,7 +103,7 @@ impl UserInputAST {
|
||||
} else {
|
||||
UserInputAST::Clause(
|
||||
asts.into_iter()
|
||||
.map(|ast: UserInputAST| ast.unary(occur))
|
||||
.map(|ast: UserInputAST| (Some(occur), ast))
|
||||
.collect::<Vec<_>>(),
|
||||
)
|
||||
}
|
||||
@@ -135,25 +134,36 @@ impl From<UserInputLeaf> for UserInputAST {
|
||||
}
|
||||
}
|
||||
|
||||
fn print_occur_ast(
|
||||
occur_opt: Option<Occur>,
|
||||
ast: &UserInputAST,
|
||||
formatter: &mut fmt::Formatter,
|
||||
) -> fmt::Result {
|
||||
if let Some(occur) = occur_opt {
|
||||
write!(formatter, "{}{:?}", occur, ast)?;
|
||||
} else {
|
||||
write!(formatter, "*{:?}", ast)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl fmt::Debug for UserInputAST {
|
||||
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
|
||||
fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
||||
match *self {
|
||||
UserInputAST::Clause(ref subqueries) => {
|
||||
if subqueries.is_empty() {
|
||||
write!(formatter, "<emptyclause>")?;
|
||||
} else {
|
||||
write!(formatter, "(")?;
|
||||
write!(formatter, "{:?}", &subqueries[0])?;
|
||||
print_occur_ast(subqueries[0].0, &subqueries[0].1, formatter)?;
|
||||
for subquery in &subqueries[1..] {
|
||||
write!(formatter, " {:?}", subquery)?;
|
||||
write!(formatter, " ")?;
|
||||
print_occur_ast(subquery.0, &subquery.1, formatter)?;
|
||||
}
|
||||
write!(formatter, ")")?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
UserInputAST::Unary(ref occur, ref subquery) => {
|
||||
write!(formatter, "{}({:?})", occur, subquery)
|
||||
}
|
||||
UserInputAST::Leaf(ref subquery) => write!(formatter, "{:?}", subquery),
|
||||
UserInputAST::Boost(ref leaf, boost) => write!(formatter, "({:?})^{}", leaf, boost),
|
||||
}
|
||||
|
||||
@@ -28,7 +28,7 @@ where
|
||||
/// It is the segment local version of the [`CustomScorer`](./trait.CustomScorer.html).
|
||||
pub trait CustomSegmentScorer<TScore>: 'static {
|
||||
/// Computes the score of a specific `doc`.
|
||||
fn score(&self, doc: DocId) -> TScore;
|
||||
fn score(&mut self, doc: DocId) -> TScore;
|
||||
}
|
||||
|
||||
/// `CustomScorer` makes it possible to define any kind of score.
|
||||
@@ -117,9 +117,9 @@ where
|
||||
|
||||
impl<F, TScore> CustomSegmentScorer<TScore> for F
|
||||
where
|
||||
F: 'static + Sync + Send + Fn(DocId) -> TScore,
|
||||
F: 'static + FnMut(DocId) -> TScore,
|
||||
{
|
||||
fn score(&self, doc: DocId) -> TScore {
|
||||
fn score(&mut self, doc: DocId) -> TScore {
|
||||
(self)(doc)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -84,7 +84,7 @@ See the `custom_collector` example.
|
||||
|
||||
*/
|
||||
|
||||
use crate::DocId;
|
||||
use crate::{DocId, Searcher, Executor};
|
||||
use crate::Score;
|
||||
use crate::SegmentLocalId;
|
||||
use crate::SegmentReader;
|
||||
@@ -100,6 +100,9 @@ mod top_collector;
|
||||
|
||||
mod top_score_collector;
|
||||
pub use self::top_score_collector::TopDocs;
|
||||
#[cfg(test)]
|
||||
pub(crate) use self::top_score_collector::TopScoreSegmentCollector;
|
||||
|
||||
|
||||
mod custom_score_top_collector;
|
||||
pub use self::custom_score_top_collector::{CustomScorer, CustomSegmentScorer};
|
||||
@@ -109,6 +112,9 @@ pub use self::tweak_score_top_collector::{ScoreSegmentTweaker, ScoreTweaker};
|
||||
|
||||
mod facet_collector;
|
||||
pub use self::facet_collector::FacetCollector;
|
||||
use crate::fastfield::DeleteBitSet;
|
||||
use crate::query::{Scorer, Weight};
|
||||
use std::borrow::BorrowMut;
|
||||
|
||||
/// `Fruit` is the type for the result of our collection.
|
||||
/// e.g. `usize` for the `Count` collector.
|
||||
@@ -116,6 +122,8 @@ pub trait Fruit: Send + downcast_rs::Downcast {}
|
||||
|
||||
impl<T> Fruit for T where T: Send + downcast_rs::Downcast {}
|
||||
|
||||
|
||||
|
||||
/// Collectors are in charge of collecting and retaining relevant
|
||||
/// information from the document found and scored by the query.
|
||||
///
|
||||
@@ -154,6 +162,20 @@ pub trait Collector: Sync {
|
||||
/// Combines the fruit associated to the collection of each segments
|
||||
/// into one fruit.
|
||||
fn merge_fruits(&self, segment_fruits: Vec<Self::Fruit>) -> crate::Result<Self::Fruit>;
|
||||
|
||||
fn collect_weight(&self, searcher: &Searcher, weight: &dyn Weight, executor: &Executor) -> crate::Result<Self::Fruit> {
|
||||
let segment_readers = searcher.segment_readers();
|
||||
let fruits = executor.map(
|
||||
|(segment_ord, segment_reader)| {
|
||||
let mut scorer = weight.scorer(segment_reader, 1.0f32)?;
|
||||
let segment_collector =
|
||||
self.for_segment(segment_ord as u32, segment_reader)?;
|
||||
Ok(segment_collector.collect_scorer(scorer.borrow_mut(), segment_reader.delete_bitset()))
|
||||
},
|
||||
segment_readers.iter().enumerate(),
|
||||
)?;
|
||||
self.merge_fruits(fruits)
|
||||
}
|
||||
}
|
||||
|
||||
/// The `SegmentCollector` is the trait in charge of defining the
|
||||
@@ -161,7 +183,7 @@ pub trait Collector: Sync {
|
||||
///
|
||||
/// `.collect(doc, score)` will be called for every documents
|
||||
/// matching the query.
|
||||
pub trait SegmentCollector: 'static {
|
||||
pub trait SegmentCollector: 'static + Sized {
|
||||
/// `Fruit` is the type for the result of our collection.
|
||||
/// e.g. `usize` for the `Count` collector.
|
||||
type Fruit: Fruit;
|
||||
@@ -171,6 +193,19 @@ pub trait SegmentCollector: 'static {
|
||||
|
||||
/// Extract the fruit of the collection from the `SegmentCollector`.
|
||||
fn harvest(self) -> Self::Fruit;
|
||||
|
||||
fn collect_scorer(mut self, scorer: &mut dyn Scorer, delete_bitset: Option<&DeleteBitSet>) -> Self::Fruit {
|
||||
if let Some(delete_bitset) = delete_bitset {
|
||||
scorer.for_each(&mut |doc, score| {
|
||||
if delete_bitset.is_alive(doc) {
|
||||
self.collect(doc, score);
|
||||
}
|
||||
});
|
||||
} else {
|
||||
scorer.for_each(&mut |doc, score| self.collect(doc, score));
|
||||
}
|
||||
self.harvest()
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------
|
||||
|
||||
@@ -56,7 +56,7 @@ impl<T: PartialOrd, D: PartialOrd> PartialEq for ComparableDoc<T, D> {
|
||||
impl<T: PartialOrd, D: PartialOrd> Eq for ComparableDoc<T, D> {}
|
||||
|
||||
pub(crate) struct TopCollector<T> {
|
||||
limit: usize,
|
||||
pub limit: usize,
|
||||
_marker: PhantomData<T>,
|
||||
}
|
||||
|
||||
@@ -69,9 +69,7 @@ where
|
||||
/// # Panics
|
||||
/// The method panics if limit is 0
|
||||
pub fn with_limit(limit: usize) -> TopCollector<T> {
|
||||
if limit < 1 {
|
||||
panic!("Limit must be strictly greater than 0.");
|
||||
}
|
||||
assert!(limit > 0, "Limit must be strictly greater than 0.");
|
||||
TopCollector {
|
||||
limit,
|
||||
_marker: PhantomData,
|
||||
@@ -124,13 +122,13 @@ where
|
||||
/// The theorical complexity for collecting the top `K` out of `n` documents
|
||||
/// is `O(n log K)`.
|
||||
pub(crate) struct TopSegmentCollector<T> {
|
||||
limit: usize,
|
||||
pub limit: usize,
|
||||
heap: BinaryHeap<ComparableDoc<T, DocId>>,
|
||||
segment_id: u32,
|
||||
}
|
||||
|
||||
impl<T: PartialOrd> TopSegmentCollector<T> {
|
||||
fn new(segment_id: SegmentLocalId, limit: usize) -> TopSegmentCollector<T> {
|
||||
pub fn new(segment_id: SegmentLocalId, limit: usize) -> TopSegmentCollector<T> {
|
||||
TopSegmentCollector {
|
||||
limit,
|
||||
heap: BinaryHeap::with_capacity(limit),
|
||||
@@ -161,6 +159,10 @@ impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
|
||||
self.heap.len() >= self.limit
|
||||
}
|
||||
|
||||
pub fn pruning_score(&self) -> Option<T> {
|
||||
self.heap.peek().map(|head| head.feature.clone())
|
||||
}
|
||||
|
||||
/// Collects a document scored by the given feature
|
||||
///
|
||||
/// It collects documents until it has reached the max capacity. Once it reaches capacity, it
|
||||
|
||||
@@ -8,12 +8,13 @@ use crate::collector::{
|
||||
};
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::schema::Field;
|
||||
use crate::DocAddress;
|
||||
use crate::{DocAddress, Executor, Searcher};
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
use crate::SegmentLocalId;
|
||||
use crate::SegmentReader;
|
||||
use std::fmt;
|
||||
use crate::query::{Weight, PruningScorerIfPossible};
|
||||
|
||||
/// The `TopDocs` collector keeps track of the top `K` documents
|
||||
/// sorted by their score.
|
||||
@@ -66,7 +67,7 @@ struct ScorerByFastFieldReader {
|
||||
}
|
||||
|
||||
impl CustomSegmentScorer<u64> for ScorerByFastFieldReader {
|
||||
fn score(&self, doc: DocId) -> u64 {
|
||||
fn score(&mut self, doc: DocId) -> u64 {
|
||||
self.ff_reader.get_u64(u64::from(doc))
|
||||
}
|
||||
}
|
||||
@@ -417,6 +418,42 @@ impl Collector for TopDocs {
|
||||
true
|
||||
}
|
||||
|
||||
fn collect_weight(&self, searcher: &Searcher, weight: &dyn Weight, executor: &Executor) -> crate::Result<Self::Fruit> {
|
||||
let segment_readers = searcher.segment_readers();
|
||||
let fruits = executor.map(
|
||||
|(segment_ord, segment_reader)| {
|
||||
match weight.pruning_scorer(segment_reader, 1.0f32)? {
|
||||
PruningScorerIfPossible::NonPruning(mut scorer) => {
|
||||
let segment_collector =
|
||||
self.for_segment(segment_ord as u32, segment_reader)?;
|
||||
let fruit =
|
||||
segment_collector.collect_scorer(scorer.as_mut(), segment_reader.delete_bitset());
|
||||
Ok(fruit)
|
||||
}
|
||||
PruningScorerIfPossible::Pruning(mut pruning_scorer) => {
|
||||
let limit = self.0.limit;
|
||||
let mut segment_collector =
|
||||
self.for_segment(segment_ord as u32, segment_reader)?;
|
||||
for _ in 0..limit {
|
||||
if !pruning_scorer.advance() {
|
||||
return Ok(segment_collector.harvest());
|
||||
}
|
||||
segment_collector.collect(pruning_scorer.doc(), pruning_scorer.score());
|
||||
}
|
||||
let mut pruning_score = segment_collector.0.pruning_score().unwrap_or(0.0f32);
|
||||
while pruning_scorer.advance_with_pruning(pruning_score) {
|
||||
segment_collector.0.collect(pruning_scorer.doc(), pruning_scorer.score());
|
||||
pruning_score = segment_collector.0.pruning_score().unwrap_or(0.0f32);
|
||||
}
|
||||
Ok(segment_collector.harvest())
|
||||
}
|
||||
}
|
||||
},
|
||||
segment_readers.iter().enumerate(),
|
||||
)?;
|
||||
self.merge_fruits(fruits)
|
||||
}
|
||||
|
||||
fn merge_fruits(
|
||||
&self,
|
||||
child_fruits: Vec<Vec<(Score, DocAddress)>>,
|
||||
@@ -428,6 +465,12 @@ impl Collector for TopDocs {
|
||||
/// Segment Collector associated to `TopDocs`.
|
||||
pub struct TopScoreSegmentCollector(TopSegmentCollector<Score>);
|
||||
|
||||
impl TopScoreSegmentCollector {
|
||||
pub fn new(segment_id: SegmentLocalId, limit: usize) -> Self {
|
||||
TopScoreSegmentCollector(TopSegmentCollector::new(segment_id, limit))
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentCollector for TopScoreSegmentCollector {
|
||||
type Fruit = Vec<(Score, DocAddress)>;
|
||||
|
||||
@@ -450,7 +493,6 @@ mod tests {
|
||||
use crate::Index;
|
||||
use crate::IndexWriter;
|
||||
use crate::Score;
|
||||
use itertools::Itertools;
|
||||
|
||||
fn make_index() -> Index {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -524,8 +566,8 @@ mod tests {
|
||||
|
||||
// precondition for the test to be meaningful: we did get documents
|
||||
// with the same score
|
||||
assert!(page_1.iter().map(|result| result.0).all_equal());
|
||||
assert!(page_2.iter().map(|result| result.0).all_equal());
|
||||
assert!(page_1.iter().all(|result| result.0 == page_1[0].0));
|
||||
assert!(page_2.iter().all(|result| result.0 == page_2[0].0));
|
||||
|
||||
// sanity check since we're relying on make_index()
|
||||
assert_eq!(page_1.len(), 2);
|
||||
|
||||
@@ -29,7 +29,7 @@ where
|
||||
/// It is the segment local version of the [`ScoreTweaker`](./trait.ScoreTweaker.html).
|
||||
pub trait ScoreSegmentTweaker<TScore>: 'static {
|
||||
/// Tweak the given `score` for the document `doc`.
|
||||
fn score(&self, doc: DocId, score: Score) -> TScore;
|
||||
fn score(&mut self, doc: DocId, score: Score) -> TScore;
|
||||
}
|
||||
|
||||
/// `ScoreTweaker` makes it possible to tweak the score
|
||||
@@ -121,9 +121,9 @@ where
|
||||
|
||||
impl<F, TScore> ScoreSegmentTweaker<TScore> for F
|
||||
where
|
||||
F: 'static + Sync + Send + Fn(DocId, Score) -> TScore,
|
||||
F: 'static + FnMut(DocId, Score) -> TScore,
|
||||
{
|
||||
fn score(&self, doc: DocId, score: Score) -> TScore {
|
||||
fn score(&mut self, doc: DocId, score: Score) -> TScore {
|
||||
(self)(doc, score)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,6 +18,19 @@ pub use byteorder::LittleEndian as Endianness;
|
||||
/// We do not allow segments with more than
|
||||
pub const MAX_DOC_LIMIT: u32 = 1 << 31;
|
||||
|
||||
pub fn minmax<I, T>(mut vals: I) -> Option<(T, T)>
|
||||
where
|
||||
I: Iterator<Item = T>,
|
||||
T: Copy + Ord,
|
||||
{
|
||||
if let Some(first_el) = vals.next() {
|
||||
return Some(vals.fold((first_el, first_el), |(min_val, max_val), el| {
|
||||
(min_val.min(el), max_val.max(el))
|
||||
}));
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Computes the number of bits that will be used for bitpacking.
|
||||
///
|
||||
/// In general the target is the minimum number of bits
|
||||
@@ -134,6 +147,7 @@ pub fn u64_to_f64(val: u64) -> f64 {
|
||||
#[cfg(test)]
|
||||
pub(crate) mod test {
|
||||
|
||||
pub use super::minmax;
|
||||
pub use super::serialize::test::fixed_size_test;
|
||||
use super::{compute_num_bits, f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
|
||||
use std::f64;
|
||||
@@ -199,4 +213,21 @@ pub(crate) mod test {
|
||||
assert!(((super::MAX_DOC_LIMIT - 1) as i32) >= 0);
|
||||
assert!((super::MAX_DOC_LIMIT as i32) < 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_minmax_empty() {
|
||||
let vals: Vec<u32> = vec![];
|
||||
assert_eq!(minmax(vals.into_iter()), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_minmax_one() {
|
||||
assert_eq!(minmax(vec![1].into_iter()), Some((1, 1)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_minmax_two() {
|
||||
assert_eq!(minmax(vec![1, 2].into_iter()), Some((1, 2)));
|
||||
assert_eq!(minmax(vec![2, 1].into_iter()), Some((1, 2)));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use super::segment::create_segment;
|
||||
use super::segment::Segment;
|
||||
use crate::core::Executor;
|
||||
use crate::core::IndexMeta;
|
||||
@@ -22,7 +21,6 @@ use crate::schema::FieldType;
|
||||
use crate::schema::Schema;
|
||||
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
|
||||
use crate::IndexWriter;
|
||||
use num_cpus;
|
||||
use std::borrow::BorrowMut;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt;
|
||||
@@ -337,7 +335,7 @@ impl Index {
|
||||
|
||||
#[doc(hidden)]
|
||||
pub fn segment(&self, segment_meta: SegmentMeta) -> Segment {
|
||||
create_segment(self.clone(), segment_meta)
|
||||
Segment::for_index(self.clone(), segment_meta)
|
||||
}
|
||||
|
||||
/// Creates a new segment.
|
||||
|
||||
@@ -3,8 +3,7 @@ use crate::core::SegmentId;
|
||||
use crate::schema::Schema;
|
||||
use crate::Opstamp;
|
||||
use census::{Inventory, TrackedObject};
|
||||
use serde;
|
||||
use serde_json;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashSet;
|
||||
use std::fmt;
|
||||
use std::path::PathBuf;
|
||||
|
||||
@@ -4,7 +4,6 @@ use crate::core::Executor;
|
||||
use crate::core::InvertedIndexReader;
|
||||
use crate::core::SegmentReader;
|
||||
use crate::query::Query;
|
||||
use crate::query::Scorer;
|
||||
use crate::query::Weight;
|
||||
use crate::schema::Document;
|
||||
use crate::schema::Schema;
|
||||
@@ -24,17 +23,9 @@ fn collect_segment<C: Collector>(
|
||||
segment_reader: &SegmentReader,
|
||||
) -> crate::Result<C::Fruit> {
|
||||
let mut scorer = weight.scorer(segment_reader, 1.0f32)?;
|
||||
let mut segment_collector = collector.for_segment(segment_ord as u32, segment_reader)?;
|
||||
if let Some(delete_bitset) = segment_reader.delete_bitset() {
|
||||
scorer.for_each(&mut |doc, score| {
|
||||
if delete_bitset.is_alive(doc) {
|
||||
segment_collector.collect(doc, score);
|
||||
}
|
||||
});
|
||||
} else {
|
||||
scorer.for_each(&mut |doc, score| segment_collector.collect(doc, score));
|
||||
}
|
||||
Ok(segment_collector.harvest())
|
||||
let segment_collector =
|
||||
collector.for_segment(segment_ord as u32, segment_reader)?;
|
||||
Ok(segment_collector.collect_scorer(&mut scorer, segment_reader.delete_bitset()))
|
||||
}
|
||||
|
||||
/// Holds a list of `SegmentReader`s ready for search.
|
||||
|
||||
@@ -24,15 +24,12 @@ impl fmt::Debug for Segment {
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new segment given an `Index` and a `SegmentId`
|
||||
///
|
||||
/// The function is here to make it private outside `tantivy`.
|
||||
/// #[doc(hidden)]
|
||||
pub fn create_segment(index: Index, meta: SegmentMeta) -> Segment {
|
||||
Segment { index, meta }
|
||||
}
|
||||
|
||||
impl Segment {
|
||||
/// Creates a new segment given an `Index` and a `SegmentId`
|
||||
pub(crate) fn for_index(index: Index, meta: SegmentMeta) -> Segment {
|
||||
Segment { index, meta }
|
||||
}
|
||||
|
||||
/// Returns the index the segment belongs to.
|
||||
pub fn index(&self) -> &Index {
|
||||
&self.index
|
||||
|
||||
@@ -4,6 +4,7 @@ use uuid::Uuid;
|
||||
|
||||
#[cfg(test)]
|
||||
use once_cell::sync::Lazy;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::error::Error;
|
||||
use std::str::FromStr;
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -8,6 +8,8 @@ use crc32fast::Hasher;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
|
||||
const FOOTER_MAX_LEN: usize = 10_000;
|
||||
|
||||
type CrcHashU32 = u32;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
@@ -143,12 +145,23 @@ impl BinarySerializable for VersionedFooter {
|
||||
}
|
||||
}
|
||||
BinarySerializable::serialize(&VInt(buf.len() as u64), writer)?;
|
||||
assert!(buf.len() <= FOOTER_MAX_LEN);
|
||||
writer.write_all(&buf[..])?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let len = VInt::deserialize(reader)?.0 as usize;
|
||||
if len > FOOTER_MAX_LEN {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
format!(
|
||||
"Footer seems invalid as it suggests a footer len of {}. File is corrupted, \
|
||||
or the index was created with a different & old version of tantivy.",
|
||||
len
|
||||
),
|
||||
));
|
||||
}
|
||||
let mut buf = vec![0u8; len];
|
||||
reader.read_exact(&mut buf[..])?;
|
||||
let mut cursor = &buf[..];
|
||||
@@ -221,11 +234,12 @@ mod tests {
|
||||
|
||||
use super::CrcHashU32;
|
||||
use super::FooterProxy;
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::common::{BinarySerializable, VInt};
|
||||
use crate::directory::footer::{Footer, VersionedFooter};
|
||||
use crate::directory::TerminatingWrite;
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use regex::Regex;
|
||||
use std::io;
|
||||
|
||||
#[test]
|
||||
fn test_versioned_footer() {
|
||||
@@ -336,4 +350,20 @@ mod tests {
|
||||
let res = footer.is_compatible();
|
||||
assert!(res.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_deserialize_too_large_footer() {
|
||||
let mut buf = vec![];
|
||||
assert!(FooterProxy::new(&mut buf).terminate().is_ok());
|
||||
let mut long_len_buf = [0u8; 10];
|
||||
let num_bytes = VInt(super::FOOTER_MAX_LEN as u64 + 1u64).serialize_into(&mut long_len_buf);
|
||||
buf[0..num_bytes].copy_from_slice(&long_len_buf[..num_bytes]);
|
||||
let err = Footer::deserialize(&mut &buf[..]).unwrap_err();
|
||||
assert_eq!(err.kind(), io::ErrorKind::InvalidData);
|
||||
assert_eq!(
|
||||
err.to_string(),
|
||||
"Footer seems invalid as it suggests a footer len of 10001. File is corrupted, \
|
||||
or the index was created with a different & old version of tantivy."
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,7 +11,6 @@ use crate::error::DataCorruption;
|
||||
use crate::Directory;
|
||||
|
||||
use crc32fast::Hasher;
|
||||
use serde_json;
|
||||
use std::collections::HashSet;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
|
||||
@@ -1,10 +1,3 @@
|
||||
use fs2;
|
||||
use notify;
|
||||
|
||||
use self::fs2::FileExt;
|
||||
use self::notify::RawEvent;
|
||||
use self::notify::RecursiveMode;
|
||||
use self::notify::Watcher;
|
||||
use crate::core::META_FILEPATH;
|
||||
use crate::directory::error::LockError;
|
||||
use crate::directory::error::{
|
||||
@@ -20,8 +13,12 @@ use crate::directory::WatchCallback;
|
||||
use crate::directory::WatchCallbackList;
|
||||
use crate::directory::WatchHandle;
|
||||
use crate::directory::{TerminatingWrite, WritePtr};
|
||||
use atomicwrites;
|
||||
use fs2::FileExt;
|
||||
use memmap::Mmap;
|
||||
use notify::RawEvent;
|
||||
use notify::RecursiveMode;
|
||||
use notify::Watcher;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::convert::From;
|
||||
use std::fmt;
|
||||
@@ -223,17 +220,13 @@ struct MmapDirectoryInner {
|
||||
}
|
||||
|
||||
impl MmapDirectoryInner {
|
||||
fn new(
|
||||
root_path: PathBuf,
|
||||
temp_directory: Option<TempDir>,
|
||||
) -> Result<MmapDirectoryInner, OpenDirectoryError> {
|
||||
let mmap_directory_inner = MmapDirectoryInner {
|
||||
fn new(root_path: PathBuf, temp_directory: Option<TempDir>) -> MmapDirectoryInner {
|
||||
MmapDirectoryInner {
|
||||
root_path,
|
||||
mmap_cache: Default::default(),
|
||||
_temp_directory: temp_directory,
|
||||
watcher: RwLock::new(None),
|
||||
};
|
||||
Ok(mmap_directory_inner)
|
||||
}
|
||||
}
|
||||
|
||||
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
|
||||
@@ -267,14 +260,11 @@ impl fmt::Debug for MmapDirectory {
|
||||
}
|
||||
|
||||
impl MmapDirectory {
|
||||
fn new(
|
||||
root_path: PathBuf,
|
||||
temp_directory: Option<TempDir>,
|
||||
) -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
let inner = MmapDirectoryInner::new(root_path, temp_directory)?;
|
||||
Ok(MmapDirectory {
|
||||
fn new(root_path: PathBuf, temp_directory: Option<TempDir>) -> MmapDirectory {
|
||||
let inner = MmapDirectoryInner::new(root_path, temp_directory);
|
||||
MmapDirectory {
|
||||
inner: Arc::new(inner),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new MmapDirectory in a temporary directory.
|
||||
@@ -284,7 +274,7 @@ impl MmapDirectory {
|
||||
pub fn create_from_tempdir() -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
let tempdir = TempDir::new().map_err(OpenDirectoryError::IoError)?;
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
MmapDirectory::new(tempdir_path, Some(tempdir))
|
||||
Ok(MmapDirectory::new(tempdir_path, Some(tempdir)))
|
||||
}
|
||||
|
||||
/// Opens a MmapDirectory in a directory.
|
||||
@@ -302,7 +292,7 @@ impl MmapDirectory {
|
||||
directory_path,
|
||||
)))
|
||||
} else {
|
||||
Ok(MmapDirectory::new(PathBuf::from(directory_path), None)?)
|
||||
Ok(MmapDirectory::new(PathBuf::from(directory_path), None))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -144,6 +144,22 @@ impl RAMDirectory {
|
||||
pub fn total_mem_usage(&self) -> usize {
|
||||
self.fs.read().unwrap().total_mem_usage()
|
||||
}
|
||||
|
||||
/// Write a copy of all of the files saved in the RAMDirectory in the target `Directory`.
|
||||
///
|
||||
/// Files are all written using the `Directory::write` meaning, even if they were
|
||||
/// written using the `atomic_write` api.
|
||||
///
|
||||
/// If an error is encounterred, files may be persisted partially.
|
||||
pub fn persist(&self, dest: &mut dyn Directory) -> crate::Result<()> {
|
||||
let wlock = self.fs.write().unwrap();
|
||||
for (path, source) in wlock.fs.iter() {
|
||||
let mut dest_wrt = dest.open_write(path)?;
|
||||
dest_wrt.write_all(source.as_slice())?;
|
||||
dest_wrt.terminate()?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Directory for RAMDirectory {
|
||||
@@ -204,3 +220,28 @@ impl Directory for RAMDirectory {
|
||||
Ok(self.fs.write().unwrap().watch(watch_callback))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::RAMDirectory;
|
||||
use crate::Directory;
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
|
||||
#[test]
|
||||
fn test_persist() {
|
||||
let msg_atomic: &'static [u8] = b"atomic is the way";
|
||||
let msg_seq: &'static [u8] = b"sequential is the way";
|
||||
let path_atomic: &'static Path = Path::new("atomic");
|
||||
let path_seq: &'static Path = Path::new("seq");
|
||||
let mut directory = RAMDirectory::create();
|
||||
assert!(directory.atomic_write(path_atomic, msg_atomic).is_ok());
|
||||
let mut wrt = directory.open_write(path_seq).unwrap();
|
||||
assert!(wrt.write_all(msg_seq).is_ok());
|
||||
assert!(wrt.flush().is_ok());
|
||||
let mut directory_copy = RAMDirectory::create();
|
||||
assert!(directory.persist(&mut directory_copy).is_ok());
|
||||
assert_eq!(directory_copy.atomic_read(path_atomic).unwrap(), msg_atomic);
|
||||
assert_eq!(directory_copy.atomic_read(path_seq).unwrap(), msg_seq);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,7 +7,6 @@ use crate::directory::error::{Incompatibility, LockError};
|
||||
use crate::fastfield::FastFieldNotAvailableError;
|
||||
use crate::query;
|
||||
use crate::schema;
|
||||
use serde_json;
|
||||
use std::fmt;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::PoisonError;
|
||||
|
||||
@@ -6,7 +6,6 @@ use crate::schema::{Document, Field};
|
||||
use crate::termdict::TermOrdinal;
|
||||
use crate::DocId;
|
||||
use fnv::FnvHashMap;
|
||||
use itertools::Itertools;
|
||||
use std::io;
|
||||
|
||||
/// Writer for multi-valued (as in, more than one value per document)
|
||||
@@ -151,8 +150,8 @@ impl MultiValueIntFastFieldWriter {
|
||||
}
|
||||
}
|
||||
None => {
|
||||
let val_min_max = self.vals.iter().cloned().minmax();
|
||||
let (val_min, val_max) = val_min_max.into_option().unwrap_or((0u64, 0u64));
|
||||
let val_min_max = crate::common::minmax(self.vals.iter().cloned());
|
||||
let (val_min, val_max) = val_min_max.unwrap_or((0u64, 0u64));
|
||||
value_serializer =
|
||||
serializer.new_u64_fast_field_with_idx(self.field, val_min, val_max, 1)?;
|
||||
for &val in &self.vals {
|
||||
|
||||
@@ -155,6 +155,8 @@ pub(crate) fn advance_deletes(
|
||||
None => BitSet::with_max_value(max_doc),
|
||||
};
|
||||
|
||||
let num_deleted_docs_before = segment.meta().num_deleted_docs();
|
||||
|
||||
compute_deleted_bitset(
|
||||
&mut delete_bitset,
|
||||
&segment_reader,
|
||||
@@ -164,6 +166,8 @@ pub(crate) fn advance_deletes(
|
||||
)?;
|
||||
|
||||
// TODO optimize
|
||||
// It should be possible to do something smarter by manipulation bitsets directly
|
||||
// to compute this union.
|
||||
if let Some(seg_delete_bitset) = segment_reader.delete_bitset() {
|
||||
for doc in 0u32..max_doc {
|
||||
if seg_delete_bitset.is_deleted(doc) {
|
||||
@@ -172,8 +176,9 @@ pub(crate) fn advance_deletes(
|
||||
}
|
||||
}
|
||||
|
||||
let num_deleted_docs = delete_bitset.len();
|
||||
if num_deleted_docs > 0 {
|
||||
let num_deleted_docs: u32 = delete_bitset.len() as u32;
|
||||
if num_deleted_docs > num_deleted_docs_before {
|
||||
// There are new deletes. We need to write a new delete file.
|
||||
segment = segment.with_delete_meta(num_deleted_docs as u32, target_opstamp);
|
||||
let mut delete_file = segment.open_write(SegmentComponent::DELETE)?;
|
||||
write_delete_bitset(&delete_bitset, max_doc, &mut delete_file)?;
|
||||
@@ -803,6 +808,46 @@ mod tests {
|
||||
assert_eq!(batch_opstamp1, 2u64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_no_need_to_rewrite_delete_file_if_no_new_deletes() {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(text_field => "hello1"));
|
||||
index_writer.add_document(doc!(text_field => "hello2"));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
assert_eq!(searcher.segment_reader(0u32).num_deleted_docs(), 0);
|
||||
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "hello1"));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
assert!(reader.reload().is_ok());
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
assert_eq!(searcher.segment_reader(0u32).num_deleted_docs(), 1);
|
||||
|
||||
let previous_delete_opstamp = index.load_metas().unwrap().segments[0].delete_opstamp();
|
||||
|
||||
// All docs containing hello1 have been already removed.
|
||||
// We should not update the delete meta.
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "hello1"));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
assert!(reader.reload().is_ok());
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
assert_eq!(searcher.segment_reader(0u32).num_deleted_docs(), 1);
|
||||
|
||||
let after_delete_opstamp = index.load_metas().unwrap().segments[0].delete_opstamp();
|
||||
assert_eq!(after_delete_opstamp, previous_delete_opstamp);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ordered_batched_operations() {
|
||||
// * one delete for `doc!(field=>"a")`
|
||||
|
||||
@@ -21,7 +21,6 @@ use crate::store::StoreWriter;
|
||||
use crate::termdict::TermMerger;
|
||||
use crate::termdict::TermOrdinal;
|
||||
use crate::DocId;
|
||||
use itertools::Itertools;
|
||||
use std::cmp;
|
||||
use std::collections::HashMap;
|
||||
|
||||
@@ -70,11 +69,11 @@ fn compute_min_max_val(
|
||||
Some(delete_bitset) => {
|
||||
// some deleted documents,
|
||||
// we need to recompute the max / min
|
||||
(0..max_doc)
|
||||
.filter(|doc_id| delete_bitset.is_alive(*doc_id))
|
||||
.map(|doc_id| u64_reader.get(doc_id))
|
||||
.minmax()
|
||||
.into_option()
|
||||
crate::common::minmax(
|
||||
(0..max_doc)
|
||||
.filter(|doc_id| delete_bitset.is_alive(*doc_id))
|
||||
.map(|doc_id| u64_reader.get(doc_id)),
|
||||
)
|
||||
}
|
||||
None => {
|
||||
// no deleted documents,
|
||||
|
||||
@@ -23,7 +23,6 @@ use futures::channel::oneshot;
|
||||
use futures::executor::{ThreadPool, ThreadPoolBuilder};
|
||||
use futures::future::Future;
|
||||
use futures::future::TryFutureExt;
|
||||
use serde_json;
|
||||
use std::borrow::BorrowMut;
|
||||
use std::collections::HashSet;
|
||||
use std::io::Write;
|
||||
|
||||
@@ -98,9 +98,6 @@
|
||||
//! [literate programming](https://tantivy-search.github.io/examples/basic_search.html) /
|
||||
//! [source code](https://github.com/tantivy-search/tantivy/blob/master/examples/basic_search.rs))
|
||||
|
||||
#[macro_use]
|
||||
extern crate serde_derive;
|
||||
|
||||
#[cfg_attr(test, macro_use)]
|
||||
extern crate serde_json;
|
||||
|
||||
@@ -173,6 +170,7 @@ pub use crate::schema::{Document, Term};
|
||||
use std::fmt;
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Index format version.
|
||||
const INDEX_FORMAT_VERSION: u32 = 1;
|
||||
|
||||
16
src/postings/block_max_postings.rs
Normal file
16
src/postings/block_max_postings.rs
Normal file
@@ -0,0 +1,16 @@
|
||||
use crate::postings::Postings;
|
||||
use crate::DocId;
|
||||
|
||||
/// Inverted list with additional information about the maximum term frequency
|
||||
/// within a block, as well as globally within the list.
|
||||
pub trait BlockMaxPostings: Postings {
|
||||
/// Returns the maximum frequency in the entire list.
|
||||
fn max_term_freq(&self) -> u32;
|
||||
/// Returns the maximum frequency in the current block.
|
||||
fn block_max_term_freq(&mut self) -> u32;
|
||||
/// Returns the document with the largest frequency.
|
||||
fn max_doc(&self) -> DocId;
|
||||
/// Returns the document with the largest frequency within the current
|
||||
/// block.
|
||||
fn block_max_doc(&self) -> DocId;
|
||||
}
|
||||
76
src/postings/block_max_segment_postings.rs
Normal file
76
src/postings/block_max_segment_postings.rs
Normal file
@@ -0,0 +1,76 @@
|
||||
use crate::postings::{BlockMaxPostings, Postings, SegmentPostings};
|
||||
use crate::{DocId, DocSet, SkipResult};
|
||||
|
||||
/// A wrapper over [`SegmentPostings`](./struct.SegmentPostings.html)
|
||||
/// with max block frequencies.
|
||||
pub struct BlockMaxSegmentPostings {
|
||||
postings: SegmentPostings,
|
||||
max_blocks: SegmentPostings,
|
||||
doc_with_max_term_freq: DocId,
|
||||
max_term_freq: u32,
|
||||
}
|
||||
|
||||
impl BlockMaxSegmentPostings {
|
||||
/// Constructs a new segment postings with block-max information.
|
||||
pub fn new(
|
||||
postings: SegmentPostings,
|
||||
max_blocks: SegmentPostings,
|
||||
doc_with_max_term_freq: DocId,
|
||||
max_term_freq: u32,
|
||||
) -> Self {
|
||||
Self {
|
||||
postings,
|
||||
max_blocks,
|
||||
doc_with_max_term_freq,
|
||||
max_term_freq,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl DocSet for BlockMaxSegmentPostings {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.postings.advance()
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.postings.doc()
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.postings.size_hint()
|
||||
}
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
self.postings.skip_next(target)
|
||||
}
|
||||
}
|
||||
|
||||
impl Postings for BlockMaxSegmentPostings {
|
||||
fn term_freq(&self) -> u32 {
|
||||
self.postings.term_freq()
|
||||
}
|
||||
fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
|
||||
self.postings.positions_with_offset(offset, output);
|
||||
}
|
||||
fn positions(&mut self, output: &mut Vec<u32>) {
|
||||
self.postings.positions(output);
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockMaxPostings for BlockMaxSegmentPostings {
|
||||
fn max_term_freq(&self) -> u32 {
|
||||
self.max_term_freq
|
||||
}
|
||||
fn block_max_term_freq(&mut self) -> u32 {
|
||||
if let SkipResult::End = self.max_blocks.skip_next(self.doc()) {
|
||||
panic!("Max blocks corrupted: reached end of max block");
|
||||
}
|
||||
self.max_blocks.term_freq()
|
||||
}
|
||||
fn max_doc(&self) -> DocId {
|
||||
self.doc_with_max_term_freq
|
||||
}
|
||||
fn block_max_doc(&self) -> DocId {
|
||||
self.max_blocks.doc()
|
||||
}
|
||||
}
|
||||
@@ -106,7 +106,7 @@ impl BlockSearcher {
|
||||
/// the target.
|
||||
///
|
||||
/// The results should be equivalent to
|
||||
/// ```ignore
|
||||
/// ```compile_fail
|
||||
/// block[..]
|
||||
// .iter()
|
||||
// .take_while(|&&val| val < target)
|
||||
|
||||
316
src/postings/block_segment_postings.rs
Normal file
316
src/postings/block_segment_postings.rs
Normal file
@@ -0,0 +1,316 @@
|
||||
use crate::DocId;
|
||||
use tantivy_fst::Streamer;
|
||||
use crate::postings::{SkipReader, FreqReadingOption, USE_SKIP_INFO_LIMIT};
|
||||
use owned_read::OwnedRead;
|
||||
use crate::postings::compression::{BlockDecoder, COMPRESSION_BLOCK_SIZE, VIntDecoder, compressed_block_size, AlignedBuffer};
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::common::{VInt, BinarySerializable};
|
||||
|
||||
|
||||
fn split_into_skips_and_postings(
|
||||
doc_freq: u32,
|
||||
mut data: OwnedRead,
|
||||
) -> (Option<OwnedRead>, OwnedRead) {
|
||||
if doc_freq >= USE_SKIP_INFO_LIMIT {
|
||||
let skip_len = VInt::deserialize(&mut data).expect("Data corrupted").0 as usize;
|
||||
let mut postings_data = data.clone();
|
||||
postings_data.advance(skip_len);
|
||||
data.clip(skip_len);
|
||||
(Some(data), postings_data)
|
||||
} else {
|
||||
(None, data)
|
||||
}
|
||||
}
|
||||
|
||||
/// `BlockSegmentPostings` is a cursor iterating over blocks
|
||||
/// of documents.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// While it is useful for some very specific high-performance
|
||||
/// use cases, you should prefer using `SegmentPostings` for most usage.
|
||||
pub struct BlockSegmentPostings {
|
||||
doc_decoder: BlockDecoder,
|
||||
freq_decoder: BlockDecoder,
|
||||
freq_reading_option: FreqReadingOption,
|
||||
|
||||
doc_freq: usize,
|
||||
doc_offset: DocId,
|
||||
|
||||
num_vint_docs: usize,
|
||||
|
||||
remaining_data: OwnedRead,
|
||||
skip_reader: SkipReader,
|
||||
}
|
||||
|
||||
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
pub enum BlockSegmentPostingsSkipResult {
|
||||
Terminated,
|
||||
Success(u32), //< number of term freqs to skip
|
||||
}
|
||||
|
||||
impl BlockSegmentPostings {
|
||||
pub(crate) fn from_data(
|
||||
doc_freq: u32,
|
||||
data: OwnedRead,
|
||||
record_option: IndexRecordOption,
|
||||
requested_option: IndexRecordOption,
|
||||
) -> BlockSegmentPostings {
|
||||
let freq_reading_option = match (record_option, requested_option) {
|
||||
(IndexRecordOption::Basic, _) => FreqReadingOption::NoFreq,
|
||||
(_, IndexRecordOption::Basic) => FreqReadingOption::SkipFreq,
|
||||
(_, _) => FreqReadingOption::ReadFreq,
|
||||
};
|
||||
|
||||
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, data);
|
||||
let skip_reader = match skip_data_opt {
|
||||
Some(skip_data) => SkipReader::new(skip_data, record_option),
|
||||
None => SkipReader::new(OwnedRead::new(&[][..]), record_option),
|
||||
};
|
||||
let doc_freq = doc_freq as usize;
|
||||
let num_vint_docs = doc_freq % COMPRESSION_BLOCK_SIZE;
|
||||
BlockSegmentPostings {
|
||||
num_vint_docs,
|
||||
doc_decoder: BlockDecoder::new(),
|
||||
freq_decoder: BlockDecoder::with_val(1),
|
||||
freq_reading_option,
|
||||
doc_offset: 0,
|
||||
doc_freq,
|
||||
remaining_data: postings_data,
|
||||
skip_reader,
|
||||
}
|
||||
}
|
||||
|
||||
// Resets the block segment postings on another position
|
||||
// in the postings file.
|
||||
//
|
||||
// This is useful for enumerating through a list of terms,
|
||||
// and consuming the associated posting lists while avoiding
|
||||
// reallocating a `BlockSegmentPostings`.
|
||||
//
|
||||
// # Warning
|
||||
//
|
||||
// This does not reset the positions list.
|
||||
pub(crate) fn reset(&mut self, doc_freq: u32, postings_data: OwnedRead) {
|
||||
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, postings_data);
|
||||
let num_vint_docs = (doc_freq as usize) & (COMPRESSION_BLOCK_SIZE - 1);
|
||||
self.num_vint_docs = num_vint_docs;
|
||||
self.remaining_data = postings_data;
|
||||
if let Some(skip_data) = skip_data_opt {
|
||||
self.skip_reader.reset(skip_data);
|
||||
} else {
|
||||
self.skip_reader.reset(OwnedRead::new(&[][..]))
|
||||
}
|
||||
self.doc_offset = 0;
|
||||
self.doc_freq = doc_freq as usize;
|
||||
}
|
||||
|
||||
/// Returns the document frequency associated to this block postings.
|
||||
///
|
||||
/// This `doc_freq` is simply the sum of the length of all of the blocks
|
||||
/// length, and it does not take in account deleted documents.
|
||||
pub fn doc_freq(&self) -> usize {
|
||||
self.doc_freq
|
||||
}
|
||||
|
||||
/// Returns the array of docs in the current block.
|
||||
///
|
||||
/// Before the first call to `.advance()`, the block
|
||||
/// returned by `.docs()` is empty.
|
||||
#[inline]
|
||||
pub fn docs(&self) -> &[DocId] {
|
||||
self.doc_decoder.output_array()
|
||||
}
|
||||
|
||||
pub(crate) fn docs_aligned(&self) -> (&AlignedBuffer, usize) {
|
||||
self.doc_decoder.output_aligned()
|
||||
}
|
||||
|
||||
/// Return the document at index `idx` of the block.
|
||||
#[inline]
|
||||
pub fn doc(&self, idx: usize) -> u32 {
|
||||
self.doc_decoder.output(idx)
|
||||
}
|
||||
|
||||
/// Return the array of `term freq` in the block.
|
||||
#[inline]
|
||||
pub fn freqs(&self) -> &[u32] {
|
||||
self.freq_decoder.output_array()
|
||||
}
|
||||
|
||||
/// Return the frequency at index `idx` of the block.
|
||||
#[inline]
|
||||
pub fn freq(&self, idx: usize) -> u32 {
|
||||
self.freq_decoder.output(idx)
|
||||
}
|
||||
|
||||
/// Returns the length of the current block.
|
||||
///
|
||||
/// All blocks have a length of `NUM_DOCS_PER_BLOCK`,
|
||||
/// except the last block that may have a length
|
||||
/// of any number between 1 and `NUM_DOCS_PER_BLOCK - 1`
|
||||
#[inline]
|
||||
pub(crate) fn block_len(&self) -> usize {
|
||||
self.doc_decoder.output_len
|
||||
}
|
||||
|
||||
/// position on a block that may contains `doc_id`.
|
||||
/// Always advance the current block.
|
||||
///
|
||||
/// Returns true if a block that has an element greater or equal to the target is found.
|
||||
/// Returning true does not guarantee that the smallest element of the block is smaller
|
||||
/// than the target. It only guarantees that the last element is greater or equal.
|
||||
///
|
||||
/// Returns false iff all of the document remaining are smaller than
|
||||
/// `doc_id`. In that case, all of these document are consumed.
|
||||
///
|
||||
pub fn skip_to(&mut self, target_doc: DocId) -> BlockSegmentPostingsSkipResult {
|
||||
let mut skip_freqs = 0u32;
|
||||
while self.skip_reader.advance() {
|
||||
if self.skip_reader.doc() >= target_doc {
|
||||
// the last document of the current block is larger
|
||||
// than the target.
|
||||
//
|
||||
// We found our block!
|
||||
let num_bits = self.skip_reader.doc_num_bits();
|
||||
let num_consumed_bytes = self.doc_decoder.uncompress_block_sorted(
|
||||
self.remaining_data.as_ref(),
|
||||
self.doc_offset,
|
||||
num_bits,
|
||||
);
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
let tf_num_bits = self.skip_reader.tf_num_bits();
|
||||
match self.freq_reading_option {
|
||||
FreqReadingOption::NoFreq => {}
|
||||
FreqReadingOption::SkipFreq => {
|
||||
let num_bytes_to_skip = compressed_block_size(tf_num_bits);
|
||||
self.remaining_data.advance(num_bytes_to_skip);
|
||||
}
|
||||
FreqReadingOption::ReadFreq => {
|
||||
let num_consumed_bytes = self
|
||||
.freq_decoder
|
||||
.uncompress_block_unsorted(self.remaining_data.as_ref(), tf_num_bits);
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
}
|
||||
}
|
||||
self.doc_offset = self.skip_reader.doc();
|
||||
return BlockSegmentPostingsSkipResult::Success(skip_freqs);
|
||||
} else {
|
||||
skip_freqs += self.skip_reader.tf_sum();
|
||||
let advance_len = self.skip_reader.total_block_len();
|
||||
self.doc_offset = self.skip_reader.doc();
|
||||
self.remaining_data.advance(advance_len);
|
||||
}
|
||||
}
|
||||
|
||||
// we are now on the last, incomplete, variable encoded block.
|
||||
if self.num_vint_docs > 0 {
|
||||
let num_compressed_bytes = self.doc_decoder.uncompress_vint_sorted(
|
||||
self.remaining_data.as_ref(),
|
||||
self.doc_offset,
|
||||
self.num_vint_docs,
|
||||
);
|
||||
self.remaining_data.advance(num_compressed_bytes);
|
||||
match self.freq_reading_option {
|
||||
FreqReadingOption::NoFreq | FreqReadingOption::SkipFreq => {}
|
||||
FreqReadingOption::ReadFreq => {
|
||||
self.freq_decoder
|
||||
.uncompress_vint_unsorted(self.remaining_data.as_ref(), self.num_vint_docs);
|
||||
}
|
||||
}
|
||||
self.num_vint_docs = 0;
|
||||
return self
|
||||
.docs()
|
||||
.last()
|
||||
.map(|last_doc| {
|
||||
if *last_doc >= target_doc {
|
||||
BlockSegmentPostingsSkipResult::Success(skip_freqs)
|
||||
} else {
|
||||
BlockSegmentPostingsSkipResult::Terminated
|
||||
}
|
||||
})
|
||||
.unwrap_or(BlockSegmentPostingsSkipResult::Terminated);
|
||||
}
|
||||
BlockSegmentPostingsSkipResult::Terminated
|
||||
}
|
||||
|
||||
/// Advance to the next block.
|
||||
///
|
||||
/// Returns false iff there was no remaining blocks.
|
||||
pub fn advance(&mut self) -> bool {
|
||||
if self.skip_reader.advance() {
|
||||
let num_bits = self.skip_reader.doc_num_bits();
|
||||
let num_consumed_bytes = self.doc_decoder.uncompress_block_sorted(
|
||||
self.remaining_data.as_ref(),
|
||||
self.doc_offset,
|
||||
num_bits,
|
||||
);
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
let tf_num_bits = self.skip_reader.tf_num_bits();
|
||||
match self.freq_reading_option {
|
||||
FreqReadingOption::NoFreq => {}
|
||||
FreqReadingOption::SkipFreq => {
|
||||
let num_bytes_to_skip = compressed_block_size(tf_num_bits);
|
||||
self.remaining_data.advance(num_bytes_to_skip);
|
||||
}
|
||||
FreqReadingOption::ReadFreq => {
|
||||
let num_consumed_bytes = self
|
||||
.freq_decoder
|
||||
.uncompress_block_unsorted(self.remaining_data.as_ref(), tf_num_bits);
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
}
|
||||
}
|
||||
// it will be used as the next offset.
|
||||
self.doc_offset = self.doc_decoder.output(COMPRESSION_BLOCK_SIZE - 1);
|
||||
true
|
||||
} else if self.num_vint_docs > 0 {
|
||||
let num_compressed_bytes = self.doc_decoder.uncompress_vint_sorted(
|
||||
self.remaining_data.as_ref(),
|
||||
self.doc_offset,
|
||||
self.num_vint_docs,
|
||||
);
|
||||
self.remaining_data.advance(num_compressed_bytes);
|
||||
match self.freq_reading_option {
|
||||
FreqReadingOption::NoFreq | FreqReadingOption::SkipFreq => {}
|
||||
FreqReadingOption::ReadFreq => {
|
||||
self.freq_decoder
|
||||
.uncompress_vint_unsorted(self.remaining_data.as_ref(), self.num_vint_docs);
|
||||
}
|
||||
}
|
||||
self.num_vint_docs = 0;
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an empty segment postings object
|
||||
pub fn empty() -> BlockSegmentPostings {
|
||||
BlockSegmentPostings {
|
||||
num_vint_docs: 0,
|
||||
|
||||
doc_decoder: BlockDecoder::new(),
|
||||
freq_decoder: BlockDecoder::with_val(1),
|
||||
freq_reading_option: FreqReadingOption::NoFreq,
|
||||
|
||||
doc_offset: 0,
|
||||
doc_freq: 0,
|
||||
|
||||
remaining_data: OwnedRead::new(vec![]),
|
||||
skip_reader: SkipReader::new(OwnedRead::new(vec![]), IndexRecordOption::Basic),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Streamer<'a> for BlockSegmentPostings {
|
||||
type Item = &'a [DocId];
|
||||
|
||||
fn next(&'a mut self) -> Option<&'a [DocId]> {
|
||||
if self.advance() {
|
||||
Some(self.docs())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2,6 +2,8 @@
|
||||
Postings module (also called inverted index)
|
||||
*/
|
||||
|
||||
mod block_max_postings;
|
||||
mod block_max_segment_postings;
|
||||
mod block_search;
|
||||
pub(crate) mod compression;
|
||||
/// Postings module
|
||||
@@ -12,6 +14,7 @@ mod postings;
|
||||
mod postings_writer;
|
||||
mod recorder;
|
||||
mod segment_postings;
|
||||
mod block_segment_postings;
|
||||
mod serializer;
|
||||
mod skip;
|
||||
mod stacker;
|
||||
@@ -27,7 +30,11 @@ pub use self::postings::Postings;
|
||||
pub(crate) use self::skip::SkipReader;
|
||||
pub use self::term_info::TermInfo;
|
||||
|
||||
pub use self::segment_postings::{BlockSegmentPostings, SegmentPostings};
|
||||
pub use self::segment_postings::SegmentPostings;
|
||||
|
||||
pub use self::block_segment_postings::BlockSegmentPostings;
|
||||
pub use self::block_max_postings::BlockMaxPostings;
|
||||
pub use self::block_max_segment_postings::BlockMaxSegmentPostings;
|
||||
|
||||
pub(crate) use self::stacker::compute_table_size;
|
||||
|
||||
|
||||
@@ -1,21 +1,16 @@
|
||||
use crate::common::BitSet;
|
||||
use crate::common::HasLen;
|
||||
use crate::common::{BinarySerializable, VInt};
|
||||
use crate::docset::{DocSet, SkipResult};
|
||||
use crate::positions::PositionReader;
|
||||
use crate::postings::compression::{compressed_block_size, AlignedBuffer};
|
||||
use crate::postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE};
|
||||
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
use crate::postings::serializer::PostingsSerializer;
|
||||
use crate::postings::BlockSearcher;
|
||||
use crate::postings::FreqReadingOption;
|
||||
use crate::postings::{BlockSearcher, BlockSegmentPostings};
|
||||
use crate::postings::Postings;
|
||||
use crate::postings::SkipReader;
|
||||
use crate::postings::USE_SKIP_INFO_LIMIT;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::DocId;
|
||||
use owned_read::OwnedRead;
|
||||
use std::cmp::Ordering;
|
||||
use tantivy_fst::Streamer;
|
||||
use crate::postings::block_segment_postings::BlockSegmentPostingsSkipResult;
|
||||
|
||||
struct PositionComputer {
|
||||
// store the amount of position int
|
||||
@@ -299,313 +294,6 @@ impl Postings for SegmentPostings {
|
||||
}
|
||||
}
|
||||
|
||||
/// `BlockSegmentPostings` is a cursor iterating over blocks
|
||||
/// of documents.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// While it is useful for some very specific high-performance
|
||||
/// use cases, you should prefer using `SegmentPostings` for most usage.
|
||||
pub struct BlockSegmentPostings {
|
||||
doc_decoder: BlockDecoder,
|
||||
freq_decoder: BlockDecoder,
|
||||
freq_reading_option: FreqReadingOption,
|
||||
|
||||
doc_freq: usize,
|
||||
doc_offset: DocId,
|
||||
|
||||
num_vint_docs: usize,
|
||||
|
||||
remaining_data: OwnedRead,
|
||||
skip_reader: SkipReader,
|
||||
}
|
||||
|
||||
fn split_into_skips_and_postings(
|
||||
doc_freq: u32,
|
||||
mut data: OwnedRead,
|
||||
) -> (Option<OwnedRead>, OwnedRead) {
|
||||
if doc_freq >= USE_SKIP_INFO_LIMIT {
|
||||
let skip_len = VInt::deserialize(&mut data).expect("Data corrupted").0 as usize;
|
||||
let mut postings_data = data.clone();
|
||||
postings_data.advance(skip_len);
|
||||
data.clip(skip_len);
|
||||
(Some(data), postings_data)
|
||||
} else {
|
||||
(None, data)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
pub enum BlockSegmentPostingsSkipResult {
|
||||
Terminated,
|
||||
Success(u32), //< number of term freqs to skip
|
||||
}
|
||||
|
||||
impl BlockSegmentPostings {
|
||||
pub(crate) fn from_data(
|
||||
doc_freq: u32,
|
||||
data: OwnedRead,
|
||||
record_option: IndexRecordOption,
|
||||
requested_option: IndexRecordOption,
|
||||
) -> BlockSegmentPostings {
|
||||
let freq_reading_option = match (record_option, requested_option) {
|
||||
(IndexRecordOption::Basic, _) => FreqReadingOption::NoFreq,
|
||||
(_, IndexRecordOption::Basic) => FreqReadingOption::SkipFreq,
|
||||
(_, _) => FreqReadingOption::ReadFreq,
|
||||
};
|
||||
|
||||
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, data);
|
||||
let skip_reader = match skip_data_opt {
|
||||
Some(skip_data) => SkipReader::new(skip_data, record_option),
|
||||
None => SkipReader::new(OwnedRead::new(&[][..]), record_option),
|
||||
};
|
||||
let doc_freq = doc_freq as usize;
|
||||
let num_vint_docs = doc_freq % COMPRESSION_BLOCK_SIZE;
|
||||
BlockSegmentPostings {
|
||||
num_vint_docs,
|
||||
doc_decoder: BlockDecoder::new(),
|
||||
freq_decoder: BlockDecoder::with_val(1),
|
||||
freq_reading_option,
|
||||
doc_offset: 0,
|
||||
doc_freq,
|
||||
remaining_data: postings_data,
|
||||
skip_reader,
|
||||
}
|
||||
}
|
||||
|
||||
// Resets the block segment postings on another position
|
||||
// in the postings file.
|
||||
//
|
||||
// This is useful for enumerating through a list of terms,
|
||||
// and consuming the associated posting lists while avoiding
|
||||
// reallocating a `BlockSegmentPostings`.
|
||||
//
|
||||
// # Warning
|
||||
//
|
||||
// This does not reset the positions list.
|
||||
pub(crate) fn reset(&mut self, doc_freq: u32, postings_data: OwnedRead) {
|
||||
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, postings_data);
|
||||
let num_vint_docs = (doc_freq as usize) & (COMPRESSION_BLOCK_SIZE - 1);
|
||||
self.num_vint_docs = num_vint_docs;
|
||||
self.remaining_data = postings_data;
|
||||
if let Some(skip_data) = skip_data_opt {
|
||||
self.skip_reader.reset(skip_data);
|
||||
} else {
|
||||
self.skip_reader.reset(OwnedRead::new(&[][..]))
|
||||
}
|
||||
self.doc_offset = 0;
|
||||
self.doc_freq = doc_freq as usize;
|
||||
}
|
||||
|
||||
/// Returns the document frequency associated to this block postings.
|
||||
///
|
||||
/// This `doc_freq` is simply the sum of the length of all of the blocks
|
||||
/// length, and it does not take in account deleted documents.
|
||||
pub fn doc_freq(&self) -> usize {
|
||||
self.doc_freq
|
||||
}
|
||||
|
||||
/// Returns the array of docs in the current block.
|
||||
///
|
||||
/// Before the first call to `.advance()`, the block
|
||||
/// returned by `.docs()` is empty.
|
||||
#[inline]
|
||||
pub fn docs(&self) -> &[DocId] {
|
||||
self.doc_decoder.output_array()
|
||||
}
|
||||
|
||||
pub(crate) fn docs_aligned(&self) -> (&AlignedBuffer, usize) {
|
||||
self.doc_decoder.output_aligned()
|
||||
}
|
||||
|
||||
/// Return the document at index `idx` of the block.
|
||||
#[inline]
|
||||
pub fn doc(&self, idx: usize) -> u32 {
|
||||
self.doc_decoder.output(idx)
|
||||
}
|
||||
|
||||
/// Return the array of `term freq` in the block.
|
||||
#[inline]
|
||||
pub fn freqs(&self) -> &[u32] {
|
||||
self.freq_decoder.output_array()
|
||||
}
|
||||
|
||||
/// Return the frequency at index `idx` of the block.
|
||||
#[inline]
|
||||
pub fn freq(&self, idx: usize) -> u32 {
|
||||
self.freq_decoder.output(idx)
|
||||
}
|
||||
|
||||
/// Returns the length of the current block.
|
||||
///
|
||||
/// All blocks have a length of `NUM_DOCS_PER_BLOCK`,
|
||||
/// except the last block that may have a length
|
||||
/// of any number between 1 and `NUM_DOCS_PER_BLOCK - 1`
|
||||
#[inline]
|
||||
fn block_len(&self) -> usize {
|
||||
self.doc_decoder.output_len
|
||||
}
|
||||
|
||||
/// position on a block that may contains `doc_id`.
|
||||
/// Always advance the current block.
|
||||
///
|
||||
/// Returns true if a block that has an element greater or equal to the target is found.
|
||||
/// Returning true does not guarantee that the smallest element of the block is smaller
|
||||
/// than the target. It only guarantees that the last element is greater or equal.
|
||||
///
|
||||
/// Returns false iff all of the document remaining are smaller than
|
||||
/// `doc_id`. In that case, all of these document are consumed.
|
||||
///
|
||||
pub fn skip_to(&mut self, target_doc: DocId) -> BlockSegmentPostingsSkipResult {
|
||||
let mut skip_freqs = 0u32;
|
||||
while self.skip_reader.advance() {
|
||||
if self.skip_reader.doc() >= target_doc {
|
||||
// the last document of the current block is larger
|
||||
// than the target.
|
||||
//
|
||||
// We found our block!
|
||||
let num_bits = self.skip_reader.doc_num_bits();
|
||||
let num_consumed_bytes = self.doc_decoder.uncompress_block_sorted(
|
||||
self.remaining_data.as_ref(),
|
||||
self.doc_offset,
|
||||
num_bits,
|
||||
);
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
let tf_num_bits = self.skip_reader.tf_num_bits();
|
||||
match self.freq_reading_option {
|
||||
FreqReadingOption::NoFreq => {}
|
||||
FreqReadingOption::SkipFreq => {
|
||||
let num_bytes_to_skip = compressed_block_size(tf_num_bits);
|
||||
self.remaining_data.advance(num_bytes_to_skip);
|
||||
}
|
||||
FreqReadingOption::ReadFreq => {
|
||||
let num_consumed_bytes = self
|
||||
.freq_decoder
|
||||
.uncompress_block_unsorted(self.remaining_data.as_ref(), tf_num_bits);
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
}
|
||||
}
|
||||
self.doc_offset = self.skip_reader.doc();
|
||||
return BlockSegmentPostingsSkipResult::Success(skip_freqs);
|
||||
} else {
|
||||
skip_freqs += self.skip_reader.tf_sum();
|
||||
let advance_len = self.skip_reader.total_block_len();
|
||||
self.doc_offset = self.skip_reader.doc();
|
||||
self.remaining_data.advance(advance_len);
|
||||
}
|
||||
}
|
||||
|
||||
// we are now on the last, incomplete, variable encoded block.
|
||||
if self.num_vint_docs > 0 {
|
||||
let num_compressed_bytes = self.doc_decoder.uncompress_vint_sorted(
|
||||
self.remaining_data.as_ref(),
|
||||
self.doc_offset,
|
||||
self.num_vint_docs,
|
||||
);
|
||||
self.remaining_data.advance(num_compressed_bytes);
|
||||
match self.freq_reading_option {
|
||||
FreqReadingOption::NoFreq | FreqReadingOption::SkipFreq => {}
|
||||
FreqReadingOption::ReadFreq => {
|
||||
self.freq_decoder
|
||||
.uncompress_vint_unsorted(self.remaining_data.as_ref(), self.num_vint_docs);
|
||||
}
|
||||
}
|
||||
self.num_vint_docs = 0;
|
||||
return self
|
||||
.docs()
|
||||
.last()
|
||||
.map(|last_doc| {
|
||||
if *last_doc >= target_doc {
|
||||
BlockSegmentPostingsSkipResult::Success(skip_freqs)
|
||||
} else {
|
||||
BlockSegmentPostingsSkipResult::Terminated
|
||||
}
|
||||
})
|
||||
.unwrap_or(BlockSegmentPostingsSkipResult::Terminated);
|
||||
}
|
||||
BlockSegmentPostingsSkipResult::Terminated
|
||||
}
|
||||
|
||||
/// Advance to the next block.
|
||||
///
|
||||
/// Returns false iff there was no remaining blocks.
|
||||
pub fn advance(&mut self) -> bool {
|
||||
if self.skip_reader.advance() {
|
||||
let num_bits = self.skip_reader.doc_num_bits();
|
||||
let num_consumed_bytes = self.doc_decoder.uncompress_block_sorted(
|
||||
self.remaining_data.as_ref(),
|
||||
self.doc_offset,
|
||||
num_bits,
|
||||
);
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
let tf_num_bits = self.skip_reader.tf_num_bits();
|
||||
match self.freq_reading_option {
|
||||
FreqReadingOption::NoFreq => {}
|
||||
FreqReadingOption::SkipFreq => {
|
||||
let num_bytes_to_skip = compressed_block_size(tf_num_bits);
|
||||
self.remaining_data.advance(num_bytes_to_skip);
|
||||
}
|
||||
FreqReadingOption::ReadFreq => {
|
||||
let num_consumed_bytes = self
|
||||
.freq_decoder
|
||||
.uncompress_block_unsorted(self.remaining_data.as_ref(), tf_num_bits);
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
}
|
||||
}
|
||||
// it will be used as the next offset.
|
||||
self.doc_offset = self.doc_decoder.output(COMPRESSION_BLOCK_SIZE - 1);
|
||||
true
|
||||
} else if self.num_vint_docs > 0 {
|
||||
let num_compressed_bytes = self.doc_decoder.uncompress_vint_sorted(
|
||||
self.remaining_data.as_ref(),
|
||||
self.doc_offset,
|
||||
self.num_vint_docs,
|
||||
);
|
||||
self.remaining_data.advance(num_compressed_bytes);
|
||||
match self.freq_reading_option {
|
||||
FreqReadingOption::NoFreq | FreqReadingOption::SkipFreq => {}
|
||||
FreqReadingOption::ReadFreq => {
|
||||
self.freq_decoder
|
||||
.uncompress_vint_unsorted(self.remaining_data.as_ref(), self.num_vint_docs);
|
||||
}
|
||||
}
|
||||
self.num_vint_docs = 0;
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an empty segment postings object
|
||||
pub fn empty() -> BlockSegmentPostings {
|
||||
BlockSegmentPostings {
|
||||
num_vint_docs: 0,
|
||||
|
||||
doc_decoder: BlockDecoder::new(),
|
||||
freq_decoder: BlockDecoder::with_val(1),
|
||||
freq_reading_option: FreqReadingOption::NoFreq,
|
||||
|
||||
doc_offset: 0,
|
||||
doc_freq: 0,
|
||||
|
||||
remaining_data: OwnedRead::new(vec![]),
|
||||
skip_reader: SkipReader::new(OwnedRead::new(vec![]), IndexRecordOption::Basic),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'b> Streamer<'b> for BlockSegmentPostings {
|
||||
type Item = &'b [DocId];
|
||||
|
||||
fn next(&'b mut self) -> Option<&'b [DocId]> {
|
||||
if self.advance() {
|
||||
Some(self.docs())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::BlockSegmentPostings;
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
use murmurhash32;
|
||||
|
||||
use self::murmurhash32::murmurhash2;
|
||||
use murmurhash32::murmurhash2;
|
||||
|
||||
use super::{Addr, MemoryArena};
|
||||
use crate::postings::stacker::memory_arena::store;
|
||||
|
||||
31
src/query/block_max_scorer.rs
Normal file
31
src/query/block_max_scorer.rs
Normal file
@@ -0,0 +1,31 @@
|
||||
use crate::docset::DocSet;
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
use downcast_rs::impl_downcast;
|
||||
use crate::query::Scorer;
|
||||
|
||||
/// A set of documents matching a query within a specific segment
|
||||
/// and having a maximum score within certain blocks.
|
||||
///
|
||||
/// See [`Query`](./trait.Query.html) and [`Scorer`](./trait.Scorer.html).
|
||||
pub trait BlockMaxScorer: downcast_rs::Downcast + DocSet + Scorer + 'static {
|
||||
/// Returns the maximum score within the current block.
|
||||
///
|
||||
/// The blocks are defined when indexing. For example, blocks can be
|
||||
/// have a specific number postings each, or can be optimized for
|
||||
/// retrieval speed. Read more in
|
||||
/// [Faster BlockMax WAND with Variable-sized Blocks][vbmw]
|
||||
///
|
||||
/// This method will perform a bit of computation and is not cached.
|
||||
///
|
||||
/// [vbmw]: https://dl.acm.org/doi/abs/10.1145/3077136.3080780
|
||||
fn block_max_score(&mut self) -> Score;
|
||||
|
||||
/// Returns the last document in the current block.
|
||||
fn block_max_doc(&mut self) -> DocId;
|
||||
|
||||
/// Returns the maximum possible score within the entire document set.
|
||||
fn max_score(&self) -> Score;
|
||||
}
|
||||
|
||||
impl_downcast!(BlockMaxScorer);
|
||||
613
src/query/block_max_wand.rs
Normal file
613
src/query/block_max_wand.rs
Normal file
@@ -0,0 +1,613 @@
|
||||
use crate::docset::{DocSet, SkipResult};
|
||||
use crate::query::score_combiner::ScoreCombiner;
|
||||
use crate::query::{BlockMaxScorer, Scorer};
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
use crate::query::weight::PruningScorer;
|
||||
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
|
||||
struct Pivot {
|
||||
position: usize,
|
||||
first_occurrence: usize,
|
||||
doc: DocId,
|
||||
}
|
||||
|
||||
|
||||
/// Find the position in the sorted list of posting lists of the **pivot**.
|
||||
///
|
||||
/// docsets need to be advanced, and are required to be sorted by the doc they point to.
|
||||
///
|
||||
/// The pivot is then defined as the lowest DocId that has a chance of matching our condition.
|
||||
fn find_pivot_position<'a, TScorer>(
|
||||
mut docsets: impl Iterator<Item = &'a TScorer>,
|
||||
lower_bound_score: Score,
|
||||
) -> Option<Pivot>
|
||||
where TScorer: BlockMaxScorer
|
||||
{
|
||||
let mut position = 0;
|
||||
let mut upper_bound = Score::default();
|
||||
while let Some(docset) = docsets.next() {
|
||||
upper_bound += docset.max_score();
|
||||
if lower_bound_score < upper_bound {
|
||||
let pivot_doc = docset.doc();
|
||||
let first_occurrence = position;
|
||||
while let Some(docset) = docsets.next() {
|
||||
if docset.doc() != pivot_doc {
|
||||
break;
|
||||
} else {
|
||||
position += 1;
|
||||
}
|
||||
}
|
||||
return Some(Pivot {
|
||||
position,
|
||||
doc: pivot_doc,
|
||||
first_occurrence,
|
||||
});
|
||||
}
|
||||
position += 1;
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Given an iterator over all ordered lists up to the pivot (inclusive) and the following list (if
|
||||
/// exists), it returns the next document ID that can be possibly relevant, based on the block max
|
||||
/// scores.
|
||||
fn find_next_relevant_doc<T, TScorer>(
|
||||
docsets_up_to_pivot: &mut [T],
|
||||
pivot_docset: &mut T,
|
||||
docset_after_pivot: Option<&mut T>,
|
||||
) -> DocId
|
||||
where
|
||||
T: AsMut<TScorer>,
|
||||
TScorer: BlockMaxScorer + Scorer,
|
||||
{
|
||||
let pivot_docset = pivot_docset.as_mut();
|
||||
let mut next_doc = 1 + docsets_up_to_pivot
|
||||
.iter_mut()
|
||||
.map(|docset| docset.as_mut().block_max_doc())
|
||||
.chain(std::iter::once(pivot_docset.block_max_doc()))
|
||||
.min()
|
||||
.unwrap();
|
||||
if let Some(docset) = docset_after_pivot {
|
||||
let doc = docset.as_mut().doc();
|
||||
if doc < next_doc {
|
||||
next_doc = doc;
|
||||
}
|
||||
}
|
||||
if next_doc <= pivot_docset.doc() {
|
||||
pivot_docset.doc() + 1
|
||||
} else {
|
||||
next_doc
|
||||
}
|
||||
}
|
||||
|
||||
/// Sifts down the first element of the slice.
|
||||
///
|
||||
/// `docsets[1..]` are assumed sorted.
|
||||
/// This function swaps `docsets[0]` with its right
|
||||
/// neighbor successively -bubble sort style- until it reaches the first
|
||||
/// position such that `docsets` is sorted.
|
||||
fn sift_down<T, TScorer>(docsets: &mut [T])
|
||||
where
|
||||
T: AsRef<TScorer>,
|
||||
TScorer: BlockMaxScorer + Scorer,
|
||||
{
|
||||
for idx in 1..docsets.len() {
|
||||
if docsets[idx].as_ref().doc() >= docsets[idx - 1].as_ref().doc() {
|
||||
return;
|
||||
}
|
||||
docsets.swap(idx, idx - 1);
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a `DocSet` that iterates through the union of two or more `DocSet`s,
|
||||
/// applying [BlockMaxWand] dynamic pruning.
|
||||
///
|
||||
/// [BlockMaxWand]: https://dl.acm.org/doi/10.1145/2009916.2010048
|
||||
pub struct BlockMaxWand<TScorer, TScoreCombiner> {
|
||||
docsets: Vec<Box<TScorer>>,
|
||||
doc: DocId,
|
||||
score: Score,
|
||||
combiner: TScoreCombiner,
|
||||
}
|
||||
|
||||
impl<TScorer, TScoreCombiner> BlockMaxWand<TScorer, TScoreCombiner>
|
||||
where
|
||||
TScoreCombiner: ScoreCombiner,
|
||||
TScorer: BlockMaxScorer + Scorer,
|
||||
{
|
||||
fn new(
|
||||
docsets: Vec<TScorer>,
|
||||
combiner: TScoreCombiner,
|
||||
) -> BlockMaxWand<TScorer, TScoreCombiner> {
|
||||
let mut non_empty_docsets: Vec<_> = docsets
|
||||
.into_iter()
|
||||
.flat_map(|mut docset| {
|
||||
if docset.advance() {
|
||||
Some(Box::new(docset))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
non_empty_docsets.sort_by_key(Box::<TScorer>::doc);
|
||||
BlockMaxWand {
|
||||
docsets: non_empty_docsets,
|
||||
combiner,
|
||||
doc: 0u32,
|
||||
score: 0f32
|
||||
}
|
||||
}
|
||||
|
||||
/// Find the position in the sorted list of posting lists of the **pivot**.
|
||||
fn find_pivot_position(&self, lower_bound_score: Score) -> Option<Pivot> {
|
||||
find_pivot_position(
|
||||
self.docsets.iter().map(|docset| docset.as_ref()),
|
||||
lower_bound_score)
|
||||
}
|
||||
|
||||
fn advance_with_pivot(&mut self, pivot: Pivot, lower_bound_score: Score) -> SkipResult {
|
||||
let block_upper_bound: Score = self.docsets[..=pivot.position]
|
||||
.iter_mut()
|
||||
.map(|docset| docset.block_max_score())
|
||||
.sum();
|
||||
if block_upper_bound > lower_bound_score {
|
||||
if pivot.doc == self.docsets[0].doc() {
|
||||
// Since self.docsets is sorted by their current doc, in this branch, all
|
||||
// docsets in [0..=pivot] are positioned on pivot.doc.
|
||||
//
|
||||
// Lets compute the actual score for this doc.
|
||||
//
|
||||
// NOTE(elshize): One additional check needs to be done to improve performance:
|
||||
// update block-wise bound while accumulating score with the actual score,
|
||||
// and check each time if still above threshold.
|
||||
self.combiner.clear();
|
||||
for idx in (0..=pivot.position).rev() {
|
||||
self.combiner.update(self.docsets[idx].as_mut());
|
||||
if !self.docsets[idx].advance() {
|
||||
self.docsets.swap_remove(idx);
|
||||
}
|
||||
}
|
||||
self.score = self.combiner.score();
|
||||
self.doc = pivot.doc;
|
||||
self.docsets.sort_by_key(Box::<TScorer>::doc);
|
||||
SkipResult::Reached
|
||||
} else {
|
||||
// The substraction does not underflow because otherwise we would go to the other
|
||||
// branch.
|
||||
//
|
||||
// `advanced_idx` is the last idx that is not positionned on the pivot yet.
|
||||
let advanced_idx = pivot.first_occurrence - 1;
|
||||
if !self.docsets[advanced_idx].advance() {
|
||||
self.docsets.swap_remove(advanced_idx);
|
||||
}
|
||||
if self.docsets.is_empty() {
|
||||
return SkipResult::End;
|
||||
}
|
||||
sift_down(&mut self.docsets[advanced_idx..]);
|
||||
SkipResult::OverStep
|
||||
}
|
||||
} else {
|
||||
let (up_to_pivot, pivot_and_rest) = self.docsets.split_at_mut(pivot.position as usize);
|
||||
let (pivot, after_pivot) = pivot_and_rest.split_first_mut().unwrap();
|
||||
let next_doc = find_next_relevant_doc(up_to_pivot, pivot, after_pivot.first_mut());
|
||||
// NOTE(elshize): It might be more efficient to advance the list with the higher
|
||||
// max score, but let's advance the first one for now for simplicity.
|
||||
if self.docsets[0].skip_next(next_doc) == SkipResult::End {
|
||||
self.docsets.swap_remove(0);
|
||||
}
|
||||
if self.docsets.is_empty() {
|
||||
return SkipResult::End;
|
||||
}
|
||||
sift_down(&mut self.docsets[..]);
|
||||
SkipResult::OverStep
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<TScorer, TScoreCombiner> PruningScorer
|
||||
for BlockMaxWand<TScorer, TScoreCombiner>
|
||||
where
|
||||
TScoreCombiner: ScoreCombiner,
|
||||
TScorer: Scorer + BlockMaxScorer,
|
||||
{
|
||||
fn doc(&self) -> DocId {
|
||||
self.doc
|
||||
}
|
||||
|
||||
fn score(&self) -> Score {
|
||||
self.score
|
||||
}
|
||||
|
||||
fn advance_with_pruning(&mut self, lower_bound_score: f32) -> bool {
|
||||
while let Some(pivot) = self.find_pivot_position(lower_bound_score) {
|
||||
match self.advance_with_pivot(pivot, lower_bound_score) {
|
||||
SkipResult::End => { return false },
|
||||
SkipResult::Reached=> { return true; }
|
||||
SkipResult::OverStep => {}
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::common::HasLen;
|
||||
use crate::docset::DocSet;
|
||||
use crate::query::score_combiner::SumCombiner;
|
||||
use crate::query::Union;
|
||||
use crate::query::{BlockMaxScorer, Scorer};
|
||||
use crate::{DocId, Score};
|
||||
use float_cmp::approx_eq;
|
||||
use proptest::strategy::Strategy;
|
||||
use std::cmp::Ordering;
|
||||
use std::num::Wrapping;
|
||||
use crate::collector::{SegmentCollector, TopScoreSegmentCollector};
|
||||
|
||||
/*
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct VecDocSet {
|
||||
postings: Vec<(DocId, Score)>,
|
||||
cursor: Wrapping<usize>,
|
||||
block_max_scores: Vec<(DocId, Score)>,
|
||||
max_score: Score,
|
||||
block_size: usize,
|
||||
}
|
||||
|
||||
impl VecDocSet {
|
||||
fn new(postings: Vec<(DocId, Score)>, block_size: usize) -> VecDocSet {
|
||||
let block_max_scores: Vec<(DocId, f32)> = postings
|
||||
.chunks(block_size)
|
||||
.into_iter()
|
||||
.map(|block| {
|
||||
(
|
||||
block.iter().last().unwrap().0,
|
||||
block
|
||||
.iter()
|
||||
.map(|(_, s)| *s)
|
||||
.fold(-f32::INFINITY, |left, right| left.max(right))
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
let max_score = block_max_scores
|
||||
.iter()
|
||||
.copied()
|
||||
.map(|(_, s)| s)
|
||||
.fold(-f32::INFINITY, |left, right| left.max(right));
|
||||
VecDocSet {
|
||||
postings,
|
||||
cursor: Wrapping(0_usize) - Wrapping(1_usize),
|
||||
block_max_scores,
|
||||
max_score,
|
||||
block_size,
|
||||
}
|
||||
}
|
||||
/// Constructs a new set and advances it.
|
||||
fn started(postings: Vec<(DocId, Score)>, block_size: usize) -> VecDocSet {
|
||||
let mut docset = VecDocSet::new(postings, block_size);
|
||||
docset.advance();
|
||||
docset
|
||||
}
|
||||
}
|
||||
|
||||
impl DocSet for VecDocSet {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.cursor += Wrapping(1);
|
||||
self.postings.len() > self.cursor.0
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.postings[self.cursor.0].0
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.len() as u32
|
||||
}
|
||||
}
|
||||
|
||||
impl HasLen for VecDocSet {
|
||||
fn len(&self) -> usize {
|
||||
self.postings.len()
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockMaxScorer for VecDocSet {
|
||||
fn max_score(&self) -> Score {
|
||||
self.max_score
|
||||
}
|
||||
fn block_max_score(&mut self) -> Score {
|
||||
self.block_max_scores[self.cursor.0 / self.block_size].1
|
||||
}
|
||||
fn block_max_doc(&mut self) -> DocId {
|
||||
self.block_max_scores[self.cursor.0 / self.block_size].0
|
||||
}
|
||||
}
|
||||
|
||||
impl Scorer for VecDocSet {
|
||||
fn score(&mut self) -> Score {
|
||||
self.postings[self.cursor.0].1
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct ComparableDoc<T, D> {
|
||||
feature: T,
|
||||
doc: D,
|
||||
}
|
||||
|
||||
impl<T: PartialOrd, D: PartialOrd> PartialOrd for ComparableDoc<T, D> {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: PartialOrd, D: PartialOrd> Ord for ComparableDoc<T, D> {
|
||||
#[inline]
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
// Reversed to make BinaryHeap work as a min-heap
|
||||
let by_feature = other
|
||||
.feature
|
||||
.partial_cmp(&self.feature)
|
||||
.unwrap_or(Ordering::Equal);
|
||||
|
||||
let lazy_by_doc_address =
|
||||
|| self.doc.partial_cmp(&other.doc).unwrap_or(Ordering::Equal);
|
||||
|
||||
// In case of a tie on the feature, we sort by ascending
|
||||
// `DocAddress` in order to ensure a stable sorting of the
|
||||
// documents.
|
||||
by_feature.then_with(lazy_by_doc_address)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: PartialOrd, D: PartialOrd> PartialEq for ComparableDoc<T, D> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.cmp(other) == Ordering::Equal
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: PartialOrd, D: PartialOrd> Eq for ComparableDoc<T, D> {}
|
||||
|
||||
fn union_vs_bmw(posting_lists: Vec<VecDocSet>) {
|
||||
let mut union = Union::<VecDocSet, SumCombiner>::from(posting_lists.clone());
|
||||
let mut top_union = TopScoreSegmentCollector::new(0, 10);
|
||||
while union.advance() {
|
||||
top_union.collect(union.doc(), union.score());
|
||||
}
|
||||
let top_bmw = TopScoreSegmentCollector::new(0, 10 );
|
||||
let mut bmw = BlockMaxWand::new(posting_lists, SumCombiner::default());
|
||||
let top_docs_bnw = top_bmw.collect_scorer(&mut bmw, None);
|
||||
for ((expected_score, expected_doc), (actual_score, actual_doc)) in
|
||||
top_union.harvest().into_iter().zip( top_docs_bnw )
|
||||
{
|
||||
assert!(approx_eq!(
|
||||
f32,
|
||||
expected_score,
|
||||
actual_score,
|
||||
epsilon = 0.0001
|
||||
));
|
||||
assert_eq!(expected_doc, actual_doc);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bmw_0() {
|
||||
union_vs_bmw(vec![
|
||||
VecDocSet {
|
||||
postings: vec![
|
||||
(0, 1.0),
|
||||
(23, 1.0),
|
||||
(28, 1.0),
|
||||
(56, 1.0),
|
||||
(59, 1.0),
|
||||
(66, 1.0),
|
||||
(93, 1.0),
|
||||
],
|
||||
cursor: Wrapping(0_usize) - Wrapping(1_usize),
|
||||
block_max_scores: vec![(93, 1.0)],
|
||||
max_score: 1.0,
|
||||
block_size: 16,
|
||||
},
|
||||
VecDocSet {
|
||||
postings: vec![
|
||||
(2, 1.6549665),
|
||||
(43, 2.6958032),
|
||||
(53, 3.5309567),
|
||||
(71, 2.7688136),
|
||||
(87, 3.4279852),
|
||||
(96, 3.9028034),
|
||||
],
|
||||
cursor: Wrapping(0_usize) - Wrapping(1_usize),
|
||||
block_max_scores: vec![(96, 3.9028034)],
|
||||
max_score: 3.9028034,
|
||||
block_size: 16,
|
||||
},
|
||||
])
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bmw_1() {
|
||||
union_vs_bmw(vec![
|
||||
VecDocSet {
|
||||
postings: vec![(73, 1.0), (82, 1.0)],
|
||||
cursor: Wrapping(0_usize) - Wrapping(1_usize),
|
||||
block_max_scores: vec![(82, 1.0)],
|
||||
max_score: 1.0,
|
||||
block_size: 16,
|
||||
},
|
||||
VecDocSet {
|
||||
postings: vec![
|
||||
(21, 3.582513),
|
||||
(23, 1.6928024),
|
||||
(27, 3.887647),
|
||||
(42, 1.5469292),
|
||||
(61, 1.7317574),
|
||||
(62, 1.2968783),
|
||||
(82, 2.4040694),
|
||||
(85, 3.1487892),
|
||||
],
|
||||
cursor: Wrapping(0_usize) - Wrapping(1_usize),
|
||||
block_max_scores: vec![(85, 3.887647)],
|
||||
max_score: 3.887647,
|
||||
block_size: 16,
|
||||
},
|
||||
])
|
||||
}
|
||||
|
||||
proptest::proptest! {
|
||||
#[test]
|
||||
fn test_union_vs_bmw(postings in proptest::collection::vec(
|
||||
proptest::collection::vec(0_u32..100, 1..10)
|
||||
.prop_flat_map(|v| {
|
||||
let scores = proptest::collection::vec(1_f32..4_f32, v.len()..=v.len());
|
||||
scores.prop_map(move |s| {
|
||||
let mut postings: Vec<_> = v.iter().copied().zip(s.iter().copied()).collect();
|
||||
postings.sort_by_key(|p| p.0);
|
||||
postings.dedup_by_key(|p| p.0);
|
||||
VecDocSet::new(postings, 16)
|
||||
})
|
||||
}),
|
||||
2..5)
|
||||
) {
|
||||
union_vs_bmw(postings);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_pivot_position() {
|
||||
let postings = vec![
|
||||
VecDocSet::started(vec![(0, 2.0)], 1),
|
||||
VecDocSet::started(vec![(1, 3.0)], 1),
|
||||
VecDocSet::started(vec![(2, 4.0)], 1),
|
||||
VecDocSet::started(vec![(3, 5.0)], 1),
|
||||
VecDocSet::started(vec![(3, 6.0)], 1),
|
||||
];
|
||||
assert_eq!(
|
||||
find_pivot_position(postings.iter(), 2.0f32),
|
||||
Some(Pivot {
|
||||
position: 1,
|
||||
doc: 1,
|
||||
first_occurrence: 1,
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
find_pivot_position(postings.iter(), 5.0f32),
|
||||
Some(Pivot {
|
||||
position: 2,
|
||||
doc: 2,
|
||||
first_occurrence: 2,
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
find_pivot_position(postings.iter(), 9.0f32),
|
||||
Some(Pivot {
|
||||
position: 4,
|
||||
doc: 3,
|
||||
first_occurrence: 3,
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
find_pivot_position(postings.iter(), 20.0f32),
|
||||
None
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_next_relevant_doc_before_pivot() {
|
||||
let mut postings = vec![
|
||||
Box::new(VecDocSet::started(vec![(0, 0.0), (3, 0.0)], 2)),
|
||||
Box::new(VecDocSet::started(vec![(1, 0.0), (4, 0.0)], 2)),
|
||||
Box::new(VecDocSet::started(vec![(2, 0.0), (6, 0.0)], 2)), // pivot
|
||||
Box::new(VecDocSet::started(vec![(6, 0.0), (8, 0.0)], 2)),
|
||||
Box::new(VecDocSet::started(vec![(6, 0.0), (8, 0.0)], 2)),
|
||||
];
|
||||
let (up_to_pivot, rest) = postings.split_at_mut(2);
|
||||
let (pivot, after_pivot) = rest.split_first_mut().unwrap();
|
||||
let next_doc = find_next_relevant_doc(up_to_pivot, pivot, Some(&mut after_pivot[0]));
|
||||
assert_eq!(next_doc, 4);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_next_relevant_doc_prefix_smaller_than_pivot() {
|
||||
let mut postings = vec![
|
||||
Box::new(VecDocSet::started(vec![(0, 0.0), (3, 0.0)], 2)),
|
||||
Box::new(VecDocSet::started(vec![(1, 0.0), (4, 0.0)], 2)),
|
||||
Box::new(VecDocSet::started(vec![(5, 0.0), (8, 0.0)], 2)), // pivot
|
||||
Box::new(VecDocSet::started(vec![(6, 0.0), (8, 0.0)], 2)),
|
||||
Box::new(VecDocSet::started(vec![(6, 0.0), (8, 0.0)], 2)),
|
||||
];
|
||||
let (up_to_pivot, rest) = postings.split_at_mut(2);
|
||||
let (pivot, after_pivot) = rest.split_first_mut().unwrap();
|
||||
let next_doc = find_next_relevant_doc(up_to_pivot, pivot, Some(&mut after_pivot[0]));
|
||||
assert_eq!(next_doc, 6);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_next_relevant_doc_after_pivot() {
|
||||
let mut postings = vec![
|
||||
Box::new(VecDocSet::started(vec![(0, 0.0), (8, 0.0)], 2)),
|
||||
Box::new(VecDocSet::started(vec![(1, 0.0), (8, 0.0)], 2)),
|
||||
Box::new(VecDocSet::started(vec![(2, 0.0), (8, 0.0)], 2)), // pivot
|
||||
Box::new(VecDocSet::started(vec![(5, 0.0), (7, 0.0)], 2)),
|
||||
Box::new(VecDocSet::started(vec![(6, 0.0), (7, 0.0)], 2)),
|
||||
];
|
||||
let (up_to_pivot, rest) = postings.split_at_mut(2);
|
||||
let (pivot, after_pivot) = rest.split_first_mut().unwrap();
|
||||
let next_doc = find_next_relevant_doc(up_to_pivot, pivot, Some(&mut after_pivot[0]));
|
||||
assert_eq!(next_doc, 5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sift_down_already_sifted() {
|
||||
let mut postings = vec![
|
||||
Box::new(VecDocSet::started(vec![(0, 0.0), (8, 0.0)], 2)),
|
||||
Box::new(VecDocSet::started(vec![(1, 0.0), (8, 0.0)], 2)),
|
||||
Box::new(VecDocSet::started(vec![(2, 0.0), (8, 0.0)], 2)), // pivot
|
||||
Box::new(VecDocSet::started(vec![(5, 0.0), (7, 0.0)], 2)),
|
||||
Box::new(VecDocSet::started(vec![(6, 0.0), (7, 0.0)], 2)),
|
||||
];
|
||||
sift_down(&mut postings[2..]);
|
||||
assert_eq!(
|
||||
postings.into_iter().map(|p| p.doc()).collect::<Vec<_>>(),
|
||||
vec![0, 1, 2, 5, 6]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sift_down_sift_one_down() {
|
||||
let mut postings = vec![
|
||||
Box::new(VecDocSet::started(vec![(0, 0.0), (8, 0.0)], 2)),
|
||||
Box::new(VecDocSet::started(vec![(1, 0.0), (8, 0.0)], 2)),
|
||||
Box::new(VecDocSet::started(vec![(6, 0.0), (8, 0.0)], 2)), // pivot
|
||||
Box::new(VecDocSet::started(vec![(5, 0.0), (7, 0.0)], 2)),
|
||||
Box::new(VecDocSet::started(vec![(7, 0.0), (7, 0.0)], 2)),
|
||||
];
|
||||
sift_down(&mut postings[2..]);
|
||||
assert_eq!(
|
||||
postings.into_iter().map(|p| p.doc()).collect::<Vec<_>>(),
|
||||
vec![0, 1, 5, 6, 7]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sift_down_to_bottom() {
|
||||
let mut postings = vec![
|
||||
Box::new(VecDocSet::started(vec![(0, 0.0), (8, 0.0)], 2)),
|
||||
Box::new(VecDocSet::started(vec![(1, 0.0), (8, 0.0)], 2)),
|
||||
Box::new(VecDocSet::started(vec![(7, 0.0), (8, 0.0)], 2)), // pivot
|
||||
Box::new(VecDocSet::started(vec![(5, 0.0), (7, 0.0)], 2)),
|
||||
Box::new(VecDocSet::started(vec![(6, 0.0), (7, 0.0)], 2)),
|
||||
];
|
||||
sift_down(&mut postings[2..]);
|
||||
assert_eq!(
|
||||
postings.into_iter().map(|p| p.doc()).collect::<Vec<_>>(),
|
||||
vec![0, 1, 5, 6, 7]
|
||||
);
|
||||
}
|
||||
|
||||
*/
|
||||
}
|
||||
@@ -31,24 +31,11 @@ mod tests {
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
{
|
||||
let doc = doc!(text_field => "a b c");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{
|
||||
let doc = doc!(text_field => "a c");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{
|
||||
let doc = doc!(text_field => "b c");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{
|
||||
let doc = doc!(text_field => "a b c d");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{
|
||||
let doc = doc!(text_field => "d");
|
||||
index_writer.add_document(doc);
|
||||
index_writer.add_document(doc!(text_field => "a b c"));
|
||||
index_writer.add_document(doc!(text_field => "a c"));
|
||||
index_writer.add_document(doc!(text_field => "b c"));
|
||||
index_writer.add_document(doc!(text_field => "a b c d"));
|
||||
index_writer.add_document(doc!(text_field => "d"));
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use crate::{DocId, TantivyError};
|
||||
use serde::Serialize;
|
||||
|
||||
pub(crate) fn does_not_match(doc: DocId) -> TantivyError {
|
||||
TantivyError::InvalidArgument(format!("Document #({}) does not match", doc))
|
||||
|
||||
@@ -2,14 +2,40 @@ use crate::query::{AutomatonWeight, Query, Weight};
|
||||
use crate::schema::Term;
|
||||
use crate::Searcher;
|
||||
use crate::TantivyError::InvalidArgument;
|
||||
use levenshtein_automata::{LevenshteinAutomatonBuilder, DFA};
|
||||
use levenshtein_automata::{Distance, LevenshteinAutomatonBuilder, DFA};
|
||||
use once_cell::sync::Lazy;
|
||||
use std::collections::HashMap;
|
||||
use std::ops::Range;
|
||||
use tantivy_fst::Automaton;
|
||||
|
||||
pub(crate) struct DFAWrapper(pub DFA);
|
||||
|
||||
impl Automaton for DFAWrapper {
|
||||
type State = u32;
|
||||
|
||||
fn start(&self) -> Self::State {
|
||||
self.0.initial_state()
|
||||
}
|
||||
|
||||
fn is_match(&self, state: &Self::State) -> bool {
|
||||
match self.0.distance(*state) {
|
||||
Distance::Exact(_) => true,
|
||||
Distance::AtLeast(_) => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn can_match(&self, state: &u32) -> bool {
|
||||
*state != levenshtein_automata::SINK_STATE
|
||||
}
|
||||
|
||||
fn accept(&self, state: &Self::State, byte: u8) -> Self::State {
|
||||
self.0.transition(*state, byte)
|
||||
}
|
||||
}
|
||||
|
||||
/// A range of Levenshtein distances that we will build DFAs for our terms
|
||||
/// The computation is exponential, so best keep it to low single digits
|
||||
const VALID_LEVENSHTEIN_DISTANCE_RANGE: Range<u8> = (0..3);
|
||||
const VALID_LEVENSHTEIN_DISTANCE_RANGE: Range<u8> = 0..3;
|
||||
|
||||
static LEV_BUILDER: Lazy<HashMap<(u8, bool), LevenshteinAutomatonBuilder>> = Lazy::new(|| {
|
||||
let mut lev_builder_cache = HashMap::new();
|
||||
@@ -101,13 +127,20 @@ impl FuzzyTermQuery {
|
||||
}
|
||||
}
|
||||
|
||||
fn specialized_weight(&self) -> crate::Result<AutomatonWeight<DFA>> {
|
||||
fn specialized_weight(&self) -> crate::Result<AutomatonWeight<DFAWrapper>> {
|
||||
// LEV_BUILDER is a HashMap, whose `get` method returns an Option
|
||||
match LEV_BUILDER.get(&(self.distance, false)) {
|
||||
// Unwrap the option and build the Ok(AutomatonWeight)
|
||||
Some(automaton_builder) => {
|
||||
let automaton = automaton_builder.build_dfa(self.term.text());
|
||||
Ok(AutomatonWeight::new(self.term.field(), automaton))
|
||||
let automaton = if self.prefix {
|
||||
automaton_builder.build_prefix_dfa(self.term.text())
|
||||
} else {
|
||||
automaton_builder.build_dfa(self.term.text())
|
||||
};
|
||||
Ok(AutomatonWeight::new(
|
||||
self.term.field(),
|
||||
DFAWrapper(automaton),
|
||||
))
|
||||
}
|
||||
None => Err(InvalidArgument(format!(
|
||||
"Levenshtein distance of {} is not allowed. Choose a value in the {:?} range",
|
||||
@@ -166,5 +199,17 @@ mod test {
|
||||
let (score, _) = top_docs[0];
|
||||
assert_nearly_equals(1f32, score);
|
||||
}
|
||||
|
||||
{
|
||||
let term = Term::from_field_text(country_field, "jap");
|
||||
|
||||
let fuzzy_query = FuzzyTermQuery::new_prefix(term, 1, true);
|
||||
let top_docs = searcher
|
||||
.search(&fuzzy_query, &TopDocs::with_limit(2))
|
||||
.unwrap();
|
||||
assert_eq!(top_docs.len(), 1, "Expected only 1 document");
|
||||
let (score, _) = top_docs[0];
|
||||
assert_nearly_equals(1f32, score);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
/*!
|
||||
Query
|
||||
*/
|
||||
/*! Query Module */
|
||||
|
||||
mod all_query;
|
||||
mod automaton_weight;
|
||||
mod bitset;
|
||||
mod block_max_scorer;
|
||||
mod block_max_wand;
|
||||
mod bm25;
|
||||
mod boolean_query;
|
||||
mod boost_query;
|
||||
@@ -37,11 +37,14 @@ pub use self::vec_docset::VecDocSet;
|
||||
pub use self::all_query::{AllQuery, AllScorer, AllWeight};
|
||||
pub use self::automaton_weight::AutomatonWeight;
|
||||
pub use self::bitset::BitSetDocSet;
|
||||
pub use self::block_max_scorer::BlockMaxScorer;
|
||||
pub use self::boolean_query::BooleanQuery;
|
||||
pub use self::boost_query::BoostQuery;
|
||||
pub use self::empty_query::{EmptyQuery, EmptyScorer, EmptyWeight};
|
||||
pub use self::exclude::Exclude;
|
||||
pub use self::explanation::Explanation;
|
||||
#[cfg(test)]
|
||||
pub(crate) use self::fuzzy_query::DFAWrapper;
|
||||
pub use self::fuzzy_query::FuzzyTermQuery;
|
||||
pub use self::intersection::intersect_scorers;
|
||||
pub use self::phrase_query::PhraseQuery;
|
||||
@@ -56,6 +59,8 @@ pub use self::scorer::Scorer;
|
||||
pub use self::term_query::TermQuery;
|
||||
pub use self::weight::Weight;
|
||||
pub use tantivy_query_grammar::Occur;
|
||||
pub use self::weight::PruningScorerIfPossible;
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
@@ -12,7 +12,6 @@ pub mod tests {
|
||||
use super::*;
|
||||
use crate::collector::tests::{TEST_COLLECTOR_WITHOUT_SCORE, TEST_COLLECTOR_WITH_SCORE};
|
||||
use crate::core::Index;
|
||||
use crate::error::TantivyError;
|
||||
use crate::schema::{Schema, Term, TEXT};
|
||||
use crate::tests::assert_nearly_equals;
|
||||
use crate::DocAddress;
|
||||
@@ -127,21 +126,16 @@ pub mod tests {
|
||||
Term::from_field_text(text_field, "a"),
|
||||
Term::from_field_text(text_field, "b"),
|
||||
]);
|
||||
match searcher
|
||||
|
||||
let search_result = searcher
|
||||
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
|
||||
.map(|_| ())
|
||||
.unwrap_err()
|
||||
{
|
||||
TantivyError::SchemaError(ref msg) => {
|
||||
assert_eq!(
|
||||
"Applied phrase query on field \"text\", which does not have positions indexed",
|
||||
msg.as_str()
|
||||
);
|
||||
}
|
||||
_ => {
|
||||
panic!("Should have returned an error");
|
||||
}
|
||||
}
|
||||
.map(|_| ());
|
||||
assert!(matches!(
|
||||
search_result,
|
||||
Err(crate::TantivyError::SchemaError(msg))
|
||||
if msg == "Applied phrase query on field \"text\", which does not have positions \
|
||||
indexed"
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -174,6 +174,16 @@ pub struct QueryParser {
|
||||
boost: HashMap<Field, f32>,
|
||||
}
|
||||
|
||||
fn all_negative(ast: &LogicalAST) -> bool {
|
||||
match ast {
|
||||
LogicalAST::Leaf(_) => false,
|
||||
LogicalAST::Boost(ref child_ast, _) => all_negative(&*child_ast),
|
||||
LogicalAST::Clause(children) => children
|
||||
.iter()
|
||||
.all(|(ref occur, child)| (*occur == Occur::MustNot) || all_negative(child)),
|
||||
}
|
||||
}
|
||||
|
||||
impl QueryParser {
|
||||
/// Creates a `QueryParser`, given
|
||||
/// * schema - index Schema
|
||||
@@ -253,8 +263,13 @@ impl QueryParser {
|
||||
&self,
|
||||
user_input_ast: UserInputAST,
|
||||
) -> Result<LogicalAST, QueryParserError> {
|
||||
let (occur, ast) = self.compute_logical_ast_with_occur(user_input_ast)?;
|
||||
if occur == Occur::MustNot {
|
||||
let ast = self.compute_logical_ast_with_occur(user_input_ast)?;
|
||||
if let LogicalAST::Clause(children) = &ast {
|
||||
if children.is_empty() {
|
||||
return Ok(ast);
|
||||
}
|
||||
}
|
||||
if all_negative(&ast) {
|
||||
return Err(QueryParserError::AllButQueryForbidden);
|
||||
}
|
||||
Ok(ast)
|
||||
@@ -410,31 +425,23 @@ impl QueryParser {
|
||||
fn compute_logical_ast_with_occur(
|
||||
&self,
|
||||
user_input_ast: UserInputAST,
|
||||
) -> Result<(Occur, LogicalAST), QueryParserError> {
|
||||
) -> Result<LogicalAST, QueryParserError> {
|
||||
match user_input_ast {
|
||||
UserInputAST::Clause(sub_queries) => {
|
||||
let default_occur = self.default_occur();
|
||||
let mut logical_sub_queries: Vec<(Occur, LogicalAST)> = Vec::new();
|
||||
for sub_query in sub_queries {
|
||||
let (occur, sub_ast) = self.compute_logical_ast_with_occur(sub_query)?;
|
||||
let new_occur = Occur::compose(default_occur, occur);
|
||||
logical_sub_queries.push((new_occur, sub_ast));
|
||||
for (occur_opt, sub_ast) in sub_queries {
|
||||
let sub_ast = self.compute_logical_ast_with_occur(sub_ast)?;
|
||||
let occur = occur_opt.unwrap_or(default_occur);
|
||||
logical_sub_queries.push((occur, sub_ast));
|
||||
}
|
||||
Ok((Occur::Should, LogicalAST::Clause(logical_sub_queries)))
|
||||
}
|
||||
UserInputAST::Unary(left_occur, subquery) => {
|
||||
let (right_occur, logical_sub_queries) =
|
||||
self.compute_logical_ast_with_occur(*subquery)?;
|
||||
Ok((Occur::compose(left_occur, right_occur), logical_sub_queries))
|
||||
Ok(LogicalAST::Clause(logical_sub_queries))
|
||||
}
|
||||
UserInputAST::Boost(ast, boost) => {
|
||||
let (occur, ast_without_occur) = self.compute_logical_ast_with_occur(*ast)?;
|
||||
Ok((occur, ast_without_occur.boost(boost)))
|
||||
}
|
||||
UserInputAST::Leaf(leaf) => {
|
||||
let result_ast = self.compute_logical_ast_from_leaf(*leaf)?;
|
||||
Ok((Occur::Should, result_ast))
|
||||
let ast = self.compute_logical_ast_with_occur(*ast)?;
|
||||
Ok(ast.boost(boost))
|
||||
}
|
||||
UserInputAST::Leaf(leaf) => self.compute_logical_ast_from_leaf(*leaf),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -782,6 +789,20 @@ mod test {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_query_to_ast_ab_c() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"(+title:a +title:b) title:c",
|
||||
"((+Term(field=0,bytes=[97]) +Term(field=0,bytes=[98])) Term(field=0,bytes=[99]))",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"(+title:a +title:b) title:c",
|
||||
"(+(+Term(field=0,bytes=[97]) +Term(field=0,bytes=[98])) +Term(field=0,bytes=[99]))",
|
||||
true,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_to_ast_single_term() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
@@ -801,11 +822,13 @@ mod test {
|
||||
Term(field=1,bytes=[116, 105, 116, 105])))",
|
||||
false,
|
||||
);
|
||||
assert_eq!(
|
||||
parse_query_to_logical_ast("-title:toto", false)
|
||||
.err()
|
||||
.unwrap(),
|
||||
QueryParserError::AllButQueryForbidden
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_single_negative_term() {
|
||||
assert_matches!(
|
||||
parse_query_to_logical_ast("-title:toto", false),
|
||||
Err(QueryParserError::AllButQueryForbidden)
|
||||
);
|
||||
}
|
||||
|
||||
@@ -965,6 +988,18 @@ mod test {
|
||||
assert!(query_parser.parse_query("with_stop_words:the").is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_single_negative_term_through_error() {
|
||||
assert_matches!(
|
||||
parse_query_to_logical_ast("-title:toto", true),
|
||||
Err(QueryParserError::AllButQueryForbidden)
|
||||
);
|
||||
assert_matches!(
|
||||
parse_query_to_logical_ast("-title:toto", false),
|
||||
Err(QueryParserError::AllButQueryForbidden)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_to_ast_conjunction() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
@@ -984,12 +1019,6 @@ mod test {
|
||||
Term(field=1,bytes=[116, 105, 116, 105])))",
|
||||
true,
|
||||
);
|
||||
assert_eq!(
|
||||
parse_query_to_logical_ast("-title:toto", true)
|
||||
.err()
|
||||
.unwrap(),
|
||||
QueryParserError::AllButQueryForbidden
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:a b",
|
||||
"(+Term(field=0,bytes=[97]) \
|
||||
@@ -1013,4 +1042,26 @@ mod test {
|
||||
false
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_and_default_regardless_of_default_conjunctive() {
|
||||
for &default_conjunction in &[false, true] {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:a AND title:b",
|
||||
"(+Term(field=0,bytes=[97]) +Term(field=0,bytes=[98]))",
|
||||
default_conjunction,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_or_default_conjunctive() {
|
||||
for &default_conjunction in &[false, true] {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:a OR title:b",
|
||||
"(Term(field=0,bytes=[97]) Term(field=0,bytes=[98]))",
|
||||
default_conjunction,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
98
src/query/term_query/block_max_term_scorer.rs
Normal file
98
src/query/term_query/block_max_term_scorer.rs
Normal file
@@ -0,0 +1,98 @@
|
||||
use crate::docset::{DocSet, SkipResult};
|
||||
use crate::query::{Explanation, Scorer};
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::postings::Postings;
|
||||
use crate::postings::{BlockMaxPostings, BlockMaxSegmentPostings};
|
||||
use crate::query::bm25::BM25Weight;
|
||||
use crate::query::BlockMaxScorer;
|
||||
|
||||
pub struct BlockMaxTermScorer {
|
||||
postings: BlockMaxSegmentPostings,
|
||||
fieldnorm_reader: FieldNormReader,
|
||||
similarity_weight: BM25Weight,
|
||||
}
|
||||
|
||||
impl BlockMaxTermScorer {
|
||||
pub fn new(
|
||||
postings: BlockMaxSegmentPostings,
|
||||
fieldnorm_reader: FieldNormReader,
|
||||
similarity_weight: BM25Weight,
|
||||
) -> Self {
|
||||
Self {
|
||||
postings,
|
||||
fieldnorm_reader,
|
||||
similarity_weight,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockMaxTermScorer {
|
||||
fn _score(&self, fieldnorm_id: u8, term_freq: u32) -> Score {
|
||||
self.similarity_weight.score(fieldnorm_id, term_freq)
|
||||
}
|
||||
|
||||
pub fn term_freq(&self) -> u32 {
|
||||
self.postings.term_freq()
|
||||
}
|
||||
|
||||
pub fn fieldnorm_id(&self) -> u8 {
|
||||
self.fieldnorm_reader.fieldnorm_id(self.doc())
|
||||
}
|
||||
|
||||
pub fn explain(&self) -> Explanation {
|
||||
let fieldnorm_id = self.fieldnorm_id();
|
||||
let term_freq = self.term_freq();
|
||||
self.similarity_weight.explain(fieldnorm_id, term_freq)
|
||||
}
|
||||
}
|
||||
|
||||
impl DocSet for BlockMaxTermScorer {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.postings.advance()
|
||||
}
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
self.postings.skip_next(target)
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.postings.doc()
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.postings.size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
impl Scorer for BlockMaxTermScorer {
|
||||
fn score(&mut self) -> Score {
|
||||
self._score(
|
||||
self.fieldnorm_reader.fieldnorm_id(self.doc()),
|
||||
self.postings.term_freq(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockMaxScorer for BlockMaxTermScorer {
|
||||
fn block_max_score(&mut self) -> Score {
|
||||
self._score(
|
||||
self.fieldnorm_reader
|
||||
.fieldnorm_id(self.postings.block_max_doc()),
|
||||
self.postings.term_freq(),
|
||||
)
|
||||
}
|
||||
|
||||
fn block_max_doc(&mut self) -> DocId {
|
||||
self.postings.block_max_doc()
|
||||
}
|
||||
|
||||
fn max_score(&self) -> Score {
|
||||
self._score(
|
||||
self.fieldnorm_reader.fieldnorm_id(self.postings.max_doc()),
|
||||
self.postings.max_term_freq(),
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -1,7 +1,9 @@
|
||||
mod block_max_term_scorer;
|
||||
mod term_query;
|
||||
mod term_scorer;
|
||||
mod term_weight;
|
||||
|
||||
pub use self::block_max_term_scorer::BlockMaxTermScorer;
|
||||
pub use self::term_query::TermQuery;
|
||||
pub use self::term_scorer::TermScorer;
|
||||
pub use self::term_weight::TermWeight;
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use crate::common::TinySet;
|
||||
use crate::docset::{DocSet, SkipResult};
|
||||
use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner};
|
||||
use crate::query::Scorer;
|
||||
use crate::query::{Scorer, BlockMaxScorer};
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
use std::cmp::Ordering;
|
||||
@@ -9,6 +9,99 @@ use std::cmp::Ordering;
|
||||
const HORIZON_NUM_TINYBITSETS: usize = 64;
|
||||
const HORIZON: u32 = 64u32 * HORIZON_NUM_TINYBITSETS as u32;
|
||||
|
||||
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
|
||||
struct Pivot {
|
||||
position: usize,
|
||||
first_occurrence: usize,
|
||||
doc: DocId,
|
||||
}
|
||||
|
||||
/// Find the position in the sorted list of posting lists of the **pivot**.
|
||||
///
|
||||
/// docsets need to be advanced, and are required to be sorted by the doc they point to.
|
||||
///
|
||||
/// The pivot is then defined as the lowest DocId that has a chance of matching our condition.
|
||||
fn find_pivot_position<'a, TScorer>(
|
||||
mut docsets: impl Iterator<Item = &'a TScorer>,
|
||||
lower_bound_score: Score,
|
||||
) -> Option<Pivot>
|
||||
where TScorer: BlockMaxScorer
|
||||
{
|
||||
let mut position = 0;
|
||||
let mut upper_bound = Score::default();
|
||||
while let Some(docset) = docsets.next() {
|
||||
upper_bound += docset.max_score();
|
||||
if lower_bound_score < upper_bound {
|
||||
let pivot_doc = docset.doc();
|
||||
let first_occurrence = position;
|
||||
while let Some(docset) = docsets.next() {
|
||||
if docset.doc() != pivot_doc {
|
||||
break;
|
||||
} else {
|
||||
position += 1;
|
||||
}
|
||||
}
|
||||
return Some(Pivot {
|
||||
position,
|
||||
doc: pivot_doc,
|
||||
first_occurrence,
|
||||
});
|
||||
}
|
||||
position += 1;
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Sifts down the first element of the slice.
|
||||
///
|
||||
/// `docsets[1..]` are assumed sorted.
|
||||
/// This function swaps `docsets[0]` with its right
|
||||
/// neighbor successively -bubble sort style- until it reaches the first
|
||||
/// position such that `docsets` is sorted.
|
||||
fn sift_down<TScorer>(docsets: &mut [TScorer])
|
||||
where
|
||||
TScorer: BlockMaxScorer + Scorer,
|
||||
{
|
||||
for idx in 1..docsets.len() {
|
||||
if docsets[idx].doc() >= docsets[idx - 1].doc() {
|
||||
return;
|
||||
}
|
||||
docsets.swap(idx, idx - 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Given an iterator over all ordered lists up to the pivot (inclusive) and the following list (if
|
||||
/// exists), it returns the next document ID that can be possibly relevant, based on the block max
|
||||
/// scores.
|
||||
fn find_next_relevant_doc<TScorer>(
|
||||
docsets_up_to_pivot: &mut [TScorer],
|
||||
pivot_docset: &mut TScorer,
|
||||
docset_after_pivot: Option<&mut TScorer>,
|
||||
) -> DocId
|
||||
where
|
||||
TScorer: BlockMaxScorer + Scorer,
|
||||
{
|
||||
let mut next_doc = 1 + docsets_up_to_pivot
|
||||
.iter_mut()
|
||||
.map(|docset| docset.block_max_doc())
|
||||
.chain(std::iter::once(pivot_docset.block_max_doc()))
|
||||
.min()
|
||||
.unwrap();
|
||||
if let Some(docset) = docset_after_pivot {
|
||||
let doc = docset.doc();
|
||||
if doc < next_doc {
|
||||
next_doc = doc;
|
||||
}
|
||||
}
|
||||
if next_doc <= pivot_docset.doc() {
|
||||
pivot_docset.doc() + 1
|
||||
} else {
|
||||
next_doc
|
||||
}
|
||||
}
|
||||
|
||||
// `drain_filter` is not stable yet.
|
||||
// This function is similar except that it does is not unstable, and
|
||||
// it does not keep the original vector ordering.
|
||||
@@ -39,6 +132,7 @@ pub struct Union<TScorer, TScoreCombiner = DoNothingCombiner> {
|
||||
score: Score,
|
||||
}
|
||||
|
||||
|
||||
impl<TScorer, TScoreCombiner> From<Vec<TScorer>> for Union<TScorer, TScoreCombiner>
|
||||
where
|
||||
TScoreCombiner: ScoreCombiner,
|
||||
@@ -126,6 +220,76 @@ impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> Union<TScorer, TScoreCombin
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
impl<TScorer: BlockMaxScorer, TScoreCombiner: ScoreCombiner> Union<TScorer, TScoreCombiner> {
|
||||
fn advance_with_pivot(&mut self, pivot: Pivot, lower_bound_score: Score) -> SkipResult {
|
||||
let block_upper_bound: Score = self.docsets[..=pivot.position]
|
||||
.iter_mut()
|
||||
.map(|docset| docset.block_max_score())
|
||||
.sum();
|
||||
if block_upper_bound > lower_bound_score {
|
||||
if pivot.doc == self.docsets[0].doc() {
|
||||
// Since self.docsets is sorted by their current doc, in this branch, all
|
||||
// docsets in [0..=pivot] are positioned on pivot.doc.
|
||||
//
|
||||
// Lets compute the actual score for this doc.
|
||||
//
|
||||
// NOTE(elshize): One additional check needs to be done to improve performance:
|
||||
// update block-wise bound while accumulating score with the actual score,
|
||||
// and check each time if still above threshold.
|
||||
let mut combiner = TScoreCombiner::default();
|
||||
for idx in (0..=pivot.position).rev() {
|
||||
combiner.update(&mut self.docsets[idx]);
|
||||
if !self.docsets[idx].advance() {
|
||||
self.docsets.swap_remove(idx);
|
||||
}
|
||||
}
|
||||
self.score = combiner.score();
|
||||
self.doc = pivot.doc;
|
||||
self.docsets.sort_by_key(TScorer::doc);
|
||||
SkipResult::Reached
|
||||
} else {
|
||||
// The substraction does not underflow because otherwise we would go to the other
|
||||
// branch.
|
||||
//
|
||||
// `advanced_idx` is the last idx that is not positionned on the pivot yet.
|
||||
let advanced_idx = pivot.first_occurrence - 1;
|
||||
if !self.docsets[advanced_idx].advance() {
|
||||
self.docsets.swap_remove(advanced_idx);
|
||||
}
|
||||
if self.docsets.is_empty() {
|
||||
return SkipResult::End;
|
||||
}
|
||||
sift_down(&mut self.docsets[advanced_idx..]);
|
||||
SkipResult::OverStep
|
||||
}
|
||||
} else {
|
||||
let (up_to_pivot, pivot_and_rest) = self.docsets.split_at_mut(pivot.position as usize);
|
||||
let (pivot, after_pivot) = pivot_and_rest.split_first_mut().unwrap();
|
||||
let next_doc = find_next_relevant_doc(up_to_pivot, pivot, after_pivot.first_mut());
|
||||
// NOTE(elshize): It might be more efficient to advance the list with the higher
|
||||
// max score, but let's advance the first one for now for simplicity.
|
||||
if self.docsets[0].skip_next(next_doc) == SkipResult::End {
|
||||
self.docsets.swap_remove(0);
|
||||
}
|
||||
if self.docsets.is_empty() {
|
||||
return SkipResult::End;
|
||||
}
|
||||
sift_down(&mut self.docsets[..]);
|
||||
SkipResult::OverStep
|
||||
}
|
||||
}
|
||||
|
||||
/// Find the position in the sorted list of posting lists of the **pivot**.
|
||||
fn find_pivot_position(&self, lower_bound_score: Score) -> Option<Pivot> {
|
||||
find_pivot_position(
|
||||
self.docsets.iter().map(|docset| docset),
|
||||
lower_bound_score)
|
||||
}
|
||||
}
|
||||
|
||||
impl<TScorer, TScoreCombiner> DocSet for Union<TScorer, TScoreCombiner>
|
||||
|
||||
@@ -1,7 +1,26 @@
|
||||
use super::Scorer;
|
||||
use crate::core::SegmentReader;
|
||||
use crate::query::Explanation;
|
||||
use crate::DocId;
|
||||
use crate::{DocId, Score};
|
||||
|
||||
pub trait PruningScorer {
|
||||
|
||||
fn doc(&self) -> DocId;
|
||||
|
||||
fn score(&self) -> Score;
|
||||
|
||||
/// Advance to the next document that has a score strictly greater than
|
||||
/// `lower_bound_score`.
|
||||
fn advance_with_pruning(&mut self, score_lower_bound: f32) -> bool;
|
||||
fn advance(&mut self) -> bool {
|
||||
self.advance_with_pruning(std::f32::NEG_INFINITY)
|
||||
}
|
||||
}
|
||||
|
||||
pub enum PruningScorerIfPossible {
|
||||
Pruning(Box<dyn PruningScorer>),
|
||||
NonPruning(Box<dyn Scorer>)
|
||||
}
|
||||
|
||||
/// A Weight is the specialization of a Query
|
||||
/// for a given set of segments.
|
||||
@@ -15,6 +34,11 @@ pub trait Weight: Send + Sync + 'static {
|
||||
/// See [`Query`](./trait.Query.html).
|
||||
fn scorer(&self, reader: &SegmentReader, boost: f32) -> crate::Result<Box<dyn Scorer>>;
|
||||
|
||||
fn pruning_scorer(&self, reader: &SegmentReader, boost: f32) -> crate::Result<PruningScorerIfPossible> {
|
||||
let scorer = self.scorer(reader, boost)?;
|
||||
Ok(PruningScorerIfPossible::NonPruning(Box::new(scorer)))
|
||||
}
|
||||
|
||||
/// Returns an `Explanation` for the given document.
|
||||
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation>;
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ use crate::directory::META_LOCK;
|
||||
use crate::Index;
|
||||
use crate::Searcher;
|
||||
use crate::SegmentReader;
|
||||
use std::convert::TryInto;
|
||||
use std::sync::Arc;
|
||||
|
||||
/// Defines when a new version of the index should be reloaded.
|
||||
@@ -60,7 +61,6 @@ impl IndexReaderBuilder {
|
||||
/// Building the reader is a non-trivial operation that requires
|
||||
/// to open different segment readers. It may take hundreds of milliseconds
|
||||
/// of time and it may return an error.
|
||||
/// TODO(pmasurel) Use the `TryInto` trait once it is available in stable.
|
||||
pub fn try_into(self) -> crate::Result<IndexReader> {
|
||||
let inner_reader = InnerIndexReader {
|
||||
index: self.index,
|
||||
@@ -113,6 +113,14 @@ impl IndexReaderBuilder {
|
||||
}
|
||||
}
|
||||
|
||||
impl TryInto<IndexReader> for IndexReaderBuilder {
|
||||
type Error = crate::TantivyError;
|
||||
|
||||
fn try_into(self) -> crate::Result<IndexReader> {
|
||||
IndexReaderBuilder::try_into(self)
|
||||
}
|
||||
}
|
||||
|
||||
struct InnerIndexReader {
|
||||
num_searchers: usize,
|
||||
searcher_pool: Pool<Searcher>,
|
||||
|
||||
@@ -3,8 +3,8 @@ use crate::common::BinarySerializable;
|
||||
use crate::common::VInt;
|
||||
use crate::tokenizer::PreTokenizedString;
|
||||
use crate::DateTime;
|
||||
use itertools::Itertools;
|
||||
use std::io::{self, Read, Write};
|
||||
use std::mem;
|
||||
|
||||
/// Tantivy's Document is the object that can
|
||||
/// be indexed and then searched for.
|
||||
@@ -16,7 +16,7 @@ use std::io::{self, Read, Write};
|
||||
|
||||
/// Documents are really just a list of couple `(field, value)`.
|
||||
/// In this list, one field may appear more than once.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize, Default)]
|
||||
#[derive(Clone, Debug, serde::Serialize, serde::Deserialize, Default)]
|
||||
pub struct Document {
|
||||
field_values: Vec<FieldValue>,
|
||||
}
|
||||
@@ -131,12 +131,34 @@ impl Document {
|
||||
pub fn get_sorted_field_values(&self) -> Vec<(Field, Vec<&FieldValue>)> {
|
||||
let mut field_values: Vec<&FieldValue> = self.field_values().iter().collect();
|
||||
field_values.sort_by_key(|field_value| field_value.field());
|
||||
field_values
|
||||
.into_iter()
|
||||
.group_by(|field_value| field_value.field())
|
||||
.into_iter()
|
||||
.map(|(key, group)| (key, group.collect()))
|
||||
.collect::<Vec<(Field, Vec<&FieldValue>)>>()
|
||||
|
||||
let mut grouped_field_values = vec![];
|
||||
|
||||
let mut current_field;
|
||||
let mut current_group;
|
||||
|
||||
let mut field_values_it = field_values.into_iter();
|
||||
if let Some(field_value) = field_values_it.next() {
|
||||
current_field = field_value.field();
|
||||
current_group = vec![field_value]
|
||||
} else {
|
||||
return grouped_field_values;
|
||||
}
|
||||
|
||||
for field_value in field_values_it {
|
||||
if field_value.field() == current_field {
|
||||
current_group.push(field_value);
|
||||
} else {
|
||||
grouped_field_values.push((
|
||||
current_field,
|
||||
mem::replace(&mut current_group, vec![field_value]),
|
||||
));
|
||||
current_field = field_value.field();
|
||||
}
|
||||
}
|
||||
|
||||
grouped_field_values.push((current_field, current_group));
|
||||
grouped_field_values
|
||||
}
|
||||
|
||||
/// Returns all of the `FieldValue`s associated the given field
|
||||
|
||||
@@ -5,7 +5,9 @@ use std::io::Write;
|
||||
|
||||
/// `Field` is represented by an unsigned 32-bit integer type
|
||||
/// The schema holds the mapping between field names and `Field` objects.
|
||||
#[derive(Copy, Clone, Debug, PartialEq, PartialOrd, Eq, Ord, Hash, Serialize, Deserialize)]
|
||||
#[derive(
|
||||
Copy, Clone, Debug, PartialEq, PartialOrd, Eq, Ord, Hash, serde::Serialize, serde::Deserialize,
|
||||
)]
|
||||
pub struct Field(u32);
|
||||
|
||||
impl Field {
|
||||
|
||||
@@ -1,12 +1,10 @@
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::schema::Field;
|
||||
use crate::schema::Value;
|
||||
use std::io;
|
||||
use std::io::Read;
|
||||
use std::io::Write;
|
||||
use std::io::{self, Read, Write};
|
||||
|
||||
/// `FieldValue` holds together a `Field` and its `Value`.
|
||||
#[derive(Debug, Clone, Ord, PartialEq, Eq, PartialOrd, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone, Ord, PartialEq, Eq, PartialOrd, serde::Serialize, serde::Deserialize)]
|
||||
pub struct FieldValue {
|
||||
field: Field,
|
||||
value: Value,
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// `IndexRecordOption` describes an amount information associated
|
||||
/// to a given indexed field.
|
||||
///
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use crate::schema::flags::{FastFlag, IndexedFlag, SchemaFlagList, StoredFlag};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::ops::BitOr;
|
||||
|
||||
/// Express whether a field is single-value or multi-valued.
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use crate::schema::Value;
|
||||
use serde::Serialize;
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
/// Internal representation of a document used for JSON
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use crate::schema::flags::SchemaFlagList;
|
||||
use crate::schema::flags::StoredFlag;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::borrow::Cow;
|
||||
use std::ops::BitOr;
|
||||
|
||||
@@ -155,30 +156,17 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_field_options() {
|
||||
{
|
||||
let field_options = STORED | TEXT;
|
||||
assert!(field_options.is_stored());
|
||||
assert!(field_options.get_indexing_options().is_some());
|
||||
}
|
||||
{
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_text_field("body", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let field = schema.get_field("body").unwrap();
|
||||
let field_entry = schema.get_field_entry(field);
|
||||
match field_entry.field_type() {
|
||||
&FieldType::Str(ref text_options) => {
|
||||
assert!(text_options.get_indexing_options().is_some());
|
||||
assert_eq!(
|
||||
text_options.get_indexing_options().unwrap().tokenizer(),
|
||||
"default"
|
||||
);
|
||||
}
|
||||
_ => {
|
||||
panic!("");
|
||||
}
|
||||
}
|
||||
}
|
||||
let field_options = STORED | TEXT;
|
||||
assert!(field_options.is_stored());
|
||||
assert!(field_options.get_indexing_options().is_some());
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_text_field("body", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let field = schema.get_field("body").unwrap();
|
||||
let field_entry = schema.get_field_entry(field);
|
||||
assert!(matches!(field_entry.field_type(),
|
||||
&FieldType::Str(ref text_options)
|
||||
if text_options.get_indexing_options().unwrap().tokenizer() == "default"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -11,6 +11,7 @@ under-count actual resultant space usage by up to 4095 bytes per file.
|
||||
|
||||
use crate::schema::Field;
|
||||
use crate::SegmentComponent;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
|
||||
/// Indicates space usage in bytes
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
use snap;
|
||||
|
||||
use std::io::{self, Read, Write};
|
||||
|
||||
/// Name of the compression scheme used in the doc store.
|
||||
|
||||
@@ -434,6 +434,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_automaton_search() {
|
||||
use crate::query::DFAWrapper;
|
||||
use levenshtein_automata::LevenshteinAutomatonBuilder;
|
||||
|
||||
const COUNTRIES: [&'static str; 7] = [
|
||||
@@ -463,7 +464,7 @@ mod tests {
|
||||
|
||||
// We can now build an entire dfa.
|
||||
let lev_automaton_builder = LevenshteinAutomatonBuilder::new(2, true);
|
||||
let automaton = lev_automaton_builder.build_dfa("Spaen");
|
||||
let automaton = DFAWrapper(lev_automaton_builder.build_dfa("Spaen"));
|
||||
|
||||
let mut range = term_dict.search(automaton).into_stream();
|
||||
|
||||
|
||||
@@ -7,7 +7,6 @@ use crate::postings::TermInfo;
|
||||
use crate::termdict::TermOrdinal;
|
||||
use once_cell::sync::Lazy;
|
||||
use std::io::{self, Write};
|
||||
use tantivy_fst;
|
||||
use tantivy_fst::raw::Fst;
|
||||
use tantivy_fst::Automaton;
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
use crate::tokenizer::BoxTokenStream;
|
||||
use rust_stemmers::{self, Algorithm};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Available stemmer languages.
|
||||
#[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)]
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use crate::tokenizer::{BoxTokenStream, Token, TokenStream, TokenStreamChain};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::cmp::Ordering;
|
||||
|
||||
/// Struct representing pre-tokenized text
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use crate::tokenizer::TokenStreamChain;
|
||||
use serde::{Deserialize, Serialize};
|
||||
/// The tokenizer module contains all of the tools used to process
|
||||
/// text in `tantivy`.
|
||||
use std::borrow::{Borrow, BorrowMut};
|
||||
|
||||
Reference in New Issue
Block a user