Compare commits

..

2 Commits

Author SHA1 Message Date
Halvor Fladsrud Bø
22a702c17f Removed temporary comment 2020-02-04 17:07:06 +01:00
Halvor Fladsrud Bø
14f0c6d01a Changed watcher to use polling 2020-02-04 17:06:29 +01:00
137 changed files with 3770 additions and 6398 deletions

2
.gitignore vendored
View File

@@ -1,5 +1,4 @@
tantivy.iml tantivy.iml
proptest-regressions
*.swp *.swp
target target
target/debug target/debug
@@ -12,4 +11,3 @@ cpp/simdcomp/bitpackingbenchmark
*.bk *.bk
.idea .idea
trace.dat trace.dat
cargo-timing*

View File

@@ -1,51 +1,9 @@
Tantivy 0.14.0
=========================
- Remove dependency to atomicwrites #833. Implemented by @pmasurel upon suggestion and research from @asafigan).
- Migrated tantivy error from the now deprecated `failure` crate to `thiserror` #760. (@hirevo)
- Switched to structure logging (via the `slog` crate). (@pmasurel)
Tantivy 0.13.1
===================
Made `Query` and `Collector` `Send + Sync`.
Updated misc dependency versions.
Tantivy 0.13.0
======================
Tantivy 0.13 introduce a change in the index format that will require
you to reindex your index (BlockWAND information are added in the skiplist).
The index size increase is minor as this information is only added for
full blocks.
If you have a massive index for which reindexing is not an option, please contact me
so that we can discuss possible solutions.
- Bugfix in `FuzzyTermQuery` not matching terms by prefix when it should (@Peachball)
- Relaxed constraints on the custom/tweak score functions. At the segment level, they can be mut, and they are not required to be Sync + Send.
- `MMapDirectory::open` does not return a `Result` anymore.
- Change in the DocSet and Scorer API. (@fulmicoton).
A freshly created DocSet point directly to their first doc. A sentinel value called TERMINATED marks the end of a DocSet.
`.advance()` returns the new DocId. `Scorer::skip(target)` has been replaced by `Scorer::seek(target)` and returns the resulting DocId.
As a result, iterating through DocSet now looks as follows
```rust
let mut doc = docset.doc();
while doc != TERMINATED {
// ...
doc = docset.advance();
}
```
The change made it possible to greatly simplify a lot of the docset's code.
- Misc internal optimization and introduction of the `Scorer::for_each_pruning` function. (@fulmicoton)
- Added an offset option to the Top(.*)Collectors. (@robyoung)
- Added Block WAND. Performance on TOP-K on term-unions should be greatly increased. (@fulmicoton, and special thanks
to the PISA team for answering all my questions!)
Tantivy 0.12.0 Tantivy 0.12.0
====================== ======================
- Removing static dispatch in tokenizers for simplicity. (#762) - Removing static dispatch in tokenizers for simplicity. (#762)
- Added backward iteration for `TermDictionary` stream. (@halvorboe) - Added backward iteration for `TermDictionary` stream. (@halvorboe)
- Fixed a performance issue when searching for the posting lists of a missing term (@audunhalland) - Fixed a performance issue when searching for the posting lists of a missing term (@audunhalland)
- Added a configurable maximum number of docs (10M by default) for a segment to be considered for merge (@hntd187, landed by @halvorboe #713) - Added a configurable maximum number of docs (10M by default) for a segment to be considered for merge (@hntd187, landed by @halvorboe #713)
- Important Bugfix #777, causing tantivy to retain memory mapping. (diagnosed by @poljar)
- Added support for field boosting. (#547, @fulmicoton)
## How to update? ## How to update?

View File

@@ -1,11 +1,11 @@
[package] [package]
name = "tantivy" name = "tantivy"
version = "0.14.0-dev" version = "0.12.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"] authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT" license = "MIT"
categories = ["database-implementations", "data-structures"] categories = ["database-implementations", "data-structures"]
description = """Search engine library""" description = """Search engine library"""
documentation = "https://docs.rs/tantivy/" documentation = "https://tantivy-search.github.io/tantivy/tantivy/index.html"
homepage = "https://github.com/tantivy-search/tantivy" homepage = "https://github.com/tantivy-search/tantivy"
repository = "https://github.com/tantivy-search/tantivy" repository = "https://github.com/tantivy-search/tantivy"
readme = "README.md" readme = "README.md"
@@ -13,42 +13,44 @@ keywords = ["search", "information", "retrieval"]
edition = "2018" edition = "2018"
[dependencies] [dependencies]
base64 = "0.12" base64 = "0.11.0"
byteorder = "1" byteorder = "1.0"
crc32fast = "1" crc32fast = "1.2.0"
once_cell = "1" once_cell = "1.0"
regex ={version = "1", default-features = false, features = ["std"]} regex ={version = "1.3.0", default-features = false, features = ["std"]}
tantivy-fst = "0.3" tantivy-fst = "0.2.1"
memmap = {version = "0.7", optional=true} memmap = {version = "0.7", optional=true}
lz4 = {version="1", optional=true} lz4 = {version="1.20", optional=true}
snap = "1" snap = {version="0.2"}
tempfile = {version="3", optional=true} atomicwrites = {version="0.2.2", optional=true}
slog = "2.5" tempfile = "3.0"
slog-stdlog = "4" log = "0.4"
serde = {version="1", features=["derive"]} serde = "1.0"
serde_json = "1" serde_derive = "1.0"
num_cpus = "1" serde_json = "1.0"
num_cpus = "1.2"
fs2={version="0.4", optional=true} fs2={version="0.4", optional=true}
levenshtein_automata = "0.2" itertools = "0.8"
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
notify = {version="4", optional=true} notify = {version="4", optional=true}
uuid = { version = "0.8", features = ["v4", "serde"] } uuid = { version = "0.8", features = ["v4", "serde"] }
crossbeam = "0.7" crossbeam = "0.7"
futures = {version = "0.3", features=["thread-pool"] } futures = {version = "0.3", features=["thread-pool"] }
owning_ref = "0.4" owning_ref = "0.4"
tantivy-query-grammar = { version="0.14.0-dev", path="./query-grammar" } stable_deref_trait = "1.0.0"
stable_deref_trait = "1" rust-stemmers = "1.2"
rust-stemmers = "1" downcast-rs = { version="1.0" }
downcast-rs = "1" tantivy-query-grammar = { version="0.11", path="./query-grammar" }
bitpacking = {version="0.8", default-features = false, features=["bitpacker4x"]} bitpacking = {version="0.8", default-features = false, features=["bitpacker4x"]}
census = "0.4" census = "0.4"
fnv = "1" fnv = "1.0.6"
owned-read = "0.4" owned-read = "0.4"
thiserror = "1.0" failure = "0.1"
htmlescape = "0.3" htmlescape = "0.3.1"
fail = "0.4" fail = "0.3"
murmurhash32 = "0.2" murmurhash32 = "0.2"
chrono = "0.4" chrono = "0.4"
smallvec = "1" smallvec = "1.0"
rayon = "1" rayon = "1"
[target.'cfg(windows)'.dependencies] [target.'cfg(windows)'.dependencies]
@@ -58,10 +60,9 @@ winapi = "0.3"
rand = "0.7" rand = "0.7"
maplit = "1" maplit = "1"
matches = "0.1.8" matches = "0.1.8"
proptest = "0.10"
[dev-dependencies.fail] [dev-dependencies.fail]
version = "0.4" version = "0.3"
features = ["failpoints"] features = ["failpoints"]
[profile.release] [profile.release]
@@ -75,7 +76,7 @@ overflow-checks = true
[features] [features]
default = ["mmap"] default = ["mmap"]
mmap = ["fs2", "tempfile", "memmap", "notify"] mmap = ["atomicwrites", "fs2", "memmap", "notify"]
lz4-compression = ["lz4"] lz4-compression = ["lz4"]
failpoints = ["fail/failpoints"] failpoints = ["fail/failpoints"]
unstable = [] # useful for benches. unstable = [] # useful for benches.

View File

@@ -31,15 +31,16 @@ Tantivy is, in fact, strongly inspired by Lucene's design.
# Benchmark # Benchmark
Tantivy is typically faster than Lucene, but the results depend on
the nature of the queries in your workload.
The following [benchmark](https://tantivy-search.github.io/bench/) break downs The following [benchmark](https://tantivy-search.github.io/bench/) break downs
performance for different type of queries / collection. performance for different type of queries / collection.
Your mileage WILL vary depending on the nature of queries and their load.
# Features # Features
- Full-text search - Full-text search
- Configurable tokenizer (stemming available for 17 Latin languages with third party support for Chinese ([tantivy-jieba](https://crates.io/crates/tantivy-jieba) and [cang-jie](https://crates.io/crates/cang-jie)), Japanese ([lindera](https://github.com/lindera-morphology/lindera-tantivy) and [tantivy-tokenizer-tiny-segmente](https://crates.io/crates/tantivy-tokenizer-tiny-segmenter)) and Korean ([lindera](https://github.com/lindera-morphology/lindera-tantivy) + [lindera-ko-dic-builder](https://github.com/lindera-morphology/lindera-ko-dic-builder)) - Configurable tokenizer (stemming available for 17 Latin languages with third party support for Chinese ([tantivy-jieba](https://crates.io/crates/tantivy-jieba) and [cang-jie](https://crates.io/crates/cang-jie)) and [Japanese](https://crates.io/crates/tantivy-tokenizer-tiny-segmenter))
- Fast (check out the :racehorse: :sparkles: [benchmark](https://tantivy-search.github.io/bench/) :sparkles: :racehorse:) - Fast (check out the :racehorse: :sparkles: [benchmark](https://tantivy-search.github.io/bench/) :sparkles: :racehorse:)
- Tiny startup time (<10ms), perfect for command line tools - Tiny startup time (<10ms), perfect for command line tools
- BM25 scoring (the same as Lucene) - BM25 scoring (the same as Lucene)
@@ -58,17 +59,18 @@ Your mileage WILL vary depending on the nature of queries and their load.
- Configurable indexing (optional term frequency and position indexing) - Configurable indexing (optional term frequency and position indexing)
- Cheesy logo with a horse - Cheesy logo with a horse
## Non-features # Non-features
- Distributed search is out of the scope of Tantivy. That being said, Tantivy is a - Distributed search is out of the scope of Tantivy. That being said, Tantivy is a
library upon which one could build a distributed search. Serializable/mergeable collector state for instance, library upon which one could build a distributed search. Serializable/mergeable collector state for instance,
are within the scope of Tantivy. are within the scope of Tantivy.
# Supported OS and compiler
# Getting started
Tantivy works on stable Rust (>= 1.27) and supports Linux, MacOS, and Windows. Tantivy works on stable Rust (>= 1.27) and supports Linux, MacOS, and Windows.
# Getting started
- [Tantivy's simple search example](https://tantivy-search.github.io/examples/basic_search.html) - [Tantivy's simple search example](https://tantivy-search.github.io/examples/basic_search.html)
- [tantivy-cli and its tutorial](https://github.com/tantivy-search/tantivy-cli) - `tantivy-cli` is an actual command line interface that makes it easy for you to create a search engine, - [tantivy-cli and its tutorial](https://github.com/tantivy-search/tantivy-cli) - `tantivy-cli` is an actual command line interface that makes it easy for you to create a search engine,
index documents, and search via the CLI or a small server with a REST API. index documents, and search via the CLI or a small server with a REST API.

View File

@@ -18,5 +18,5 @@ install:
build: false build: false
test_script: test_script:
- REM SET RUST_LOG=tantivy,test & cargo test --all --verbose --no-default-features --features mmap - REM SET RUST_LOG=tantivy,test & cargo test --verbose --no-default-features --features mmap
- REM SET RUST_BACKTRACE=1 & cargo build --examples - REM SET RUST_BACKTRACE=1 & cargo build --examples

View File

@@ -112,6 +112,18 @@ fn main() -> tantivy::Result<()> {
limbs and branches that arch over the pool" limbs and branches that arch over the pool"
)); ));
index_writer.add_document(doc!(
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
// Multivalued field just need to be repeated. // Multivalued field just need to be repeated.
index_writer.add_document(doc!( index_writer.add_document(doc!(
title => "Frankenstein", title => "Frankenstein",

View File

@@ -14,7 +14,7 @@ use tantivy::fastfield::FastFieldReader;
use tantivy::query::QueryParser; use tantivy::query::QueryParser;
use tantivy::schema::Field; use tantivy::schema::Field;
use tantivy::schema::{Schema, FAST, INDEXED, TEXT}; use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
use tantivy::{doc, Index, Score, SegmentReader, TantivyError}; use tantivy::{doc, Index, SegmentReader, TantivyError};
#[derive(Default)] #[derive(Default)]
struct Stats { struct Stats {
@@ -114,7 +114,7 @@ struct StatsSegmentCollector {
impl SegmentCollector for StatsSegmentCollector { impl SegmentCollector for StatsSegmentCollector {
type Fruit = Option<Stats>; type Fruit = Option<Stats>;
fn collect(&mut self, doc: u32, _score: Score) { fn collect(&mut self, doc: u32, _score: f32) {
let value = self.fast_field_reader.get(doc) as f64; let value = self.fast_field_reader.get(doc) as f64;
self.stats.count += 1; self.stats.count += 1;
self.stats.sum += value; self.stats.sum += value;

View File

@@ -1,98 +0,0 @@
use std::collections::HashSet;
use tantivy::collector::TopDocs;
use tantivy::doc;
use tantivy::query::BooleanQuery;
use tantivy::schema::*;
use tantivy::{DocId, Index, Score, SegmentReader};
fn main() -> tantivy::Result<()> {
let mut schema_builder = Schema::builder();
let title = schema_builder.add_text_field("title", STORED);
let ingredient = schema_builder.add_facet_field("ingredient");
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let mut index_writer = index.writer(30_000_000)?;
index_writer.add_document(doc!(
title => "Fried egg",
ingredient => Facet::from("/ingredient/egg"),
ingredient => Facet::from("/ingredient/oil"),
));
index_writer.add_document(doc!(
title => "Scrambled egg",
ingredient => Facet::from("/ingredient/egg"),
ingredient => Facet::from("/ingredient/butter"),
ingredient => Facet::from("/ingredient/milk"),
ingredient => Facet::from("/ingredient/salt"),
));
index_writer.add_document(doc!(
title => "Egg rolls",
ingredient => Facet::from("/ingredient/egg"),
ingredient => Facet::from("/ingredient/garlic"),
ingredient => Facet::from("/ingredient/salt"),
ingredient => Facet::from("/ingredient/oil"),
ingredient => Facet::from("/ingredient/tortilla-wrap"),
ingredient => Facet::from("/ingredient/mushroom"),
));
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
{
let facets = vec![
Facet::from("/ingredient/egg"),
Facet::from("/ingredient/oil"),
Facet::from("/ingredient/garlic"),
Facet::from("/ingredient/mushroom"),
];
let query = BooleanQuery::new_multiterms_query(
facets
.iter()
.map(|key| Term::from_facet(ingredient, &key))
.collect(),
);
let top_docs_by_custom_score =
TopDocs::with_limit(2).tweak_score(move |segment_reader: &SegmentReader| {
let mut ingredient_reader = segment_reader.facet_reader(ingredient).unwrap();
let facet_dict = ingredient_reader.facet_dict();
let query_ords: HashSet<u64> = facets
.iter()
.filter_map(|key| facet_dict.term_ord(key.encoded_str()))
.collect();
let mut facet_ords_buffer: Vec<u64> = Vec::with_capacity(20);
move |doc: DocId, original_score: Score| {
ingredient_reader.facet_ords(doc, &mut facet_ords_buffer);
let missing_ingredients = facet_ords_buffer
.iter()
.filter(|ord| !query_ords.contains(ord))
.count();
let tweak = 1.0 / 4_f32.powi(missing_ingredients as i32);
original_score * tweak
}
});
let top_docs = searcher.search(&query, &top_docs_by_custom_score)?;
let titles: Vec<String> = top_docs
.iter()
.map(|(_, doc_id)| {
searcher
.doc(*doc_id)
.unwrap()
.get_first(title)
.unwrap()
.text()
.unwrap()
.to_owned()
})
.collect();
assert_eq!(titles, vec!["Fried egg", "Egg rolls"]);
}
Ok(())
}

View File

@@ -10,7 +10,7 @@
// --- // ---
// Importing tantivy... // Importing tantivy...
use tantivy::schema::*; use tantivy::schema::*;
use tantivy::{doc, DocSet, Index, Postings, TERMINATED}; use tantivy::{doc, DocId, DocSet, Index, Postings};
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {
// We first create a schema for the sake of the // We first create a schema for the sake of the
@@ -62,11 +62,12 @@ fn main() -> tantivy::Result<()> {
{ {
// this buffer will be used to request for positions // this buffer will be used to request for positions
let mut positions: Vec<u32> = Vec::with_capacity(100); let mut positions: Vec<u32> = Vec::with_capacity(100);
let mut doc_id = segment_postings.doc(); while segment_postings.advance() {
while doc_id != TERMINATED { // the number of time the term appears in the document.
let doc_id: DocId = segment_postings.doc(); //< do not try to access this before calling advance once.
// This MAY contains deleted documents as well. // This MAY contains deleted documents as well.
if segment_reader.is_deleted(doc_id) { if segment_reader.is_deleted(doc_id) {
doc_id = segment_postings.advance();
continue; continue;
} }
@@ -85,7 +86,6 @@ fn main() -> tantivy::Result<()> {
// Doc 2: TermFreq 1: [0] // Doc 2: TermFreq 1: [0]
// ``` // ```
println!("Doc {}: TermFreq {}: {:?}", doc_id, term_freq, positions); println!("Doc {}: TermFreq {}: {:?}", doc_id, term_freq, positions);
doc_id = segment_postings.advance();
} }
} }
} }
@@ -117,16 +117,11 @@ fn main() -> tantivy::Result<()> {
if let Some(mut block_segment_postings) = if let Some(mut block_segment_postings) =
inverted_index.read_block_postings(&term_the, IndexRecordOption::Basic) inverted_index.read_block_postings(&term_the, IndexRecordOption::Basic)
{ {
loop { while block_segment_postings.advance() {
let docs = block_segment_postings.docs();
if docs.is_empty() {
break;
}
// Once again these docs MAY contains deleted documents as well. // Once again these docs MAY contains deleted documents as well.
let docs = block_segment_postings.docs(); let docs = block_segment_postings.docs();
// Prints `Docs [0, 2].` // Prints `Docs [0, 2].`
println!("Docs {:?}", docs); println!("Docs {:?}", docs);
block_segment_postings.advance();
} }
} }
} }

View File

@@ -9,10 +9,11 @@
// - import tokenized text straight from json, // - import tokenized text straight from json,
// - perform a search on documents with pre-tokenized text // - perform a search on documents with pre-tokenized text
use tantivy::tokenizer::{PreTokenizedString, SimpleTokenizer, Token, Tokenizer};
use tantivy::collector::{Count, TopDocs}; use tantivy::collector::{Count, TopDocs};
use tantivy::query::TermQuery; use tantivy::query::TermQuery;
use tantivy::schema::*; use tantivy::schema::*;
use tantivy::tokenizer::{PreTokenizedString, SimpleTokenizer, Token, Tokenizer};
use tantivy::{doc, Index, ReloadPolicy}; use tantivy::{doc, Index, ReloadPolicy};
use tempfile::TempDir; use tempfile::TempDir;

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "tantivy-query-grammar" name = "tantivy-query-grammar"
version = "0.14.0-dev" version = "0.11.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"] authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT" license = "MIT"
categories = ["database-implementations", "data-structures"] categories = ["database-implementations", "data-structures"]
@@ -13,4 +13,4 @@ keywords = ["search", "information", "retrieval"]
edition = "2018" edition = "2018"
[dependencies] [dependencies]
combine = {version="4", default-features=false, features=[] } combine = ">=3.6.0,<4.0.0"

View File

@@ -1,3 +1,5 @@
#![recursion_limit = "100"]
mod occur; mod occur;
mod query_grammar; mod query_grammar;
mod user_input_ast; mod user_input_ast;

View File

@@ -31,12 +31,22 @@ impl Occur {
/// Compose two occur values. /// Compose two occur values.
pub fn compose(left: Occur, right: Occur) -> Occur { pub fn compose(left: Occur, right: Occur) -> Occur {
match (left, right) { match left {
(Occur::Should, _) => right, Occur::Should => right,
(Occur::Must, Occur::MustNot) => Occur::MustNot, Occur::Must => {
(Occur::Must, _) => Occur::Must, if right == Occur::MustNot {
(Occur::MustNot, Occur::MustNot) => Occur::Must, Occur::MustNot
(Occur::MustNot, _) => Occur::MustNot, } else {
Occur::Must
}
}
Occur::MustNot => {
if right == Occur::MustNot {
Occur::Must
} else {
Occur::MustNot
}
}
} }
} }
} }
@@ -46,27 +56,3 @@ impl fmt::Display for Occur {
f.write_char(self.to_char()) f.write_char(self.to_char())
} }
} }
#[cfg(test)]
mod test {
use crate::Occur;
#[test]
fn test_occur_compose() {
assert_eq!(Occur::compose(Occur::Should, Occur::Should), Occur::Should);
assert_eq!(Occur::compose(Occur::Should, Occur::Must), Occur::Must);
assert_eq!(
Occur::compose(Occur::Should, Occur::MustNot),
Occur::MustNot
);
assert_eq!(Occur::compose(Occur::Must, Occur::Should), Occur::Must);
assert_eq!(Occur::compose(Occur::Must, Occur::Must), Occur::Must);
assert_eq!(Occur::compose(Occur::Must, Occur::MustNot), Occur::MustNot);
assert_eq!(
Occur::compose(Occur::MustNot, Occur::Should),
Occur::MustNot
);
assert_eq!(Occur::compose(Occur::MustNot, Occur::Must), Occur::MustNot);
assert_eq!(Occur::compose(Occur::MustNot, Occur::MustNot), Occur::Must);
}
}

View File

@@ -1,211 +1,171 @@
use super::user_input_ast::{UserInputAST, UserInputBound, UserInputLeaf, UserInputLiteral}; use super::user_input_ast::*;
use crate::Occur; use crate::Occur;
use combine::error::StringStreamError; use combine::char::*;
use combine::parser::char::{char, digit, letter, space, spaces, string}; use combine::error::StreamError;
use combine::parser::Parser; use combine::stream::StreamErrorFor;
use combine::{ use combine::*;
attempt, choice, eof, many, many1, one_of, optional, parser, satisfy, skip_many1, value,
};
fn field<'a>() -> impl Parser<&'a str, Output = String> { parser! {
( fn field[I]()(I) -> String
(letter().or(char('_'))), where [I: Stream<Item = char>] {
many(satisfy(|c: char| { (
c.is_alphanumeric() || c == '_' || c == '-' letter(),
})), many(satisfy(|c: char| c.is_alphanumeric() || c == '_')),
) ).skip(char(':')).map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
.skip(char(':')) }
}
parser! {
fn word[I]()(I) -> String
where [I: Stream<Item = char>] {
(
satisfy(|c: char| !c.is_whitespace() && !['-', '`', ':', '{', '}', '"', '[', ']', '(',')'].contains(&c) ),
many(satisfy(|c: char| !c.is_whitespace() && ![':', '{', '}', '"', '[', ']', '(',')'].contains(&c)))
)
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2)) .map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
.and_then(|s: String|
match s.as_str() {
"OR" => Err(StreamErrorFor::<I>::unexpected_static_message("OR")),
"AND" => Err(StreamErrorFor::<I>::unexpected_static_message("AND")),
"NOT" => Err(StreamErrorFor::<I>::unexpected_static_message("NOT")),
_ => Ok(s)
})
}
} }
fn word<'a>() -> impl Parser<&'a str, Output = String> { parser! {
( fn literal[I]()(I) -> UserInputLeaf
satisfy(|c: char| { where [I: Stream<Item = char>]
!c.is_whitespace() {
&& !['-', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')'].contains(&c) let term_val = || {
}), let phrase = char('"').with(many1(satisfy(|c| c != '"'))).skip(char('"'));
many(satisfy(|c: char| { phrase.or(word())
!c.is_whitespace() && ![':', '^', '{', '}', '"', '[', ']', '(', ')'].contains(&c) };
})), let term_val_with_field = negative_number().or(term_val());
) let term_query =
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2)) (field(), term_val_with_field)
.and_then(|s: String| match s.as_str() { .map(|(field_name, phrase)| UserInputLiteral {
"OR" | "AND " | "NOT" => Err(StringStreamError::UnexpectedParse), field_name: Some(field_name),
_ => Ok(s), phrase,
});
let term_default_field = term_val().map(|phrase| UserInputLiteral {
field_name: None,
phrase,
});
attempt(term_query)
.or(term_default_field)
.map(UserInputLeaf::from)
}
}
parser! {
fn negative_number[I]()(I) -> String
where [I: Stream<Item = char>]
{
(char('-'), many1(satisfy(char::is_numeric)),
optional((char('.'), many1(satisfy(char::is_numeric)))))
.map(|(s1, s2, s3): (char, String, Option<(char, String)>)| {
if let Some(('.', s3)) = s3 {
format!("{}{}.{}", s1, s2, s3)
} else {
format!("{}{}", s1, s2)
}
})
}
}
parser! {
fn spaces1[I]()(I) -> ()
where [I: Stream<Item = char>] {
skip_many1(space())
}
}
parser! {
/// Function that parses a range out of a Stream
/// Supports ranges like:
/// [5 TO 10], {5 TO 10}, [* TO 10], [10 TO *], {10 TO *], >5, <=10
/// [a TO *], [a TO c], [abc TO bcd}
fn range[I]()(I) -> UserInputLeaf
where [I: Stream<Item = char>] {
let range_term_val = || {
word().or(negative_number()).or(char('*').with(value("*".to_string())))
};
// check for unbounded range in the form of <5, <=10, >5, >=5
let elastic_unbounded_range = (choice([attempt(string(">=")),
attempt(string("<=")),
attempt(string("<")),
attempt(string(">"))])
.skip(spaces()),
range_term_val()).
map(|(comparison_sign, bound): (&str, String)|
match comparison_sign {
">=" => (UserInputBound::Inclusive(bound), UserInputBound::Unbounded),
"<=" => (UserInputBound::Unbounded, UserInputBound::Inclusive(bound)),
"<" => (UserInputBound::Unbounded, UserInputBound::Exclusive(bound)),
">" => (UserInputBound::Exclusive(bound), UserInputBound::Unbounded),
// default case
_ => (UserInputBound::Unbounded, UserInputBound::Unbounded)
});
let lower_bound = (one_of("{[".chars()), range_term_val())
.map(|(boundary_char, lower_bound): (char, String)|
if lower_bound == "*" {
UserInputBound::Unbounded
} else if boundary_char == '{' {
UserInputBound::Exclusive(lower_bound)
} else {
UserInputBound::Inclusive(lower_bound)
});
let upper_bound = (range_term_val(), one_of("}]".chars()))
.map(|(higher_bound, boundary_char): (String, char)|
if higher_bound == "*" {
UserInputBound::Unbounded
} else if boundary_char == '}' {
UserInputBound::Exclusive(higher_bound)
} else {
UserInputBound::Inclusive(higher_bound)
});
// return only lower and upper
let lower_to_upper = (lower_bound.
skip((spaces(),
string("TO"),
spaces())),
upper_bound);
(optional(field()).skip(spaces()),
// try elastic first, if it matches, the range is unbounded
attempt(elastic_unbounded_range).or(lower_to_upper))
.map(|(field, (lower, upper))|
// Construct the leaf from extracted field (optional)
// and bounds
UserInputLeaf::Range {
field,
lower,
upper
}) })
} }
fn term_val<'a>() -> impl Parser<&'a str, Output = String> {
let phrase = char('"').with(many1(satisfy(|c| c != '"'))).skip(char('"'));
phrase.or(word())
}
fn term_query<'a>() -> impl Parser<&'a str, Output = UserInputLiteral> {
let term_val_with_field = negative_number().or(term_val());
(field(), term_val_with_field).map(|(field_name, phrase)| UserInputLiteral {
field_name: Some(field_name),
phrase,
})
}
fn literal<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
let term_default_field = term_val().map(|phrase| UserInputLiteral {
field_name: None,
phrase,
});
attempt(term_query())
.or(term_default_field)
.map(UserInputLeaf::from)
}
fn negative_number<'a>() -> impl Parser<&'a str, Output = String> {
(
char('-'),
many1(digit()),
optional((char('.'), many1(digit()))),
)
.map(|(s1, s2, s3): (char, String, Option<(char, String)>)| {
if let Some(('.', s3)) = s3 {
format!("{}{}.{}", s1, s2, s3)
} else {
format!("{}{}", s1, s2)
}
})
}
fn spaces1<'a>() -> impl Parser<&'a str, Output = ()> {
skip_many1(space())
}
/// Function that parses a range out of a Stream
/// Supports ranges like:
/// [5 TO 10], {5 TO 10}, [* TO 10], [10 TO *], {10 TO *], >5, <=10
/// [a TO *], [a TO c], [abc TO bcd}
fn range<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
let range_term_val = || {
word()
.or(negative_number())
.or(char('*').with(value("*".to_string())))
};
// check for unbounded range in the form of <5, <=10, >5, >=5
let elastic_unbounded_range = (
choice([
attempt(string(">=")),
attempt(string("<=")),
attempt(string("<")),
attempt(string(">")),
])
.skip(spaces()),
range_term_val(),
)
.map(
|(comparison_sign, bound): (&str, String)| match comparison_sign {
">=" => (UserInputBound::Inclusive(bound), UserInputBound::Unbounded),
"<=" => (UserInputBound::Unbounded, UserInputBound::Inclusive(bound)),
"<" => (UserInputBound::Unbounded, UserInputBound::Exclusive(bound)),
">" => (UserInputBound::Exclusive(bound), UserInputBound::Unbounded),
// default case
_ => (UserInputBound::Unbounded, UserInputBound::Unbounded),
},
);
let lower_bound = (one_of("{[".chars()), range_term_val()).map(
|(boundary_char, lower_bound): (char, String)| {
if lower_bound == "*" {
UserInputBound::Unbounded
} else if boundary_char == '{' {
UserInputBound::Exclusive(lower_bound)
} else {
UserInputBound::Inclusive(lower_bound)
}
},
);
let upper_bound = (range_term_val(), one_of("}]".chars())).map(
|(higher_bound, boundary_char): (String, char)| {
if higher_bound == "*" {
UserInputBound::Unbounded
} else if boundary_char == '}' {
UserInputBound::Exclusive(higher_bound)
} else {
UserInputBound::Inclusive(higher_bound)
}
},
);
// return only lower and upper
let lower_to_upper = (
lower_bound.skip((spaces(), string("TO"), spaces())),
upper_bound,
);
(
optional(field()).skip(spaces()),
// try elastic first, if it matches, the range is unbounded
attempt(elastic_unbounded_range).or(lower_to_upper),
)
.map(|(field, (lower, upper))|
// Construct the leaf from extracted field (optional)
// and bounds
UserInputLeaf::Range {
field,
lower,
upper
})
} }
fn negate(expr: UserInputAST) -> UserInputAST { fn negate(expr: UserInputAST) -> UserInputAST {
expr.unary(Occur::MustNot) expr.unary(Occur::MustNot)
} }
fn leaf<'a>() -> impl Parser<&'a str, Output = UserInputAST> { fn must(expr: UserInputAST) -> UserInputAST {
parser(|input| { expr.unary(Occur::Must)
char('(')
.with(ast())
.skip(char(')'))
.or(char('*').map(|_| UserInputAST::from(UserInputLeaf::All)))
.or(attempt(
string("NOT").skip(spaces1()).with(leaf()).map(negate),
))
.or(attempt(range().map(UserInputAST::from)))
.or(literal().map(UserInputAST::from))
.parse_stream(input)
.into_result()
})
} }
fn occur_symbol<'a>() -> impl Parser<&'a str, Output = Occur> { parser! {
char('-') fn leaf[I]()(I) -> UserInputAST
.map(|_| Occur::MustNot) where [I: Stream<Item = char>] {
.or(char('+').map(|_| Occur::Must)) char('-').with(leaf()).map(negate)
} .or(char('+').with(leaf()).map(must))
.or(char('(').with(ast()).skip(char(')')))
fn occur_leaf<'a>() -> impl Parser<&'a str, Output = (Option<Occur>, UserInputAST)> { .or(char('*').map(|_| UserInputAST::from(UserInputLeaf::All)))
(optional(occur_symbol()), boosted_leaf()) .or(attempt(string("NOT").skip(spaces1()).with(leaf()).map(negate)))
} .or(attempt(range().map(UserInputAST::from)))
.or(literal().map(UserInputAST::from))
fn positive_float_number<'a>() -> impl Parser<&'a str, Output = f64> { }
(many1(digit()), optional((char('.'), many1(digit())))).map(
|(int_part, decimal_part_opt): (String, Option<(char, String)>)| {
let mut float_str = int_part;
if let Some((chr, decimal_str)) = decimal_part_opt {
float_str.push(chr);
float_str.push_str(&decimal_str);
}
float_str.parse::<f64>().unwrap()
},
)
}
fn boost<'a>() -> impl Parser<&'a str, Output = f64> {
(char('^'), positive_float_number()).map(|(_, boost)| boost)
}
fn boosted_leaf<'a>() -> impl Parser<&'a str, Output = UserInputAST> {
(leaf(), optional(boost())).map(|(leaf, boost_opt)| match boost_opt {
Some(boost) if (boost - 1.0).abs() > std::f64::EPSILON => {
UserInputAST::Boost(Box::new(leaf), boost)
}
_ => leaf,
})
} }
#[derive(Clone, Copy)] #[derive(Clone, Copy)]
@@ -214,10 +174,13 @@ enum BinaryOperand {
And, And,
} }
fn binary_operand<'a>() -> impl Parser<&'a str, Output = BinaryOperand> { parser! {
string("AND") fn binary_operand[I]()(I) -> BinaryOperand
.with(value(BinaryOperand::And)) where [I: Stream<Item = char>]
.or(string("OR").with(value(BinaryOperand::Or))) {
string("AND").with(value(BinaryOperand::And))
.or(string("OR").with(value(BinaryOperand::Or)))
}
} }
fn aggregate_binary_expressions( fn aggregate_binary_expressions(
@@ -245,84 +208,37 @@ fn aggregate_binary_expressions(
} }
} }
fn operand_leaf<'a>() -> impl Parser<&'a str, Output = (BinaryOperand, UserInputAST)> { parser! {
( pub fn ast[I]()(I) -> UserInputAST
binary_operand().skip(spaces()), where [I: Stream<Item = char>]
boosted_leaf().skip(spaces()), {
) let operand_leaf = (binary_operand().skip(spaces()), leaf().skip(spaces()));
} let boolean_expr = (leaf().skip(spaces().silent()), many1(operand_leaf)).map(
|(left, right)| aggregate_binary_expressions(left,right));
pub fn ast<'a>() -> impl Parser<&'a str, Output = UserInputAST> { let whitespace_separated_leaves = many1(leaf().skip(spaces().silent()))
let boolean_expr = (boosted_leaf().skip(spaces()), many1(operand_leaf())) .map(|subqueries: Vec<UserInputAST>|
.map(|(left, right)| aggregate_binary_expressions(left, right));
let whitespace_separated_leaves = many1(occur_leaf().skip(spaces().silent())).map(
|subqueries: Vec<(Option<Occur>, UserInputAST)>| {
if subqueries.len() == 1 { if subqueries.len() == 1 {
let (occur_opt, ast) = subqueries.into_iter().next().unwrap(); subqueries.into_iter().next().unwrap()
match occur_opt.unwrap_or(Occur::Should) {
Occur::Must | Occur::Should => ast,
Occur::MustNot => UserInputAST::Clause(vec![(Some(Occur::MustNot), ast)]),
}
} else { } else {
UserInputAST::Clause(subqueries.into_iter().collect()) UserInputAST::Clause(subqueries.into_iter().collect())
} });
}, let expr = attempt(boolean_expr).or(whitespace_separated_leaves);
); spaces().with(expr).skip(spaces())
let expr = attempt(boolean_expr).or(whitespace_separated_leaves); }
spaces().with(expr).skip(spaces())
} }
pub fn parse_to_ast<'a>() -> impl Parser<&'a str, Output = UserInputAST> { parser! {
spaces() pub fn parse_to_ast[I]()(I) -> UserInputAST
.with(optional(ast()).skip(eof())) where [I: Stream<Item = char>]
.map(|opt_ast| opt_ast.unwrap_or_else(UserInputAST::empty_query)) {
spaces().with(optional(ast()).skip(eof())).map(|opt_ast| opt_ast.unwrap_or_else(UserInputAST::empty_query))
}
} }
#[cfg(test)] #[cfg(test)]
mod test { mod test {
type TestParseResult = Result<(), StringStreamError>;
use super::*; use super::*;
use combine::parser::Parser;
pub fn nearly_equals(a: f64, b: f64) -> bool {
(a - b).abs() < 0.0005 * (a + b).abs()
}
fn assert_nearly_equals(expected: f64, val: f64) {
assert!(
nearly_equals(val, expected),
"Got {}, expected {}.",
val,
expected
);
}
#[test]
fn test_occur_symbol() -> TestParseResult {
assert_eq!(super::occur_symbol().parse("-")?, (Occur::MustNot, ""));
assert_eq!(super::occur_symbol().parse("+")?, (Occur::Must, ""));
Ok(())
}
#[test]
fn test_positive_float_number() {
fn valid_parse(float_str: &str, expected_val: f64, expected_remaining: &str) {
let (val, remaining) = positive_float_number().parse(float_str).unwrap();
assert_eq!(remaining, expected_remaining);
assert_nearly_equals(val, expected_val);
}
fn error_parse(float_str: &str) {
assert!(positive_float_number().parse(float_str).is_err());
}
valid_parse("1.0", 1.0, "");
valid_parse("1", 1.0, "");
valid_parse("0.234234 aaa", 0.234234f64, " aaa");
error_parse(".3332");
error_parse("1.");
error_parse("-1.");
}
fn test_parse_query_to_ast_helper(query: &str, expected: &str) { fn test_parse_query_to_ast_helper(query: &str, expected: &str) {
let query = parse_to_ast().parse(query).unwrap().0; let query = parse_to_ast().parse(query).unwrap().0;
@@ -353,24 +269,15 @@ mod test {
"Err(UnexpectedParse)" "Err(UnexpectedParse)"
); );
test_parse_query_to_ast_helper("NOTa", "\"NOTa\""); test_parse_query_to_ast_helper("NOTa", "\"NOTa\"");
test_parse_query_to_ast_helper("NOT a", "(-\"a\")"); test_parse_query_to_ast_helper("NOT a", "-(\"a\")");
}
#[test]
fn test_boosting() {
assert!(parse_to_ast().parse("a^2^3").is_err());
assert!(parse_to_ast().parse("a^2^").is_err());
test_parse_query_to_ast_helper("a^3", "(\"a\")^3");
test_parse_query_to_ast_helper("a^3 b^2", "(*(\"a\")^3 *(\"b\")^2)");
test_parse_query_to_ast_helper("a^1", "\"a\"");
} }
#[test] #[test]
fn test_parse_query_to_ast_binary_op() { fn test_parse_query_to_ast_binary_op() {
test_parse_query_to_ast_helper("a AND b", "(+\"a\" +\"b\")"); test_parse_query_to_ast_helper("a AND b", "(+(\"a\") +(\"b\"))");
test_parse_query_to_ast_helper("a OR b", "(?\"a\" ?\"b\")"); test_parse_query_to_ast_helper("a OR b", "(?(\"a\") ?(\"b\"))");
test_parse_query_to_ast_helper("a OR b AND c", "(?\"a\" ?(+\"b\" +\"c\"))"); test_parse_query_to_ast_helper("a OR b AND c", "(?(\"a\") ?((+(\"b\") +(\"c\"))))");
test_parse_query_to_ast_helper("a AND b AND c", "(+\"a\" +\"b\" +\"c\")"); test_parse_query_to_ast_helper("a AND b AND c", "(+(\"a\") +(\"b\") +(\"c\"))");
assert_eq!( assert_eq!(
format!("{:?}", parse_to_ast().parse("a OR b aaa")), format!("{:?}", parse_to_ast().parse("a OR b aaa")),
"Err(UnexpectedParse)" "Err(UnexpectedParse)"
@@ -408,32 +315,6 @@ mod test {
test_parse_query_to_ast_helper("weight: <= 70.5", "weight:{\"*\" TO \"70.5\"]"); test_parse_query_to_ast_helper("weight: <= 70.5", "weight:{\"*\" TO \"70.5\"]");
} }
#[test]
fn test_occur_leaf() {
let ((occur, ast), _) = super::occur_leaf().parse("+abc").unwrap();
assert_eq!(occur, Some(Occur::Must));
assert_eq!(format!("{:?}", ast), "\"abc\"");
}
#[test]
fn test_field_name() -> TestParseResult {
assert_eq!(
super::field().parse("my-field-name:a")?,
("my-field-name".to_string(), "a")
);
assert_eq!(
super::field().parse("my_field_name:a")?,
("my_field_name".to_string(), "a")
);
assert!(super::field().parse(":a").is_err());
assert!(super::field().parse("-my_field:a").is_err());
assert_eq!(
super::field().parse("_my_field:a")?,
("_my_field".to_string(), "a")
);
Ok(())
}
#[test] #[test]
fn test_range_parser() { fn test_range_parser() {
// testing the range() parser separately // testing the range() parser separately
@@ -462,67 +343,32 @@ mod test {
fn test_parse_query_to_triming_spaces() { fn test_parse_query_to_triming_spaces() {
test_parse_query_to_ast_helper(" abc", "\"abc\""); test_parse_query_to_ast_helper(" abc", "\"abc\"");
test_parse_query_to_ast_helper("abc ", "\"abc\""); test_parse_query_to_ast_helper("abc ", "\"abc\"");
test_parse_query_to_ast_helper("( a OR abc)", "(?\"a\" ?\"abc\")"); test_parse_query_to_ast_helper("( a OR abc)", "(?(\"a\") ?(\"abc\"))");
test_parse_query_to_ast_helper("(a OR abc)", "(?\"a\" ?\"abc\")"); test_parse_query_to_ast_helper("(a OR abc)", "(?(\"a\") ?(\"abc\"))");
test_parse_query_to_ast_helper("(a OR abc)", "(?\"a\" ?\"abc\")"); test_parse_query_to_ast_helper("(a OR abc)", "(?(\"a\") ?(\"abc\"))");
test_parse_query_to_ast_helper("a OR abc ", "(?\"a\" ?\"abc\")"); test_parse_query_to_ast_helper("a OR abc ", "(?(\"a\") ?(\"abc\"))");
test_parse_query_to_ast_helper("(a OR abc )", "(?\"a\" ?\"abc\")"); test_parse_query_to_ast_helper("(a OR abc )", "(?(\"a\") ?(\"abc\"))");
test_parse_query_to_ast_helper("(a OR abc) ", "(?\"a\" ?\"abc\")"); test_parse_query_to_ast_helper("(a OR abc) ", "(?(\"a\") ?(\"abc\"))");
} }
#[test] #[test]
fn test_parse_query_single_term() { fn test_parse_query_to_ast() {
test_parse_query_to_ast_helper("abc", "\"abc\""); test_parse_query_to_ast_helper("abc", "\"abc\"");
} test_parse_query_to_ast_helper("a b", "(\"a\" \"b\")");
test_parse_query_to_ast_helper("+(a b)", "+((\"a\" \"b\"))");
#[test] test_parse_query_to_ast_helper("+d", "+(\"d\")");
fn test_parse_query_default_clause() { test_parse_query_to_ast_helper("+(a b) +d", "(+((\"a\" \"b\")) +(\"d\"))");
test_parse_query_to_ast_helper("a b", "(*\"a\" *\"b\")"); test_parse_query_to_ast_helper("(+a +b) d", "((+(\"a\") +(\"b\")) \"d\")");
} test_parse_query_to_ast_helper("(+a)", "+(\"a\")");
test_parse_query_to_ast_helper("(+a +b)", "(+(\"a\") +(\"b\"))");
#[test]
fn test_parse_query_must_default_clause() {
test_parse_query_to_ast_helper("+(a b)", "(*\"a\" *\"b\")");
}
#[test]
fn test_parse_query_must_single_term() {
test_parse_query_to_ast_helper("+d", "\"d\"");
}
#[test]
fn test_single_term_with_field() {
test_parse_query_to_ast_helper("abc:toto", "abc:\"toto\""); test_parse_query_to_ast_helper("abc:toto", "abc:\"toto\"");
}
#[test]
fn test_single_term_with_float() {
test_parse_query_to_ast_helper("abc:1.1", "abc:\"1.1\""); test_parse_query_to_ast_helper("abc:1.1", "abc:\"1.1\"");
} test_parse_query_to_ast_helper("+abc:toto", "+(abc:\"toto\")");
test_parse_query_to_ast_helper("(+abc:toto -titi)", "(+(abc:\"toto\") -(\"titi\"))");
#[test] test_parse_query_to_ast_helper("-abc:toto", "-(abc:\"toto\")");
fn test_must_clause() { test_parse_query_to_ast_helper("abc:a b", "(abc:\"a\" \"b\")");
test_parse_query_to_ast_helper("(+a +b)", "(+\"a\" +\"b\")");
}
#[test]
fn test_parse_test_query_plus_a_b_plus_d() {
test_parse_query_to_ast_helper("+(a b) +d", "(+(*\"a\" *\"b\") +\"d\")");
}
#[test]
fn test_parse_test_query_other() {
test_parse_query_to_ast_helper("(+a +b) d", "(*(+\"a\" +\"b\") *\"d\")");
test_parse_query_to_ast_helper("+abc:toto", "abc:\"toto\"");
test_parse_query_to_ast_helper("(+abc:toto -titi)", "(+abc:\"toto\" -\"titi\")");
test_parse_query_to_ast_helper("-abc:toto", "(-abc:\"toto\")");
test_parse_query_to_ast_helper("abc:a b", "(*abc:\"a\" *\"b\")");
test_parse_query_to_ast_helper("abc:\"a b\"", "abc:\"a b\""); test_parse_query_to_ast_helper("abc:\"a b\"", "abc:\"a b\"");
test_parse_query_to_ast_helper("foo:[1 TO 5]", "foo:[\"1\" TO \"5\"]"); test_parse_query_to_ast_helper("foo:[1 TO 5]", "foo:[\"1\" TO \"5\"]");
}
#[test]
fn test_parse_query_with_range() {
test_parse_query_to_ast_helper("[1 TO 5]", "[\"1\" TO \"5\"]"); test_parse_query_to_ast_helper("[1 TO 5]", "[\"1\" TO \"5\"]");
test_parse_query_to_ast_helper("foo:{a TO z}", "foo:{\"a\" TO \"z\"}"); test_parse_query_to_ast_helper("foo:{a TO z}", "foo:{\"a\" TO \"z\"}");
test_parse_query_to_ast_helper("foo:[1 TO toto}", "foo:[\"1\" TO \"toto\"}"); test_parse_query_to_ast_helper("foo:[1 TO toto}", "foo:[\"1\" TO \"toto\"}");

View File

@@ -85,14 +85,14 @@ impl UserInputBound {
} }
pub enum UserInputAST { pub enum UserInputAST {
Clause(Vec<(Option<Occur>, UserInputAST)>), Clause(Vec<UserInputAST>),
Unary(Occur, Box<UserInputAST>),
Leaf(Box<UserInputLeaf>), Leaf(Box<UserInputLeaf>),
Boost(Box<UserInputAST>, f64),
} }
impl UserInputAST { impl UserInputAST {
pub fn unary(self, occur: Occur) -> UserInputAST { pub fn unary(self, occur: Occur) -> UserInputAST {
UserInputAST::Clause(vec![(Some(occur), self)]) UserInputAST::Unary(occur, Box::new(self))
} }
fn compose(occur: Occur, asts: Vec<UserInputAST>) -> UserInputAST { fn compose(occur: Occur, asts: Vec<UserInputAST>) -> UserInputAST {
@@ -103,7 +103,7 @@ impl UserInputAST {
} else { } else {
UserInputAST::Clause( UserInputAST::Clause(
asts.into_iter() asts.into_iter()
.map(|ast: UserInputAST| (Some(occur), ast)) .map(|ast: UserInputAST| ast.unary(occur))
.collect::<Vec<_>>(), .collect::<Vec<_>>(),
) )
} }
@@ -134,38 +134,26 @@ impl From<UserInputLeaf> for UserInputAST {
} }
} }
fn print_occur_ast(
occur_opt: Option<Occur>,
ast: &UserInputAST,
formatter: &mut fmt::Formatter,
) -> fmt::Result {
if let Some(occur) = occur_opt {
write!(formatter, "{}{:?}", occur, ast)?;
} else {
write!(formatter, "*{:?}", ast)?;
}
Ok(())
}
impl fmt::Debug for UserInputAST { impl fmt::Debug for UserInputAST {
fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result { fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
match *self { match *self {
UserInputAST::Clause(ref subqueries) => { UserInputAST::Clause(ref subqueries) => {
if subqueries.is_empty() { if subqueries.is_empty() {
write!(formatter, "<emptyclause>")?; write!(formatter, "<emptyclause>")?;
} else { } else {
write!(formatter, "(")?; write!(formatter, "(")?;
print_occur_ast(subqueries[0].0, &subqueries[0].1, formatter)?; write!(formatter, "{:?}", &subqueries[0])?;
for subquery in &subqueries[1..] { for subquery in &subqueries[1..] {
write!(formatter, " ")?; write!(formatter, " {:?}", subquery)?;
print_occur_ast(subquery.0, &subquery.1, formatter)?;
} }
write!(formatter, ")")?; write!(formatter, ")")?;
} }
Ok(()) Ok(())
} }
UserInputAST::Unary(ref occur, ref subquery) => {
write!(formatter, "{}({:?})", occur, subquery)
}
UserInputAST::Leaf(ref subquery) => write!(formatter, "{:?}", subquery), UserInputAST::Leaf(ref subquery) => write!(formatter, "{:?}", subquery),
UserInputAST::Boost(ref leaf, boost) => write!(formatter, "({:?})^{}", leaf, boost),
} }
} }
} }

View File

@@ -96,18 +96,18 @@ mod tests {
} }
{ {
let mut count_collector = SegmentCountCollector::default(); let mut count_collector = SegmentCountCollector::default();
count_collector.collect(0u32, 1.0); count_collector.collect(0u32, 1f32);
assert_eq!(count_collector.harvest(), 1); assert_eq!(count_collector.harvest(), 1);
} }
{ {
let mut count_collector = SegmentCountCollector::default(); let mut count_collector = SegmentCountCollector::default();
count_collector.collect(0u32, 1.0); count_collector.collect(0u32, 1f32);
assert_eq!(count_collector.harvest(), 1); assert_eq!(count_collector.harvest(), 1);
} }
{ {
let mut count_collector = SegmentCountCollector::default(); let mut count_collector = SegmentCountCollector::default();
count_collector.collect(0u32, 1.0); count_collector.collect(0u32, 1f32);
count_collector.collect(1u32, 1.0); count_collector.collect(1u32, 1f32);
assert_eq!(count_collector.harvest(), 2); assert_eq!(count_collector.harvest(), 2);
} }
} }

View File

@@ -11,13 +11,13 @@ impl<TCustomScorer, TScore> CustomScoreTopCollector<TCustomScorer, TScore>
where where
TScore: Clone + PartialOrd, TScore: Clone + PartialOrd,
{ {
pub(crate) fn new( pub fn new(
custom_scorer: TCustomScorer, custom_scorer: TCustomScorer,
collector: TopCollector<TScore>, limit: usize,
) -> CustomScoreTopCollector<TCustomScorer, TScore> { ) -> CustomScoreTopCollector<TCustomScorer, TScore> {
CustomScoreTopCollector { CustomScoreTopCollector {
custom_scorer, custom_scorer,
collector, collector: TopCollector::with_limit(limit),
} }
} }
} }
@@ -28,7 +28,7 @@ where
/// It is the segment local version of the [`CustomScorer`](./trait.CustomScorer.html). /// It is the segment local version of the [`CustomScorer`](./trait.CustomScorer.html).
pub trait CustomSegmentScorer<TScore>: 'static { pub trait CustomSegmentScorer<TScore>: 'static {
/// Computes the score of a specific `doc`. /// Computes the score of a specific `doc`.
fn score(&mut self, doc: DocId) -> TScore; fn score(&self, doc: DocId) -> TScore;
} }
/// `CustomScorer` makes it possible to define any kind of score. /// `CustomScorer` makes it possible to define any kind of score.
@@ -46,7 +46,7 @@ pub trait CustomScorer<TScore>: Sync {
impl<TCustomScorer, TScore> Collector for CustomScoreTopCollector<TCustomScorer, TScore> impl<TCustomScorer, TScore> Collector for CustomScoreTopCollector<TCustomScorer, TScore>
where where
TCustomScorer: CustomScorer<TScore> + Send + Sync, TCustomScorer: CustomScorer<TScore>,
TScore: 'static + PartialOrd + Clone + Send + Sync, TScore: 'static + PartialOrd + Clone + Send + Sync,
{ {
type Fruit = Vec<(TScore, DocAddress)>; type Fruit = Vec<(TScore, DocAddress)>;
@@ -117,9 +117,9 @@ where
impl<F, TScore> CustomSegmentScorer<TScore> for F impl<F, TScore> CustomSegmentScorer<TScore> for F
where where
F: 'static + FnMut(DocId) -> TScore, F: 'static + Sync + Send + Fn(DocId) -> TScore,
{ {
fn score(&mut self, doc: DocId) -> TScore { fn score(&self, doc: DocId) -> TScore {
(self)(doc) (self)(doc)
} }
} }

View File

@@ -1,5 +1,6 @@
use crate::collector::Collector; use crate::collector::Collector;
use crate::collector::SegmentCollector; use crate::collector::SegmentCollector;
use crate::docset::SkipResult;
use crate::fastfield::FacetReader; use crate::fastfield::FacetReader;
use crate::schema::Facet; use crate::schema::Facet;
use crate::schema::Field; use crate::schema::Field;
@@ -187,11 +188,6 @@ pub struct FacetSegmentCollector {
collapse_facet_ords: Vec<u64>, collapse_facet_ords: Vec<u64>,
} }
enum SkipResult {
Found,
NotFound,
}
fn skip<'a, I: Iterator<Item = &'a Facet>>( fn skip<'a, I: Iterator<Item = &'a Facet>>(
target: &[u8], target: &[u8],
collapse_it: &mut Peekable<I>, collapse_it: &mut Peekable<I>,
@@ -201,14 +197,14 @@ fn skip<'a, I: Iterator<Item = &'a Facet>>(
Some(facet_bytes) => match facet_bytes.encoded_str().as_bytes().cmp(target) { Some(facet_bytes) => match facet_bytes.encoded_str().as_bytes().cmp(target) {
Ordering::Less => {} Ordering::Less => {}
Ordering::Greater => { Ordering::Greater => {
return SkipResult::NotFound; return SkipResult::OverStep;
} }
Ordering::Equal => { Ordering::Equal => {
return SkipResult::Found; return SkipResult::Reached;
} }
}, },
None => { None => {
return SkipResult::NotFound; return SkipResult::End;
} }
} }
collapse_it.next(); collapse_it.next();
@@ -285,7 +281,7 @@ impl Collector for FacetCollector {
// is positionned on a term that has not been processed yet. // is positionned on a term that has not been processed yet.
let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it); let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it);
match skip_result { match skip_result {
SkipResult::Found => { SkipResult::Reached => {
// we reach a facet we decided to collapse. // we reach a facet we decided to collapse.
let collapse_depth = facet_depth(facet_streamer.key()); let collapse_depth = facet_depth(facet_streamer.key());
let mut collapsed_id = 0; let mut collapsed_id = 0;
@@ -305,7 +301,7 @@ impl Collector for FacetCollector {
} }
break; break;
} }
SkipResult::NotFound => { SkipResult::End | SkipResult::OverStep => {
collapse_mapping.push(0); collapse_mapping.push(0);
if !facet_streamer.advance() { if !facet_streamer.advance() {
break; break;
@@ -472,7 +468,7 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let num_facets: usize = 3 * 4 * 5; let num_facets: usize = 3 * 4 * 5;
let facets: Vec<Facet> = (0..num_facets) let facets: Vec<Facet> = (0..num_facets)
.map(|mut n| { .map(|mut n| {
@@ -531,7 +527,7 @@ mod tests {
let facet_field = schema_builder.add_facet_field("facets"); let facet_field = schema_builder.add_facet_field("facets");
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!( index_writer.add_document(doc!(
facet_field => Facet::from_text(&"/subjects/A/a"), facet_field => Facet::from_text(&"/subjects/A/a"),
facet_field => Facet::from_text(&"/subjects/B/a"), facet_field => Facet::from_text(&"/subjects/B/a"),
@@ -550,12 +546,12 @@ mod tests {
} }
#[test] #[test]
fn test_doc_search_by_facet() -> crate::Result<()> { fn test_doc_search_by_facet() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let facet_field = schema_builder.add_facet_field("facet"); let facet_field = schema_builder.add_facet_field("facet");
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!( index_writer.add_document(doc!(
facet_field => Facet::from_text(&"/A/A"), facet_field => Facet::from_text(&"/A/A"),
)); ));
@@ -568,8 +564,8 @@ mod tests {
index_writer.add_document(doc!( index_writer.add_document(doc!(
facet_field => Facet::from_text(&"/D/C/A"), facet_field => Facet::from_text(&"/D/C/A"),
)); ));
index_writer.commit()?; index_writer.commit().unwrap();
let reader = index.reader()?; let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 4); assert_eq!(searcher.num_docs(), 4);
@@ -586,17 +582,17 @@ mod tests {
assert_eq!(count_facet("/A/C"), 1); assert_eq!(count_facet("/A/C"), 1);
assert_eq!(count_facet("/A/C/A"), 1); assert_eq!(count_facet("/A/C/A"), 1);
assert_eq!(count_facet("/C/A"), 0); assert_eq!(count_facet("/C/A"), 0);
let query_parser = QueryParser::for_index(&index, vec![]);
{ {
let query = query_parser.parse_query("facet:/A/B")?; let query_parser = QueryParser::for_index(&index, vec![]);
assert_eq!(1, searcher.search(&query, &Count).unwrap()); {
let query = query_parser.parse_query("facet:/A/B").unwrap();
assert_eq!(1, searcher.search(&query, &Count).unwrap());
}
{
let query = query_parser.parse_query("facet:/A").unwrap();
assert_eq!(3, searcher.search(&query, &Count).unwrap());
}
} }
{
let query = query_parser.parse_query("facet:/A")?;
assert_eq!(3, searcher.search(&query, &Count)?);
}
Ok(())
} }
#[test] #[test]
@@ -631,7 +627,7 @@ mod tests {
.collect(); .collect();
docs[..].shuffle(&mut thread_rng()); docs[..].shuffle(&mut thread_rng());
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
for doc in docs { for doc in docs {
index_writer.add_document(doc); index_writer.add_document(doc);
} }
@@ -684,7 +680,7 @@ mod bench {
// 40425 docs // 40425 docs
docs[..].shuffle(&mut thread_rng()); docs[..].shuffle(&mut thread_rng());
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
for doc in docs { for doc in docs {
index_writer.add_document(doc); index_writer.add_document(doc);
} }

View File

@@ -89,7 +89,7 @@ mod tests {
let index = Index::create_in_ram(schema.clone()); let index = Index::create_in_ram(schema.clone());
{ {
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
{ {
for i in 0u64..10u64 { for i in 0u64..10u64 {
index_writer.add_document(doc!( index_writer.add_document(doc!(

View File

@@ -109,7 +109,6 @@ pub use self::tweak_score_top_collector::{ScoreSegmentTweaker, ScoreTweaker};
mod facet_collector; mod facet_collector;
pub use self::facet_collector::FacetCollector; pub use self::facet_collector::FacetCollector;
use crate::query::Weight;
/// `Fruit` is the type for the result of our collection. /// `Fruit` is the type for the result of our collection.
/// e.g. `usize` for the `Count` collector. /// e.g. `usize` for the `Count` collector.
@@ -133,7 +132,7 @@ impl<T> Fruit for T where T: Send + downcast_rs::Downcast {}
/// The collection logic itself is in the `SegmentCollector`. /// The collection logic itself is in the `SegmentCollector`.
/// ///
/// Segments are not guaranteed to be visited in any specific order. /// Segments are not guaranteed to be visited in any specific order.
pub trait Collector: Sync + Send { pub trait Collector: Sync {
/// `Fruit` is the type for the result of our collection. /// `Fruit` is the type for the result of our collection.
/// e.g. `usize` for the `Count` collector. /// e.g. `usize` for the `Count` collector.
type Fruit: Fruit; type Fruit: Fruit;
@@ -155,29 +154,6 @@ pub trait Collector: Sync + Send {
/// Combines the fruit associated to the collection of each segments /// Combines the fruit associated to the collection of each segments
/// into one fruit. /// into one fruit.
fn merge_fruits(&self, segment_fruits: Vec<Self::Fruit>) -> crate::Result<Self::Fruit>; fn merge_fruits(&self, segment_fruits: Vec<Self::Fruit>) -> crate::Result<Self::Fruit>;
/// Created a segment collector and
fn collect_segment(
&self,
weight: &dyn Weight,
segment_ord: u32,
reader: &SegmentReader,
) -> crate::Result<<Self::Child as SegmentCollector>::Fruit> {
let mut segment_collector = self.for_segment(segment_ord as u32, reader)?;
if let Some(delete_bitset) = reader.delete_bitset() {
weight.for_each(reader, &mut |doc, score| {
if delete_bitset.is_alive(doc) {
segment_collector.collect(doc, score);
}
})?;
} else {
weight.for_each(reader, &mut |doc, score| {
segment_collector.collect(doc, score);
})?;
}
Ok(segment_collector.harvest())
}
} }
/// The `SegmentCollector` is the trait in charge of defining the /// The `SegmentCollector` is the trait in charge of defining the

View File

@@ -55,7 +55,7 @@ impl<TCollector: Collector> Collector for CollectorWrapper<TCollector> {
impl SegmentCollector for Box<dyn BoxableSegmentCollector> { impl SegmentCollector for Box<dyn BoxableSegmentCollector> {
type Fruit = Box<dyn Fruit>; type Fruit = Box<dyn Fruit>;
fn collect(&mut self, doc: u32, score: Score) { fn collect(&mut self, doc: u32, score: f32) {
self.as_mut().collect(doc, score); self.as_mut().collect(doc, score);
} }
@@ -65,7 +65,7 @@ impl SegmentCollector for Box<dyn BoxableSegmentCollector> {
} }
pub trait BoxableSegmentCollector { pub trait BoxableSegmentCollector {
fn collect(&mut self, doc: u32, score: Score); fn collect(&mut self, doc: u32, score: f32);
fn harvest_from_box(self: Box<Self>) -> Box<dyn Fruit>; fn harvest_from_box(self: Box<Self>) -> Box<dyn Fruit>;
} }
@@ -74,7 +74,7 @@ pub struct SegmentCollectorWrapper<TSegmentCollector: SegmentCollector>(TSegment
impl<TSegmentCollector: SegmentCollector> BoxableSegmentCollector impl<TSegmentCollector: SegmentCollector> BoxableSegmentCollector
for SegmentCollectorWrapper<TSegmentCollector> for SegmentCollectorWrapper<TSegmentCollector>
{ {
fn collect(&mut self, doc: u32, score: Score) { fn collect(&mut self, doc: u32, score: f32) {
self.0.collect(doc, score); self.0.collect(doc, score);
} }
@@ -259,7 +259,7 @@ mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(text=>"abc")); index_writer.add_document(doc!(text=>"abc"));
index_writer.add_document(doc!(text=>"abc abc abc")); index_writer.add_document(doc!(text=>"abc abc abc"));
index_writer.add_document(doc!(text=>"abc abc")); index_writer.add_document(doc!(text=>"abc abc"));

View File

@@ -206,7 +206,7 @@ impl Collector for BytesFastFieldTestCollector {
impl SegmentCollector for BytesFastFieldSegmentCollector { impl SegmentCollector for BytesFastFieldSegmentCollector {
type Fruit = Vec<u8>; type Fruit = Vec<u8>;
fn collect(&mut self, doc: u32, _score: Score) { fn collect(&mut self, doc: u32, _score: f32) {
let data = self.reader.get_bytes(doc); let data = self.reader.get_bytes(doc);
self.vals.extend(data); self.vals.extend(data);
} }

View File

@@ -18,9 +18,9 @@ use std::collections::BinaryHeap;
/// Two elements are equal if their feature is equal, and regardless of whether `doc` /// Two elements are equal if their feature is equal, and regardless of whether `doc`
/// is equal. This should be perfectly fine for this usage, but let's make sure this /// is equal. This should be perfectly fine for this usage, but let's make sure this
/// struct is never public. /// struct is never public.
pub(crate) struct ComparableDoc<T, D> { struct ComparableDoc<T, D> {
pub feature: T, feature: T,
pub doc: D, doc: D,
} }
impl<T: PartialOrd, D: PartialOrd> PartialOrd for ComparableDoc<T, D> { impl<T: PartialOrd, D: PartialOrd> PartialOrd for ComparableDoc<T, D> {
@@ -56,8 +56,7 @@ impl<T: PartialOrd, D: PartialOrd> PartialEq for ComparableDoc<T, D> {
impl<T: PartialOrd, D: PartialOrd> Eq for ComparableDoc<T, D> {} impl<T: PartialOrd, D: PartialOrd> Eq for ComparableDoc<T, D> {}
pub(crate) struct TopCollector<T> { pub(crate) struct TopCollector<T> {
pub limit: usize, limit: usize,
pub offset: usize,
_marker: PhantomData<T>, _marker: PhantomData<T>,
} }
@@ -73,20 +72,14 @@ where
if limit < 1 { if limit < 1 {
panic!("Limit must be strictly greater than 0."); panic!("Limit must be strictly greater than 0.");
} }
Self { TopCollector {
limit, limit,
offset: 0,
_marker: PhantomData, _marker: PhantomData,
} }
} }
/// Skip the first "offset" documents when collecting. pub fn limit(&self) -> usize {
/// self.limit
/// This is equivalent to `OFFSET` in MySQL or PostgreSQL and `start` in
/// Lucene's TopDocsCollector.
pub fn and_offset(mut self, offset: usize) -> TopCollector<T> {
self.offset = offset;
self
} }
pub fn merge_fruits( pub fn merge_fruits(
@@ -99,7 +92,7 @@ where
let mut top_collector = BinaryHeap::new(); let mut top_collector = BinaryHeap::new();
for child_fruit in children { for child_fruit in children {
for (feature, doc) in child_fruit { for (feature, doc) in child_fruit {
if top_collector.len() < (self.limit + self.offset) { if top_collector.len() < self.limit {
top_collector.push(ComparableDoc { feature, doc }); top_collector.push(ComparableDoc { feature, doc });
} else if let Some(mut head) = top_collector.peek_mut() { } else if let Some(mut head) = top_collector.peek_mut() {
if head.feature < feature { if head.feature < feature {
@@ -111,7 +104,6 @@ where
Ok(top_collector Ok(top_collector
.into_sorted_vec() .into_sorted_vec()
.into_iter() .into_iter()
.skip(self.offset)
.map(|cdoc| (cdoc.feature, cdoc.doc)) .map(|cdoc| (cdoc.feature, cdoc.doc))
.collect()) .collect())
} }
@@ -121,23 +113,7 @@ where
segment_id: SegmentLocalId, segment_id: SegmentLocalId,
_: &SegmentReader, _: &SegmentReader,
) -> crate::Result<TopSegmentCollector<F>> { ) -> crate::Result<TopSegmentCollector<F>> {
Ok(TopSegmentCollector::new( Ok(TopSegmentCollector::new(segment_id, self.limit))
segment_id,
self.limit + self.offset,
))
}
/// Create a new TopCollector with the same limit and offset.
///
/// Ideally we would use Into but the blanket implementation seems to cause the Scorer traits
/// to fail.
#[doc(hidden)]
pub(crate) fn into_tscore<TScore: PartialOrd + Clone>(self) -> TopCollector<TScore> {
TopCollector {
limit: self.limit,
offset: self.offset,
_marker: PhantomData,
}
} }
} }
@@ -211,7 +187,7 @@ impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::{TopCollector, TopSegmentCollector}; use super::TopSegmentCollector;
use crate::DocAddress; use crate::DocAddress;
#[test] #[test]
@@ -272,48 +248,6 @@ mod tests {
top_collector_limit_3.harvest()[..2].to_vec(), top_collector_limit_3.harvest()[..2].to_vec(),
); );
} }
#[test]
fn test_top_collector_with_limit_and_offset() {
let collector = TopCollector::with_limit(2).and_offset(1);
let results = collector
.merge_fruits(vec![vec![
(0.9, DocAddress(0, 1)),
(0.8, DocAddress(0, 2)),
(0.7, DocAddress(0, 3)),
(0.6, DocAddress(0, 4)),
(0.5, DocAddress(0, 5)),
]])
.unwrap();
assert_eq!(
results,
vec![(0.8, DocAddress(0, 2)), (0.7, DocAddress(0, 3)),]
);
}
#[test]
fn test_top_collector_with_limit_larger_than_set_and_offset() {
let collector = TopCollector::with_limit(2).and_offset(1);
let results = collector
.merge_fruits(vec![vec![(0.9, DocAddress(0, 1)), (0.8, DocAddress(0, 2))]])
.unwrap();
assert_eq!(results, vec![(0.8, DocAddress(0, 2)),]);
}
#[test]
fn test_top_collector_with_limit_and_offset_larger_than_set() {
let collector = TopCollector::with_limit(2).and_offset(20);
let results = collector
.merge_fruits(vec![vec![(0.9, DocAddress(0, 1)), (0.8, DocAddress(0, 2))]])
.unwrap();
assert_eq!(results, vec![]);
}
} }
#[cfg(all(test, feature = "unstable"))] #[cfg(all(test, feature = "unstable"))]

View File

@@ -1,20 +1,18 @@
use super::Collector; use super::Collector;
use crate::collector::custom_score_top_collector::CustomScoreTopCollector; use crate::collector::custom_score_top_collector::CustomScoreTopCollector;
use crate::collector::top_collector::TopCollector;
use crate::collector::top_collector::TopSegmentCollector; use crate::collector::top_collector::TopSegmentCollector;
use crate::collector::top_collector::{ComparableDoc, TopCollector};
use crate::collector::tweak_score_top_collector::TweakedScoreTopCollector; use crate::collector::tweak_score_top_collector::TweakedScoreTopCollector;
use crate::collector::{ use crate::collector::{
CustomScorer, CustomSegmentScorer, ScoreSegmentTweaker, ScoreTweaker, SegmentCollector, CustomScorer, CustomSegmentScorer, ScoreSegmentTweaker, ScoreTweaker, SegmentCollector,
}; };
use crate::fastfield::FastFieldReader; use crate::fastfield::FastFieldReader;
use crate::query::Weight;
use crate::schema::Field; use crate::schema::Field;
use crate::DocAddress; use crate::DocAddress;
use crate::DocId; use crate::DocId;
use crate::Score; use crate::Score;
use crate::SegmentLocalId; use crate::SegmentLocalId;
use crate::SegmentReader; use crate::SegmentReader;
use std::collections::BinaryHeap;
use std::fmt; use std::fmt;
/// The `TopDocs` collector keeps track of the top `K` documents /// The `TopDocs` collector keeps track of the top `K` documents
@@ -38,7 +36,7 @@ use std::fmt;
/// let schema = schema_builder.build(); /// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema); /// let index = Index::create_in_ram(schema);
/// ///
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap(); /// let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
/// index_writer.add_document(doc!(title => "The Name of the Wind")); /// index_writer.add_document(doc!(title => "The Name of the Wind"));
/// index_writer.add_document(doc!(title => "The Diary of Muadib")); /// index_writer.add_document(doc!(title => "The Diary of Muadib"));
/// index_writer.add_document(doc!(title => "A Dairy Cow")); /// index_writer.add_document(doc!(title => "A Dairy Cow"));
@@ -52,18 +50,14 @@ use std::fmt;
/// let query = query_parser.parse_query("diary").unwrap(); /// let query = query_parser.parse_query("diary").unwrap();
/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2)).unwrap(); /// let top_docs = searcher.search(&query, &TopDocs::with_limit(2)).unwrap();
/// ///
/// assert_eq!(top_docs[0].1, DocAddress(0, 1)); /// assert_eq!(&top_docs[0], &(0.7261542, DocAddress(0, 1)));
/// assert_eq!(top_docs[1].1, DocAddress(0, 3)); /// assert_eq!(&top_docs[1], &(0.6099695, DocAddress(0, 3)));
/// ``` /// ```
pub struct TopDocs(TopCollector<Score>); pub struct TopDocs(TopCollector<Score>);
impl fmt::Debug for TopDocs { impl fmt::Debug for TopDocs {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!( write!(f, "TopDocs({})", self.0.limit())
f,
"TopDocs(limit={}, offset={})",
self.0.limit, self.0.offset
)
} }
} }
@@ -72,7 +66,7 @@ struct ScorerByFastFieldReader {
} }
impl CustomSegmentScorer<u64> for ScorerByFastFieldReader { impl CustomSegmentScorer<u64> for ScorerByFastFieldReader {
fn score(&mut self, doc: DocId) -> u64 { fn score(&self, doc: DocId) -> u64 {
self.ff_reader.get_u64(u64::from(doc)) self.ff_reader.get_u64(u64::from(doc))
} }
} }
@@ -90,8 +84,7 @@ impl CustomScorer<u64> for ScorerByField {
.u64(self.field) .u64(self.field)
.ok_or_else(|| { .ok_or_else(|| {
crate::TantivyError::SchemaError(format!( crate::TantivyError::SchemaError(format!(
"Field requested ({:?}) is not a i64/u64 fast field.", "Field requested is not a i64/u64 fast field."
self.field
)) ))
})?; })?;
Ok(ScorerByFastFieldReader { ff_reader }) Ok(ScorerByFastFieldReader { ff_reader })
@@ -107,45 +100,6 @@ impl TopDocs {
TopDocs(TopCollector::with_limit(limit)) TopDocs(TopCollector::with_limit(limit))
} }
/// Skip the first "offset" documents when collecting.
///
/// This is equivalent to `OFFSET` in MySQL or PostgreSQL and `start` in
/// Lucene's TopDocsCollector.
///
/// ```rust
/// use tantivy::collector::TopDocs;
/// use tantivy::query::QueryParser;
/// use tantivy::schema::{Schema, TEXT};
/// use tantivy::{doc, DocAddress, Index};
///
/// let mut schema_builder = Schema::builder();
/// let title = schema_builder.add_text_field("title", TEXT);
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
///
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
/// index_writer.add_document(doc!(title => "The Name of the Wind"));
/// index_writer.add_document(doc!(title => "The Diary of Muadib"));
/// index_writer.add_document(doc!(title => "A Dairy Cow"));
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"));
/// index_writer.add_document(doc!(title => "The Diary of Lena Mukhina"));
/// assert!(index_writer.commit().is_ok());
///
/// let reader = index.reader().unwrap();
/// let searcher = reader.searcher();
///
/// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary").unwrap();
/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2).and_offset(1)).unwrap();
///
/// assert_eq!(top_docs.len(), 2);
/// assert_eq!(top_docs[0].1, DocAddress(0, 4));
/// assert_eq!(top_docs[1].1, DocAddress(0, 3));
/// ```
pub fn and_offset(self, offset: usize) -> TopDocs {
TopDocs(self.0.and_offset(offset))
}
/// Set top-K to rank documents by a given fast field. /// Set top-K to rank documents by a given fast field.
/// ///
/// ```rust /// ```rust
@@ -163,7 +117,7 @@ impl TopDocs {
/// # let schema = schema_builder.build(); /// # let schema = schema_builder.build();
/// # /// #
/// # let index = Index::create_in_ram(schema); /// # let index = Index::create_in_ram(schema);
/// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?; /// # let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
/// # index_writer.add_document(doc!(title => "The Name of the Wind", rating => 92u64)); /// # index_writer.add_document(doc!(title => "The Name of the Wind", rating => 92u64));
/// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64)); /// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64));
/// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64)); /// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64));
@@ -264,7 +218,7 @@ impl TopDocs {
/// fn create_index() -> tantivy::Result<Index> { /// fn create_index() -> tantivy::Result<Index> {
/// let schema = create_schema(); /// let schema = create_schema();
/// let index = Index::create_in_ram(schema); /// let index = Index::create_in_ram(schema);
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?; /// let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
/// let product_name = index.schema().get_field("product_name").unwrap(); /// let product_name = index.schema().get_field("product_name").unwrap();
/// let popularity: Field = index.schema().get_field("popularity").unwrap(); /// let popularity: Field = index.schema().get_field("popularity").unwrap();
/// index_writer.add_document(doc!(product_name => "The Diary of Muadib", popularity => 1u64)); /// index_writer.add_document(doc!(product_name => "The Diary of Muadib", popularity => 1u64));
@@ -303,7 +257,7 @@ impl TopDocs {
/// let popularity: u64 = popularity_reader.get(doc); /// let popularity: u64 = popularity_reader.get(doc);
/// // Well.. For the sake of the example we use a simple logarithm /// // Well.. For the sake of the example we use a simple logarithm
/// // function. /// // function.
/// let popularity_boost_score = ((2u64 + popularity) as Score).log2(); /// let popularity_boost_score = ((2u64 + popularity) as f32).log2();
/// popularity_boost_score * original_score /// popularity_boost_score * original_score
/// } /// }
/// }); /// });
@@ -324,9 +278,9 @@ impl TopDocs {
where where
TScore: 'static + Send + Sync + Clone + PartialOrd, TScore: 'static + Send + Sync + Clone + PartialOrd,
TScoreSegmentTweaker: ScoreSegmentTweaker<TScore> + 'static, TScoreSegmentTweaker: ScoreSegmentTweaker<TScore> + 'static,
TScoreTweaker: ScoreTweaker<TScore, Child = TScoreSegmentTweaker> + Send + Sync, TScoreTweaker: ScoreTweaker<TScore, Child = TScoreSegmentTweaker>,
{ {
TweakedScoreTopCollector::new(score_tweaker, self.0.into_tscore()) TweakedScoreTopCollector::new(score_tweaker, self.0.limit())
} }
/// Ranks the documents using a custom score. /// Ranks the documents using a custom score.
@@ -371,7 +325,7 @@ impl TopDocs {
/// # fn main() -> tantivy::Result<()> { /// # fn main() -> tantivy::Result<()> {
/// # let schema = create_schema(); /// # let schema = create_schema();
/// # let index = Index::create_in_ram(schema); /// # let index = Index::create_in_ram(schema);
/// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?; /// # let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
/// # let product_name = index.schema().get_field("product_name").unwrap(); /// # let product_name = index.schema().get_field("product_name").unwrap();
/// # /// #
/// let popularity: Field = index.schema().get_field("popularity").unwrap(); /// let popularity: Field = index.schema().get_field("popularity").unwrap();
@@ -438,9 +392,9 @@ impl TopDocs {
where where
TScore: 'static + Send + Sync + Clone + PartialOrd, TScore: 'static + Send + Sync + Clone + PartialOrd,
TCustomSegmentScorer: CustomSegmentScorer<TScore> + 'static, TCustomSegmentScorer: CustomSegmentScorer<TScore> + 'static,
TCustomScorer: CustomScorer<TScore, Child = TCustomSegmentScorer> + Send + Sync, TCustomScorer: CustomScorer<TScore, Child = TCustomSegmentScorer>,
{ {
CustomScoreTopCollector::new(custom_score, self.0.into_tscore()) CustomScoreTopCollector::new(custom_score, self.0.limit())
} }
} }
@@ -468,64 +422,6 @@ impl Collector for TopDocs {
) -> crate::Result<Self::Fruit> { ) -> crate::Result<Self::Fruit> {
self.0.merge_fruits(child_fruits) self.0.merge_fruits(child_fruits)
} }
fn collect_segment(
&self,
weight: &dyn Weight,
segment_ord: u32,
reader: &SegmentReader,
) -> crate::Result<<Self::Child as SegmentCollector>::Fruit> {
let heap_len = self.0.limit + self.0.offset;
let mut heap: BinaryHeap<ComparableDoc<Score, DocId>> = BinaryHeap::with_capacity(heap_len);
if let Some(delete_bitset) = reader.delete_bitset() {
let mut threshold = Score::MIN;
weight.for_each_pruning(threshold, reader, &mut |doc, score| {
if delete_bitset.is_deleted(doc) {
return threshold;
}
let heap_item = ComparableDoc {
feature: score,
doc,
};
if heap.len() < heap_len {
heap.push(heap_item);
if heap.len() == heap_len {
threshold = heap.peek().map(|el| el.feature).unwrap_or(Score::MIN);
}
return threshold;
}
*heap.peek_mut().unwrap() = heap_item;
threshold = heap.peek().map(|el| el.feature).unwrap_or(Score::MIN);
threshold
})?;
} else {
weight.for_each_pruning(Score::MIN, reader, &mut |doc, score| {
let heap_item = ComparableDoc {
feature: score,
doc,
};
if heap.len() < heap_len {
heap.push(heap_item);
// TODO the threshold is suboptimal for heap.len == heap_len
if heap.len() == heap_len {
return heap.peek().map(|el| el.feature).unwrap_or(Score::MIN);
} else {
return Score::MIN;
}
}
*heap.peek_mut().unwrap() = heap_item;
heap.peek().map(|el| el.feature).unwrap_or(Score::MIN)
})?;
}
let fruit = heap
.into_sorted_vec()
.into_iter()
.map(|cid| (cid.feature, DocAddress(segment_ord, cid.doc)))
.collect();
Ok(fruit)
}
} }
/// Segment Collector associated to `TopDocs`. /// Segment Collector associated to `TopDocs`.
@@ -535,7 +431,7 @@ impl SegmentCollector for TopScoreSegmentCollector {
type Fruit = Vec<(Score, DocAddress)>; type Fruit = Vec<(Score, DocAddress)>;
fn collect(&mut self, doc: DocId, score: Score) { fn collect(&mut self, doc: DocId, score: Score) {
self.0.collect(doc, score); self.0.collect(doc, score)
} }
fn harvest(self) -> Vec<(Score, DocAddress)> { fn harvest(self) -> Vec<(Score, DocAddress)> {
@@ -549,10 +445,11 @@ mod tests {
use crate::collector::Collector; use crate::collector::Collector;
use crate::query::{AllQuery, Query, QueryParser}; use crate::query::{AllQuery, Query, QueryParser};
use crate::schema::{Field, Schema, FAST, STORED, TEXT}; use crate::schema::{Field, Schema, FAST, STORED, TEXT};
use crate::DocAddress;
use crate::Index; use crate::Index;
use crate::IndexWriter; use crate::IndexWriter;
use crate::Score; use crate::Score;
use crate::{DocAddress, DocId, SegmentReader}; use itertools::Itertools;
fn make_index() -> Index { fn make_index() -> Index {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
@@ -561,7 +458,7 @@ mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"Hello happy tax payer.")); index_writer.add_document(doc!(text_field=>"Hello happy tax payer."));
index_writer.add_document(doc!(text_field=>"Droopy says hello happy tax payer")); index_writer.add_document(doc!(text_field=>"Droopy says hello happy tax payer"));
index_writer.add_document(doc!(text_field=>"I like Droopy")); index_writer.add_document(doc!(text_field=>"I like Droopy"));
@@ -570,13 +467,6 @@ mod tests {
index index
} }
fn assert_results_equals(results: &[(Score, DocAddress)], expected: &[(Score, DocAddress)]) {
for (result, expected) in results.iter().zip(expected.iter()) {
assert_eq!(result.1, expected.1);
crate::assert_nearly_equals!(result.0, expected.0);
}
}
#[test] #[test]
fn test_top_collector_not_at_capacity() { fn test_top_collector_not_at_capacity() {
let index = make_index(); let index = make_index();
@@ -589,31 +479,16 @@ mod tests {
.searcher() .searcher()
.search(&text_query, &TopDocs::with_limit(4)) .search(&text_query, &TopDocs::with_limit(4))
.unwrap(); .unwrap();
assert_results_equals( assert_eq!(
&score_docs, score_docs,
&[ vec![
(0.81221175, DocAddress(0u32, 1)), (0.81221175, DocAddress(0u32, 1)),
(0.5376842, DocAddress(0u32, 2)), (0.5376842, DocAddress(0u32, 2)),
(0.48527452, DocAddress(0, 0)), (0.48527452, DocAddress(0, 0))
], ]
); );
} }
#[test]
fn test_top_collector_not_at_capacity_with_offset() {
let index = make_index();
let field = index.schema().get_field("text").unwrap();
let query_parser = QueryParser::for_index(&index, vec![field]);
let text_query = query_parser.parse_query("droopy tax").unwrap();
let score_docs: Vec<(Score, DocAddress)> = index
.reader()
.unwrap()
.searcher()
.search(&text_query, &TopDocs::with_limit(4).and_offset(2))
.unwrap();
assert_results_equals(&score_docs[..], &[(0.48527452, DocAddress(0, 0))]);
}
#[test] #[test]
fn test_top_collector_at_capacity() { fn test_top_collector_at_capacity() {
let index = make_index(); let index = make_index();
@@ -626,33 +501,12 @@ mod tests {
.searcher() .searcher()
.search(&text_query, &TopDocs::with_limit(2)) .search(&text_query, &TopDocs::with_limit(2))
.unwrap(); .unwrap();
assert_results_equals( assert_eq!(
&score_docs, score_docs,
&[ vec![
(0.81221175, DocAddress(0u32, 1)), (0.81221175, DocAddress(0u32, 1)),
(0.5376842, DocAddress(0u32, 2)), (0.5376842, DocAddress(0u32, 2)),
], ]
);
}
#[test]
fn test_top_collector_at_capacity_with_offset() {
let index = make_index();
let field = index.schema().get_field("text").unwrap();
let query_parser = QueryParser::for_index(&index, vec![field]);
let text_query = query_parser.parse_query("droopy tax").unwrap();
let score_docs: Vec<(Score, DocAddress)> = index
.reader()
.unwrap()
.searcher()
.search(&text_query, &TopDocs::with_limit(2).and_offset(1))
.unwrap();
assert_results_equals(
&score_docs[..],
&[
(0.5376842, DocAddress(0u32, 2)),
(0.48527452, DocAddress(0, 0)),
],
); );
} }
@@ -669,8 +523,8 @@ mod tests {
// precondition for the test to be meaningful: we did get documents // precondition for the test to be meaningful: we did get documents
// with the same score // with the same score
assert!(page_1.iter().all(|result| result.0 == page_1[0].0)); assert!(page_1.iter().map(|result| result.0).all_equal());
assert!(page_2.iter().all(|result| result.0 == page_2[0].0)); assert!(page_2.iter().map(|result| result.0).all_equal());
// sanity check since we're relying on make_index() // sanity check since we're relying on make_index()
assert_eq!(page_1.len(), 2); assert_eq!(page_1.len(), 2);
@@ -713,8 +567,8 @@ mod tests {
let top_collector = TopDocs::with_limit(4).order_by_u64_field(size); let top_collector = TopDocs::with_limit(4).order_by_u64_field(size);
let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector).unwrap(); let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector).unwrap();
assert_eq!( assert_eq!(
&top_docs[..], top_docs,
&[ vec![
(64, DocAddress(0, 1)), (64, DocAddress(0, 1)),
(16, DocAddress(0, 2)), (16, DocAddress(0, 2)),
(12, DocAddress(0, 0)) (12, DocAddress(0, 0))
@@ -760,59 +614,12 @@ mod tests {
let top_collector = TopDocs::with_limit(4).order_by_u64_field(size); let top_collector = TopDocs::with_limit(4).order_by_u64_field(size);
let err = top_collector.for_segment(0, segment); let err = top_collector.for_segment(0, segment);
if let Err(crate::TantivyError::SchemaError(msg)) = err { if let Err(crate::TantivyError::SchemaError(msg)) = err {
assert_eq!( assert_eq!(msg, "Field requested is not a i64/u64 fast field.");
msg,
"Field requested (Field(1)) is not a i64/u64 fast field."
);
} else { } else {
assert!(false); assert!(false);
} }
} }
#[test]
fn test_tweak_score_top_collector_with_offset() {
let index = make_index();
let field = index.schema().get_field("text").unwrap();
let query_parser = QueryParser::for_index(&index, vec![field]);
let text_query = query_parser.parse_query("droopy tax").unwrap();
let collector = TopDocs::with_limit(2).and_offset(1).tweak_score(
move |_segment_reader: &SegmentReader| move |doc: DocId, _original_score: Score| doc,
);
let score_docs: Vec<(u32, DocAddress)> = index
.reader()
.unwrap()
.searcher()
.search(&text_query, &collector)
.unwrap();
assert_eq!(
score_docs,
vec![(1, DocAddress(0, 1)), (0, DocAddress(0, 0)),]
);
}
#[test]
fn test_custom_score_top_collector_with_offset() {
let index = make_index();
let field = index.schema().get_field("text").unwrap();
let query_parser = QueryParser::for_index(&index, vec![field]);
let text_query = query_parser.parse_query("droopy tax").unwrap();
let collector = TopDocs::with_limit(2)
.and_offset(1)
.custom_score(move |_segment_reader: &SegmentReader| move |doc: DocId| doc);
let score_docs: Vec<(u32, DocAddress)> = index
.reader()
.unwrap()
.searcher()
.search(&text_query, &collector)
.unwrap();
assert_eq!(
score_docs,
vec![(1, DocAddress(0, 1)), (0, DocAddress(0, 0)),]
);
}
fn index( fn index(
query: &str, query: &str,
query_field: Field, query_field: Field,
@@ -821,7 +628,7 @@ mod tests {
) -> (Index, Box<dyn Query>) { ) -> (Index, Box<dyn Query>) {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
doc_adder(&mut index_writer); doc_adder(&mut index_writer);
index_writer.commit().unwrap(); index_writer.commit().unwrap();
let query_parser = QueryParser::for_index(&index, vec![query_field]); let query_parser = QueryParser::for_index(&index, vec![query_field]);

View File

@@ -14,11 +14,11 @@ where
{ {
pub fn new( pub fn new(
score_tweaker: TScoreTweaker, score_tweaker: TScoreTweaker,
collector: TopCollector<TScore>, limit: usize,
) -> TweakedScoreTopCollector<TScoreTweaker, TScore> { ) -> TweakedScoreTopCollector<TScoreTweaker, TScore> {
TweakedScoreTopCollector { TweakedScoreTopCollector {
score_tweaker, score_tweaker,
collector, collector: TopCollector::with_limit(limit),
} }
} }
} }
@@ -29,7 +29,7 @@ where
/// It is the segment local version of the [`ScoreTweaker`](./trait.ScoreTweaker.html). /// It is the segment local version of the [`ScoreTweaker`](./trait.ScoreTweaker.html).
pub trait ScoreSegmentTweaker<TScore>: 'static { pub trait ScoreSegmentTweaker<TScore>: 'static {
/// Tweak the given `score` for the document `doc`. /// Tweak the given `score` for the document `doc`.
fn score(&mut self, doc: DocId, score: Score) -> TScore; fn score(&self, doc: DocId, score: Score) -> TScore;
} }
/// `ScoreTweaker` makes it possible to tweak the score /// `ScoreTweaker` makes it possible to tweak the score
@@ -49,7 +49,7 @@ pub trait ScoreTweaker<TScore>: Sync {
impl<TScoreTweaker, TScore> Collector for TweakedScoreTopCollector<TScoreTweaker, TScore> impl<TScoreTweaker, TScore> Collector for TweakedScoreTopCollector<TScoreTweaker, TScore>
where where
TScoreTweaker: ScoreTweaker<TScore> + Send + Sync, TScoreTweaker: ScoreTweaker<TScore>,
TScore: 'static + PartialOrd + Clone + Send + Sync, TScore: 'static + PartialOrd + Clone + Send + Sync,
{ {
type Fruit = Vec<(TScore, DocAddress)>; type Fruit = Vec<(TScore, DocAddress)>;
@@ -121,9 +121,9 @@ where
impl<F, TScore> ScoreSegmentTweaker<TScore> for F impl<F, TScore> ScoreSegmentTweaker<TScore> for F
where where
F: 'static + FnMut(DocId, Score) -> TScore, F: 'static + Sync + Send + Fn(DocId, Score) -> TScore,
{ {
fn score(&mut self, doc: DocId, score: Score) -> TScore { fn score(&self, doc: DocId, score: Score) -> TScore {
(self)(doc, score) (self)(doc, score)
} }
} }

View File

@@ -33,10 +33,6 @@ impl TinySet {
TinySet(0u64) TinySet(0u64)
} }
pub fn clear(&mut self) {
self.0 = 0u64;
}
/// Returns the complement of the set in `[0, 64[`. /// Returns the complement of the set in `[0, 64[`.
fn complement(self) -> TinySet { fn complement(self) -> TinySet {
TinySet(!self.0) TinySet(!self.0)
@@ -47,11 +43,6 @@ impl TinySet {
!self.intersect(TinySet::singleton(el)).is_empty() !self.intersect(TinySet::singleton(el)).is_empty()
} }
/// Returns the number of elements in the TinySet.
pub fn len(self) -> u32 {
self.0.count_ones()
}
/// Returns the intersection of `self` and `other` /// Returns the intersection of `self` and `other`
pub fn intersect(self, other: TinySet) -> TinySet { pub fn intersect(self, other: TinySet) -> TinySet {
TinySet(self.0 & other.0) TinySet(self.0 & other.0)
@@ -118,12 +109,22 @@ impl TinySet {
pub fn range_greater_or_equal(from_included: u32) -> TinySet { pub fn range_greater_or_equal(from_included: u32) -> TinySet {
TinySet::range_lower(from_included).complement() TinySet::range_lower(from_included).complement()
} }
pub fn clear(&mut self) {
self.0 = 0u64;
}
pub fn len(self) -> u32 {
self.0.count_ones()
}
} }
#[derive(Clone)] #[derive(Clone)]
pub struct BitSet { pub struct BitSet {
tinysets: Box<[TinySet]>, tinysets: Box<[TinySet]>,
len: usize, len: usize, //< Technically it should be u32, but we
// count multiple inserts.
// `usize` guards us from overflow.
max_value: u32, max_value: u32,
} }
@@ -203,7 +204,7 @@ mod tests {
use super::BitSet; use super::BitSet;
use super::TinySet; use super::TinySet;
use crate::docset::{DocSet, TERMINATED}; use crate::docset::DocSet;
use crate::query::BitSetDocSet; use crate::query::BitSetDocSet;
use crate::tests; use crate::tests;
use crate::tests::generate_nonunique_unsorted; use crate::tests::generate_nonunique_unsorted;
@@ -277,13 +278,11 @@ mod tests {
} }
assert_eq!(btreeset.len(), bitset.len()); assert_eq!(btreeset.len(), bitset.len());
let mut bitset_docset = BitSetDocSet::from(bitset); let mut bitset_docset = BitSetDocSet::from(bitset);
let mut remaining = true;
for el in btreeset.into_iter() { for el in btreeset.into_iter() {
assert!(remaining); bitset_docset.advance();
assert_eq!(bitset_docset.doc(), el); assert_eq!(bitset_docset.doc(), el);
remaining = bitset_docset.advance() != TERMINATED;
} }
assert!(!remaining); assert!(!bitset_docset.advance());
} }
#[test] #[test]

View File

@@ -10,9 +10,7 @@ pub(crate) use self::bitset::TinySet;
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite}; pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
pub use self::counting_writer::CountingWriter; pub use self::counting_writer::CountingWriter;
pub use self::serialize::{BinarySerializable, FixedSize}; pub use self::serialize::{BinarySerializable, FixedSize};
pub use self::vint::{ pub use self::vint::{read_u32_vint, serialize_vint_u32, write_u32_vint, VInt};
read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt,
};
pub use byteorder::LittleEndian as Endianness; pub use byteorder::LittleEndian as Endianness;
/// Segment's max doc must be `< MAX_DOC_LIMIT`. /// Segment's max doc must be `< MAX_DOC_LIMIT`.
@@ -20,19 +18,6 @@ pub use byteorder::LittleEndian as Endianness;
/// We do not allow segments with more than /// We do not allow segments with more than
pub const MAX_DOC_LIMIT: u32 = 1 << 31; pub const MAX_DOC_LIMIT: u32 = 1 << 31;
pub fn minmax<I, T>(mut vals: I) -> Option<(T, T)>
where
I: Iterator<Item = T>,
T: Copy + Ord,
{
if let Some(first_el) = vals.next() {
return Some(vals.fold((first_el, first_el), |(min_val, max_val), el| {
(min_val.min(el), max_val.max(el))
}));
}
None
}
/// Computes the number of bits that will be used for bitpacking. /// Computes the number of bits that will be used for bitpacking.
/// ///
/// In general the target is the minimum number of bits /// In general the target is the minimum number of bits
@@ -149,7 +134,6 @@ pub fn u64_to_f64(val: u64) -> f64 {
#[cfg(test)] #[cfg(test)]
pub(crate) mod test { pub(crate) mod test {
pub use super::minmax;
pub use super::serialize::test::fixed_size_test; pub use super::serialize::test::fixed_size_test;
use super::{compute_num_bits, f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64}; use super::{compute_num_bits, f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
use std::f64; use std::f64;
@@ -215,21 +199,4 @@ pub(crate) mod test {
assert!(((super::MAX_DOC_LIMIT - 1) as i32) >= 0); assert!(((super::MAX_DOC_LIMIT - 1) as i32) >= 0);
assert!((super::MAX_DOC_LIMIT as i32) < 0); assert!((super::MAX_DOC_LIMIT as i32) < 0);
} }
#[test]
fn test_minmax_empty() {
let vals: Vec<u32> = vec![];
assert_eq!(minmax(vals.into_iter()), None);
}
#[test]
fn test_minmax_one() {
assert_eq!(minmax(vec![1].into_iter()), Some((1, 1)));
}
#[test]
fn test_minmax_two() {
assert_eq!(minmax(vec![1, 2].into_iter()), Some((1, 2)));
assert_eq!(minmax(vec![2, 1].into_iter()), Some((1, 2)));
}
} }

View File

@@ -89,19 +89,6 @@ impl FixedSize for u64 {
const SIZE_IN_BYTES: usize = 8; const SIZE_IN_BYTES: usize = 8;
} }
impl BinarySerializable for f32 {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
writer.write_f32::<Endianness>(*self)
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
reader.read_f32::<Endianness>()
}
}
impl FixedSize for f32 {
const SIZE_IN_BYTES: usize = 4;
}
impl BinarySerializable for i64 { impl BinarySerializable for i64 {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> { fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
writer.write_i64::<Endianness>(*self) writer.write_i64::<Endianness>(*self)

View File

@@ -5,12 +5,12 @@ use std::io::Read;
use std::io::Write; use std::io::Write;
/// Wrapper over a `u64` that serializes as a variable int. /// Wrapper over a `u64` that serializes as a variable int.
#[derive(Clone, Copy, Debug, Eq, PartialEq)] #[derive(Debug, Eq, PartialEq)]
pub struct VInt(pub u64); pub struct VInt(pub u64);
const STOP_BIT: u8 = 128; const STOP_BIT: u8 = 128;
pub fn serialize_vint_u32(val: u32, buf: &mut [u8; 8]) -> &[u8] { pub fn serialize_vint_u32(val: u32) -> (u64, usize) {
const START_2: u64 = 1 << 7; const START_2: u64 = 1 << 7;
const START_3: u64 = 1 << 14; const START_3: u64 = 1 << 14;
const START_4: u64 = 1 << 21; const START_4: u64 = 1 << 21;
@@ -29,7 +29,7 @@ pub fn serialize_vint_u32(val: u32, buf: &mut [u8; 8]) -> &[u8] {
let val = u64::from(val); let val = u64::from(val);
const STOP_BIT: u64 = 128u64; const STOP_BIT: u64 = 128u64;
let (res, num_bytes) = match val { match val {
0..=STOP_1 => (val | STOP_BIT, 1), 0..=STOP_1 => (val | STOP_BIT, 1),
START_2..=STOP_2 => ( START_2..=STOP_2 => (
(val & MASK_1) | ((val & MASK_2) << 1) | (STOP_BIT << (8)), (val & MASK_1) | ((val & MASK_2) << 1) | (STOP_BIT << (8)),
@@ -56,9 +56,7 @@ pub fn serialize_vint_u32(val: u32, buf: &mut [u8; 8]) -> &[u8] {
| (STOP_BIT << (8 * 4)), | (STOP_BIT << (8 * 4)),
5, 5,
), ),
}; }
LittleEndian::write_u64(&mut buf[..], res);
&buf[0..num_bytes]
} }
/// Returns the number of bytes covered by a /// Returns the number of bytes covered by a
@@ -87,26 +85,23 @@ fn vint_len(data: &[u8]) -> usize {
/// If the buffer does not start by a valid /// If the buffer does not start by a valid
/// vint payload /// vint payload
pub fn read_u32_vint(data: &mut &[u8]) -> u32 { pub fn read_u32_vint(data: &mut &[u8]) -> u32 {
let (result, vlen) = read_u32_vint_no_advance(*data); let vlen = vint_len(*data);
*data = &data[vlen..];
result
}
pub fn read_u32_vint_no_advance(data: &[u8]) -> (u32, usize) {
let vlen = vint_len(data);
let mut result = 0u32; let mut result = 0u32;
let mut shift = 0u64; let mut shift = 0u64;
for &b in &data[..vlen] { for &b in &data[..vlen] {
result |= u32::from(b & 127u8) << shift; result |= u32::from(b & 127u8) << shift;
shift += 7; shift += 7;
} }
(result, vlen) *data = &data[vlen..];
result
} }
/// Write a `u32` as a vint payload. /// Write a `u32` as a vint payload.
pub fn write_u32_vint<W: io::Write>(val: u32, writer: &mut W) -> io::Result<()> { pub fn write_u32_vint<W: io::Write>(val: u32, writer: &mut W) -> io::Result<()> {
let mut buf = [0u8; 8]; let (val, num_bytes) = serialize_vint_u32(val);
let data = serialize_vint_u32(val, &mut buf); let mut buffer = [0u8; 8];
writer.write_all(&data) LittleEndian::write_u64(&mut buffer, val);
writer.write_all(&buffer[..num_bytes])
} }
impl VInt { impl VInt {
@@ -177,6 +172,7 @@ mod tests {
use super::serialize_vint_u32; use super::serialize_vint_u32;
use super::VInt; use super::VInt;
use crate::common::BinarySerializable; use crate::common::BinarySerializable;
use byteorder::{ByteOrder, LittleEndian};
fn aux_test_vint(val: u64) { fn aux_test_vint(val: u64) {
let mut v = [14u8; 10]; let mut v = [14u8; 10];
@@ -212,10 +208,12 @@ mod tests {
fn aux_test_serialize_vint_u32(val: u32) { fn aux_test_serialize_vint_u32(val: u32) {
let mut buffer = [0u8; 10]; let mut buffer = [0u8; 10];
let mut buffer2 = [0u8; 8]; let mut buffer2 = [0u8; 10];
let len_vint = VInt(val as u64).serialize_into(&mut buffer); let len_vint = VInt(val as u64).serialize_into(&mut buffer);
let res2 = serialize_vint_u32(val, &mut buffer2); let (vint, len) = serialize_vint_u32(val);
assert_eq!(&buffer[..len_vint], res2, "array wrong for {}", val); assert_eq!(len, len_vint, "len wrong for val {}", val);
LittleEndian::write_u64(&mut buffer2, vint);
assert_eq!(&buffer[..len], &buffer2[..len], "array wrong for {}", val);
} }
#[test] #[test]

View File

@@ -1,6 +1,5 @@
use crossbeam::channel; use crossbeam::channel;
use rayon::{ThreadPool, ThreadPoolBuilder}; use rayon::{ThreadPool, ThreadPoolBuilder};
use slog::{error, Logger};
/// Search executor whether search request are single thread or multithread. /// Search executor whether search request are single thread or multithread.
/// ///
@@ -44,7 +43,6 @@ impl Executor {
&self, &self,
f: F, f: F,
args: AIterator, args: AIterator,
logger: Logger,
) -> crate::Result<Vec<R>> { ) -> crate::Result<Vec<R>> {
match self { match self {
Executor::SingleThread => args.map(f).collect::<crate::Result<_>>(), Executor::SingleThread => args.map(f).collect::<crate::Result<_>>(),
@@ -59,7 +57,7 @@ impl Executor {
let (idx, arg) = arg_with_idx; let (idx, arg) = arg_with_idx;
let fruit = f(arg); let fruit = f(arg);
if let Err(err) = fruit_sender.send((idx, fruit)) { if let Err(err) = fruit_sender.send((idx, fruit)) {
error!(logger, "Failed to send search task. It probably means all search threads have panicked. {:?}", err); error!("Failed to send search task. It probably means all search threads have panicked. {:?}", err);
} }
}); });
} }
@@ -89,21 +87,17 @@ impl Executor {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use slog::{o, Discard, Logger};
use super::Executor; use super::Executor;
#[test] #[test]
#[should_panic(expected = "panic should propagate")] #[should_panic(expected = "panic should propagate")]
fn test_panic_propagates_single_thread() { fn test_panic_propagates_single_thread() {
let logger = Logger::root(Discard, o!());
let _result: Vec<usize> = Executor::single_thread() let _result: Vec<usize> = Executor::single_thread()
.map( .map(
|_| { |_| {
panic!("panic should propagate"); panic!("panic should propagate");
}, },
vec![0].into_iter(), vec![0].into_iter(),
logger,
) )
.unwrap(); .unwrap();
} }
@@ -111,7 +105,6 @@ mod tests {
#[test] #[test]
#[should_panic] //< unfortunately the panic message is not propagated #[should_panic] //< unfortunately the panic message is not propagated
fn test_panic_propagates_multi_thread() { fn test_panic_propagates_multi_thread() {
let logger = Logger::root(Discard, o!());
let _result: Vec<usize> = Executor::multi_thread(1, "search-test") let _result: Vec<usize> = Executor::multi_thread(1, "search-test")
.unwrap() .unwrap()
.map( .map(
@@ -119,16 +112,14 @@ mod tests {
panic!("panic should propagate"); panic!("panic should propagate");
}, },
vec![0].into_iter(), vec![0].into_iter(),
logger,
) )
.unwrap(); .unwrap();
} }
#[test] #[test]
fn test_map_singlethread() { fn test_map_singlethread() {
let logger = Logger::root(Discard, o!());
let result: Vec<usize> = Executor::single_thread() let result: Vec<usize> = Executor::single_thread()
.map(|i| Ok(i * 2), 0..1_000, logger) .map(|i| Ok(i * 2), 0..1_000)
.unwrap(); .unwrap();
assert_eq!(result.len(), 1_000); assert_eq!(result.len(), 1_000);
for i in 0..1_000 { for i in 0..1_000 {
@@ -138,10 +129,9 @@ mod tests {
#[test] #[test]
fn test_map_multithread() { fn test_map_multithread() {
let logger = Logger::root(Discard, o!());
let result: Vec<usize> = Executor::multi_thread(3, "search-test") let result: Vec<usize> = Executor::multi_thread(3, "search-test")
.unwrap() .unwrap()
.map(|i| Ok(i * 2), 0..10, logger) .map(|i| Ok(i * 2), 0..10)
.unwrap(); .unwrap();
assert_eq!(result.len(), 10); assert_eq!(result.len(), 10);
for i in 0..10 { for i in 0..10 {

View File

@@ -1,3 +1,4 @@
use super::segment::create_segment;
use super::segment::Segment; use super::segment::Segment;
use crate::core::Executor; use crate::core::Executor;
use crate::core::IndexMeta; use crate::core::IndexMeta;
@@ -21,14 +22,12 @@ use crate::schema::FieldType;
use crate::schema::Schema; use crate::schema::Schema;
use crate::tokenizer::{TextAnalyzer, TokenizerManager}; use crate::tokenizer::{TextAnalyzer, TokenizerManager};
use crate::IndexWriter; use crate::IndexWriter;
use slog::Logger; use num_cpus;
use std::borrow::BorrowMut; use std::borrow::BorrowMut;
use std::collections::HashSet; use std::collections::HashSet;
use std::fmt; use std::fmt;
#[cfg(feature = "mmap")] #[cfg(feature = "mmap")]
use std::path::Path; use std::path::{Path, PathBuf};
use std::path::PathBuf;
use std::sync::Arc; use std::sync::Arc;
fn load_metas( fn load_metas(
@@ -58,14 +57,7 @@ pub struct Index {
} }
impl Index { impl Index {
/// Examines the director to see if it contains an index
pub(crate) fn logger(&self) -> &Logger {
self.directory.logger()
}
/// Examines the directory to see if it contains an index.
///
/// Effectively, it only checks for the presence of the `meta.json` file.
pub fn exists<Dir: Directory>(dir: &Dir) -> bool { pub fn exists<Dir: Directory>(dir: &Dir) -> bool {
dir.exists(&META_FILEPATH) dir.exists(&META_FILEPATH)
} }
@@ -148,18 +140,16 @@ impl Index {
Index::create(mmap_directory, schema) Index::create(mmap_directory, schema)
} }
/// Creates a new index given an implementation of the trait `Directory`. /// Creates a new index given an implementation of the trait `Directory`
///
/// If a directory previously existed, it will be erased.
pub fn create<Dir: Directory>(dir: Dir, schema: Schema) -> crate::Result<Index> { pub fn create<Dir: Directory>(dir: Dir, schema: Schema) -> crate::Result<Index> {
let directory = ManagedDirectory::wrap(dir)?; let directory = ManagedDirectory::wrap(dir)?;
Index::new_from_directory(directory, schema) Index::from_directory(directory, schema)
} }
/// Create a new index from a directory. /// Create a new index from a directory.
/// ///
/// This will overwrite existing meta.json /// This will overwrite existing meta.json
fn new_from_directory(mut directory: ManagedDirectory, schema: Schema) -> crate::Result<Index> { fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> crate::Result<Index> {
save_new_metas(schema.clone(), directory.borrow_mut())?; save_new_metas(schema.clone(), directory.borrow_mut())?;
let metas = IndexMeta::with_schema(schema); let metas = IndexMeta::with_schema(schema);
Index::create_from_metas(directory, &metas, SegmentMetaInventory::default()) Index::create_from_metas(directory, &metas, SegmentMetaInventory::default())
@@ -250,8 +240,6 @@ impl Index {
/// Open the index using the provided directory /// Open the index using the provided directory
pub fn open<D: Directory>(directory: D) -> crate::Result<Index> { pub fn open<D: Directory>(directory: D) -> crate::Result<Index> {
let logger: &Logger = directory.logger();
slog::info!(logger, "index-open"; "directory" => format!("{:?}", directory));
let directory = ManagedDirectory::wrap(directory)?; let directory = ManagedDirectory::wrap(directory)?;
let inventory = SegmentMetaInventory::default(); let inventory = SegmentMetaInventory::default();
let metas = load_metas(&directory, &inventory)?; let metas = load_metas(&directory, &inventory)?;
@@ -295,7 +283,7 @@ impl Index {
TantivyError::LockFailure( TantivyError::LockFailure(
err, err,
Some( Some(
"Failed to acquire index lock. If you are using \ "Failed to acquire index lock. If you are using\
a regular directory, this means there is already an \ a regular directory, this means there is already an \
`IndexWriter` working on this `Directory`, in this process \ `IndexWriter` working on this `Directory`, in this process \
or in a different process." or in a different process."
@@ -312,15 +300,6 @@ impl Index {
) )
} }
/// Helper to create an index writer for tests.
///
/// That index writer only simply has a single thread and a heap of 5 MB.
/// Using a single thread gives us a deterministic allocation of DocId.
#[cfg(test)]
pub fn writer_for_tests(&self) -> crate::Result<IndexWriter> {
self.writer_with_num_threads(1, 10_000_000)
}
/// Creates a multithreaded writer /// Creates a multithreaded writer
/// ///
/// Tantivy will automatically define the number of threads to use. /// Tantivy will automatically define the number of threads to use.
@@ -358,7 +337,7 @@ impl Index {
#[doc(hidden)] #[doc(hidden)]
pub fn segment(&self, segment_meta: SegmentMeta) -> Segment { pub fn segment(&self, segment_meta: SegmentMeta) -> Segment {
Segment::for_index(self.clone(), segment_meta) create_segment(self.clone(), segment_meta)
} }
/// Creates a new segment. /// Creates a new segment.
@@ -523,7 +502,7 @@ mod tests {
let schema = throw_away_schema(); let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap(); let field = schema.get_field("num_likes").unwrap();
let mut index = Index::create_from_tempdir(schema).unwrap(); let mut index = Index::create_from_tempdir(schema).unwrap();
let mut writer = index.writer_for_tests().unwrap(); let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
writer.commit().unwrap(); writer.commit().unwrap();
let reader = index let reader = index
.reader_builder() .reader_builder()
@@ -560,33 +539,23 @@ mod tests {
test_index_on_commit_reload_policy_aux(field, &write_index, &reader); test_index_on_commit_reload_policy_aux(field, &write_index, &reader);
} }
} }
fn test_index_on_commit_reload_policy_aux(field: Field, index: &Index, reader: &IndexReader) { fn test_index_on_commit_reload_policy_aux(field: Field, index: &Index, reader: &IndexReader) {
let mut reader_index = reader.index(); let mut reader_index = reader.index();
let (sender, receiver) = crossbeam::channel::unbounded(); let (sender, receiver) = crossbeam::channel::unbounded();
let _watch_handle = reader_index.directory_mut().watch(Box::new(move || { let _watch_handle = reader_index.directory_mut().watch(Box::new(move || {
let _ = sender.send(()); let _ = sender.send(());
})); }));
let mut writer = index.writer_for_tests().unwrap(); let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
assert_eq!(reader.searcher().num_docs(), 0); assert_eq!(reader.searcher().num_docs(), 0);
writer.add_document(doc!(field=>1u64)); writer.add_document(doc!(field=>1u64));
writer.commit().unwrap(); writer.commit().unwrap();
// We need a loop here because it is possible for notify to send more than assert!(receiver.recv().is_ok());
// one modify event. It was observed on CI on MacOS. assert_eq!(reader.searcher().num_docs(), 1);
loop {
assert!(receiver.recv().is_ok());
if reader.searcher().num_docs() == 1 {
break;
}
}
writer.add_document(doc!(field=>2u64)); writer.add_document(doc!(field=>2u64));
writer.commit().unwrap(); writer.commit().unwrap();
// ... Same as above assert!(receiver.recv().is_ok());
loop { assert_eq!(reader.searcher().num_docs(), 2);
assert!(receiver.recv().is_ok());
if reader.searcher().num_docs() == 2 {
break;
}
}
} }
// This test will not pass on windows, because windows // This test will not pass on windows, because windows

View File

@@ -3,7 +3,8 @@ use crate::core::SegmentId;
use crate::schema::Schema; use crate::schema::Schema;
use crate::Opstamp; use crate::Opstamp;
use census::{Inventory, TrackedObject}; use census::{Inventory, TrackedObject};
use serde::{Deserialize, Serialize}; use serde;
use serde_json;
use std::collections::HashSet; use std::collections::HashSet;
use std::fmt; use std::fmt;
use std::path::PathBuf; use std::path::PathBuf;
@@ -213,7 +214,7 @@ pub struct IndexMeta {
#[serde(skip_serializing_if = "Option::is_none")] #[serde(skip_serializing_if = "Option::is_none")]
/// Payload associated to the last commit. /// Payload associated to the last commit.
/// ///
/// Upon commit, clients can optionally add a small `String` payload to their commit /// Upon commit, clients can optionally add a small `Striing` payload to their commit
/// to help identify this commit. /// to help identify this commit.
/// This payload is entirely unused by tantivy. /// This payload is entirely unused by tantivy.
pub payload: Option<String>, pub payload: Option<String>,

View File

@@ -3,9 +3,11 @@ use crate::directory::ReadOnlySource;
use crate::positions::PositionReader; use crate::positions::PositionReader;
use crate::postings::TermInfo; use crate::postings::TermInfo;
use crate::postings::{BlockSegmentPostings, SegmentPostings}; use crate::postings::{BlockSegmentPostings, SegmentPostings};
use crate::schema::FieldType;
use crate::schema::IndexRecordOption; use crate::schema::IndexRecordOption;
use crate::schema::Term; use crate::schema::Term;
use crate::termdict::TermDictionary; use crate::termdict::TermDictionary;
use owned_read::OwnedRead;
/// The inverted index reader is in charge of accessing /// The inverted index reader is in charge of accessing
/// the inverted index associated to a specific field. /// the inverted index associated to a specific field.
@@ -53,7 +55,10 @@ impl InvertedIndexReader {
/// Creates an empty `InvertedIndexReader` object, which /// Creates an empty `InvertedIndexReader` object, which
/// contains no terms at all. /// contains no terms at all.
pub fn empty(record_option: IndexRecordOption) -> InvertedIndexReader { pub fn empty(field_type: &FieldType) -> InvertedIndexReader {
let record_option = field_type
.get_index_record_option()
.unwrap_or(IndexRecordOption::Basic);
InvertedIndexReader { InvertedIndexReader {
termdict: TermDictionary::empty(), termdict: TermDictionary::empty(),
postings_source: ReadOnlySource::empty(), postings_source: ReadOnlySource::empty(),
@@ -92,7 +97,8 @@ impl InvertedIndexReader {
let offset = term_info.postings_offset as usize; let offset = term_info.postings_offset as usize;
let end_source = self.postings_source.len(); let end_source = self.postings_source.len();
let postings_slice = self.postings_source.slice(offset, end_source); let postings_slice = self.postings_source.slice(offset, end_source);
block_postings.reset(term_info.doc_freq, postings_slice); let postings_reader = OwnedRead::new(postings_slice);
block_postings.reset(term_info.doc_freq, postings_reader);
} }
/// Returns a block postings given a `Term`. /// Returns a block postings given a `Term`.
@@ -121,7 +127,7 @@ impl InvertedIndexReader {
let postings_data = self.postings_source.slice_from(offset); let postings_data = self.postings_source.slice_from(offset);
BlockSegmentPostings::from_data( BlockSegmentPostings::from_data(
term_info.doc_freq, term_info.doc_freq,
postings_data, OwnedRead::new(postings_data),
self.record_option, self.record_option,
requested_option, requested_option,
) )

View File

@@ -1,8 +1,11 @@
use crate::collector::Collector; use crate::collector::Collector;
use crate::collector::SegmentCollector;
use crate::core::Executor; use crate::core::Executor;
use crate::core::InvertedIndexReader; use crate::core::InvertedIndexReader;
use crate::core::SegmentReader; use crate::core::SegmentReader;
use crate::query::Query; use crate::query::Query;
use crate::query::Scorer;
use crate::query::Weight;
use crate::schema::Document; use crate::schema::Document;
use crate::schema::Schema; use crate::schema::Schema;
use crate::schema::{Field, Term}; use crate::schema::{Field, Term};
@@ -14,6 +17,26 @@ use crate::Index;
use std::fmt; use std::fmt;
use std::sync::Arc; use std::sync::Arc;
fn collect_segment<C: Collector>(
collector: &C,
weight: &dyn Weight,
segment_ord: u32,
segment_reader: &SegmentReader,
) -> crate::Result<C::Fruit> {
let mut scorer = weight.scorer(segment_reader)?;
let mut segment_collector = collector.for_segment(segment_ord as u32, segment_reader)?;
if let Some(delete_bitset) = segment_reader.delete_bitset() {
scorer.for_each(&mut |doc, score| {
if delete_bitset.is_alive(doc) {
segment_collector.collect(doc, score);
}
});
} else {
scorer.for_each(&mut |doc, score| segment_collector.collect(doc, score));
}
Ok(segment_collector.harvest())
}
/// Holds a list of `SegmentReader`s ready for search. /// Holds a list of `SegmentReader`s ready for search.
/// ///
/// It guarantees that the `Segment` will not be removed before /// It guarantees that the `Segment` will not be removed before
@@ -140,10 +163,14 @@ impl Searcher {
let segment_readers = self.segment_readers(); let segment_readers = self.segment_readers();
let fruits = executor.map( let fruits = executor.map(
|(segment_ord, segment_reader)| { |(segment_ord, segment_reader)| {
collector.collect_segment(weight.as_ref(), segment_ord as u32, segment_reader) collect_segment(
collector,
weight.as_ref(),
segment_ord as u32,
segment_reader,
)
}, },
segment_readers.iter().enumerate(), segment_readers.iter().enumerate(),
self.index.logger().clone(),
)?; )?;
collector.merge_fruits(fruits) collector.merge_fruits(fruits)
} }

View File

@@ -24,12 +24,15 @@ impl fmt::Debug for Segment {
} }
} }
impl Segment { /// Creates a new segment given an `Index` and a `SegmentId`
/// Creates a new segment given an `Index` and a `SegmentId` ///
pub(crate) fn for_index(index: Index, meta: SegmentMeta) -> Segment { /// The function is here to make it private outside `tantivy`.
Segment { index, meta } /// #[doc(hidden)]
} pub fn create_segment(index: Index, meta: SegmentMeta) -> Segment {
Segment { index, meta }
}
impl Segment {
/// Returns the index the segment belongs to. /// Returns the index the segment belongs to.
pub fn index(&self) -> &Index { pub fn index(&self) -> &Index {
&self.index &self.index

View File

@@ -4,7 +4,6 @@ use uuid::Uuid;
#[cfg(test)] #[cfg(test)]
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use serde::{Deserialize, Serialize};
use std::error::Error; use std::error::Error;
use std::str::FromStr; use std::str::FromStr;
#[cfg(test)] #[cfg(test)]
@@ -21,12 +20,6 @@ use std::sync::atomic;
#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] #[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct SegmentId(Uuid); pub struct SegmentId(Uuid);
impl ToString for SegmentId {
fn to_string(&self) -> String {
self.short_uuid_string()
}
}
#[cfg(test)] #[cfg(test)]
static AUTO_INC_COUNTER: Lazy<atomic::AtomicUsize> = Lazy::new(|| atomic::AtomicUsize::default()); static AUTO_INC_COUNTER: Lazy<atomic::AtomicUsize> = Lazy::new(|| atomic::AtomicUsize::default());

View File

@@ -8,16 +8,15 @@ use crate::directory::ReadOnlySource;
use crate::fastfield::DeleteBitSet; use crate::fastfield::DeleteBitSet;
use crate::fastfield::FacetReader; use crate::fastfield::FacetReader;
use crate::fastfield::FastFieldReaders; use crate::fastfield::FastFieldReaders;
use crate::fieldnorm::{FieldNormReader, FieldNormReaders}; use crate::fieldnorm::FieldNormReader;
use crate::schema::Field;
use crate::schema::FieldType; use crate::schema::FieldType;
use crate::schema::Schema; use crate::schema::Schema;
use crate::schema::{Field, IndexRecordOption};
use crate::space_usage::SegmentSpaceUsage; use crate::space_usage::SegmentSpaceUsage;
use crate::store::StoreReader; use crate::store::StoreReader;
use crate::termdict::TermDictionary; use crate::termdict::TermDictionary;
use crate::DocId; use crate::DocId;
use fail::fail_point; use fail::fail_point;
use slog::{warn, Logger};
use std::collections::HashMap; use std::collections::HashMap;
use std::fmt; use std::fmt;
use std::sync::Arc; use std::sync::Arc;
@@ -49,12 +48,11 @@ pub struct SegmentReader {
positions_composite: CompositeFile, positions_composite: CompositeFile,
positions_idx_composite: CompositeFile, positions_idx_composite: CompositeFile,
fast_fields_readers: Arc<FastFieldReaders>, fast_fields_readers: Arc<FastFieldReaders>,
fieldnorm_readers: FieldNormReaders, fieldnorms_composite: CompositeFile,
store_source: ReadOnlySource, store_source: ReadOnlySource,
delete_bitset_opt: Option<DeleteBitSet>, delete_bitset_opt: Option<DeleteBitSet>,
schema: Schema, schema: Schema,
logger: Logger,
} }
impl SegmentReader { impl SegmentReader {
@@ -127,15 +125,17 @@ impl SegmentReader {
/// ///
/// They are simply stored as a fast field, serialized in /// They are simply stored as a fast field, serialized in
/// the `.fieldnorm` file of the segment. /// the `.fieldnorm` file of the segment.
pub fn get_fieldnorms_reader(&self, field: Field) -> crate::Result<FieldNormReader> { pub fn get_fieldnorms_reader(&self, field: Field) -> FieldNormReader {
self.fieldnorm_readers.get_field(field).ok_or_else(|| { if let Some(fieldnorm_source) = self.fieldnorms_composite.open_read(field) {
FieldNormReader::open(fieldnorm_source)
} else {
let field_name = self.schema.get_field_name(field); let field_name = self.schema.get_field_name(field);
let err_msg = format!( let err_msg = format!(
"Field norm not found for field {:?}. Was it market as indexed during indexing.", "Field norm not found for field {:?}. Was it market as indexed during indexing.",
field_name field_name
); );
crate::TantivyError::SchemaError(err_msg) panic!(err_msg);
}) }
} }
/// Accessor to the segment's `StoreReader`. /// Accessor to the segment's `StoreReader`.
@@ -178,8 +178,8 @@ impl SegmentReader {
let fast_field_readers = let fast_field_readers =
Arc::new(FastFieldReaders::load_all(&schema, &fast_fields_composite)?); Arc::new(FastFieldReaders::load_all(&schema, &fast_fields_composite)?);
let fieldnorm_data = segment.open_read(SegmentComponent::FIELDNORMS)?; let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?; let fieldnorms_composite = CompositeFile::open(&fieldnorms_data)?;
let delete_bitset_opt = if segment.meta().has_deletes() { let delete_bitset_opt = if segment.meta().has_deletes() {
let delete_data = segment.open_read(SegmentComponent::DELETE)?; let delete_data = segment.open_read(SegmentComponent::DELETE)?;
@@ -195,14 +195,13 @@ impl SegmentReader {
termdict_composite, termdict_composite,
postings_composite, postings_composite,
fast_fields_readers: fast_field_readers, fast_fields_readers: fast_field_readers,
fieldnorm_readers, fieldnorms_composite,
segment_id: segment.id(), segment_id: segment.id(),
store_source, store_source,
delete_bitset_opt, delete_bitset_opt,
positions_composite, positions_composite,
positions_idx_composite, positions_idx_composite,
schema, schema,
logger: segment.index().logger().clone(),
}) })
} }
@@ -213,11 +212,6 @@ impl SegmentReader {
/// The field reader is in charge of iterating through the /// The field reader is in charge of iterating through the
/// term dictionary associated to a specific field, /// term dictionary associated to a specific field,
/// and opening the posting list associated to any term. /// and opening the posting list associated to any term.
///
/// If the field is marked as index, a warn is logged and an empty `InvertedIndexReader`
/// is returned.
/// Similarly if the field is marked as indexed but no term has been indexed for the given
/// index. an empty `InvertedIndexReader` is returned (but no warning is logged).
pub fn inverted_index(&self, field: Field) -> Arc<InvertedIndexReader> { pub fn inverted_index(&self, field: Field) -> Arc<InvertedIndexReader> {
if let Some(inv_idx_reader) = self if let Some(inv_idx_reader) = self
.inv_idx_reader_cache .inv_idx_reader_cache
@@ -232,25 +226,21 @@ impl SegmentReader {
let record_option_opt = field_type.get_index_record_option(); let record_option_opt = field_type.get_index_record_option();
if record_option_opt.is_none() { if record_option_opt.is_none() {
warn!( panic!("Field {:?} does not seem indexed.", field_entry.name());
self.logger,
"Field {:?} does not seem indexed.",
field_entry.name()
);
} }
let record_option = record_option_opt.unwrap();
let postings_source_opt = self.postings_composite.open_read(field); let postings_source_opt = self.postings_composite.open_read(field);
if postings_source_opt.is_none() || record_option_opt.is_none() { if postings_source_opt.is_none() {
// no documents in the segment contained this field. // no documents in the segment contained this field.
// As a result, no data is associated to the inverted index. // As a result, no data is associated to the inverted index.
// //
// Returns an empty inverted index. // Returns an empty inverted index.
let record_option = record_option_opt.unwrap_or(IndexRecordOption::Basic); return Arc::new(InvertedIndexReader::empty(field_type));
return Arc::new(InvertedIndexReader::empty(record_option));
} }
let record_option = record_option_opt.unwrap();
let postings_source = postings_source_opt.unwrap(); let postings_source = postings_source_opt.unwrap();
let termdict_source = self.termdict_composite.open_read(field).expect( let termdict_source = self.termdict_composite.open_read(field).expect(
@@ -305,8 +295,8 @@ impl SegmentReader {
} }
/// Returns an iterator that will iterate over the alive document ids /// Returns an iterator that will iterate over the alive document ids
pub fn doc_ids_alive<'a>(&'a self) -> impl Iterator<Item = DocId> + 'a { pub fn doc_ids_alive(&self) -> SegmentReaderAliveDocsIterator<'_> {
(0u32..self.max_doc).filter(move |doc| !self.is_deleted(*doc)) SegmentReaderAliveDocsIterator::new(&self)
} }
/// Summarize total space usage of this segment. /// Summarize total space usage of this segment.
@@ -318,7 +308,7 @@ impl SegmentReader {
self.positions_composite.space_usage(), self.positions_composite.space_usage(),
self.positions_idx_composite.space_usage(), self.positions_idx_composite.space_usage(),
self.fast_fields_readers.space_usage(), self.fast_fields_readers.space_usage(),
self.fieldnorm_readers.space_usage(), self.fieldnorms_composite.space_usage(),
self.get_store_reader().space_usage(), self.get_store_reader().space_usage(),
self.delete_bitset_opt self.delete_bitset_opt
.as_ref() .as_ref()
@@ -334,6 +324,52 @@ impl fmt::Debug for SegmentReader {
} }
} }
/// Implements the iterator trait to allow easy iteration
/// over non-deleted ("alive") DocIds in a SegmentReader
pub struct SegmentReaderAliveDocsIterator<'a> {
reader: &'a SegmentReader,
max_doc: DocId,
current: DocId,
}
impl<'a> SegmentReaderAliveDocsIterator<'a> {
pub fn new(reader: &'a SegmentReader) -> SegmentReaderAliveDocsIterator<'a> {
SegmentReaderAliveDocsIterator {
reader,
max_doc: reader.max_doc(),
current: 0,
}
}
}
impl<'a> Iterator for SegmentReaderAliveDocsIterator<'a> {
type Item = DocId;
fn next(&mut self) -> Option<Self::Item> {
// TODO: Use TinySet (like in BitSetDocSet) to speed this process up
if self.current >= self.max_doc {
return None;
}
// find the next alive doc id
while self.reader.is_deleted(self.current) {
self.current += 1;
if self.current >= self.max_doc {
return None;
}
}
// capture the current alive DocId
let result = Some(self.current);
// move down the chain
self.current += 1;
result
}
}
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use crate::core::Index; use crate::core::Index;
@@ -349,7 +385,7 @@ mod test {
let name = schema.get_field("name").unwrap(); let name = schema.get_field("name").unwrap();
{ {
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(name => "tantivy")); index_writer.add_document(doc!(name => "tantivy"));
index_writer.add_document(doc!(name => "horse")); index_writer.add_document(doc!(name => "horse"));
index_writer.add_document(doc!(name => "jockey")); index_writer.add_document(doc!(name => "jockey"));

View File

@@ -1,5 +1,3 @@
use slog::{error, Logger};
use crate::directory::directory_lock::Lock; use crate::directory::directory_lock::Lock;
use crate::directory::error::LockError; use crate::directory::error::LockError;
use crate::directory::error::{DeleteError, OpenReadError, OpenWriteError}; use crate::directory::error::{DeleteError, OpenReadError, OpenWriteError};
@@ -66,10 +64,7 @@ impl<T: Send + Sync + 'static> From<Box<T>> for DirectoryLock {
impl Drop for DirectoryLockGuard { impl Drop for DirectoryLockGuard {
fn drop(&mut self) { fn drop(&mut self) {
if let Err(e) = self.directory.delete(&*self.path) { if let Err(e) = self.directory.delete(&*self.path) {
error!( error!("Failed to remove the lock file. {:?}", e);
self.directory.logger(),
"Failed to remove the lock file. {:?}", e
);
} }
} }
} }
@@ -85,7 +80,7 @@ fn try_acquire_lock(
) -> Result<DirectoryLock, TryAcquireLockError> { ) -> Result<DirectoryLock, TryAcquireLockError> {
let mut write = directory.open_write(filepath).map_err(|e| match e { let mut write = directory.open_write(filepath).map_err(|e| match e {
OpenWriteError::FileAlreadyExists(_) => TryAcquireLockError::FileExists, OpenWriteError::FileAlreadyExists(_) => TryAcquireLockError::FileExists,
OpenWriteError::IOError { io_error, .. } => TryAcquireLockError::IOError(io_error), OpenWriteError::IOError(io_error) => TryAcquireLockError::IOError(io_error.into()),
})?; })?;
write.flush().map_err(TryAcquireLockError::IOError)?; write.flush().map_err(TryAcquireLockError::IOError)?;
Ok(DirectoryLock::from(Box::new(DirectoryLockGuard { Ok(DirectoryLock::from(Box::new(DirectoryLockGuard {
@@ -214,9 +209,6 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
/// `OnCommit` `ReloadPolicy`. Not implementing watch in a `Directory` only prevents the /// `OnCommit` `ReloadPolicy`. Not implementing watch in a `Directory` only prevents the
/// `OnCommit` `ReloadPolicy` to work properly. /// `OnCommit` `ReloadPolicy` to work properly.
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle>; fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle>;
/// Returns the `slog::Logger` configured for the `Directory`.
fn logger(&self) -> &Logger;
} }
/// DirectoryClone /// DirectoryClone

View File

@@ -1,60 +1,160 @@
use crate::Version; use crate::Version;
use std::error::Error as StdError;
use std::fmt; use std::fmt;
use std::io; use std::io;
use std::path::PathBuf; use std::path::PathBuf;
/// Error while trying to acquire a directory lock. /// Error while trying to acquire a directory lock.
#[derive(Debug, Error)] #[derive(Debug, Fail)]
pub enum LockError { pub enum LockError {
/// Failed to acquired a lock as it is already held by another /// Failed to acquired a lock as it is already held by another
/// client. /// client.
/// - In the context of a blocking lock, this means the lock was not released within some `timeout` period. /// - In the context of a blocking lock, this means the lock was not released within some `timeout` period.
/// - In the context of a non-blocking lock, this means the lock was busy at the moment of the call. /// - In the context of a non-blocking lock, this means the lock was busy at the moment of the call.
#[error("Could not acquire lock as it is already held, possibly by a different process.")] #[fail(
display = "Could not acquire lock as it is already held, possibly by a different process."
)]
LockBusy, LockBusy,
/// Trying to acquire a lock failed with an `IOError` /// Trying to acquire a lock failed with an `IOError`
#[error("Failed to acquire the lock due to an io:Error.")] #[fail(display = "Failed to acquire the lock due to an io:Error.")]
IOError(io::Error), IOError(io::Error),
} }
/// General IO error with an optional path to the offending file.
#[derive(Debug)]
pub struct IOError {
path: Option<PathBuf>,
err: io::Error,
}
impl Into<io::Error> for IOError {
fn into(self) -> io::Error {
self.err
}
}
impl fmt::Display for IOError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self.path {
Some(ref path) => write!(f, "io error occurred on path '{:?}': '{}'", path, self.err),
None => write!(f, "io error occurred: '{}'", self.err),
}
}
}
impl StdError for IOError {
fn description(&self) -> &str {
"io error occurred"
}
fn cause(&self) -> Option<&dyn StdError> {
Some(&self.err)
}
}
impl IOError {
pub(crate) fn with_path(path: PathBuf, err: io::Error) -> Self {
IOError {
path: Some(path),
err,
}
}
}
impl From<io::Error> for IOError {
fn from(err: io::Error) -> IOError {
IOError { path: None, err }
}
}
/// Error that may occur when opening a directory /// Error that may occur when opening a directory
#[derive(Debug, Error)] #[derive(Debug)]
pub enum OpenDirectoryError { pub enum OpenDirectoryError {
/// The underlying directory does not exists. /// The underlying directory does not exists.
#[error("Directory does not exist: '{0}'.")]
DoesNotExist(PathBuf), DoesNotExist(PathBuf),
/// The path exists but is not a directory. /// The path exists but is not a directory.
#[error("Path exists but is not a directory: '{0}'.")]
NotADirectory(PathBuf), NotADirectory(PathBuf),
/// Failed to create a temp directory.
#[error("Failed to create a temporary directory: '{0}'.")]
FailedToCreateTempDir(io::Error),
/// IoError /// IoError
#[error("IOError '{io_error:?}' while create directory in: '{directory_path:?}'.")] IoError(io::Error),
IoError { }
/// underlying io Error.
io_error: io::Error, impl From<io::Error> for OpenDirectoryError {
/// directory we tried to open. fn from(io_err: io::Error) -> Self {
directory_path: PathBuf, OpenDirectoryError::IoError(io_err)
}, }
}
impl fmt::Display for OpenDirectoryError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match *self {
OpenDirectoryError::DoesNotExist(ref path) => {
write!(f, "the underlying directory '{:?}' does not exist", path)
}
OpenDirectoryError::NotADirectory(ref path) => {
write!(f, "the path '{:?}' exists but is not a directory", path)
}
OpenDirectoryError::IoError(ref err) => write!(
f,
"IOError while trying to open/create the directory. {:?}",
err
),
}
}
}
impl StdError for OpenDirectoryError {
fn description(&self) -> &str {
"error occurred while opening a directory"
}
fn cause(&self) -> Option<&dyn StdError> {
None
}
} }
/// Error that may occur when starting to write in a file /// Error that may occur when starting to write in a file
#[derive(Debug, Error)] #[derive(Debug)]
pub enum OpenWriteError { pub enum OpenWriteError {
/// Our directory is WORM, writing an existing file is forbidden. /// Our directory is WORM, writing an existing file is forbidden.
/// Checkout the `Directory` documentation. /// Checkout the `Directory` documentation.
#[error("File already exists: '{0}'")]
FileAlreadyExists(PathBuf), FileAlreadyExists(PathBuf),
/// Any kind of IO error that happens when /// Any kind of IO error that happens when
/// writing in the underlying IO device. /// writing in the underlying IO device.
#[error("IOError '{io_error:?}' while opening file for write: '{filepath}'.")] IOError(IOError),
IOError { }
/// The underlying `io::Error`.
io_error: io::Error, impl From<IOError> for OpenWriteError {
/// File path of the file that tantivy failed to open for write. fn from(err: IOError) -> OpenWriteError {
filepath: PathBuf, OpenWriteError::IOError(err)
}, }
}
impl fmt::Display for OpenWriteError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match *self {
OpenWriteError::FileAlreadyExists(ref path) => {
write!(f, "the file '{:?}' already exists", path)
}
OpenWriteError::IOError(ref err) => write!(
f,
"an io error occurred while opening a file for writing: '{}'",
err
),
}
}
}
impl StdError for OpenWriteError {
fn description(&self) -> &str {
"error occurred while opening a file for writing"
}
fn cause(&self) -> Option<&dyn StdError> {
match *self {
OpenWriteError::FileAlreadyExists(_) => None,
OpenWriteError::IOError(ref err) => Some(err),
}
}
} }
/// Type of index incompatibility between the library and the index found on disk /// Type of index incompatibility between the library and the index found on disk
@@ -117,41 +217,55 @@ impl fmt::Debug for Incompatibility {
} }
/// Error that may occur when accessing a file read /// Error that may occur when accessing a file read
#[derive(Debug, Error)] #[derive(Debug)]
pub enum OpenReadError { pub enum OpenReadError {
/// The file does not exists. /// The file does not exists.
#[error("Files does not exists: {0:?}")]
FileDoesNotExist(PathBuf),
/// Any kind of io::Error.
#[error(
"IOError: '{io_error:?}' happened while opening the following file for Read: {filepath}."
)]
IOError {
/// The underlying `io::Error`.
io_error: io::Error,
/// File path of the file that tantivy failed to open for read.
filepath: PathBuf,
},
/// This library does not support the index version found in file footer.
#[error("Index version unsupported: {0:?}")]
IncompatibleIndex(Incompatibility),
}
/// Error that may occur when trying to delete a file
#[derive(Debug, Error)]
pub enum DeleteError {
/// The file does not exists.
#[error("File does not exists: '{0}'.")]
FileDoesNotExist(PathBuf), FileDoesNotExist(PathBuf),
/// Any kind of IO error that happens when /// Any kind of IO error that happens when
/// interacting with the underlying IO device. /// interacting with the underlying IO device.
#[error("The following IO error happened while deleting file '{filepath}': '{io_error:?}'.")] IOError(IOError),
IOError { /// This library doesn't support the index version found on disk
/// The underlying `io::Error`. IncompatibleIndex(Incompatibility),
io_error: io::Error, }
/// File path of the file that tantivy failed to delete.
filepath: PathBuf, impl From<IOError> for OpenReadError {
}, fn from(err: IOError) -> OpenReadError {
OpenReadError::IOError(err)
}
}
impl fmt::Display for OpenReadError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match *self {
OpenReadError::FileDoesNotExist(ref path) => {
write!(f, "the file '{:?}' does not exist", path)
}
OpenReadError::IOError(ref err) => write!(
f,
"an io error occurred while opening a file for reading: '{}'",
err
),
OpenReadError::IncompatibleIndex(ref footer) => {
write!(f, "Incompatible index format: {:?}", footer)
}
}
}
}
/// Error that may occur when trying to delete a file
#[derive(Debug)]
pub enum DeleteError {
/// The file does not exists.
FileDoesNotExist(PathBuf),
/// Any kind of IO error that happens when
/// interacting with the underlying IO device.
IOError(IOError),
}
impl From<IOError> for DeleteError {
fn from(err: IOError) -> DeleteError {
DeleteError::IOError(err)
}
} }
impl From<Incompatibility> for OpenReadError { impl From<Incompatibility> for OpenReadError {
@@ -159,3 +273,29 @@ impl From<Incompatibility> for OpenReadError {
OpenReadError::IncompatibleIndex(incompatibility) OpenReadError::IncompatibleIndex(incompatibility)
} }
} }
impl fmt::Display for DeleteError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match *self {
DeleteError::FileDoesNotExist(ref path) => {
write!(f, "the file '{:?}' does not exist", path)
}
DeleteError::IOError(ref err) => {
write!(f, "an io error occurred while deleting a file: '{}'", err)
}
}
}
}
impl StdError for DeleteError {
fn description(&self) -> &str {
"error occurred while deleting a file"
}
fn cause(&self) -> Option<&dyn StdError> {
match *self {
DeleteError::FileDoesNotExist(_) => None,
DeleteError::IOError(ref err) => Some(err),
}
}
}

View File

@@ -8,8 +8,6 @@ use crc32fast::Hasher;
use std::io; use std::io;
use std::io::Write; use std::io::Write;
const FOOTER_MAX_LEN: usize = 10_000;
type CrcHashU32 = u32; type CrcHashU32 = u32;
#[derive(Debug, Clone, PartialEq)] #[derive(Debug, Clone, PartialEq)]
@@ -94,24 +92,12 @@ impl Footer {
match &self.versioned_footer { match &self.versioned_footer {
VersionedFooter::V1 { VersionedFooter::V1 {
crc32: _crc, crc32: _crc,
store_compression, store_compression: compression,
} => { } => {
if &library_version.store_compression != store_compression { if &library_version.store_compression != compression {
return Err(Incompatibility::CompressionMismatch { return Err(Incompatibility::CompressionMismatch {
library_compression_format: library_version.store_compression.to_string(), library_compression_format: library_version.store_compression.to_string(),
index_compression_format: store_compression.to_string(), index_compression_format: compression.to_string(),
});
}
Ok(())
}
VersionedFooter::V2 {
crc32: _crc,
store_compression,
} => {
if &library_version.store_compression != store_compression {
return Err(Incompatibility::CompressionMismatch {
library_compression_format: library_version.store_compression.to_string(),
index_compression_format: store_compression.to_string(),
}); });
} }
Ok(()) Ok(())
@@ -132,29 +118,24 @@ pub enum VersionedFooter {
crc32: CrcHashU32, crc32: CrcHashU32,
store_compression: String, store_compression: String,
}, },
// Introduction of the Block WAND information.
V2 {
crc32: CrcHashU32,
store_compression: String,
},
} }
impl BinarySerializable for VersionedFooter { impl BinarySerializable for VersionedFooter {
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> { fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
let mut buf = Vec::new(); let mut buf = Vec::new();
match self { match self {
VersionedFooter::V2 { VersionedFooter::V1 {
crc32, crc32,
store_compression: compression, store_compression: compression,
} => { } => {
// Serializes a valid `VersionedFooter` or panics if the version is unknown // Serializes a valid `VersionedFooter` or panics if the version is unknown
// [ version | crc_hash | compression_mode ] // [ version | crc_hash | compression_mode ]
// [ 0..4 | 4..8 | variable ] // [ 0..4 | 4..8 | variable ]
BinarySerializable::serialize(&2u32, &mut buf)?; BinarySerializable::serialize(&1u32, &mut buf)?;
BinarySerializable::serialize(crc32, &mut buf)?; BinarySerializable::serialize(crc32, &mut buf)?;
BinarySerializable::serialize(compression, &mut buf)?; BinarySerializable::serialize(compression, &mut buf)?;
} }
VersionedFooter::V1 { .. } | VersionedFooter::UnknownVersion => { VersionedFooter::UnknownVersion => {
return Err(io::Error::new( return Err(io::Error::new(
io::ErrorKind::InvalidInput, io::ErrorKind::InvalidInput,
"Cannot serialize an unknown versioned footer ", "Cannot serialize an unknown versioned footer ",
@@ -162,51 +143,32 @@ impl BinarySerializable for VersionedFooter {
} }
} }
BinarySerializable::serialize(&VInt(buf.len() as u64), writer)?; BinarySerializable::serialize(&VInt(buf.len() as u64), writer)?;
assert!(buf.len() <= FOOTER_MAX_LEN);
writer.write_all(&buf[..])?; writer.write_all(&buf[..])?;
Ok(()) Ok(())
} }
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> { fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let len = VInt::deserialize(reader)?.0 as usize; let len = VInt::deserialize(reader)?.0 as usize;
if len > FOOTER_MAX_LEN {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!(
"Footer seems invalid as it suggests a footer len of {}. File is corrupted, \
or the index was created with a different & old version of tantivy.",
len
),
));
}
let mut buf = vec![0u8; len]; let mut buf = vec![0u8; len];
reader.read_exact(&mut buf[..])?; reader.read_exact(&mut buf[..])?;
let mut cursor = &buf[..]; let mut cursor = &buf[..];
let version = u32::deserialize(&mut cursor)?; let version = u32::deserialize(&mut cursor)?;
if version != 1 && version != 2 { if version == 1 {
return Ok(VersionedFooter::UnknownVersion); let crc32 = u32::deserialize(&mut cursor)?;
} let compression = String::deserialize(&mut cursor)?;
let crc32 = u32::deserialize(&mut cursor)?; Ok(VersionedFooter::V1 {
let store_compression = String::deserialize(&mut cursor)?;
Ok(if version == 1 {
VersionedFooter::V1 {
crc32, crc32,
store_compression, store_compression: compression,
} })
} else { } else {
assert_eq!(version, 2); Ok(VersionedFooter::UnknownVersion)
VersionedFooter::V2 { }
crc32,
store_compression,
}
})
} }
} }
impl VersionedFooter { impl VersionedFooter {
pub fn crc(&self) -> Option<CrcHashU32> { pub fn crc(&self) -> Option<CrcHashU32> {
match self { match self {
VersionedFooter::V2 { crc32, .. } => Some(*crc32),
VersionedFooter::V1 { crc32, .. } => Some(*crc32), VersionedFooter::V1 { crc32, .. } => Some(*crc32),
VersionedFooter::UnknownVersion { .. } => None, VersionedFooter::UnknownVersion { .. } => None,
} }
@@ -244,7 +206,7 @@ impl<W: TerminatingWrite> Write for FooterProxy<W> {
impl<W: TerminatingWrite> TerminatingWrite for FooterProxy<W> { impl<W: TerminatingWrite> TerminatingWrite for FooterProxy<W> {
fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()> { fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()> {
let crc32 = self.hasher.take().unwrap().finalize(); let crc32 = self.hasher.take().unwrap().finalize();
let footer = Footer::new(VersionedFooter::V2 { let footer = Footer::new(VersionedFooter::V1 {
crc32, crc32,
store_compression: crate::store::COMPRESSION.to_string(), store_compression: crate::store::COMPRESSION.to_string(),
}); });
@@ -259,29 +221,28 @@ mod tests {
use super::CrcHashU32; use super::CrcHashU32;
use super::FooterProxy; use super::FooterProxy;
use crate::common::{BinarySerializable, VInt}; use crate::common::BinarySerializable;
use crate::directory::footer::{Footer, VersionedFooter}; use crate::directory::footer::{Footer, VersionedFooter};
use crate::directory::TerminatingWrite; use crate::directory::TerminatingWrite;
use byteorder::{ByteOrder, LittleEndian}; use byteorder::{ByteOrder, LittleEndian};
use regex::Regex; use regex::Regex;
use std::io;
#[test] #[test]
fn test_versioned_footer() { fn test_versioned_footer() {
let mut vec = Vec::new(); let mut vec = Vec::new();
let footer_proxy = FooterProxy::new(&mut vec); let footer_proxy = FooterProxy::new(&mut vec);
assert!(footer_proxy.terminate().is_ok()); assert!(footer_proxy.terminate().is_ok());
if crate::store::COMPRESSION == "lz4" { assert_eq!(vec.len(), 167);
assert_eq!(vec.len(), 158);
} else {
assert_eq!(vec.len(), 167);
}
let footer = Footer::deserialize(&mut &vec[..]).unwrap(); let footer = Footer::deserialize(&mut &vec[..]).unwrap();
assert!(matches!( if let VersionedFooter::V1 {
footer.versioned_footer, crc32: _,
VersionedFooter::V2 { store_compression, .. } store_compression,
if store_compression == crate::store::COMPRESSION } = footer.versioned_footer
)); {
assert_eq!(store_compression, crate::store::COMPRESSION);
} else {
panic!("Versioned footer should be V1.");
}
assert_eq!(&footer.version, crate::version()); assert_eq!(&footer.version, crate::version());
} }
@@ -289,7 +250,7 @@ mod tests {
fn test_serialize_deserialize_footer() { fn test_serialize_deserialize_footer() {
let mut buffer = Vec::new(); let mut buffer = Vec::new();
let crc32 = 123456u32; let crc32 = 123456u32;
let footer: Footer = Footer::new(VersionedFooter::V2 { let footer: Footer = Footer::new(VersionedFooter::V1 {
crc32, crc32,
store_compression: "lz4".to_string(), store_compression: "lz4".to_string(),
}); });
@@ -301,7 +262,7 @@ mod tests {
#[test] #[test]
fn footer_length() { fn footer_length() {
let crc32 = 1111111u32; let crc32 = 1111111u32;
let versioned_footer = VersionedFooter::V2 { let versioned_footer = VersionedFooter::V1 {
crc32, crc32,
store_compression: "lz4".to_string(), store_compression: "lz4".to_string(),
}; };
@@ -322,7 +283,7 @@ mod tests {
// versionned footer length // versionned footer length
12 | 128, 12 | 128,
// index format version // index format version
2, 1,
0, 0,
0, 0,
0, 0,
@@ -341,7 +302,7 @@ mod tests {
let versioned_footer = VersionedFooter::deserialize(&mut cursor).unwrap(); let versioned_footer = VersionedFooter::deserialize(&mut cursor).unwrap();
assert!(cursor.is_empty()); assert!(cursor.is_empty());
let expected_crc: u32 = LittleEndian::read_u32(&v_footer_bytes[5..9]) as CrcHashU32; let expected_crc: u32 = LittleEndian::read_u32(&v_footer_bytes[5..9]) as CrcHashU32;
let expected_versioned_footer: VersionedFooter = VersionedFooter::V2 { let expected_versioned_footer: VersionedFooter = VersionedFooter::V1 {
crc32: expected_crc, crc32: expected_crc,
store_compression: "lz4".to_string(), store_compression: "lz4".to_string(),
}; };
@@ -375,20 +336,4 @@ mod tests {
let res = footer.is_compatible(); let res = footer.is_compatible();
assert!(res.is_err()); assert!(res.is_err());
} }
#[test]
fn test_deserialize_too_large_footer() {
let mut buf = vec![];
assert!(FooterProxy::new(&mut buf).terminate().is_ok());
let mut long_len_buf = [0u8; 10];
let num_bytes = VInt(super::FOOTER_MAX_LEN as u64 + 1u64).serialize_into(&mut long_len_buf);
buf[0..num_bytes].copy_from_slice(&long_len_buf[..num_bytes]);
let err = Footer::deserialize(&mut &buf[..]).unwrap_err();
assert_eq!(err.kind(), io::ErrorKind::InvalidData);
assert_eq!(
err.to_string(),
"Footer seems invalid as it suggests a footer len of 10001. File is corrupted, \
or the index was created with a different & old version of tantivy."
);
}
} }

View File

@@ -1,5 +1,5 @@
use crate::core::{MANAGED_FILEPATH, META_FILEPATH}; use crate::core::MANAGED_FILEPATH;
use crate::directory::error::{DeleteError, LockError, OpenReadError, OpenWriteError}; use crate::directory::error::{DeleteError, IOError, LockError, OpenReadError, OpenWriteError};
use crate::directory::footer::{Footer, FooterProxy}; use crate::directory::footer::{Footer, FooterProxy};
use crate::directory::DirectoryLock; use crate::directory::DirectoryLock;
use crate::directory::GarbageCollectionResult; use crate::directory::GarbageCollectionResult;
@@ -11,9 +11,10 @@ use crate::error::DataCorruption;
use crate::Directory; use crate::Directory;
use crc32fast::Hasher; use crc32fast::Hasher;
use slog::{debug, error, info}; use serde_json;
use std::collections::HashSet; use std::collections::HashSet;
use std::io; use std::io;
use std::io::Write;
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::result; use std::result;
use std::sync::RwLockWriteGuard; use std::sync::RwLockWriteGuard;
@@ -56,9 +57,9 @@ fn save_managed_paths(
directory: &mut dyn Directory, directory: &mut dyn Directory,
wlock: &RwLockWriteGuard<'_, MetaInformation>, wlock: &RwLockWriteGuard<'_, MetaInformation>,
) -> io::Result<()> { ) -> io::Result<()> {
let mut managed_json = serde_json::to_string_pretty(&wlock.managed_paths)?; let mut w = serde_json::to_vec(&wlock.managed_paths)?;
managed_json.push_str("\n"); writeln!(&mut w)?;
directory.atomic_write(&MANAGED_FILEPATH, managed_json.as_bytes())?; directory.atomic_write(&MANAGED_FILEPATH, &w[..])?;
Ok(()) Ok(())
} }
@@ -86,12 +87,7 @@ impl ManagedDirectory {
directory: Box::new(directory), directory: Box::new(directory),
meta_informations: Arc::default(), meta_informations: Arc::default(),
}), }),
Err(OpenReadError::IOError { io_error, filepath }) => { Err(OpenReadError::IOError(e)) => Err(From::from(e)),
Err(crate::TantivyError::OpenReadError(OpenReadError::IOError {
io_error,
filepath,
}))
}
Err(OpenReadError::IncompatibleIndex(incompatibility)) => { Err(OpenReadError::IncompatibleIndex(incompatibility)) => {
// For the moment, this should never happen `meta.json` // For the moment, this should never happen `meta.json`
// do not have any footer and cannot detect incompatibility. // do not have any footer and cannot detect incompatibility.
@@ -118,7 +114,7 @@ impl ManagedDirectory {
&mut self, &mut self,
get_living_files: L, get_living_files: L,
) -> crate::Result<GarbageCollectionResult> { ) -> crate::Result<GarbageCollectionResult> {
info!(self.directory.logger(), "gc"; "stage"=>"start"); info!("Garbage collect");
let mut files_to_delete = vec![]; let mut files_to_delete = vec![];
// It is crucial to get the living files after acquiring the // It is crucial to get the living files after acquiring the
@@ -153,7 +149,7 @@ impl ManagedDirectory {
} }
} }
Err(err) => { Err(err) => {
error!(self.logger(), "Failed to acquire lock for GC"); error!("Failed to acquire lock for GC");
return Err(crate::TantivyError::from(err)); return Err(crate::TantivyError::from(err));
} }
} }
@@ -165,7 +161,7 @@ impl ManagedDirectory {
for file_to_delete in files_to_delete { for file_to_delete in files_to_delete {
match self.delete(&file_to_delete) { match self.delete(&file_to_delete) {
Ok(_) => { Ok(_) => {
debug!(self.logger(), "deleted-success"; "file"=>format!("{:?}", file_to_delete)); info!("Deleted {:?}", file_to_delete);
deleted_files.push(file_to_delete); deleted_files.push(file_to_delete);
} }
Err(file_error) => { Err(file_error) => {
@@ -173,12 +169,12 @@ impl ManagedDirectory {
DeleteError::FileDoesNotExist(_) => { DeleteError::FileDoesNotExist(_) => {
deleted_files.push(file_to_delete.clone()); deleted_files.push(file_to_delete.clone());
} }
DeleteError::IOError { .. } => { DeleteError::IOError(_) => {
failed_to_delete_files.push(file_to_delete.clone()); failed_to_delete_files.push(file_to_delete.clone());
if !cfg!(target_os = "windows") { if !cfg!(target_os = "windows") {
// On windows, delete is expected to fail if the file // On windows, delete is expected to fail if the file
// is mmapped. // is mmapped.
error!(self.logger(), "delete-file-fail"; "path"=>file_to_delete.to_str().unwrap_or("<invalid-utf8>")); error!("Failed to delete {:?}", file_to_delete);
} }
} }
} }
@@ -200,10 +196,6 @@ impl ManagedDirectory {
save_managed_paths(self.directory.as_mut(), &meta_informations_wlock)?; save_managed_paths(self.directory.as_mut(), &meta_informations_wlock)?;
} }
info!(self.directory.logger(), "gc"; "stage"=>"end",
"num-sucess-file-deletes"=>deleted_files.len(),
"num-failed-file-deletes"=>failed_to_delete_files.len());
Ok(GarbageCollectionResult { Ok(GarbageCollectionResult {
deleted_files, deleted_files,
failed_to_delete_files, failed_to_delete_files,
@@ -240,11 +232,8 @@ impl ManagedDirectory {
/// Verify checksum of a managed file /// Verify checksum of a managed file
pub fn validate_checksum(&self, path: &Path) -> result::Result<bool, OpenReadError> { pub fn validate_checksum(&self, path: &Path) -> result::Result<bool, OpenReadError> {
let reader = self.directory.open_read(path)?; let reader = self.directory.open_read(path)?;
let (footer, data) = let (footer, data) = Footer::extract_footer(reader)
Footer::extract_footer(reader).map_err(|io_error| OpenReadError::IOError { .map_err(|err| IOError::with_path(path.to_path_buf(), err))?;
io_error,
filepath: path.to_path_buf(),
})?;
let mut hasher = Hasher::new(); let mut hasher = Hasher::new();
hasher.update(data.as_slice()); hasher.update(data.as_slice());
let crc = hasher.finalize(); let crc = hasher.finalize();
@@ -257,46 +246,35 @@ impl ManagedDirectory {
/// List files for which checksum does not match content /// List files for which checksum does not match content
pub fn list_damaged(&self) -> result::Result<HashSet<PathBuf>, OpenReadError> { pub fn list_damaged(&self) -> result::Result<HashSet<PathBuf>, OpenReadError> {
let mut managed_paths = self let mut hashset = HashSet::new();
let managed_paths = self
.meta_informations .meta_informations
.read() .read()
.expect("Managed directory rlock poisoned in list damaged.") .expect("Managed directory rlock poisoned in list damaged.")
.managed_paths .managed_paths
.clone(); .clone();
managed_paths.remove(*META_FILEPATH); for path in managed_paths.into_iter() {
let mut damaged_files = HashSet::new();
for path in managed_paths {
if !self.validate_checksum(&path)? { if !self.validate_checksum(&path)? {
damaged_files.insert(path); hashset.insert(path);
} }
} }
Ok(damaged_files) Ok(hashset)
} }
} }
impl Directory for ManagedDirectory { impl Directory for ManagedDirectory {
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> { fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> {
slog::debug!(self.logger(), "open-read"; "path" => path.to_str().unwrap_or("<invalid-utf8>"));
let read_only_source = self.directory.open_read(path)?; let read_only_source = self.directory.open_read(path)?;
let (footer, reader) = Footer::extract_footer(read_only_source).map_err(|io_error| { let (footer, reader) = Footer::extract_footer(read_only_source)
OpenReadError::IOError { .map_err(|err| IOError::with_path(path.to_path_buf(), err))?;
io_error,
filepath: path.to_path_buf(),
}
})?;
footer.is_compatible()?; footer.is_compatible()?;
Ok(reader) Ok(reader)
} }
fn open_write(&mut self, path: &Path) -> result::Result<WritePtr, OpenWriteError> { fn open_write(&mut self, path: &Path) -> result::Result<WritePtr, OpenWriteError> {
slog::debug!(self.logger(), "open-write"; "path" => path.to_str().unwrap_or("<invalid-utf8>"));
self.register_file_as_managed(path) self.register_file_as_managed(path)
.map_err(|io_error| OpenWriteError::IOError { .map_err(|e| IOError::with_path(path.to_owned(), e))?;
io_error,
filepath: path.to_path_buf(),
})?;
Ok(io::BufWriter::new(Box::new(FooterProxy::new( Ok(io::BufWriter::new(Box::new(FooterProxy::new(
self.directory self.directory
.open_write(path)? .open_write(path)?
@@ -306,11 +284,9 @@ impl Directory for ManagedDirectory {
)))) ))))
} }
fn atomic_write(&mut self, path: &Path, content: &[u8]) -> io::Result<()> { fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
let content_str = std::str::from_utf8(content).unwrap_or("<content-not-utf-8>");
slog::debug!(self.logger(), "Atomic write"; "path" => format!("{:?}", path), "content_length"=>content_str);
self.register_file_as_managed(path)?; self.register_file_as_managed(path)?;
self.directory.atomic_write(path, content) self.directory.atomic_write(path, data)
} }
fn atomic_read(&self, path: &Path) -> result::Result<Vec<u8>, OpenReadError> { fn atomic_read(&self, path: &Path) -> result::Result<Vec<u8>, OpenReadError> {
@@ -332,10 +308,6 @@ impl Directory for ManagedDirectory {
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> { fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
self.directory.watch(watch_callback) self.directory.watch(watch_callback)
} }
fn logger(&self) -> &slog::Logger {
self.directory.logger()
}
} }
impl Clone for ManagedDirectory { impl Clone for ManagedDirectory {

View File

@@ -1,6 +1,15 @@
use fs2;
use notify;
use self::fs2::FileExt;
use self::notify::RawEvent;
use self::notify::RecursiveMode;
use self::notify::Watcher;
use crate::core::META_FILEPATH; use crate::core::META_FILEPATH;
use crate::directory::error::LockError; use crate::directory::error::LockError;
use crate::directory::error::{DeleteError, OpenDirectoryError, OpenReadError, OpenWriteError}; use crate::directory::error::{
DeleteError, IOError, OpenDirectoryError, OpenReadError, OpenWriteError,
};
use crate::directory::read_only_source::BoxedData; use crate::directory::read_only_source::BoxedData;
use crate::directory::AntiCallToken; use crate::directory::AntiCallToken;
use crate::directory::Directory; use crate::directory::Directory;
@@ -11,14 +20,8 @@ use crate::directory::WatchCallback;
use crate::directory::WatchCallbackList; use crate::directory::WatchCallbackList;
use crate::directory::WatchHandle; use crate::directory::WatchHandle;
use crate::directory::{TerminatingWrite, WritePtr}; use crate::directory::{TerminatingWrite, WritePtr};
use fs2::FileExt; use atomicwrites;
use memmap::Mmap; use memmap::Mmap;
use notify::RawEvent;
use notify::RecursiveMode;
use notify::Watcher;
use serde::{Deserialize, Serialize};
use slog::{debug, o, Drain, Logger};
use slog_stdlog::StdLog;
use std::collections::HashMap; use std::collections::HashMap;
use std::convert::From; use std::convert::From;
use std::fmt; use std::fmt;
@@ -36,6 +39,11 @@ use std::sync::Weak;
use std::thread; use std::thread;
use tempfile::TempDir; use tempfile::TempDir;
/// Create a default io error given a string.
pub(crate) fn make_io_err(msg: String) -> io::Error {
io::Error::new(io::ErrorKind::Other, msg)
}
/// Returns None iff the file exists, can be read, but is empty (and hence /// Returns None iff the file exists, can be read, but is empty (and hence
/// cannot be mmapped) /// cannot be mmapped)
fn open_mmap(full_path: &Path) -> result::Result<Option<Mmap>, OpenReadError> { fn open_mmap(full_path: &Path) -> result::Result<Option<Mmap>, OpenReadError> {
@@ -43,17 +51,13 @@ fn open_mmap(full_path: &Path) -> result::Result<Option<Mmap>, OpenReadError> {
if e.kind() == io::ErrorKind::NotFound { if e.kind() == io::ErrorKind::NotFound {
OpenReadError::FileDoesNotExist(full_path.to_owned()) OpenReadError::FileDoesNotExist(full_path.to_owned())
} else { } else {
OpenReadError::IOError { OpenReadError::IOError(IOError::with_path(full_path.to_owned(), e))
io_error: e,
filepath: full_path.to_owned(),
}
} }
})?; })?;
let meta_data = file.metadata().map_err(|e| OpenReadError::IOError { let meta_data = file
io_error: e, .metadata()
filepath: full_path.to_owned(), .map_err(|e| IOError::with_path(full_path.to_owned(), e))?;
})?;
if meta_data.len() == 0 { if meta_data.len() == 0 {
// if the file size is 0, it will not be possible // if the file size is 0, it will not be possible
// to mmap the file, so we return None // to mmap the file, so we return None
@@ -63,10 +67,7 @@ fn open_mmap(full_path: &Path) -> result::Result<Option<Mmap>, OpenReadError> {
unsafe { unsafe {
memmap::Mmap::map(&file) memmap::Mmap::map(&file)
.map(Some) .map(Some)
.map_err(|e| OpenReadError::IOError { .map_err(|e| From::from(IOError::with_path(full_path.to_owned(), e)))
io_error: e,
filepath: full_path.to_owned(),
})
} }
} }
@@ -141,15 +142,15 @@ impl MmapCache {
} }
struct WatcherWrapper { struct WatcherWrapper {
_watcher: Mutex<notify::RecommendedWatcher>, _watcher: Mutex<notify::PollWatcher>,
watcher_router: Arc<WatchCallbackList>, watcher_router: Arc<WatchCallbackList>,
} }
impl WatcherWrapper { impl WatcherWrapper {
pub(crate) fn new(path: &Path, logger: Logger) -> Result<Self, OpenDirectoryError> { pub fn new(path: &Path) -> Result<Self, OpenDirectoryError> {
let (tx, watcher_recv): (Sender<RawEvent>, Receiver<RawEvent>) = channel(); let (tx, watcher_recv): (Sender<RawEvent>, Receiver<RawEvent>) = channel();
// We need to initialize the // We need to initialize the
let watcher = notify::raw_watcher(tx) let watcher = notify::poll::PollWatcher::with_delay_ms(tx, 1)
.and_then(|mut watcher| { .and_then(|mut watcher| {
watcher.watch(path, RecursiveMode::Recursive)?; watcher.watch(path, RecursiveMode::Recursive)?;
Ok(watcher) Ok(watcher)
@@ -160,8 +161,7 @@ impl WatcherWrapper {
panic!("Unknown error while starting watching directory {:?}", path); panic!("Unknown error while starting watching directory {:?}", path);
} }
})?; })?;
let watcher_router: Arc<WatchCallbackList> = let watcher_router: Arc<WatchCallbackList> = Default::default();
Arc::new(WatchCallbackList::with_logger(logger));
let watcher_router_clone = watcher_router.clone(); let watcher_router_clone = watcher_router.clone();
thread::Builder::new() thread::Builder::new()
.name("meta-file-watch-thread".to_string()) .name("meta-file-watch-thread".to_string())
@@ -186,10 +186,6 @@ impl WatcherWrapper {
} }
} }
} }
})
.map_err(|io_error| OpenDirectoryError::IoError {
io_error,
directory_path: path.to_path_buf(),
})?; })?;
Ok(WatcherWrapper { Ok(WatcherWrapper {
_watcher: Mutex::new(watcher), _watcher: Mutex::new(watcher),
@@ -224,22 +220,20 @@ struct MmapDirectoryInner {
mmap_cache: RwLock<MmapCache>, mmap_cache: RwLock<MmapCache>,
_temp_directory: Option<TempDir>, _temp_directory: Option<TempDir>,
watcher: RwLock<Option<WatcherWrapper>>, watcher: RwLock<Option<WatcherWrapper>>,
logger: Logger,
} }
impl MmapDirectoryInner { impl MmapDirectoryInner {
fn new( fn new(
root_path: PathBuf, root_path: PathBuf,
temp_directory: Option<TempDir>, temp_directory: Option<TempDir>,
logger: Logger, ) -> Result<MmapDirectoryInner, OpenDirectoryError> {
) -> MmapDirectoryInner { let mmap_directory_inner = MmapDirectoryInner {
MmapDirectoryInner {
root_path, root_path,
mmap_cache: Default::default(), mmap_cache: Default::default(),
_temp_directory: temp_directory, _temp_directory: temp_directory,
watcher: RwLock::new(None), watcher: RwLock::new(None),
logger, };
} Ok(mmap_directory_inner)
} }
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> { fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
@@ -250,7 +244,7 @@ impl MmapDirectoryInner {
// The downside is that we might create a watch wrapper that is not useful. // The downside is that we might create a watch wrapper that is not useful.
let need_initialization = self.watcher.read().unwrap().is_none(); let need_initialization = self.watcher.read().unwrap().is_none();
if need_initialization { if need_initialization {
let watch_wrapper = WatcherWrapper::new(&self.root_path, self.logger.clone())?; let watch_wrapper = WatcherWrapper::new(&self.root_path)?;
let mut watch_wlock = self.watcher.write().unwrap(); let mut watch_wlock = self.watcher.write().unwrap();
// the watcher could have been initialized when we released the lock, and // the watcher could have been initialized when we released the lock, and
// we do not want to lose the watched files that were set. // we do not want to lose the watched files that were set.
@@ -273,11 +267,14 @@ impl fmt::Debug for MmapDirectory {
} }
impl MmapDirectory { impl MmapDirectory {
fn new(root_path: PathBuf, temp_directory: Option<TempDir>, logger: Logger) -> MmapDirectory { fn new(
let inner = MmapDirectoryInner::new(root_path, temp_directory, logger); root_path: PathBuf,
MmapDirectory { temp_directory: Option<TempDir>,
) -> Result<MmapDirectory, OpenDirectoryError> {
let inner = MmapDirectoryInner::new(root_path, temp_directory)?;
Ok(MmapDirectory {
inner: Arc::new(inner), inner: Arc::new(inner),
} })
} }
/// Creates a new MmapDirectory in a temporary directory. /// Creates a new MmapDirectory in a temporary directory.
@@ -285,19 +282,16 @@ impl MmapDirectory {
/// This is mostly useful to test the MmapDirectory itself. /// This is mostly useful to test the MmapDirectory itself.
/// For your unit tests, prefer the RAMDirectory. /// For your unit tests, prefer the RAMDirectory.
pub fn create_from_tempdir() -> Result<MmapDirectory, OpenDirectoryError> { pub fn create_from_tempdir() -> Result<MmapDirectory, OpenDirectoryError> {
let tempdir = TempDir::new().map_err(OpenDirectoryError::FailedToCreateTempDir)?; let tempdir = TempDir::new().map_err(OpenDirectoryError::IoError)?;
let logger = Logger::root(StdLog.fuse(), o!()); let tempdir_path = PathBuf::from(tempdir.path());
Ok(MmapDirectory::new(tempdir.path().to_owned(), Some(tempdir), logger)) MmapDirectory::new(tempdir_path, Some(tempdir))
} }
/// Opens a MmapDirectory in a directory. /// Opens a MmapDirectory in a directory.
/// ///
/// Returns an error if the `directory_path` does not /// Returns an error if the `directory_path` does not
/// exist or if it is not a directory. /// exist or if it is not a directory.
pub fn open_with_logger<P: AsRef<Path>>( pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<MmapDirectory, OpenDirectoryError> {
directory_path: P,
logger: Logger,
) -> Result<MmapDirectory, OpenDirectoryError> {
let directory_path: &Path = directory_path.as_ref(); let directory_path: &Path = directory_path.as_ref();
if !directory_path.exists() { if !directory_path.exists() {
Err(OpenDirectoryError::DoesNotExist(PathBuf::from( Err(OpenDirectoryError::DoesNotExist(PathBuf::from(
@@ -308,20 +302,10 @@ impl MmapDirectory {
directory_path, directory_path,
))) )))
} else { } else {
Ok(MmapDirectory::new( Ok(MmapDirectory::new(PathBuf::from(directory_path), None)?)
PathBuf::from(directory_path),
None,
logger,
))
} }
} }
/// Creates an `MmapDirectory` at the given path.
pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<MmapDirectory, OpenDirectoryError> {
let logger = Logger::root(StdLog.fuse(), o!());
Self::open_with_logger(directory_path, logger)
}
/// Joins a relative_path to the directory `root_path` /// Joins a relative_path to the directory `root_path`
/// to create a proper complete `filepath`. /// to create a proper complete `filepath`.
fn resolve_path(&self, relative_path: &Path) -> PathBuf { fn resolve_path(&self, relative_path: &Path) -> PathBuf {
@@ -381,12 +365,11 @@ impl MmapDirectory {
struct ReleaseLockFile { struct ReleaseLockFile {
_file: File, _file: File,
path: PathBuf, path: PathBuf,
logger: Logger,
} }
impl Drop for ReleaseLockFile { impl Drop for ReleaseLockFile {
fn drop(&mut self) { fn drop(&mut self) {
debug!(self.logger, "Releasing lock {:?}", self.path); debug!("Releasing lock {:?}", self.path);
} }
} }
@@ -425,18 +408,16 @@ impl TerminatingWrite for SafeFileWriter {
impl Directory for MmapDirectory { impl Directory for MmapDirectory {
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> { fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> {
debug!("Open Read {:?}", path);
let full_path = self.resolve_path(path); let full_path = self.resolve_path(path);
let mut mmap_cache = self.inner.mmap_cache.write().map_err(|_| { let mut mmap_cache = self.inner.mmap_cache.write().map_err(|_| {
let msg = format!( let msg = format!(
"Failed to acquired write lock \ "Failed to acquired write lock \
on mmap cache while reading {:?}", on mmap cache while reading {:?}",
path path
); );
let io_error = io::Error::new(io::ErrorKind::Other, msg); IOError::with_path(path.to_owned(), make_io_err(msg))
OpenReadError::IOError {
io_error,
filepath: path.to_owned(),
}
})?; })?;
Ok(mmap_cache Ok(mmap_cache
.get_mmap(&full_path)? .get_mmap(&full_path)?
@@ -449,18 +430,14 @@ impl Directory for MmapDirectory {
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> { fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
let full_path = self.resolve_path(path); let full_path = self.resolve_path(path);
match fs::remove_file(&full_path) { match fs::remove_file(&full_path) {
Ok(_) => self.sync_directory().map_err(|e| DeleteError::IOError { Ok(_) => self
io_error: e, .sync_directory()
filepath: path.to_path_buf(), .map_err(|e| IOError::with_path(path.to_owned(), e).into()),
}),
Err(e) => { Err(e) => {
if e.kind() == io::ErrorKind::NotFound { if e.kind() == io::ErrorKind::NotFound {
Err(DeleteError::FileDoesNotExist(path.to_owned())) Err(DeleteError::FileDoesNotExist(path.to_owned()))
} else { } else {
Err(DeleteError::IOError { Err(IOError::with_path(path.to_owned(), e).into())
io_error: e,
filepath: path.to_path_buf(),
})
} }
} }
} }
@@ -472,7 +449,9 @@ impl Directory for MmapDirectory {
} }
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> { fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> {
debug!("Open Write {:?}", path);
let full_path = self.resolve_path(path); let full_path = self.resolve_path(path);
let open_res = OpenOptions::new() let open_res = OpenOptions::new()
.write(true) .write(true)
.create_new(true) .create_new(true)
@@ -482,25 +461,18 @@ impl Directory for MmapDirectory {
if err.kind() == io::ErrorKind::AlreadyExists { if err.kind() == io::ErrorKind::AlreadyExists {
OpenWriteError::FileAlreadyExists(path.to_owned()) OpenWriteError::FileAlreadyExists(path.to_owned())
} else { } else {
OpenWriteError::IOError { IOError::with_path(path.to_owned(), err).into()
io_error: err,
filepath: path.to_owned(),
}
} }
})?; })?;
// making sure the file is created. // making sure the file is created.
file.flush().map_err(|io_error| OpenWriteError::IOError { file.flush()
io_error, .map_err(|e| IOError::with_path(path.to_owned(), e))?;
filepath: path.to_owned(),
})?;
// Apparetntly, on some filesystem syncing the parent // Apparetntly, on some filesystem syncing the parent
// directory is required. // directory is required.
self.sync_directory().map_err(|e| OpenWriteError::IOError { self.sync_directory()
io_error: e, .map_err(|e| IOError::with_path(path.to_owned(), e))?;
filepath: path.to_owned(),
})?;
let writer = SafeFileWriter::new(file); let writer = SafeFileWriter::new(file);
Ok(BufWriter::new(Box::new(writer))) Ok(BufWriter::new(Box::new(writer)))
@@ -512,31 +484,24 @@ impl Directory for MmapDirectory {
match File::open(&full_path) { match File::open(&full_path) {
Ok(mut file) => { Ok(mut file) => {
file.read_to_end(&mut buffer) file.read_to_end(&mut buffer)
.map_err(|io_error| OpenReadError::IOError { .map_err(|e| IOError::with_path(path.to_owned(), e))?;
io_error,
filepath: path.to_owned(),
})?;
Ok(buffer) Ok(buffer)
} }
Err(io_error) => { Err(e) => {
if io_error.kind() == io::ErrorKind::NotFound { if e.kind() == io::ErrorKind::NotFound {
Err(OpenReadError::FileDoesNotExist(path.to_owned())) Err(OpenReadError::FileDoesNotExist(path.to_owned()))
} else { } else {
Err(OpenReadError::IOError { Err(IOError::with_path(path.to_owned(), e).into())
io_error,
filepath: path.to_owned(),
})
} }
} }
} }
} }
fn atomic_write(&mut self, path: &Path, content: &[u8]) -> io::Result<()> { fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
let mut tempfile = tempfile::Builder::new().tempfile_in(&self.inner.root_path)?; debug!("Atomic Write {:?}", path);
tempfile.write_all(content)?;
tempfile.flush()?;
let full_path = self.resolve_path(path); let full_path = self.resolve_path(path);
tempfile.into_temp_path().persist(full_path)?; let meta_file = atomicwrites::AtomicFile::new(full_path, atomicwrites::AllowOverwrite);
meta_file.write(|f| f.write_all(data))?;
Ok(()) Ok(())
} }
@@ -553,22 +518,16 @@ impl Directory for MmapDirectory {
} else { } else {
file.try_lock_exclusive().map_err(|_| LockError::LockBusy)? file.try_lock_exclusive().map_err(|_| LockError::LockBusy)?
} }
let logger = self.inner.logger.clone();
// dropping the file handle will release the lock. // dropping the file handle will release the lock.
Ok(DirectoryLock::from(Box::new(ReleaseLockFile { Ok(DirectoryLock::from(Box::new(ReleaseLockFile {
path: lock.filepath.clone(), path: lock.filepath.clone(),
_file: file, _file: file,
logger,
}))) })))
} }
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> { fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
self.inner.watch(watch_callback) self.inner.watch(watch_callback)
} }
fn logger(&self) -> &Logger {
&self.inner.logger
}
} }
#[cfg(test)] #[cfg(test)]
@@ -678,8 +637,7 @@ mod tests {
let counter_clone = counter.clone(); let counter_clone = counter.clone();
let tmp_dir = tempfile::TempDir::new().unwrap(); let tmp_dir = tempfile::TempDir::new().unwrap();
let tmp_dirpath = tmp_dir.path().to_owned(); let tmp_dirpath = tmp_dir.path().to_owned();
let logger = Logger::root(slog::Discard, o!()); let mut watch_wrapper = WatcherWrapper::new(&tmp_dirpath).unwrap();
let mut watch_wrapper = WatcherWrapper::new(&tmp_dirpath, logger).unwrap();
let tmp_file = tmp_dirpath.join(*META_FILEPATH); let tmp_file = tmp_dirpath.join(*META_FILEPATH);
let _handle = watch_wrapper.watch(Box::new(move || { let _handle = watch_wrapper.watch(Box::new(move || {
counter_clone.fetch_add(1, Ordering::SeqCst); counter_clone.fetch_add(1, Ordering::SeqCst);
@@ -704,7 +662,7 @@ mod tests {
{ {
let index = Index::create(mmap_directory.clone(), schema).unwrap(); let index = Index::create(mmap_directory.clone(), schema).unwrap();
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut log_merge_policy = LogMergePolicy::default(); let mut log_merge_policy = LogMergePolicy::default();
log_merge_policy.set_min_merge_size(3); log_merge_policy.set_min_merge_size(3);
index_writer.set_merge_policy(Box::new(log_merge_policy)); index_writer.set_merge_policy(Box::new(log_merge_policy));

View File

@@ -23,8 +23,7 @@ pub use self::directory::{Directory, DirectoryClone};
pub use self::directory_lock::{Lock, INDEX_WRITER_LOCK, META_LOCK}; pub use self::directory_lock::{Lock, INDEX_WRITER_LOCK, META_LOCK};
pub use self::ram_directory::RAMDirectory; pub use self::ram_directory::RAMDirectory;
pub use self::read_only_source::ReadOnlySource; pub use self::read_only_source::ReadOnlySource;
pub(crate) use self::watch_event_router::WatchCallbackList; pub use self::watch_event_router::{WatchCallback, WatchCallbackList, WatchHandle};
pub use self::watch_event_router::{WatchCallback, WatchHandle};
use std::io::{self, BufWriter, Write}; use std::io::{self, BufWriter, Write};
use std::path::PathBuf; use std::path::PathBuf;
/// Outcome of the Garbage collection /// Outcome of the Garbage collection

View File

@@ -5,8 +5,6 @@ use crate::directory::WatchCallbackList;
use crate::directory::{Directory, ReadOnlySource, WatchCallback, WatchHandle}; use crate::directory::{Directory, ReadOnlySource, WatchCallback, WatchHandle};
use crate::directory::{TerminatingWrite, WritePtr}; use crate::directory::{TerminatingWrite, WritePtr};
use fail::fail_point; use fail::fail_point;
use slog::{o, Drain, Logger};
use slog_stdlog::StdLog;
use std::collections::HashMap; use std::collections::HashMap;
use std::fmt; use std::fmt;
use std::io::{self, BufWriter, Cursor, Seek, SeekFrom, Write}; use std::io::{self, BufWriter, Cursor, Seek, SeekFrom, Write};
@@ -68,7 +66,7 @@ impl Write for VecWriter {
fn flush(&mut self) -> io::Result<()> { fn flush(&mut self) -> io::Result<()> {
self.is_flushed = true; self.is_flushed = true;
let mut fs = self.shared_directory.fs.inner_directory.write().unwrap(); let mut fs = self.shared_directory.fs.write().unwrap();
fs.write(self.path.clone(), self.data.get_ref()); fs.write(self.path.clone(), self.data.get_ref());
Ok(()) Ok(())
} }
@@ -80,19 +78,13 @@ impl TerminatingWrite for VecWriter {
} }
} }
#[derive(Default)]
struct InnerDirectory { struct InnerDirectory {
fs: HashMap<PathBuf, ReadOnlySource>, fs: HashMap<PathBuf, ReadOnlySource>,
watch_router: WatchCallbackList, watch_router: WatchCallbackList,
} }
impl InnerDirectory { impl InnerDirectory {
fn with_logger(logger: Logger) -> Self {
InnerDirectory {
fs: Default::default(),
watch_router: WatchCallbackList::with_logger(logger.clone()),
}
}
fn write(&mut self, path: PathBuf, data: &[u8]) -> bool { fn write(&mut self, path: PathBuf, data: &[u8]) -> bool {
let data = ReadOnlySource::new(Vec::from(data)); let data = ReadOnlySource::new(Vec::from(data));
self.fs.insert(path, data).is_some() self.fs.insert(path, data).is_some()
@@ -125,32 +117,20 @@ impl InnerDirectory {
} }
} }
impl Default for RAMDirectory {
fn default() -> RAMDirectory {
let logger = Logger::root(StdLog.fuse(), o!());
Self::with_logger(logger)
}
}
impl fmt::Debug for RAMDirectory { impl fmt::Debug for RAMDirectory {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "RAMDirectory") write!(f, "RAMDirectory")
} }
} }
struct Inner {
inner_directory: RwLock<InnerDirectory>,
logger: Logger,
}
/// A Directory storing everything in anonymous memory. /// A Directory storing everything in anonymous memory.
/// ///
/// It is mainly meant for unit testing. /// It is mainly meant for unit testing.
/// Writes are only made visible upon flushing. /// Writes are only made visible upon flushing.
/// ///
#[derive(Clone)] #[derive(Clone, Default)]
pub struct RAMDirectory { pub struct RAMDirectory {
fs: Arc<Inner>, fs: Arc<RwLock<InnerDirectory>>,
} }
impl RAMDirectory { impl RAMDirectory {
@@ -159,61 +139,33 @@ impl RAMDirectory {
Self::default() Self::default()
} }
/// Create a `RAMDirectory` with a custom logger.
pub fn with_logger(logger: Logger) -> RAMDirectory {
let inner_directory = InnerDirectory::with_logger(logger.clone()).into();
RAMDirectory {
fs: Arc::new(Inner {
inner_directory,
logger,
}),
}
}
/// Returns the sum of the size of the different files /// Returns the sum of the size of the different files
/// in the RAMDirectory. /// in the RAMDirectory.
pub fn total_mem_usage(&self) -> usize { pub fn total_mem_usage(&self) -> usize {
self.fs.inner_directory.read().unwrap().total_mem_usage() self.fs.read().unwrap().total_mem_usage()
}
/// Write a copy of all of the files saved in the RAMDirectory in the target `Directory`.
///
/// Files are all written using the `Directory::write` meaning, even if they were
/// written using the `atomic_write` api.
///
/// If an error is encounterred, files may be persisted partially.
pub fn persist(&self, dest: &mut dyn Directory) -> crate::Result<()> {
let wlock = self.fs.inner_directory.write().unwrap();
for (path, source) in wlock.fs.iter() {
let mut dest_wrt = dest.open_write(path)?;
dest_wrt.write_all(source.as_slice())?;
dest_wrt.terminate()?;
}
Ok(())
} }
} }
impl Directory for RAMDirectory { impl Directory for RAMDirectory {
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> { fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> {
self.fs.inner_directory.read().unwrap().open_read(path) self.fs.read().unwrap().open_read(path)
} }
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> { fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
fail_point!("RAMDirectory::delete", |_| { fail_point!("RAMDirectory::delete", |_| {
Err(DeleteError::IOError { use crate::directory::error::IOError;
io_error: io::Error::from(io::ErrorKind::Other), let io_error = IOError::from(io::Error::from(io::ErrorKind::Other));
filepath: path.to_path_buf(), Err(DeleteError::from(io_error))
})
}); });
self.fs.inner_directory.write().unwrap().delete(path) self.fs.write().unwrap().delete(path)
} }
fn exists(&self, path: &Path) -> bool { fn exists(&self, path: &Path) -> bool {
self.fs.inner_directory.read().unwrap().exists(path) self.fs.read().unwrap().exists(path)
} }
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> { fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> {
let mut fs = self.fs.inner_directory.write().unwrap(); let mut fs = self.fs.write().unwrap();
let path_buf = PathBuf::from(path); let path_buf = PathBuf::from(path);
let vec_writer = VecWriter::new(path_buf.clone(), self.clone()); let vec_writer = VecWriter::new(path_buf.clone(), self.clone());
let exists = fs.write(path_buf.clone(), &[]); let exists = fs.write(path_buf.clone(), &[]);
@@ -237,62 +189,18 @@ impl Directory for RAMDirectory {
let path_buf = PathBuf::from(path); let path_buf = PathBuf::from(path);
// Reserve the path to prevent calls to .write() to succeed. // Reserve the path to prevent calls to .write() to succeed.
self.fs self.fs.write().unwrap().write(path_buf.clone(), &[]);
.inner_directory
.write()
.unwrap()
.write(path_buf.clone(), &[]);
let mut vec_writer = VecWriter::new(path_buf, self.clone()); let mut vec_writer = VecWriter::new(path_buf, self.clone());
vec_writer.write_all(data)?; vec_writer.write_all(data)?;
vec_writer.flush()?; vec_writer.flush()?;
if path == Path::new(&*META_FILEPATH) { if path == Path::new(&*META_FILEPATH) {
let _ = self let _ = self.fs.write().unwrap().watch_router.broadcast();
.fs
.inner_directory
.write()
.unwrap()
.watch_router
.broadcast();
} }
Ok(()) Ok(())
} }
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> { fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
Ok(self Ok(self.fs.write().unwrap().watch(watch_callback))
.fs
.inner_directory
.write()
.unwrap()
.watch(watch_callback))
}
fn logger(&self) -> &Logger {
&self.fs.logger
}
}
#[cfg(test)]
mod tests {
use super::RAMDirectory;
use crate::Directory;
use std::io::Write;
use std::path::Path;
#[test]
fn test_persist() {
let msg_atomic: &'static [u8] = b"atomic is the way";
let msg_seq: &'static [u8] = b"sequential is the way";
let path_atomic: &'static Path = Path::new("atomic");
let path_seq: &'static Path = Path::new("seq");
let mut directory = RAMDirectory::create();
assert!(directory.atomic_write(path_atomic, msg_atomic).is_ok());
let mut wrt = directory.open_write(path_seq).unwrap();
assert!(wrt.write_all(msg_seq).is_ok());
assert!(wrt.flush().is_ok());
let mut directory_copy = RAMDirectory::create();
assert!(directory.persist(&mut directory_copy).is_ok());
assert_eq!(directory_copy.atomic_read(path_atomic).unwrap(), msg_atomic);
assert_eq!(directory_copy.atomic_read(path_seq).unwrap(), msg_seq);
} }
} }

View File

@@ -211,19 +211,19 @@ fn test_watch(directory: &mut dyn Directory) {
.unwrap(); .unwrap();
for i in 0..10 { for i in 0..10 {
assert!(i <= counter.load(SeqCst)); assert_eq!(i, counter.load(SeqCst));
assert!(directory assert!(directory
.atomic_write(Path::new("meta.json"), b"random_test_data_2") .atomic_write(Path::new("meta.json"), b"random_test_data_2")
.is_ok()); .is_ok());
assert_eq!(receiver.recv_timeout(Duration::from_millis(500)), Ok(i)); assert_eq!(receiver.recv_timeout(Duration::from_millis(500)), Ok(i));
assert!(i + 1 <= counter.load(SeqCst)); // notify can trigger more than once. assert_eq!(i + 1, counter.load(SeqCst));
} }
mem::drop(watch_handle); mem::drop(watch_handle);
assert!(directory assert!(directory
.atomic_write(Path::new("meta.json"), b"random_test_data") .atomic_write(Path::new("meta.json"), b"random_test_data")
.is_ok()); .is_ok());
assert!(receiver.recv_timeout(Duration::from_millis(500)).is_ok()); assert!(receiver.recv_timeout(Duration::from_millis(500)).is_ok());
assert!(10 <= counter.load(SeqCst)); assert_eq!(10, counter.load(SeqCst));
} }
fn test_lock_non_blocking(directory: &mut dyn Directory) { fn test_lock_non_blocking(directory: &mut dyn Directory) {

View File

@@ -1,20 +1,19 @@
use futures::channel::oneshot; use futures::channel::oneshot;
use futures::{Future, TryFutureExt}; use futures::{Future, TryFutureExt};
use slog::{error, Logger};
use std::sync::Arc; use std::sync::Arc;
use std::sync::RwLock; use std::sync::RwLock;
use std::sync::Weak; use std::sync::Weak;
/// Type alias for callbacks registered when watching files of a `Directory`. /// Type alias for callbacks registered when watching files of a `Directory`.
pub type WatchCallback = Box<dyn Fn() + Sync + Send>; pub type WatchCallback = Box<dyn Fn() -> () + Sync + Send>;
/// Helper struct to implement the watch method in `Directory` implementations. /// Helper struct to implement the watch method in `Directory` implementations.
/// ///
/// It registers callbacks (See `.subscribe(...)`) and /// It registers callbacks (See `.subscribe(...)`) and
/// calls them upon calls to `.broadcast(...)`. /// calls them upon calls to `.broadcast(...)`.
pub(crate) struct WatchCallbackList { #[derive(Default)]
pub struct WatchCallbackList {
router: RwLock<Vec<Weak<WatchCallback>>>, router: RwLock<Vec<Weak<WatchCallback>>>,
logger: Logger,
} }
/// Controls how long a directory should watch for a file change. /// Controls how long a directory should watch for a file change.
@@ -33,14 +32,7 @@ impl WatchHandle {
} }
impl WatchCallbackList { impl WatchCallbackList {
pub fn with_logger(logger: Logger) -> Self { /// Suscribes a new callback and returns a handle that controls the lifetime of the callback.
WatchCallbackList {
logger,
router: Default::default(),
}
}
/// Subscribes a new callback and returns a handle that controls the lifetime of the callback.
pub fn subscribe(&self, watch_callback: WatchCallback) -> WatchHandle { pub fn subscribe(&self, watch_callback: WatchCallback) -> WatchHandle {
let watch_callback_arc = Arc::new(watch_callback); let watch_callback_arc = Arc::new(watch_callback);
let watch_callback_weak = Arc::downgrade(&watch_callback_arc); let watch_callback_weak = Arc::downgrade(&watch_callback_arc);
@@ -82,8 +74,8 @@ impl WatchCallbackList {
}); });
if let Err(err) = spawn_res { if let Err(err) = spawn_res {
error!( error!(
self.logger, "Failed to spawn thread to call watch callbacks. Cause: {:?}",
"Failed to spawn thread to call watch callbacks. Cause: {:?}", err err
); );
} }
result result
@@ -94,18 +86,13 @@ impl WatchCallbackList {
mod tests { mod tests {
use crate::directory::WatchCallbackList; use crate::directory::WatchCallbackList;
use futures::executor::block_on; use futures::executor::block_on;
use slog::{o, Discard, Logger};
use std::mem; use std::mem;
use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc; use std::sync::Arc;
fn default_watch_callback_list() -> WatchCallbackList {
WatchCallbackList::with_logger(Logger::root(Discard, o!()))
}
#[test] #[test]
fn test_watch_event_router_simple() { fn test_watch_event_router_simple() {
let watch_event_router = default_watch_callback_list(); let watch_event_router = WatchCallbackList::default();
let counter: Arc<AtomicUsize> = Default::default(); let counter: Arc<AtomicUsize> = Default::default();
let counter_clone = counter.clone(); let counter_clone = counter.clone();
let inc_callback = Box::new(move || { let inc_callback = Box::new(move || {
@@ -132,7 +119,7 @@ mod tests {
#[test] #[test]
fn test_watch_event_router_multiple_callback_same_key() { fn test_watch_event_router_multiple_callback_same_key() {
let watch_event_router = default_watch_callback_list(); let watch_event_router = WatchCallbackList::default();
let counter: Arc<AtomicUsize> = Default::default(); let counter: Arc<AtomicUsize> = Default::default();
let inc_callback = |inc: usize| { let inc_callback = |inc: usize| {
let counter_clone = counter.clone(); let counter_clone = counter.clone();
@@ -161,7 +148,7 @@ mod tests {
#[test] #[test]
fn test_watch_event_router_multiple_callback_different_key() { fn test_watch_event_router_multiple_callback_different_key() {
let watch_event_router = default_watch_callback_list(); let watch_event_router = WatchCallbackList::default();
let counter: Arc<AtomicUsize> = Default::default(); let counter: Arc<AtomicUsize> = Default::default();
let counter_clone = counter.clone(); let counter_clone = counter.clone();
let inc_callback = Box::new(move || { let inc_callback = Box::new(move || {

View File

@@ -1,48 +1,58 @@
use crate::common::BitSet;
use crate::fastfield::DeleteBitSet; use crate::fastfield::DeleteBitSet;
use crate::DocId; use crate::DocId;
use std::borrow::Borrow; use std::borrow::Borrow;
use std::borrow::BorrowMut; use std::borrow::BorrowMut;
use std::cmp::Ordering;
/// Sentinel value returned when a DocSet has been entirely consumed. /// Expresses the outcome of a call to `DocSet`'s `.skip_next(...)`.
/// #[derive(PartialEq, Eq, Debug)]
/// This is not u32::MAX as one would have expected, due to the lack of SSE2 instructions pub enum SkipResult {
/// to compare [u32; 4]. /// target was in the docset
pub const TERMINATED: DocId = std::i32::MAX as u32; Reached,
/// target was not in the docset, skipping stopped as a greater element was found
OverStep,
/// the docset was entirely consumed without finding the target, nor any
/// element greater than the target.
End,
}
/// Represents an iterable set of sorted doc ids. /// Represents an iterable set of sorted doc ids.
pub trait DocSet { pub trait DocSet {
/// Goes to the next element. /// Goes to the next element.
/// /// `.advance(...)` needs to be called a first time to point to the correct
/// The DocId of the next element is returned. /// element.
/// In other words we should always have : fn advance(&mut self) -> bool;
/// ```ignore
/// let doc = docset.advance();
/// assert_eq!(doc, docset.doc());
/// ```
///
/// If we reached the end of the DocSet, TERMINATED should be returned.
///
/// Calling `.advance()` on a terminated DocSet should be supported, and TERMINATED should
/// be returned.
/// TODO Test existing docsets.
fn advance(&mut self) -> DocId;
/// Advances the DocSet forward until reaching the target, or going to the /// After skipping, position the iterator in such a way that `.doc()`
/// lowest DocId greater than the target. /// will return a value greater than or equal to target.
/// ///
/// If the end of the DocSet is reached, TERMINATED is returned. /// SkipResult expresses whether the `target value` was reached, overstepped,
/// or if the `DocSet` was entirely consumed without finding any value
/// greater or equal to the `target`.
/// ///
/// Calling `.seek(target)` on a terminated DocSet is legal. Implementation /// WARNING: Calling skip always advances the docset.
/// of DocSet should support it. /// More specifically, if the docset is already positionned on the target
/// skipping will advance to the next position and return SkipResult::Overstep.
/// ///
/// Calling `seek(TERMINATED)` is also legal and is the normal way to consume a DocSet. /// If `.skip_next()` oversteps, then the docset must be positionned correctly
fn seek(&mut self, target: DocId) -> DocId { /// on an existing document. In other words, `.doc()` should return the first document
let mut doc = self.doc(); /// greater than `DocId`.
debug_assert!(doc <= target); fn skip_next(&mut self, target: DocId) -> SkipResult {
while doc < target { if !self.advance() {
doc = self.advance(); return SkipResult::End;
}
loop {
match self.doc().cmp(&target) {
Ordering::Less => {
if !self.advance() {
return SkipResult::End;
}
}
Ordering::Equal => return SkipResult::Reached,
Ordering::Greater => return SkipResult::OverStep,
}
} }
doc
} }
/// Fills a given mutable buffer with the next doc ids from the /// Fills a given mutable buffer with the next doc ids from the
@@ -61,38 +71,38 @@ pub trait DocSet {
/// use case where batching. The normal way to /// use case where batching. The normal way to
/// go through the `DocId`'s is to call `.advance()`. /// go through the `DocId`'s is to call `.advance()`.
fn fill_buffer(&mut self, buffer: &mut [DocId]) -> usize { fn fill_buffer(&mut self, buffer: &mut [DocId]) -> usize {
if self.doc() == TERMINATED {
return 0;
}
for (i, buffer_val) in buffer.iter_mut().enumerate() { for (i, buffer_val) in buffer.iter_mut().enumerate() {
*buffer_val = self.doc(); if self.advance() {
if self.advance() == TERMINATED { *buffer_val = self.doc();
return i + 1; } else {
return i;
} }
} }
buffer.len() buffer.len()
} }
/// Returns the current document /// Returns the current document
/// Right after creating a new DocSet, the docset points to the first document.
///
/// If the DocSet is empty, .doc() should return `TERMINATED`.
fn doc(&self) -> DocId; fn doc(&self) -> DocId;
/// Returns a best-effort hint of the /// Returns a best-effort hint of the
/// length of the docset. /// length of the docset.
fn size_hint(&self) -> u32; fn size_hint(&self) -> u32;
/// Appends all docs to a `bitset`.
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
while self.advance() {
bitset.insert(self.doc());
}
}
/// Returns the number documents matching. /// Returns the number documents matching.
/// Calling this method consumes the `DocSet`. /// Calling this method consumes the `DocSet`.
fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 { fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
let mut count = 0u32; let mut count = 0u32;
let mut doc = self.doc(); while self.advance() {
while doc != TERMINATED { if !delete_bitset.is_deleted(self.doc()) {
if !delete_bitset.is_deleted(doc) {
count += 1u32; count += 1u32;
} }
doc = self.advance();
} }
count count
} }
@@ -104,42 +114,22 @@ pub trait DocSet {
/// given by `count()`. /// given by `count()`.
fn count_including_deleted(&mut self) -> u32 { fn count_including_deleted(&mut self) -> u32 {
let mut count = 0u32; let mut count = 0u32;
let mut doc = self.doc(); while self.advance() {
while doc != TERMINATED {
count += 1u32; count += 1u32;
doc = self.advance();
} }
count count
} }
} }
impl<'a> DocSet for &'a mut dyn DocSet {
fn advance(&mut self) -> u32 {
(**self).advance()
}
fn seek(&mut self, target: DocId) -> DocId {
(**self).seek(target)
}
fn doc(&self) -> u32 {
(**self).doc()
}
fn size_hint(&self) -> u32 {
(**self).size_hint()
}
}
impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> { impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
fn advance(&mut self) -> DocId { fn advance(&mut self) -> bool {
let unboxed: &mut TDocSet = self.borrow_mut(); let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.advance() unboxed.advance()
} }
fn seek(&mut self, target: DocId) -> DocId { fn skip_next(&mut self, target: DocId) -> SkipResult {
let unboxed: &mut TDocSet = self.borrow_mut(); let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.seek(target) unboxed.skip_next(target)
} }
fn doc(&self) -> DocId { fn doc(&self) -> DocId {
@@ -161,4 +151,9 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
let unboxed: &mut TDocSet = self.borrow_mut(); let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.count_including_deleted() unboxed.count_including_deleted()
} }
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.append_to_bitset(bitset);
}
} }

View File

@@ -2,13 +2,12 @@
use std::io; use std::io;
use crate::directory::error::{IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
use crate::directory::error::{Incompatibility, LockError}; use crate::directory::error::{Incompatibility, LockError};
use crate::fastfield::FastFieldNotAvailableError; use crate::fastfield::FastFieldNotAvailableError;
use crate::query; use crate::query;
use crate::{ use crate::schema;
directory::error::{OpenDirectoryError, OpenReadError, OpenWriteError}, use serde_json;
schema,
};
use std::fmt; use std::fmt;
use std::path::PathBuf; use std::path::PathBuf;
use std::sync::PoisonError; use std::sync::PoisonError;
@@ -45,47 +44,44 @@ impl fmt::Debug for DataCorruption {
} }
} }
/// The library's error enum /// The library's failure based error enum
#[derive(Debug, Error)] #[derive(Debug, Fail)]
pub enum TantivyError { pub enum TantivyError {
/// Failed to open the directory. /// Path does not exist.
#[error("Failed to open the directory: '{0:?}'")] #[fail(display = "Path does not exist: '{:?}'", _0)]
OpenDirectoryError(#[from] OpenDirectoryError), PathDoesNotExist(PathBuf),
/// Failed to open a file for read. /// File already exists, this is a problem when we try to write into a new file.
#[error("Failed to open file for read: '{0:?}'")] #[fail(display = "File already exists: '{:?}'", _0)]
OpenReadError(#[from] OpenReadError), FileAlreadyExists(PathBuf),
/// Failed to open a file for write.
#[error("Failed to open file for write: '{0:?}'")]
OpenWriteError(#[from] OpenWriteError),
/// Index already exists in this directory /// Index already exists in this directory
#[error("Index already exists")] #[fail(display = "Index already exists")]
IndexAlreadyExists, IndexAlreadyExists,
/// Failed to acquire file lock /// Failed to acquire file lock
#[error("Failed to acquire Lockfile: {0:?}. {1:?}")] #[fail(display = "Failed to acquire Lockfile: {:?}. {:?}", _0, _1)]
LockFailure(LockError, Option<String>), LockFailure(LockError, Option<String>),
/// IO Error. /// IO Error.
#[error("An IO error occurred: '{0}'")] #[fail(display = "An IO error occurred: '{}'", _0)]
IOError(#[from] io::Error), IOError(#[cause] IOError),
/// Data corruption. /// Data corruption.
#[error("Data corrupted: '{0:?}'")] #[fail(display = "{:?}", _0)]
DataCorruption(DataCorruption), DataCorruption(DataCorruption),
/// A thread holding the locked panicked and poisoned the lock. /// A thread holding the locked panicked and poisoned the lock.
#[error("A thread holding the locked panicked and poisoned the lock")] #[fail(display = "A thread holding the locked panicked and poisoned the lock")]
Poisoned, Poisoned,
/// Invalid argument was passed by the user. /// Invalid argument was passed by the user.
#[error("An invalid argument was passed: '{0}'")] #[fail(display = "An invalid argument was passed: '{}'", _0)]
InvalidArgument(String), InvalidArgument(String),
/// An Error happened in one of the thread. /// An Error happened in one of the thread.
#[error("An error occurred in a thread: '{0}'")] #[fail(display = "An error occurred in a thread: '{}'", _0)]
ErrorInThread(String), ErrorInThread(String),
/// An Error appeared related to the schema. /// An Error appeared related to the schema.
#[error("Schema error: '{0}'")] #[fail(display = "Schema error: '{}'", _0)]
SchemaError(String), SchemaError(String),
/// System error. (e.g.: We failed spawning a new thread) /// System error. (e.g.: We failed spawning a new thread)
#[error("System error.'{0}'")] #[fail(display = "System error.'{}'", _0)]
SystemError(String), SystemError(String),
/// Index incompatible with current version of tantivy /// Index incompatible with current version of tantivy
#[error("{0:?}")] #[fail(display = "{:?}", _0)]
IncompatibleIndex(Incompatibility), IncompatibleIndex(Incompatibility),
} }
@@ -94,17 +90,31 @@ impl From<DataCorruption> for TantivyError {
TantivyError::DataCorruption(data_corruption) TantivyError::DataCorruption(data_corruption)
} }
} }
impl From<FastFieldNotAvailableError> for TantivyError { impl From<FastFieldNotAvailableError> for TantivyError {
fn from(fastfield_error: FastFieldNotAvailableError) -> TantivyError { fn from(fastfield_error: FastFieldNotAvailableError) -> TantivyError {
TantivyError::SchemaError(format!("{}", fastfield_error)) TantivyError::SchemaError(format!("{}", fastfield_error))
} }
} }
impl From<LockError> for TantivyError { impl From<LockError> for TantivyError {
fn from(lock_error: LockError) -> TantivyError { fn from(lock_error: LockError) -> TantivyError {
TantivyError::LockFailure(lock_error, None) TantivyError::LockFailure(lock_error, None)
} }
} }
impl From<IOError> for TantivyError {
fn from(io_error: IOError) -> TantivyError {
TantivyError::IOError(io_error)
}
}
impl From<io::Error> for TantivyError {
fn from(io_error: io::Error) -> TantivyError {
TantivyError::IOError(io_error.into())
}
}
impl From<query::QueryParserError> for TantivyError { impl From<query::QueryParserError> for TantivyError {
fn from(parsing_error: query::QueryParserError) -> TantivyError { fn from(parsing_error: query::QueryParserError) -> TantivyError {
TantivyError::InvalidArgument(format!("Query is invalid. {:?}", parsing_error)) TantivyError::InvalidArgument(format!("Query is invalid. {:?}", parsing_error))
@@ -117,12 +127,49 @@ impl<Guard> From<PoisonError<Guard>> for TantivyError {
} }
} }
impl From<OpenReadError> for TantivyError {
fn from(error: OpenReadError) -> TantivyError {
match error {
OpenReadError::FileDoesNotExist(filepath) => TantivyError::PathDoesNotExist(filepath),
OpenReadError::IOError(io_error) => TantivyError::IOError(io_error),
OpenReadError::IncompatibleIndex(incompatibility) => {
TantivyError::IncompatibleIndex(incompatibility)
}
}
}
}
impl From<schema::DocParsingError> for TantivyError { impl From<schema::DocParsingError> for TantivyError {
fn from(error: schema::DocParsingError) -> TantivyError { fn from(error: schema::DocParsingError) -> TantivyError {
TantivyError::InvalidArgument(format!("Failed to parse document {:?}", error)) TantivyError::InvalidArgument(format!("Failed to parse document {:?}", error))
} }
} }
impl From<OpenWriteError> for TantivyError {
fn from(error: OpenWriteError) -> TantivyError {
match error {
OpenWriteError::FileAlreadyExists(filepath) => {
TantivyError::FileAlreadyExists(filepath)
}
OpenWriteError::IOError(io_error) => TantivyError::IOError(io_error),
}
}
}
impl From<OpenDirectoryError> for TantivyError {
fn from(error: OpenDirectoryError) -> TantivyError {
match error {
OpenDirectoryError::DoesNotExist(directory_path) => {
TantivyError::PathDoesNotExist(directory_path)
}
OpenDirectoryError::NotADirectory(directory_path) => {
TantivyError::InvalidArgument(format!("{:?} is not a directory", directory_path))
}
OpenDirectoryError::IoError(err) => TantivyError::IOError(IOError::from(err)),
}
}
}
impl From<serde_json::Error> for TantivyError { impl From<serde_json::Error> for TantivyError {
fn from(error: serde_json::Error) -> TantivyError { fn from(error: serde_json::Error) -> TantivyError {
let io_err = io::Error::from(error); let io_err = io::Error::from(error);

View File

@@ -15,7 +15,7 @@ mod tests {
let field = schema_builder.add_bytes_field("bytesfield"); let field = schema_builder.add_bytes_field("bytesfield");
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(field=>vec![0u8, 1, 2, 3])); index_writer.add_document(doc!(field=>vec![0u8, 1, 2, 3]));
index_writer.add_document(doc!(field=>vec![])); index_writer.add_document(doc!(field=>vec![]));
index_writer.add_document(doc!(field=>vec![255u8])); index_writer.add_document(doc!(field=>vec![255u8]));

View File

@@ -9,8 +9,6 @@ use std::io::Write;
/// Write a delete `BitSet` /// Write a delete `BitSet`
/// ///
/// where `delete_bitset` is the set of deleted `DocId`. /// where `delete_bitset` is the set of deleted `DocId`.
/// Warning: this function does not call terminate. The caller is in charge of
/// closing the writer properly.
pub fn write_delete_bitset( pub fn write_delete_bitset(
delete_bitset: &BitSet, delete_bitset: &BitSet,
max_doc: u32, max_doc: u32,
@@ -44,24 +42,6 @@ pub struct DeleteBitSet {
} }
impl DeleteBitSet { impl DeleteBitSet {
#[cfg(test)]
pub(crate) fn for_test(docs: &[DocId], max_doc: u32) -> DeleteBitSet {
use crate::directory::{Directory, RAMDirectory, TerminatingWrite};
use std::path::Path;
assert!(docs.iter().all(|&doc| doc < max_doc));
let mut bitset = BitSet::with_max_value(max_doc);
for &doc in docs {
bitset.insert(doc);
}
let mut directory = RAMDirectory::create();
let path = Path::new("dummydeletebitset");
let mut wrt = directory.open_write(path).unwrap();
write_delete_bitset(&bitset, max_doc, &mut wrt).unwrap();
wrt.terminate().unwrap();
let source = directory.open_read(path).unwrap();
Self::open(source)
}
/// Opens a delete bitset given its data source. /// Opens a delete bitset given its data source.
pub fn open(data: ReadOnlySource) -> DeleteBitSet { pub fn open(data: ReadOnlySource) -> DeleteBitSet {
let num_deleted: usize = data let num_deleted: usize = data
@@ -103,35 +83,42 @@ impl HasLen for DeleteBitSet {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::DeleteBitSet; use super::*;
use crate::common::HasLen; use crate::directory::*;
use std::path::PathBuf;
#[test] fn test_delete_bitset_helper(bitset: &BitSet, max_doc: u32) {
fn test_delete_bitset_empty() { let test_path = PathBuf::from("test");
let delete_bitset = DeleteBitSet::for_test(&[], 10); let mut directory = RAMDirectory::create();
for doc in 0..10 { {
assert_eq!(delete_bitset.is_deleted(doc), !delete_bitset.is_alive(doc)); let mut writer = directory.open_write(&*test_path).unwrap();
write_delete_bitset(bitset, max_doc, &mut writer).unwrap();
writer.terminate().unwrap();
} }
assert_eq!(delete_bitset.len(), 0); let source = directory.open_read(&test_path).unwrap();
let delete_bitset = DeleteBitSet::open(source);
for doc in 0..max_doc {
assert_eq!(bitset.contains(doc), delete_bitset.is_deleted(doc as DocId));
}
assert_eq!(delete_bitset.len(), bitset.len());
} }
#[test] #[test]
fn test_delete_bitset() { fn test_delete_bitset() {
let delete_bitset = DeleteBitSet::for_test(&[1, 9], 10); {
assert!(delete_bitset.is_alive(0)); let mut bitset = BitSet::with_max_value(10);
assert!(delete_bitset.is_deleted(1)); bitset.insert(1);
assert!(delete_bitset.is_alive(2)); bitset.insert(9);
assert!(delete_bitset.is_alive(3)); test_delete_bitset_helper(&bitset, 10);
assert!(delete_bitset.is_alive(4)); }
assert!(delete_bitset.is_alive(5)); {
assert!(delete_bitset.is_alive(6)); let mut bitset = BitSet::with_max_value(8);
assert!(delete_bitset.is_alive(6)); bitset.insert(1);
assert!(delete_bitset.is_alive(7)); bitset.insert(2);
assert!(delete_bitset.is_alive(8)); bitset.insert(3);
assert!(delete_bitset.is_deleted(9)); bitset.insert(5);
for doc in 0..10 { bitset.insert(7);
assert_eq!(delete_bitset.is_deleted(doc), !delete_bitset.is_alive(doc)); test_delete_bitset_helper(&bitset, 8);
} }
assert_eq!(delete_bitset.len(), 2);
} }
} }

View File

@@ -4,8 +4,8 @@ use std::result;
/// `FastFieldNotAvailableError` is returned when the /// `FastFieldNotAvailableError` is returned when the
/// user requested for a fast field reader, and the field was not /// user requested for a fast field reader, and the field was not
/// defined in the schema as a fast field. /// defined in the schema as a fast field.
#[derive(Debug, Error)] #[derive(Debug, Fail)]
#[error("Fast field not available: '{field_name:?}'")] #[fail(display = "Fast field not available: '{:?}'", field_name)]
pub struct FastFieldNotAvailableError { pub struct FastFieldNotAvailableError {
field_name: String, field_name: String,
} }

View File

@@ -474,7 +474,7 @@ mod tests {
let date_field = schema_builder.add_date_field("date", FAST); let date_field = schema_builder.add_date_field("date", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy)); index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer.add_document(doc!(date_field =>crate::chrono::prelude::Utc::now())); index_writer.add_document(doc!(date_field =>crate::chrono::prelude::Utc::now()));
index_writer.commit().unwrap(); index_writer.commit().unwrap();
@@ -511,7 +511,7 @@ mod tests {
); );
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy)); index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer.add_document(doc!( index_writer.add_document(doc!(
date_field => crate::DateTime::from_u64(1i64.to_u64()), date_field => crate::DateTime::from_u64(1i64.to_u64()),

View File

@@ -25,7 +25,7 @@ mod tests {
); );
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(field=>1u64, field=>3u64)); index_writer.add_document(doc!(field=>1u64, field=>3u64));
index_writer.add_document(doc!()); index_writer.add_document(doc!());
index_writer.add_document(doc!(field=>4u64)); index_writer.add_document(doc!(field=>4u64));
@@ -64,7 +64,7 @@ mod tests {
schema_builder.add_i64_field("time_stamp_i", IntOptions::default().set_stored()); schema_builder.add_i64_field("time_stamp_i", IntOptions::default().set_stored());
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let first_time_stamp = chrono::Utc::now(); let first_time_stamp = chrono::Utc::now();
index_writer.add_document( index_writer.add_document(
doc!(date_field=>first_time_stamp, date_field=>first_time_stamp, time_i=>1i64), doc!(date_field=>first_time_stamp, date_field=>first_time_stamp, time_i=>1i64),
@@ -186,7 +186,7 @@ mod tests {
); );
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(field=> 1i64, field => 3i64)); index_writer.add_document(doc!(field=> 1i64, field => 3i64));
index_writer.add_document(doc!()); index_writer.add_document(doc!());
index_writer.add_document(doc!(field=> -4i64)); index_writer.add_document(doc!(field=> -4i64));
@@ -221,7 +221,7 @@ mod tests {
let field = schema_builder.add_facet_field("facetfield"); let field = schema_builder.add_facet_field("facetfield");
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
for i in 0..100_000 { for i in 0..100_000 {
index_writer.add_document(doc!(field=> Facet::from(format!("/lang/{}", i).as_str()))); index_writer.add_document(doc!(field=> Facet::from(format!("/lang/{}", i).as_str())));
} }

View File

@@ -74,7 +74,7 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index let mut index_writer = index
.writer_for_tests() .writer_with_num_threads(1, 30_000_000)
.expect("Failed to create index writer."); .expect("Failed to create index writer.");
index_writer.add_document(doc!( index_writer.add_document(doc!(
facet_field => Facet::from("/category/cat2"), facet_field => Facet::from("/category/cat2"),

View File

@@ -6,6 +6,7 @@ use crate::schema::{Document, Field};
use crate::termdict::TermOrdinal; use crate::termdict::TermOrdinal;
use crate::DocId; use crate::DocId;
use fnv::FnvHashMap; use fnv::FnvHashMap;
use itertools::Itertools;
use std::io; use std::io;
/// Writer for multi-valued (as in, more than one value per document) /// Writer for multi-valued (as in, more than one value per document)
@@ -150,8 +151,8 @@ impl MultiValueIntFastFieldWriter {
} }
} }
None => { None => {
let val_min_max = crate::common::minmax(self.vals.iter().cloned()); let val_min_max = self.vals.iter().cloned().minmax();
let (val_min, val_max) = val_min_max.unwrap_or((0u64, 0u64)); let (val_min, val_max) = val_min_max.into_option().unwrap_or((0u64, 0u64));
value_serializer = value_serializer =
serializer.new_u64_fast_field_with_idx(self.field, val_min, val_max, 1)?; serializer.new_u64_fast_field_with_idx(self.field, val_min, val_max, 1)?;
for &val in &self.vals { for &val in &self.vals {

View File

@@ -21,7 +21,7 @@ mod reader;
mod serializer; mod serializer;
mod writer; mod writer;
pub use self::reader::{FieldNormReader, FieldNormReaders}; pub use self::reader::FieldNormReader;
pub use self::serializer::FieldNormsSerializer; pub use self::serializer::FieldNormsSerializer;
pub use self::writer::FieldNormsWriter; pub use self::writer::FieldNormsWriter;

View File

@@ -1,41 +1,6 @@
use super::{fieldnorm_to_id, id_to_fieldnorm}; use super::{fieldnorm_to_id, id_to_fieldnorm};
use crate::common::CompositeFile;
use crate::directory::ReadOnlySource; use crate::directory::ReadOnlySource;
use crate::schema::Field;
use crate::space_usage::PerFieldSpaceUsage;
use crate::DocId; use crate::DocId;
use std::sync::Arc;
/// Reader for the fieldnorm (for each document, the number of tokens indexed in the
/// field) of all indexed fields in the index.
///
/// Each fieldnorm is approximately compressed over one byte. We refer to this byte as
/// `fieldnorm_id`.
/// The mapping from `fieldnorm` to `fieldnorm_id` is given by monotonic.
#[derive(Clone)]
pub struct FieldNormReaders {
data: Arc<CompositeFile>,
}
impl FieldNormReaders {
/// Creates a field norm reader.
pub fn open(source: ReadOnlySource) -> crate::Result<FieldNormReaders> {
let data = CompositeFile::open(&source)?;
Ok(FieldNormReaders {
data: Arc::new(data),
})
}
/// Returns the FieldNormReader for a specific field.
pub fn get_field(&self, field: Field) -> Option<FieldNormReader> {
self.data.open_read(field).map(FieldNormReader::open)
}
/// Return a break down of the space usage per field.
pub fn space_usage(&self) -> PerFieldSpaceUsage {
self.data.space_usage()
}
}
/// Reads the fieldnorm associated to a document. /// Reads the fieldnorm associated to a document.
/// The fieldnorm represents the length associated to /// The fieldnorm represents the length associated to
@@ -54,7 +19,6 @@ impl FieldNormReaders {
/// Apart from compression, this scale also makes it possible to /// Apart from compression, this scale also makes it possible to
/// precompute computationally expensive functions of the fieldnorm /// precompute computationally expensive functions of the fieldnorm
/// in a very short array. /// in a very short array.
#[derive(Clone)]
pub struct FieldNormReader { pub struct FieldNormReader {
data: ReadOnlySource, data: ReadOnlySource,
} }
@@ -65,11 +29,6 @@ impl FieldNormReader {
FieldNormReader { data } FieldNormReader { data }
} }
/// Returns the number of documents in this segment.
pub fn num_docs(&self) -> u32 {
self.data.len() as u32
}
/// Returns the `fieldnorm` associated to a doc id. /// Returns the `fieldnorm` associated to a doc id.
/// The fieldnorm is a value approximating the number /// The fieldnorm is a value approximating the number
/// of tokens in a given field of the `doc_id`. /// of tokens in a given field of the `doc_id`.
@@ -103,12 +62,13 @@ impl FieldNormReader {
pub fn fieldnorm_to_id(fieldnorm: u32) -> u8 { pub fn fieldnorm_to_id(fieldnorm: u32) -> u8 {
fieldnorm_to_id(fieldnorm) fieldnorm_to_id(fieldnorm)
} }
}
#[cfg(test)] #[cfg(test)]
pub fn for_test(field_norms: &[u32]) -> FieldNormReader { impl From<Vec<u32>> for FieldNormReader {
fn from(field_norms: Vec<u32>) -> FieldNormReader {
let field_norms_id = field_norms let field_norms_id = field_norms
.iter() .into_iter()
.cloned()
.map(FieldNormReader::fieldnorm_to_id) .map(FieldNormReader::fieldnorm_to_id)
.collect::<Vec<u8>>(); .collect::<Vec<u8>>();
let field_norms_data = ReadOnlySource::from(field_norms_id); let field_norms_data = ReadOnlySource::from(field_norms_id);
@@ -117,20 +77,3 @@ impl FieldNormReader {
} }
} }
} }
#[cfg(test)]
mod tests {
use crate::fieldnorm::FieldNormReader;
#[test]
fn test_from_fieldnorms_array() {
let fieldnorms = &[1, 2, 3, 4, 1_000_000];
let fieldnorm_reader = FieldNormReader::for_test(fieldnorms);
assert_eq!(fieldnorm_reader.num_docs(), 5);
assert_eq!(fieldnorm_reader.fieldnorm(0), 1);
assert_eq!(fieldnorm_reader.fieldnorm(1), 2);
assert_eq!(fieldnorm_reader.fieldnorm(2), 3);
assert_eq!(fieldnorm_reader.fieldnorm(3), 4);
assert_eq!(fieldnorm_reader.fieldnorm(4), 983_064);
}
}

View File

@@ -78,12 +78,11 @@ impl FieldNormsWriter {
} }
/// Serialize the seen fieldnorm values to the serializer for all fields. /// Serialize the seen fieldnorm values to the serializer for all fields.
pub fn serialize(&self, mut fieldnorms_serializer: FieldNormsSerializer) -> io::Result<()> { pub fn serialize(&self, fieldnorms_serializer: &mut FieldNormsSerializer) -> io::Result<()> {
for &field in self.fields.iter() { for &field in self.fields.iter() {
let fieldnorm_values: &[u8] = &self.fieldnorms_buffer[field.field_id() as usize][..]; let fieldnorm_values: &[u8] = &self.fieldnorms_buffer[field.field_id() as usize][..];
fieldnorms_serializer.serialize_field(field, fieldnorm_values)?; fieldnorms_serializer.serialize_field(field, fieldnorm_values)?;
} }
fieldnorms_serializer.close()?;
Ok(()) Ok(())
} }
} }

View File

@@ -10,7 +10,7 @@ use crate::core::SegmentMeta;
use crate::core::SegmentReader; use crate::core::SegmentReader;
use crate::directory::TerminatingWrite; use crate::directory::TerminatingWrite;
use crate::directory::{DirectoryLock, GarbageCollectionResult}; use crate::directory::{DirectoryLock, GarbageCollectionResult};
use crate::docset::{DocSet, TERMINATED}; use crate::docset::DocSet;
use crate::error::TantivyError; use crate::error::TantivyError;
use crate::fastfield::write_delete_bitset; use crate::fastfield::write_delete_bitset;
use crate::indexer::delete_queue::{DeleteCursor, DeleteQueue}; use crate::indexer::delete_queue::{DeleteCursor, DeleteQueue};
@@ -27,7 +27,6 @@ use crate::Opstamp;
use crossbeam::channel; use crossbeam::channel;
use futures::executor::block_on; use futures::executor::block_on;
use futures::future::Future; use futures::future::Future;
use slog::{error, info, Logger};
use smallvec::smallvec; use smallvec::smallvec;
use smallvec::SmallVec; use smallvec::SmallVec;
use std::mem; use std::mem;
@@ -113,15 +112,15 @@ fn compute_deleted_bitset(
if let Some(mut docset) = if let Some(mut docset) =
inverted_index.read_postings(&delete_op.term, IndexRecordOption::Basic) inverted_index.read_postings(&delete_op.term, IndexRecordOption::Basic)
{ {
let mut deleted_doc = docset.doc(); while docset.advance() {
while deleted_doc != TERMINATED { let deleted_doc = docset.doc();
if deleted_doc < limit_doc { if deleted_doc < limit_doc {
delete_bitset.insert(deleted_doc); delete_bitset.insert(deleted_doc);
might_have_changed = true; might_have_changed = true;
} }
deleted_doc = docset.advance();
} }
} }
delete_cursor.advance(); delete_cursor.advance();
} }
Ok(might_have_changed) Ok(might_have_changed)
@@ -156,8 +155,6 @@ pub(crate) fn advance_deletes(
None => BitSet::with_max_value(max_doc), None => BitSet::with_max_value(max_doc),
}; };
let num_deleted_docs_before = segment.meta().num_deleted_docs();
compute_deleted_bitset( compute_deleted_bitset(
&mut delete_bitset, &mut delete_bitset,
&segment_reader, &segment_reader,
@@ -167,8 +164,6 @@ pub(crate) fn advance_deletes(
)?; )?;
// TODO optimize // TODO optimize
// It should be possible to do something smarter by manipulation bitsets directly
// to compute this union.
if let Some(seg_delete_bitset) = segment_reader.delete_bitset() { if let Some(seg_delete_bitset) = segment_reader.delete_bitset() {
for doc in 0u32..max_doc { for doc in 0u32..max_doc {
if seg_delete_bitset.is_deleted(doc) { if seg_delete_bitset.is_deleted(doc) {
@@ -177,9 +172,8 @@ pub(crate) fn advance_deletes(
} }
} }
let num_deleted_docs: u32 = delete_bitset.len() as u32; let num_deleted_docs = delete_bitset.len();
if num_deleted_docs > num_deleted_docs_before { if num_deleted_docs > 0 {
// There are new deletes. We need to write a new delete file.
segment = segment.with_delete_meta(num_deleted_docs as u32, target_opstamp); segment = segment.with_delete_meta(num_deleted_docs as u32, target_opstamp);
let mut delete_file = segment.open_write(SegmentComponent::DELETE)?; let mut delete_file = segment.open_write(SegmentComponent::DELETE)?;
write_delete_bitset(&delete_bitset, max_doc, &mut delete_file)?; write_delete_bitset(&delete_bitset, max_doc, &mut delete_file)?;
@@ -196,21 +190,20 @@ fn index_documents(
grouped_document_iterator: &mut dyn Iterator<Item = OperationGroup>, grouped_document_iterator: &mut dyn Iterator<Item = OperationGroup>,
segment_updater: &mut SegmentUpdater, segment_updater: &mut SegmentUpdater,
mut delete_cursor: DeleteCursor, mut delete_cursor: DeleteCursor,
logger: &Logger,
) -> crate::Result<bool> { ) -> crate::Result<bool> {
let schema = segment.schema(); let schema = segment.schema();
info!(logger, "segment-index"; "stage"=>"start");
let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), &schema)?; let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), &schema)?;
let mut buffer_limit_reached = false;
for document_group in grouped_document_iterator { for document_group in grouped_document_iterator {
for doc in document_group { for doc in document_group {
segment_writer.add_document(doc, &schema)?; segment_writer.add_document(doc, &schema)?;
} }
let mem_usage = segment_writer.mem_usage(); let mem_usage = segment_writer.mem_usage();
if mem_usage >= memory_budget - MARGIN_IN_BYTES { if mem_usage >= memory_budget - MARGIN_IN_BYTES {
buffer_limit_reached = true; info!(
"Buffer limit reached, flushing segment with maxdoc={}.",
segment_writer.max_doc()
);
break; break;
} }
} }
@@ -230,14 +223,6 @@ fn index_documents(
let segment_with_max_doc = segment.with_max_doc(max_doc); let segment_with_max_doc = segment.with_max_doc(max_doc);
let last_docstamp: Opstamp = *(doc_opstamps.last().unwrap()); let last_docstamp: Opstamp = *(doc_opstamps.last().unwrap());
info!(
logger,
"segment-index";
"stage" => "serialize",
"cause" => if buffer_limit_reached { "buffer-limit" } else { "commit" },
"maxdoc" => max_doc,
"last_docstamp" => last_docstamp
);
let delete_bitset_opt = apply_deletes( let delete_bitset_opt = apply_deletes(
&segment_with_max_doc, &segment_with_max_doc,
@@ -251,18 +236,7 @@ fn index_documents(
delete_cursor, delete_cursor,
delete_bitset_opt, delete_bitset_opt,
); );
info!(
logger,
"segment-index";
"stage" => "publish",
);
block_on(segment_updater.schedule_add_segment(segment_entry))?; block_on(segment_updater.schedule_add_segment(segment_entry))?;
info!(
logger,
"segment-index";
"stage" => "end",
);
Ok(true) Ok(true)
} }
@@ -365,20 +339,14 @@ impl IndexWriter {
Ok(index_writer) Ok(index_writer)
} }
pub(crate) fn logger(&self) -> &Logger {
self.index.logger()
}
fn drop_sender(&mut self) { fn drop_sender(&mut self) {
let (sender, _receiver) = channel::bounded(1); let (sender, _receiver) = channel::bounded(1);
self.operation_sender = sender; mem::replace(&mut self.operation_sender, sender);
} }
/// If there are some merging threads, blocks until they all finish their work and /// If there are some merging threads, blocks until they all finish their work and
/// then drop the `IndexWriter`. /// then drop the `IndexWriter`.
pub fn wait_merging_threads(mut self) -> crate::Result<()> { pub fn wait_merging_threads(mut self) -> crate::Result<()> {
info!(self.logger(), "wait-merge-threads"; "stage"=>"start");
// this will stop the indexing thread, // this will stop the indexing thread,
// dropping the last reference to the segment_updater. // dropping the last reference to the segment_updater.
self.drop_sender(); self.drop_sender();
@@ -399,9 +367,9 @@ impl IndexWriter {
.map_err(|_| TantivyError::ErrorInThread("Failed to join merging thread.".into())); .map_err(|_| TantivyError::ErrorInThread("Failed to join merging thread.".into()));
if let Err(ref e) = result { if let Err(ref e) = result {
error!(self.logger(), "some merge thread failed"; "cause"=>e.to_string()); error!("Some merging thread failed {:?}", e);
} }
info!(self.logger(), "wait-merge-threads"; "stage"=>"stop");
result result
} }
@@ -461,16 +429,12 @@ impl IndexWriter {
return Ok(()); return Ok(());
} }
let segment = index.new_segment(); let segment = index.new_segment();
let segment_id = segment.id();
index_documents( index_documents(
mem_budget, mem_budget,
segment, segment,
&mut document_iterator, &mut document_iterator,
&mut segment_updater, &mut segment_updater,
delete_cursor.clone(), delete_cursor.clone(),
&index
.logger()
.new(slog::o!("segment"=>segment_id.to_string())),
)?; )?;
} }
})?; })?;
@@ -567,7 +531,6 @@ impl IndexWriter {
/// when no documents are remaining. /// when no documents are remaining.
/// ///
/// Returns the former segment_ready channel. /// Returns the former segment_ready channel.
#[allow(unused_must_use)]
fn recreate_document_channel(&mut self) -> OperationReceiver { fn recreate_document_channel(&mut self) -> OperationReceiver {
let (document_sender, document_receiver): (OperationSender, OperationReceiver) = let (document_sender, document_receiver): (OperationSender, OperationReceiver) =
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS); channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
@@ -584,10 +547,7 @@ impl IndexWriter {
/// ///
/// The opstamp at the last commit is returned. /// The opstamp at the last commit is returned.
pub fn rollback(&mut self) -> crate::Result<Opstamp> { pub fn rollback(&mut self) -> crate::Result<Opstamp> {
info!( info!("Rolling back to opstamp {}", self.committed_opstamp);
self.logger(),
"Rolling back to opstamp {}", self.committed_opstamp
);
// marks the segment updater as killed. From now on, all // marks the segment updater as killed. From now on, all
// segment updates will be ignored. // segment updates will be ignored.
self.segment_updater.kill(); self.segment_updater.kill();
@@ -610,7 +570,7 @@ impl IndexWriter {
// //
// This will drop the document queue, and the thread // This will drop the document queue, and the thread
// should terminate. // should terminate.
*self = new_index_writer; mem::replace(self, new_index_writer);
// Drains the document receiver pipeline : // Drains the document receiver pipeline :
// Workers don't need to index the pending documents. // Workers don't need to index the pending documents.
@@ -644,8 +604,6 @@ impl IndexWriter {
/// using this API. /// using this API.
/// See [`PreparedCommit::set_payload()`](PreparedCommit.html) /// See [`PreparedCommit::set_payload()`](PreparedCommit.html)
pub fn prepare_commit(&mut self) -> crate::Result<PreparedCommit> { pub fn prepare_commit(&mut self) -> crate::Result<PreparedCommit> {
let logger = self.logger().clone();
// Here, because we join all of the worker threads, // Here, because we join all of the worker threads,
// all of the segment update for this commit have been // all of the segment update for this commit have been
// sent. // sent.
@@ -656,10 +614,7 @@ impl IndexWriter {
// //
// This will move uncommitted segments to the state of // This will move uncommitted segments to the state of
// committed segments. // committed segments.
info!("Preparing commit");
let commit_opstamp = self.stamper.stamp();
info!(logger, "prepare-commit"; "opstamp" => commit_opstamp);
// this will drop the current document channel // this will drop the current document channel
// and recreate a new one. // and recreate a new one.
@@ -675,8 +630,9 @@ impl IndexWriter {
self.add_indexing_worker()?; self.add_indexing_worker()?;
} }
let commit_opstamp = self.stamper.stamp();
let prepared_commit = PreparedCommit::new(self, commit_opstamp); let prepared_commit = PreparedCommit::new(self, commit_opstamp);
info!(logger, "Prepared commit {}", commit_opstamp); info!("Prepared commit {}", commit_opstamp);
Ok(prepared_commit) Ok(prepared_commit)
} }
@@ -838,7 +794,7 @@ mod tests {
let mut schema_builder = schema::Schema::builder(); let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT); let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let index_writer = index.writer_for_tests().unwrap(); let index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let operations = vec![ let operations = vec![
UserOperation::Add(doc!(text_field=>"a")), UserOperation::Add(doc!(text_field=>"a")),
UserOperation::Add(doc!(text_field=>"b")), UserOperation::Add(doc!(text_field=>"b")),
@@ -847,46 +803,6 @@ mod tests {
assert_eq!(batch_opstamp1, 2u64); assert_eq!(batch_opstamp1, 2u64);
} }
#[test]
fn test_no_need_to_rewrite_delete_file_if_no_new_deletes() {
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field => "hello1"));
index_writer.add_document(doc!(text_field => "hello2"));
assert!(index_writer.commit().is_ok());
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.segment_reader(0u32).num_deleted_docs(), 0);
index_writer.delete_term(Term::from_field_text(text_field, "hello1"));
assert!(index_writer.commit().is_ok());
assert!(reader.reload().is_ok());
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.segment_reader(0u32).num_deleted_docs(), 1);
let previous_delete_opstamp = index.load_metas().unwrap().segments[0].delete_opstamp();
// All docs containing hello1 have been already removed.
// We should not update the delete meta.
index_writer.delete_term(Term::from_field_text(text_field, "hello1"));
assert!(index_writer.commit().is_ok());
assert!(reader.reload().is_ok());
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.segment_reader(0u32).num_deleted_docs(), 1);
let after_delete_opstamp = index.load_metas().unwrap().segments[0].delete_opstamp();
assert_eq!(after_delete_opstamp, previous_delete_opstamp);
}
#[test] #[test]
fn test_ordered_batched_operations() { fn test_ordered_batched_operations() {
// * one delete for `doc!(field=>"a")` // * one delete for `doc!(field=>"a")`
@@ -902,7 +818,7 @@ mod tests {
.reload_policy(ReloadPolicy::Manual) .reload_policy(ReloadPolicy::Manual)
.try_into() .try_into()
.unwrap(); .unwrap();
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let a_term = Term::from_field_text(text_field, "a"); let a_term = Term::from_field_text(text_field, "a");
let b_term = Term::from_field_text(text_field, "b"); let b_term = Term::from_field_text(text_field, "b");
let operations = vec![ let operations = vec![
@@ -964,8 +880,8 @@ mod tests {
fn test_lockfile_already_exists_error_msg() { fn test_lockfile_already_exists_error_msg() {
let schema_builder = schema::Schema::builder(); let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let _index_writer = index.writer_for_tests().unwrap(); let _index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
match index.writer_for_tests() { match index.writer_with_num_threads(1, 3_000_000) {
Err(err) => { Err(err) => {
let err_msg = err.to_string(); let err_msg = err.to_string();
assert!(err_msg.contains("already an `IndexWriter`")); assert!(err_msg.contains("already an `IndexWriter`"));
@@ -1299,7 +1215,7 @@ mod tests {
let idfield = schema_builder.add_text_field("id", STRING); let idfield = schema_builder.add_text_field("id", STRING);
schema_builder.add_text_field("optfield", STRING); schema_builder.add_text_field("optfield", STRING);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(idfield=>"myid")); index_writer.add_document(doc!(idfield=>"myid"));
let commit = index_writer.commit(); let commit = index_writer.commit();
assert!(commit.is_ok()); assert!(commit.is_ok());

View File

@@ -54,6 +54,10 @@ impl LogMergePolicy {
impl MergePolicy for LogMergePolicy { impl MergePolicy for LogMergePolicy {
fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec<MergeCandidate> { fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec<MergeCandidate> {
if segments.is_empty() {
return Vec::new();
}
let mut size_sorted_tuples = segments let mut size_sorted_tuples = segments
.iter() .iter()
.map(SegmentMeta::num_docs) .map(SegmentMeta::num_docs)
@@ -63,35 +67,27 @@ impl MergePolicy for LogMergePolicy {
size_sorted_tuples.sort_by(|x, y| y.1.cmp(&(x.1))); size_sorted_tuples.sort_by(|x, y| y.1.cmp(&(x.1)));
if size_sorted_tuples.len() <= 1 {
return Vec::new();
}
let size_sorted_log_tuples: Vec<_> = size_sorted_tuples let size_sorted_log_tuples: Vec<_> = size_sorted_tuples
.into_iter() .into_iter()
.map(|(ind, num_docs)| (ind, f64::from(self.clip_min_size(num_docs)).log2())) .map(|(ind, num_docs)| (ind, f64::from(self.clip_min_size(num_docs)).log2()))
.collect(); .collect();
if let Some(&(first_ind, first_score)) = size_sorted_log_tuples.first() { let (first_ind, first_score) = size_sorted_log_tuples[0];
let mut current_max_log_size = first_score; let mut current_max_log_size = first_score;
let mut levels = vec![vec![first_ind]]; let mut levels = vec![vec![first_ind]];
for &(ind, score) in (&size_sorted_log_tuples).iter().skip(1) { for &(ind, score) in (&size_sorted_log_tuples).iter().skip(1) {
if score < (current_max_log_size - self.level_log_size) { if score < (current_max_log_size - self.level_log_size) {
current_max_log_size = score; current_max_log_size = score;
levels.push(Vec::new()); levels.push(Vec::new());
}
levels.last_mut().unwrap().push(ind);
} }
levels levels.last_mut().unwrap().push(ind);
.iter()
.filter(|level| level.len() >= self.min_merge_size)
.map(|ind_vec| {
MergeCandidate(ind_vec.iter().map(|&ind| segments[ind].id()).collect())
})
.collect()
} else {
return vec![];
} }
levels
.iter()
.filter(|level| level.len() >= self.min_merge_size)
.map(|ind_vec| MergeCandidate(ind_vec.iter().map(|&ind| segments[ind].id()).collect()))
.collect()
} }
} }
@@ -183,7 +179,6 @@ mod tests {
let result_list = test_merge_policy().compute_merge_candidates(&test_input); let result_list = test_merge_policy().compute_merge_candidates(&test_input);
assert_eq!(result_list.len(), 2); assert_eq!(result_list.len(), 2);
} }
#[test] #[test]
fn test_log_merge_policy_small_segments() { fn test_log_merge_policy_small_segments() {
// segments under min_layer_size are merged together // segments under min_layer_size are merged together
@@ -199,17 +194,6 @@ mod tests {
assert_eq!(result_list.len(), 1); assert_eq!(result_list.len(), 1);
} }
#[test]
fn test_log_merge_policy_all_segments_too_large_to_merge() {
let eight_large_segments: Vec<SegmentMeta> =
std::iter::repeat_with(|| create_random_segment_meta(100_001))
.take(8)
.collect();
assert!(test_merge_policy()
.compute_merge_candidates(&eight_large_segments)
.is_empty());
}
#[test] #[test]
fn test_large_merge_segments() { fn test_large_merge_segments() {
let test_input = vec![ let test_input = vec![

View File

@@ -2,37 +2,37 @@ use crate::common::MAX_DOC_LIMIT;
use crate::core::Segment; use crate::core::Segment;
use crate::core::SegmentReader; use crate::core::SegmentReader;
use crate::core::SerializableSegment; use crate::core::SerializableSegment;
use crate::docset::{DocSet, TERMINATED}; use crate::docset::DocSet;
use crate::fastfield::BytesFastFieldReader; use crate::fastfield::BytesFastFieldReader;
use crate::fastfield::DeleteBitSet; use crate::fastfield::DeleteBitSet;
use crate::fastfield::FastFieldReader; use crate::fastfield::FastFieldReader;
use crate::fastfield::FastFieldSerializer; use crate::fastfield::FastFieldSerializer;
use crate::fastfield::MultiValueIntFastFieldReader; use crate::fastfield::MultiValueIntFastFieldReader;
use crate::fieldnorm::FieldNormReader;
use crate::fieldnorm::FieldNormsSerializer; use crate::fieldnorm::FieldNormsSerializer;
use crate::fieldnorm::FieldNormsWriter; use crate::fieldnorm::FieldNormsWriter;
use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
use crate::indexer::SegmentSerializer; use crate::indexer::SegmentSerializer;
use crate::postings::InvertedIndexSerializer;
use crate::postings::Postings; use crate::postings::Postings;
use crate::postings::{InvertedIndexSerializer, SegmentPostings};
use crate::schema::Cardinality; use crate::schema::Cardinality;
use crate::schema::FieldType; use crate::schema::FieldType;
use crate::schema::{Field, Schema}; use crate::schema::{Field, Schema};
use crate::store::StoreWriter; use crate::store::StoreWriter;
use crate::termdict::TermMerger; use crate::termdict::TermMerger;
use crate::termdict::TermOrdinal; use crate::termdict::TermOrdinal;
use crate::{DocId, InvertedIndexReader, SegmentComponent}; use crate::DocId;
use itertools::Itertools;
use std::cmp; use std::cmp;
use std::collections::HashMap; use std::collections::HashMap;
use std::sync::Arc;
fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::Result<u64> { fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 {
let mut total_tokens = 0u64; let mut total_tokens = 0u64;
let mut count: [usize; 256] = [0; 256]; let mut count: [usize; 256] = [0; 256];
for reader in readers { for reader in readers {
if reader.has_deletes() { if reader.has_deletes() {
// if there are deletes, then we use an approximation // if there are deletes, then we use an approximation
// using the fieldnorm // using the fieldnorm
let fieldnorms_reader = reader.get_fieldnorms_reader(field)?; let fieldnorms_reader = reader.get_fieldnorms_reader(field);
for doc in reader.doc_ids_alive() { for doc in reader.doc_ids_alive() {
let fieldnorm_id = fieldnorms_reader.fieldnorm_id(doc); let fieldnorm_id = fieldnorms_reader.fieldnorm_id(doc);
count[fieldnorm_id as usize] += 1; count[fieldnorm_id as usize] += 1;
@@ -41,7 +41,7 @@ fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::R
total_tokens += reader.inverted_index(field).total_num_tokens(); total_tokens += reader.inverted_index(field).total_num_tokens();
} }
} }
Ok(total_tokens total_tokens
+ count + count
.iter() .iter()
.cloned() .cloned()
@@ -49,7 +49,7 @@ fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::R
.map(|(fieldnorm_ord, count)| { .map(|(fieldnorm_ord, count)| {
count as u64 * u64::from(FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8)) count as u64 * u64::from(FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8))
}) })
.sum::<u64>()) .sum::<u64>()
} }
pub struct IndexMerger { pub struct IndexMerger {
@@ -70,11 +70,11 @@ fn compute_min_max_val(
Some(delete_bitset) => { Some(delete_bitset) => {
// some deleted documents, // some deleted documents,
// we need to recompute the max / min // we need to recompute the max / min
crate::common::minmax( (0..max_doc)
(0..max_doc) .filter(|doc_id| delete_bitset.is_alive(*doc_id))
.filter(|doc_id| delete_bitset.is_alive(*doc_id)) .map(|doc_id| u64_reader.get(doc_id))
.map(|doc_id| u64_reader.get(doc_id)), .minmax()
) .into_option()
} }
None => { None => {
// no deleted documents, // no deleted documents,
@@ -168,14 +168,14 @@ impl IndexMerger {
fn write_fieldnorms( fn write_fieldnorms(
&self, &self,
mut fieldnorms_serializer: FieldNormsSerializer, fieldnorms_serializer: &mut FieldNormsSerializer,
) -> crate::Result<()> { ) -> crate::Result<()> {
let fields = FieldNormsWriter::fields_with_fieldnorm(&self.schema); let fields = FieldNormsWriter::fields_with_fieldnorm(&self.schema);
let mut fieldnorms_data = Vec::with_capacity(self.max_doc as usize); let mut fieldnorms_data = Vec::with_capacity(self.max_doc as usize);
for field in fields { for field in fields {
fieldnorms_data.clear(); fieldnorms_data.clear();
for reader in &self.readers { for reader in &self.readers {
let fieldnorms_reader = reader.get_fieldnorms_reader(field)?; let fieldnorms_reader = reader.get_fieldnorms_reader(field);
for doc_id in reader.doc_ids_alive() { for doc_id in reader.doc_ids_alive() {
let fieldnorm_id = fieldnorms_reader.fieldnorm_id(doc_id); let fieldnorm_id = fieldnorms_reader.fieldnorm_id(doc_id);
fieldnorms_data.push(fieldnorm_id); fieldnorms_data.push(fieldnorm_id);
@@ -183,7 +183,6 @@ impl IndexMerger {
} }
fieldnorms_serializer.serialize_field(field, &fieldnorms_data[..])?; fieldnorms_serializer.serialize_field(field, &fieldnorms_data[..])?;
} }
fieldnorms_serializer.close()?;
Ok(()) Ok(())
} }
@@ -494,11 +493,10 @@ impl IndexMerger {
indexed_field: Field, indexed_field: Field,
field_type: &FieldType, field_type: &FieldType,
serializer: &mut InvertedIndexSerializer, serializer: &mut InvertedIndexSerializer,
fieldnorm_reader: Option<FieldNormReader>,
) -> crate::Result<Option<TermOrdinalMapping>> { ) -> crate::Result<Option<TermOrdinalMapping>> {
let mut positions_buffer: Vec<u32> = Vec::with_capacity(1_000); let mut positions_buffer: Vec<u32> = Vec::with_capacity(1_000);
let mut delta_computer = DeltaComputer::new(); let mut delta_computer = DeltaComputer::new();
let field_readers: Vec<Arc<InvertedIndexReader>> = self let field_readers = self
.readers .readers
.iter() .iter()
.map(|reader| reader.inverted_index(indexed_field)) .map(|reader| reader.inverted_index(indexed_field))
@@ -541,7 +539,7 @@ impl IndexMerger {
// The total number of tokens will only be exact when there has been no deletes. // The total number of tokens will only be exact when there has been no deletes.
// //
// Otherwise, we approximate by removing deleted documents proportionally. // Otherwise, we approximate by removing deleted documents proportionally.
let total_num_tokens: u64 = compute_total_num_tokens(&self.readers, indexed_field)?; let total_num_tokens: u64 = compute_total_num_tokens(&self.readers, indexed_field);
// Create the total list of doc ids // Create the total list of doc ids
// by stacking the doc ids from the different segment. // by stacking the doc ids from the different segment.
@@ -553,8 +551,7 @@ impl IndexMerger {
// - Segment 2's doc ids become [seg0.max_doc + seg1.max_doc, // - Segment 2's doc ids become [seg0.max_doc + seg1.max_doc,
// seg0.max_doc + seg1.max_doc + seg2.max_doc] // seg0.max_doc + seg1.max_doc + seg2.max_doc]
// ... // ...
let mut field_serializer = let mut field_serializer = serializer.new_field(indexed_field, total_num_tokens)?;
serializer.new_field(indexed_field, total_num_tokens, fieldnorm_reader)?;
let field_entry = self.schema.get_field_entry(indexed_field); let field_entry = self.schema.get_field_entry(indexed_field);
@@ -564,78 +561,84 @@ impl IndexMerger {
indexed. Have you modified the schema?", indexed. Have you modified the schema?",
); );
let mut segment_postings_containing_the_term: Vec<(usize, SegmentPostings)> = vec![];
while merged_terms.advance() { while merged_terms.advance() {
segment_postings_containing_the_term.clear();
let term_bytes: &[u8] = merged_terms.key(); let term_bytes: &[u8] = merged_terms.key();
let mut total_doc_freq = 0;
// Let's compute the list of non-empty posting lists // Let's compute the list of non-empty posting lists
for heap_item in merged_terms.current_kvs() { let segment_postings: Vec<_> = merged_terms
let segment_ord = heap_item.segment_ord; .current_kvs()
let term_info = heap_item.streamer.value(); .iter()
let segment_reader = &self.readers[heap_item.segment_ord]; .flat_map(|heap_item| {
let inverted_index: &InvertedIndexReader = &*field_readers[segment_ord]; let segment_ord = heap_item.segment_ord;
let segment_postings = let term_info = heap_item.streamer.value();
inverted_index.read_postings_from_terminfo(term_info, segment_postings_option); let segment_reader = &self.readers[heap_item.segment_ord];
let delete_bitset_opt = segment_reader.delete_bitset(); let inverted_index = segment_reader.inverted_index(indexed_field);
let doc_freq = if let Some(delete_bitset) = delete_bitset_opt { let mut segment_postings = inverted_index
segment_postings.doc_freq_given_deletes(delete_bitset) .read_postings_from_terminfo(term_info, segment_postings_option);
} else { while segment_postings.advance() {
segment_postings.doc_freq() if !segment_reader.is_deleted(segment_postings.doc()) {
}; return Some((segment_ord, segment_postings));
if doc_freq > 0u32 { }
total_doc_freq += doc_freq; }
segment_postings_containing_the_term.push((segment_ord, segment_postings)); None
} })
} .collect();
// At this point, `segment_postings` contains the posting list // At this point, `segment_postings` contains the posting list
// of all of the segments containing the given term (and that are non-empty) // of all of the segments containing the given term.
// //
// These segments are non-empty and advance has already been called. // These segments are non-empty and advance has already been called.
if total_doc_freq == 0u32 { if !segment_postings.is_empty() {
// All docs that used to contain the term have been deleted. The `term` will be // If not, the `term` will be entirely removed.
// entirely removed.
continue;
}
let to_term_ord = field_serializer.new_term(term_bytes, total_doc_freq)?; // We know that there is at least one document containing
// the term, so we add it.
let to_term_ord = field_serializer.new_term(term_bytes)?;
if let Some(ref mut term_ord_mapping) = term_ord_mapping_opt { if let Some(ref mut term_ord_mapping) = term_ord_mapping_opt {
for (segment_ord, from_term_ord) in merged_terms.matching_segments() { for (segment_ord, from_term_ord) in merged_terms.matching_segments() {
term_ord_mapping.register_from_to(segment_ord, from_term_ord, to_term_ord); term_ord_mapping.register_from_to(segment_ord, from_term_ord, to_term_ord);
}
}
// We can now serialize this postings, by pushing each document to the
// postings serializer.
for (segment_ord, mut segment_postings) in
segment_postings_containing_the_term.drain(..)
{
let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
let mut doc = segment_postings.doc();
while doc != TERMINATED {
// deleted doc are skipped as they do not have a `remapped_doc_id`.
if let Some(remapped_doc_id) = old_to_new_doc_id[doc as usize] {
// we make sure to only write the term iff
// there is at least one document.
let term_freq = segment_postings.term_freq();
segment_postings.positions(&mut positions_buffer);
let delta_positions = delta_computer.compute_delta(&positions_buffer);
field_serializer.write_doc(remapped_doc_id, term_freq, delta_positions)?;
} }
doc = segment_postings.advance();
} }
}
// closing the term. // We can now serialize this postings, by pushing each document to the
field_serializer.close_term()?; // postings serializer.
for (segment_ord, mut segment_postings) in segment_postings {
let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
loop {
let doc = segment_postings.doc();
// `.advance()` has been called once before the loop.
//
// It was required to make sure we only consider segments
// that effectively contain at least one non-deleted document
// and remove terms that do not have documents associated.
//
// For this reason, we cannot use a `while segment_postings.advance()` loop.
// deleted doc are skipped as they do not have a `remapped_doc_id`.
if let Some(remapped_doc_id) = old_to_new_doc_id[doc as usize] {
// we make sure to only write the term iff
// there is at least one document.
let term_freq = segment_postings.term_freq();
segment_postings.positions(&mut positions_buffer);
let delta_positions = delta_computer.compute_delta(&positions_buffer);
field_serializer.write_doc(
remapped_doc_id,
term_freq,
delta_positions,
)?;
}
if !segment_postings.advance() {
break;
}
}
}
// closing the term.
field_serializer.close_term()?;
}
} }
field_serializer.close()?; field_serializer.close()?;
Ok(term_ord_mapping_opt) Ok(term_ord_mapping_opt)
@@ -644,18 +647,13 @@ impl IndexMerger {
fn write_postings( fn write_postings(
&self, &self,
serializer: &mut InvertedIndexSerializer, serializer: &mut InvertedIndexSerializer,
fieldnorm_readers: FieldNormReaders,
) -> crate::Result<HashMap<Field, TermOrdinalMapping>> { ) -> crate::Result<HashMap<Field, TermOrdinalMapping>> {
let mut term_ordinal_mappings = HashMap::new(); let mut term_ordinal_mappings = HashMap::new();
for (field, field_entry) in self.schema.fields() { for (field, field_entry) in self.schema.fields() {
let fieldnorm_reader = fieldnorm_readers.get_field(field);
if field_entry.is_indexed() { if field_entry.is_indexed() {
if let Some(term_ordinal_mapping) = self.write_postings_for_field( if let Some(term_ordinal_mapping) =
field, self.write_postings_for_field(field, field_entry.field_type(), serializer)?
field_entry.field_type(), {
serializer,
fieldnorm_reader,
)? {
term_ordinal_mappings.insert(field, term_ordinal_mapping); term_ordinal_mappings.insert(field, term_ordinal_mapping);
} }
} }
@@ -681,15 +679,8 @@ impl IndexMerger {
impl SerializableSegment for IndexMerger { impl SerializableSegment for IndexMerger {
fn write(&self, mut serializer: SegmentSerializer) -> crate::Result<u32> { fn write(&self, mut serializer: SegmentSerializer) -> crate::Result<u32> {
if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() { let term_ord_mappings = self.write_postings(serializer.get_postings_serializer())?;
self.write_fieldnorms(fieldnorms_serializer)?; self.write_fieldnorms(serializer.get_fieldnorms_serializer())?;
}
let fieldnorm_data = serializer
.segment()
.open_read(SegmentComponent::FIELDNORMS)?;
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
let term_ord_mappings =
self.write_postings(serializer.get_postings_serializer(), fieldnorm_readers)?;
self.write_fast_fields(serializer.get_fast_field_serializer(), term_ord_mappings)?; self.write_fast_fields(serializer.get_fast_field_serializer(), term_ord_mappings)?;
self.write_storable_fields(serializer.get_store_writer())?; self.write_storable_fields(serializer.get_store_writer())?;
serializer.close()?; serializer.close()?;
@@ -699,15 +690,15 @@ impl SerializableSegment for IndexMerger {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::assert_nearly_equals;
use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE; use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE;
use crate::collector::tests::{BytesFastFieldTestCollector, FastFieldTestCollector}; use crate::collector::tests::{BytesFastFieldTestCollector, FastFieldTestCollector};
use crate::collector::{Count, FacetCollector}; use crate::collector::{Count, FacetCollector};
use crate::core::Index; use crate::core::Index;
use crate::query::AllQuery; use crate::query::AllQuery;
use crate::query::BooleanQuery; use crate::query::BooleanQuery;
use crate::query::Scorer;
use crate::query::TermQuery; use crate::query::TermQuery;
use crate::schema;
use crate::schema::Cardinality;
use crate::schema::Document; use crate::schema::Document;
use crate::schema::Facet; use crate::schema::Facet;
use crate::schema::IndexRecordOption; use crate::schema::IndexRecordOption;
@@ -715,11 +706,9 @@ mod tests {
use crate::schema::Term; use crate::schema::Term;
use crate::schema::TextFieldIndexing; use crate::schema::TextFieldIndexing;
use crate::schema::INDEXED; use crate::schema::INDEXED;
use crate::schema::{Cardinality, TEXT};
use crate::DocAddress; use crate::DocAddress;
use crate::IndexWriter; use crate::IndexWriter;
use crate::Searcher; use crate::Searcher;
use crate::{schema, DocSet, SegmentId};
use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
use futures::executor::block_on; use futures::executor::block_on;
use std::io::Cursor; use std::io::Cursor;
@@ -751,7 +740,7 @@ mod tests {
}; };
{ {
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
{ {
// writing the segment // writing the segment
{ {
@@ -803,7 +792,7 @@ mod tests {
let segment_ids = index let segment_ids = index
.searchable_segment_ids() .searchable_segment_ids()
.expect("Searchable segments failed."); .expect("Searchable segments failed.");
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
block_on(index_writer.merge(&segment_ids)).expect("Merging failed"); block_on(index_writer.merge(&segment_ids)).expect("Merging failed");
index_writer.wait_merging_threads().unwrap(); index_writer.wait_merging_threads().unwrap();
} }
@@ -904,7 +893,7 @@ mod tests {
let score_field = schema_builder.add_u64_field("score", score_fieldtype); let score_field = schema_builder.add_u64_field("score", score_fieldtype);
let bytes_score_field = schema_builder.add_bytes_field("score_bytes"); let bytes_score_field = schema_builder.add_bytes_field("score_bytes");
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let reader = index.reader().unwrap(); let reader = index.reader().unwrap();
let search_term = |searcher: &Searcher, term: Term| { let search_term = |searcher: &Searcher, term: Term| {
let collector = FastFieldTestCollector::for_field(score_field); let collector = FastFieldTestCollector::for_field(score_field);
@@ -1211,7 +1200,7 @@ mod tests {
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let reader = index.reader().unwrap(); let reader = index.reader().unwrap();
{ {
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| { let index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| {
let mut doc = Document::default(); let mut doc = Document::default();
for facet in doc_facets { for facet in doc_facets {
@@ -1276,7 +1265,7 @@ mod tests {
let segment_ids = index let segment_ids = index
.searchable_segment_ids() .searchable_segment_ids()
.expect("Searchable segments failed."); .expect("Searchable segments failed.");
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
block_on(index_writer.merge(&segment_ids)).expect("Merging failed"); block_on(index_writer.merge(&segment_ids)).expect("Merging failed");
index_writer.wait_merging_threads().unwrap(); index_writer.wait_merging_threads().unwrap();
reader.reload().unwrap(); reader.reload().unwrap();
@@ -1295,7 +1284,7 @@ mod tests {
// Deleting one term // Deleting one term
{ {
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let facet = Facet::from_path(vec!["top", "a", "firstdoc"]); let facet = Facet::from_path(vec!["top", "a", "firstdoc"]);
let facet_term = Term::from_facet(facet_field, &facet); let facet_term = Term::from_facet(facet_field, &facet);
index_writer.delete_term(facet_term); index_writer.delete_term(facet_term);
@@ -1320,7 +1309,7 @@ mod tests {
let mut schema_builder = schema::Schema::builder(); let mut schema_builder = schema::Schema::builder();
let int_field = schema_builder.add_u64_field("intvals", INDEXED); let int_field = schema_builder.add_u64_field("intvals", INDEXED);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(int_field => 1u64)); index_writer.add_document(doc!(int_field => 1u64));
index_writer.commit().expect("commit failed"); index_writer.commit().expect("commit failed");
index_writer.add_document(doc!(int_field => 1u64)); index_writer.add_document(doc!(int_field => 1u64));
@@ -1349,7 +1338,7 @@ mod tests {
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let reader = index.reader().unwrap(); let reader = index.reader().unwrap();
{ {
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut doc = Document::default(); let mut doc = Document::default();
doc.add_u64(int_field, 1); doc.add_u64(int_field, 1);
index_writer.add_document(doc.clone()); index_writer.add_document(doc.clone());
@@ -1388,7 +1377,7 @@ mod tests {
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
{ {
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| { let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| {
let mut doc = Document::default(); let mut doc = Document::default();
for &val in int_vals { for &val in int_vals {
@@ -1462,7 +1451,7 @@ mod tests {
let segment_ids = index let segment_ids = index
.searchable_segment_ids() .searchable_segment_ids()
.expect("Searchable segments failed."); .expect("Searchable segments failed.");
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
assert!(block_on(index_writer.merge(&segment_ids)).is_ok()); assert!(block_on(index_writer.merge(&segment_ids)).is_ok());
assert!(index_writer.wait_merging_threads().is_ok()); assert!(index_writer.wait_merging_threads().is_ok());
} }
@@ -1516,7 +1505,7 @@ mod tests {
let index = Index::create_in_ram(builder.build()); let index = Index::create_in_ram(builder.build());
let mut writer = index.writer_for_tests()?; let mut writer = index.writer_with_num_threads(1, 3_000_000)?;
// Make sure we'll attempt to merge every created segment // Make sure we'll attempt to merge every created segment
let mut policy = crate::indexer::LogMergePolicy::default(); let mut policy = crate::indexer::LogMergePolicy::default();
@@ -1526,9 +1515,12 @@ mod tests {
for i in 0..100 { for i in 0..100 {
let mut doc = Document::new(); let mut doc = Document::new();
doc.add_f64(field, 42.0); doc.add_f64(field, 42.0);
doc.add_f64(multi_field, 0.24); doc.add_f64(multi_field, 0.24);
doc.add_f64(multi_field, 0.27); doc.add_f64(multi_field, 0.27);
writer.add_document(doc); writer.add_document(doc);
if i % 5 == 0 { if i % 5 == 0 {
writer.commit()?; writer.commit()?;
} }
@@ -1540,72 +1532,6 @@ mod tests {
// If a merging thread fails, we should end up with more // If a merging thread fails, we should end up with more
// than one segment here // than one segment here
assert_eq!(1, index.searchable_segments()?.len()); assert_eq!(1, index.searchable_segments()?.len());
Ok(())
}
#[test]
fn test_merged_index_has_blockwand() -> crate::Result<()> {
let mut builder = schema::SchemaBuilder::new();
let text = builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(builder.build());
let mut writer = index.writer_for_tests()?;
let happy_term = Term::from_field_text(text, "happy");
let term_query = TermQuery::new(happy_term, IndexRecordOption::WithFreqs);
for _ in 0..62 {
writer.add_document(doc!(text=>"hello happy tax payer"));
}
writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
let mut term_scorer = term_query
.specialized_weight(&searcher, true)
.specialized_scorer(searcher.segment_reader(0u32), 1.0)?;
assert_eq!(term_scorer.doc(), 0);
assert_nearly_equals!(term_scorer.block_max_score(), 0.0079681855);
assert_nearly_equals!(term_scorer.score(), 0.0079681855);
for _ in 0..81 {
writer.add_document(doc!(text=>"hello happy tax payer"));
}
writer.commit()?;
reader.reload()?;
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 2);
for segment_reader in searcher.segment_readers() {
let mut term_scorer = term_query
.specialized_weight(&searcher, true)
.specialized_scorer(segment_reader, 1.0)?;
// the difference compared to before is instrinsic to the bm25 formula. no worries there.
for doc in segment_reader.doc_ids_alive() {
assert_eq!(term_scorer.doc(), doc);
assert_nearly_equals!(term_scorer.block_max_score(), 0.003478312);
assert_nearly_equals!(term_scorer.score(), 0.003478312);
term_scorer.advance();
}
}
let segment_ids: Vec<SegmentId> = searcher
.segment_readers()
.iter()
.map(|reader| reader.segment_id())
.collect();
block_on(writer.merge(&segment_ids[..]))?;
reader.reload()?;
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let segment_reader = searcher.segment_reader(0u32);
let mut term_scorer = term_query
.specialized_weight(&searcher, true)
.specialized_scorer(segment_reader, 1.0)?;
// the difference compared to before is instrinsic to the bm25 formula. no worries there.
for doc in segment_reader.doc_ids_alive() {
assert_eq!(term_scorer.doc(), doc);
assert_nearly_equals!(term_scorer.block_max_score(), 0.003478312);
assert_nearly_equals!(term_scorer.score(), 0.003478312);
term_scorer.advance();
}
Ok(()) Ok(())
} }

View File

@@ -29,9 +29,8 @@ pub use self::segment_writer::SegmentWriter;
/// Alias for the default merge policy, which is the `LogMergePolicy`. /// Alias for the default merge policy, which is the `LogMergePolicy`.
pub type DefaultMergePolicy = LogMergePolicy; pub type DefaultMergePolicy = LogMergePolicy;
#[cfg(feature = "mmap")]
#[cfg(test)] #[cfg(test)]
mod tests_mmap { mod tests {
use crate::schema::{self, Schema}; use crate::schema::{self, Schema};
use crate::{Index, Term}; use crate::{Index, Term};
@@ -40,7 +39,7 @@ mod tests_mmap {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT); let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_from_tempdir(schema_builder.build()).unwrap(); let index = Index::create_from_tempdir(schema_builder.build()).unwrap();
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
// there must be one deleted document in the segment // there must be one deleted document in the segment
index_writer.add_document(doc!(text_field=>"b")); index_writer.add_document(doc!(text_field=>"b"));
index_writer.delete_term(Term::from_field_text(text_field, "b")); index_writer.delete_term(Term::from_field_text(text_field, "b"));

View File

@@ -1,7 +1,6 @@
use super::IndexWriter; use super::IndexWriter;
use crate::Opstamp; use crate::Opstamp;
use futures::executor::block_on; use futures::executor::block_on;
use slog::info;
/// A prepared commit /// A prepared commit
pub struct PreparedCommit<'a> { pub struct PreparedCommit<'a> {
@@ -32,7 +31,7 @@ impl<'a> PreparedCommit<'a> {
} }
pub fn commit(self) -> crate::Result<Opstamp> { pub fn commit(self) -> crate::Result<Opstamp> {
info!(self.index_writer.logger(), "committing {}", self.opstamp); info!("committing {}", self.opstamp);
let _ = block_on( let _ = block_on(
self.index_writer self.index_writer
.segment_updater() .segment_updater()

View File

@@ -1,5 +1,3 @@
use slog::{warn, Logger};
use super::segment_register::SegmentRegister; use super::segment_register::SegmentRegister;
use crate::core::SegmentId; use crate::core::SegmentId;
use crate::core::SegmentMeta; use crate::core::SegmentMeta;
@@ -44,9 +42,9 @@ impl SegmentRegisters {
/// ///
/// It guarantees the atomicity of the /// It guarantees the atomicity of the
/// changes (merges especially) /// changes (merges especially)
#[derive(Default)]
pub struct SegmentManager { pub struct SegmentManager {
registers: RwLock<SegmentRegisters>, registers: RwLock<SegmentRegisters>,
logger: Logger,
} }
impl Debug for SegmentManager { impl Debug for SegmentManager {
@@ -79,14 +77,12 @@ impl SegmentManager {
pub fn from_segments( pub fn from_segments(
segment_metas: Vec<SegmentMeta>, segment_metas: Vec<SegmentMeta>,
delete_cursor: &DeleteCursor, delete_cursor: &DeleteCursor,
logger: Logger,
) -> SegmentManager { ) -> SegmentManager {
SegmentManager { SegmentManager {
registers: RwLock::new(SegmentRegisters { registers: RwLock::new(SegmentRegisters {
uncommitted: SegmentRegister::default(), uncommitted: SegmentRegister::default(),
committed: SegmentRegister::new(segment_metas, delete_cursor), committed: SegmentRegister::new(segment_metas, delete_cursor),
}), }),
logger,
} }
} }
@@ -190,7 +186,7 @@ impl SegmentManager {
let segments_status = registers_lock let segments_status = registers_lock
.segments_status(before_merge_segment_ids) .segments_status(before_merge_segment_ids)
.ok_or_else(|| { .ok_or_else(|| {
warn!(self.logger, "couldn't find segment in SegmentManager"); warn!("couldn't find segment in SegmentManager");
crate::TantivyError::InvalidArgument( crate::TantivyError::InvalidArgument(
"The segments that were merged could not be found in the SegmentManager. \ "The segments that were merged could not be found in the SegmentManager. \
This is not necessarily a bug, and can happen after a rollback for instance." This is not necessarily a bug, and can happen after a rollback for instance."

View File

@@ -8,16 +8,15 @@ use crate::store::StoreWriter;
/// Segment serializer is in charge of laying out on disk /// Segment serializer is in charge of laying out on disk
/// the data accumulated and sorted by the `SegmentWriter`. /// the data accumulated and sorted by the `SegmentWriter`.
pub struct SegmentSerializer { pub struct SegmentSerializer {
segment: Segment,
store_writer: StoreWriter, store_writer: StoreWriter,
fast_field_serializer: FastFieldSerializer, fast_field_serializer: FastFieldSerializer,
fieldnorms_serializer: Option<FieldNormsSerializer>, fieldnorms_serializer: FieldNormsSerializer,
postings_serializer: InvertedIndexSerializer, postings_serializer: InvertedIndexSerializer,
} }
impl SegmentSerializer { impl SegmentSerializer {
/// Creates a new `SegmentSerializer`. /// Creates a new `SegmentSerializer`.
pub fn for_segment(mut segment: Segment) -> crate::Result<SegmentSerializer> { pub fn for_segment(segment: &mut Segment) -> crate::Result<SegmentSerializer> {
let store_write = segment.open_write(SegmentComponent::STORE)?; let store_write = segment.open_write(SegmentComponent::STORE)?;
let fast_field_write = segment.open_write(SegmentComponent::FASTFIELDS)?; let fast_field_write = segment.open_write(SegmentComponent::FASTFIELDS)?;
@@ -26,20 +25,15 @@ impl SegmentSerializer {
let fieldnorms_write = segment.open_write(SegmentComponent::FIELDNORMS)?; let fieldnorms_write = segment.open_write(SegmentComponent::FIELDNORMS)?;
let fieldnorms_serializer = FieldNormsSerializer::from_write(fieldnorms_write)?; let fieldnorms_serializer = FieldNormsSerializer::from_write(fieldnorms_write)?;
let postings_serializer = InvertedIndexSerializer::open(&mut segment)?; let postings_serializer = InvertedIndexSerializer::open(segment)?;
Ok(SegmentSerializer { Ok(SegmentSerializer {
segment,
store_writer: StoreWriter::new(store_write), store_writer: StoreWriter::new(store_write),
fast_field_serializer, fast_field_serializer,
fieldnorms_serializer: Some(fieldnorms_serializer), fieldnorms_serializer,
postings_serializer, postings_serializer,
}) })
} }
pub fn segment(&self) -> &Segment {
&self.segment
}
/// Accessor to the `PostingsSerializer`. /// Accessor to the `PostingsSerializer`.
pub fn get_postings_serializer(&mut self) -> &mut InvertedIndexSerializer { pub fn get_postings_serializer(&mut self) -> &mut InvertedIndexSerializer {
&mut self.postings_serializer &mut self.postings_serializer
@@ -50,11 +44,9 @@ impl SegmentSerializer {
&mut self.fast_field_serializer &mut self.fast_field_serializer
} }
/// Extract the field norm serializer. /// Accessor to the field norm serializer.
/// pub fn get_fieldnorms_serializer(&mut self) -> &mut FieldNormsSerializer {
/// Note the fieldnorms serializer can only be extracted once. &mut self.fieldnorms_serializer
pub fn extract_fieldnorms_serializer(&mut self) -> Option<FieldNormsSerializer> {
self.fieldnorms_serializer.take()
} }
/// Accessor to the `StoreWriter`. /// Accessor to the `StoreWriter`.
@@ -63,13 +55,11 @@ impl SegmentSerializer {
} }
/// Finalize the segment serialization. /// Finalize the segment serialization.
pub fn close(mut self) -> crate::Result<()> { pub fn close(self) -> crate::Result<()> {
if let Some(fieldnorms_serializer) = self.extract_fieldnorms_serializer() {
fieldnorms_serializer.close()?;
}
self.fast_field_serializer.close()?; self.fast_field_serializer.close()?;
self.postings_serializer.close()?; self.postings_serializer.close()?;
self.store_writer.close()?; self.store_writer.close()?;
self.fieldnorms_serializer.close()?;
Ok(()) Ok(())
} }
} }

View File

@@ -23,9 +23,10 @@ use futures::channel::oneshot;
use futures::executor::{ThreadPool, ThreadPoolBuilder}; use futures::executor::{ThreadPool, ThreadPoolBuilder};
use futures::future::Future; use futures::future::Future;
use futures::future::TryFutureExt; use futures::future::TryFutureExt;
use slog::{debug, error, info, warn}; use serde_json;
use std::borrow::BorrowMut; use std::borrow::BorrowMut;
use std::collections::HashSet; use std::collections::HashSet;
use std::io::Write;
use std::ops::Deref; use std::ops::Deref;
use std::path::PathBuf; use std::path::PathBuf;
use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::atomic::{AtomicBool, Ordering};
@@ -65,11 +66,12 @@ pub fn save_new_metas(schema: Schema, directory: &mut dyn Directory) -> crate::R
/// ///
/// This method is not part of tantivy's public API /// This method is not part of tantivy's public API
fn save_metas(metas: &IndexMeta, directory: &mut dyn Directory) -> crate::Result<()> { fn save_metas(metas: &IndexMeta, directory: &mut dyn Directory) -> crate::Result<()> {
let mut meta_json = serde_json::to_string_pretty(metas)?; info!("save metas");
let mut buffer = serde_json::to_vec_pretty(metas)?;
// Just adding a new line at the end of the buffer. // Just adding a new line at the end of the buffer.
meta_json.push_str("\n"); writeln!(&mut buffer)?;
debug!(directory.logger(), "save meta"; "content"=>&meta_json); directory.atomic_write(&META_FILEPATH, &buffer[..])?;
directory.atomic_write(&META_FILEPATH, meta_json.as_bytes())?; debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas));
Ok(()) Ok(())
} }
@@ -96,6 +98,7 @@ impl Deref for SegmentUpdater {
async fn garbage_collect_files( async fn garbage_collect_files(
segment_updater: SegmentUpdater, segment_updater: SegmentUpdater,
) -> crate::Result<GarbageCollectionResult> { ) -> crate::Result<GarbageCollectionResult> {
info!("Running garbage collection");
let mut index = segment_updater.index.clone(); let mut index = segment_updater.index.clone();
index index
.directory_mut() .directory_mut()
@@ -105,12 +108,14 @@ async fn garbage_collect_files(
/// Merges a list of segments the list of segment givens in the `segment_entries`. /// Merges a list of segments the list of segment givens in the `segment_entries`.
/// This function happens in the calling thread and is computationally expensive. /// This function happens in the calling thread and is computationally expensive.
fn merge( fn merge(
merged_segment: Segment,
index: &Index, index: &Index,
mut segment_entries: Vec<SegmentEntry>, mut segment_entries: Vec<SegmentEntry>,
target_opstamp: Opstamp, target_opstamp: Opstamp,
) -> crate::Result<SegmentEntry> { ) -> crate::Result<SegmentEntry> {
// First we apply all of the delete to the merged segment, up to the target opstamp. // first we need to apply deletes to our segment.
let mut merged_segment = index.new_segment();
// First we apply all of the delet to the merged segment, up to the target opstamp.
for segment_entry in &mut segment_entries { for segment_entry in &mut segment_entries {
let segment = index.segment(segment_entry.meta().clone()); let segment = index.segment(segment_entry.meta().clone());
advance_deletes(segment, segment_entry, target_opstamp)?; advance_deletes(segment, segment_entry, target_opstamp)?;
@@ -127,13 +132,12 @@ fn merge(
let merger: IndexMerger = IndexMerger::open(index.schema(), &segments[..])?; let merger: IndexMerger = IndexMerger::open(index.schema(), &segments[..])?;
// ... we just serialize this index merger in our new segment to merge the two segments. // ... we just serialize this index merger in our new segment to merge the two segments.
let segment_serializer = SegmentSerializer::for_segment(merged_segment.clone())?; let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment)?;
let num_docs = merger.write(segment_serializer)?; let num_docs = merger.write(segment_serializer)?;
let merged_segment_id = merged_segment.id(); let segment_meta = index.new_segment_meta(merged_segment.id(), num_docs);
let segment_meta = index.new_segment_meta(merged_segment_id, num_docs);
Ok(SegmentEntry::new(segment_meta, delete_cursor, None)) Ok(SegmentEntry::new(segment_meta, delete_cursor, None))
} }
@@ -163,8 +167,7 @@ impl SegmentUpdater {
delete_cursor: &DeleteCursor, delete_cursor: &DeleteCursor,
) -> crate::Result<SegmentUpdater> { ) -> crate::Result<SegmentUpdater> {
let segments = index.searchable_segment_metas()?; let segments = index.searchable_segment_metas()?;
let segment_manager = let segment_manager = SegmentManager::from_segments(segments, delete_cursor);
SegmentManager::from_segments(segments, delete_cursor, index.logger().clone());
let pool = ThreadPoolBuilder::new() let pool = ThreadPoolBuilder::new()
.name_prefix("segment_updater") .name_prefix("segment_updater")
.pool_size(1) .pool_size(1)
@@ -384,18 +387,7 @@ impl SegmentUpdater {
.segment_manager .segment_manager
.start_merge(merge_operation.segment_ids())?; .start_merge(merge_operation.segment_ids())?;
let segment_ids_str: String = merge_operation info!("Starting merge - {:?}", merge_operation.segment_ids());
.segment_ids()
.iter()
.map(|segment_id| segment_id.to_string())
.collect::<Vec<String>>()
.join(",");
let merged_segment = self.index.new_segment();
let logger = self.index.logger().new(slog::o!("segments"=>segment_ids_str, "merged-segment"=>merged_segment.id().to_string()));
let num_merges: usize = self.merge_operations.list().len();
slog::info!(&logger, "merge"; "stage"=>"start", "num-merges" => num_merges);
let (merging_future_send, merging_future_recv) = let (merging_future_send, merging_future_recv) =
oneshot::channel::<crate::Result<SegmentMeta>>(); oneshot::channel::<crate::Result<SegmentMeta>>();
@@ -406,20 +398,22 @@ impl SegmentUpdater {
// as well as which segment is currently in merge and therefore should not be // as well as which segment is currently in merge and therefore should not be
// candidate for another merge. // candidate for another merge.
match merge( match merge(
merged_segment,
&segment_updater.index, &segment_updater.index,
segment_entries, segment_entries,
merge_operation.target_opstamp(), merge_operation.target_opstamp(),
) { ) {
Ok(after_merge_segment_entry) => { Ok(after_merge_segment_entry) => {
info!(&logger, "merge"; "stage" => "end");
let segment_meta = segment_updater let segment_meta = segment_updater
.end_merge(merge_operation, after_merge_segment_entry) .end_merge(merge_operation, after_merge_segment_entry)
.await; .await;
let _send_result = merging_future_send.send(segment_meta); let _send_result = merging_future_send.send(segment_meta);
} }
Err(e) => { Err(e) => {
error!(&logger, "merge"; "stage" => "fail", "cause"=>e.to_string()); warn!(
"Merge of {:?} was cancelled: {:?}",
merge_operation.segment_ids().to_vec(),
e
);
// ... cancel merge // ... cancel merge
if cfg!(test) { if cfg!(test) {
panic!("Merge failed."); panic!("Merge failed.");
@@ -460,12 +454,11 @@ impl SegmentUpdater {
.collect::<Vec<_>>(); .collect::<Vec<_>>();
merge_candidates.extend(committed_merge_candidates.into_iter()); merge_candidates.extend(committed_merge_candidates.into_iter());
let logger = self.index.logger();
for merge_operation in merge_candidates { for merge_operation in merge_candidates {
if let Err(err) = self.start_merge(merge_operation) { if let Err(err) = self.start_merge(merge_operation) {
warn!( warn!(
logger, "Starting the merge failed for the following reason. This is not fatal. {}",
"merge-start-fail (not fatal, not necessarily a problem)"; "reason" => format!("{}", err), err
); );
} }
} }
@@ -478,11 +471,8 @@ impl SegmentUpdater {
) -> impl Future<Output = crate::Result<SegmentMeta>> { ) -> impl Future<Output = crate::Result<SegmentMeta>> {
let segment_updater = self.clone(); let segment_updater = self.clone();
let after_merge_segment_meta = after_merge_segment_entry.meta().clone(); let after_merge_segment_meta = after_merge_segment_entry.meta().clone();
let logger = self.index.logger().new(
slog::o!("segment"=>after_merge_segment_meta.id().to_string(),
"delete-opstamp"=>after_merge_segment_meta.delete_opstamp()),
);
let end_merge_future = self.schedule_future(async move { let end_merge_future = self.schedule_future(async move {
info!("End merge {:?}", after_merge_segment_entry.meta());
{ {
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone(); let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
if let Some(delete_operation) = delete_cursor.get() { if let Some(delete_operation) = delete_cursor.get() {
@@ -496,7 +486,6 @@ impl SegmentUpdater {
committed_opstamp, committed_opstamp,
) { ) {
error!( error!(
logger,
"Merge of {:?} was cancelled (advancing deletes failed): {:?}", "Merge of {:?} was cancelled (advancing deletes failed): {:?}",
merge_operation.segment_ids(), merge_operation.segment_ids(),
e e
@@ -533,7 +522,7 @@ impl SegmentUpdater {
/// ///
/// Upon termination of the current merging threads, /// Upon termination of the current merging threads,
/// merge opportunity may appear. /// merge opportunity may appear.
/// //
/// We keep waiting until the merge policy judges that /// We keep waiting until the merge policy judges that
/// no opportunity is available. /// no opportunity is available.
/// ///
@@ -566,7 +555,7 @@ mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
// writing the segment // writing the segment
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.set_merge_policy(Box::new(MergeWheneverPossible)); index_writer.set_merge_policy(Box::new(MergeWheneverPossible));
{ {
@@ -619,7 +608,7 @@ mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
// writing the segment // writing the segment
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
{ {
for _ in 0..100 { for _ in 0..100 {
@@ -690,7 +679,7 @@ mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
// writing the segment // writing the segment
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
{ {
for _ in 0..100 { for _ in 0..100 {

View File

@@ -1,7 +1,8 @@
use super::operation::AddOperation; use super::operation::AddOperation;
use crate::core::Segment;
use crate::core::SerializableSegment; use crate::core::SerializableSegment;
use crate::fastfield::FastFieldsWriter; use crate::fastfield::FastFieldsWriter;
use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter}; use crate::fieldnorm::FieldNormsWriter;
use crate::indexer::segment_serializer::SegmentSerializer; use crate::indexer::segment_serializer::SegmentSerializer;
use crate::postings::compute_table_size; use crate::postings::compute_table_size;
use crate::postings::MultiFieldPostingsWriter; use crate::postings::MultiFieldPostingsWriter;
@@ -13,10 +14,10 @@ use crate::schema::{Field, FieldEntry};
use crate::tokenizer::{BoxTokenStream, PreTokenizedStream}; use crate::tokenizer::{BoxTokenStream, PreTokenizedStream};
use crate::tokenizer::{FacetTokenizer, TextAnalyzer}; use crate::tokenizer::{FacetTokenizer, TextAnalyzer};
use crate::tokenizer::{TokenStreamChain, Tokenizer}; use crate::tokenizer::{TokenStreamChain, Tokenizer};
use crate::DocId;
use crate::Opstamp; use crate::Opstamp;
use crate::{core::Segment, tokenizer::MAX_TOKEN_LEN};
use crate::{DocId, SegmentComponent};
use std::io; use std::io;
use std::str;
/// Computes the initial size of the hash table. /// Computes the initial size of the hash table.
/// ///
@@ -47,7 +48,6 @@ pub struct SegmentWriter {
fieldnorms_writer: FieldNormsWriter, fieldnorms_writer: FieldNormsWriter,
doc_opstamps: Vec<Opstamp>, doc_opstamps: Vec<Opstamp>,
tokenizers: Vec<Option<TextAnalyzer>>, tokenizers: Vec<Option<TextAnalyzer>>,
term_buffer: Term,
} }
impl SegmentWriter { impl SegmentWriter {
@@ -62,12 +62,11 @@ impl SegmentWriter {
/// - schema /// - schema
pub fn for_segment( pub fn for_segment(
memory_budget: usize, memory_budget: usize,
segment: Segment, mut segment: Segment,
schema: &Schema, schema: &Schema,
) -> crate::Result<SegmentWriter> { ) -> crate::Result<SegmentWriter> {
let tokenizer_manager = segment.index().tokenizers().clone();
let table_num_bits = initial_table_size(memory_budget)?; let table_num_bits = initial_table_size(memory_budget)?;
let segment_serializer = SegmentSerializer::for_segment(segment)?; let segment_serializer = SegmentSerializer::for_segment(&mut segment)?;
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_num_bits); let multifield_postings = MultiFieldPostingsWriter::new(schema, table_num_bits);
let tokenizers = schema let tokenizers = schema
.fields() .fields()
@@ -77,7 +76,7 @@ impl SegmentWriter {
.get_indexing_options() .get_indexing_options()
.and_then(|text_index_option| { .and_then(|text_index_option| {
let tokenizer_name = &text_index_option.tokenizer(); let tokenizer_name = &text_index_option.tokenizer();
tokenizer_manager.get(tokenizer_name) segment.index().tokenizers().get(tokenizer_name)
}), }),
_ => None, _ => None,
}, },
@@ -91,7 +90,6 @@ impl SegmentWriter {
fast_field_writers: FastFieldsWriter::from_schema(schema), fast_field_writers: FastFieldsWriter::from_schema(schema),
doc_opstamps: Vec::with_capacity(1_000), doc_opstamps: Vec::with_capacity(1_000),
tokenizers, tokenizers,
term_buffer: Term::new(),
}) })
} }
@@ -129,29 +127,24 @@ impl SegmentWriter {
if !field_options.is_indexed() { if !field_options.is_indexed() {
continue; continue;
} }
let (term_buffer, multifield_postings) =
(&mut self.term_buffer, &mut self.multifield_postings);
match *field_options.field_type() { match *field_options.field_type() {
FieldType::HierarchicalFacet => { FieldType::HierarchicalFacet => {
term_buffer.set_field(field); let facets: Vec<&str> = field_values
let facets = .iter()
field_values .flat_map(|field_value| match *field_value.value() {
.iter() Value::Facet(ref facet) => Some(facet.encoded_str()),
.flat_map(|field_value| match *field_value.value() { _ => {
Value::Facet(ref facet) => Some(facet.encoded_str()), panic!("Expected hierarchical facet");
_ => { }
panic!("Expected hierarchical facet"); })
} .collect();
}); let mut term = Term::for_field(field); // we set the Term
for fake_str in facets { for fake_str in facets {
let mut unordered_term_id_opt = None; let mut unordered_term_id_opt = None;
FacetTokenizer.token_stream(fake_str).process(&mut |token| { FacetTokenizer.token_stream(fake_str).process(&mut |token| {
if token.text.len() > MAX_TOKEN_LEN { term.set_text(&token.text);
return;
}
term_buffer.set_text(&token.text);
let unordered_term_id = let unordered_term_id =
multifield_postings.subscribe(doc_id, &term_buffer); self.multifield_postings.subscribe(doc_id, &term);
unordered_term_id_opt = Some(unordered_term_id); unordered_term_id_opt = Some(unordered_term_id);
}); });
if let Some(unordered_term_id) = unordered_term_id_opt { if let Some(unordered_term_id) = unordered_term_id_opt {
@@ -174,6 +167,7 @@ impl SegmentWriter {
if let Some(last_token) = tok_str.tokens.last() { if let Some(last_token) = tok_str.tokens.last() {
total_offset += last_token.offset_to; total_offset += last_token.offset_to;
} }
token_streams token_streams
.push(PreTokenizedStream::from(tok_str.clone()).into()); .push(PreTokenizedStream::from(tok_str.clone()).into());
} }
@@ -183,6 +177,7 @@ impl SegmentWriter {
{ {
offsets.push(total_offset); offsets.push(total_offset);
total_offset += text.len(); total_offset += text.len();
token_streams.push(tokenizer.token_stream(text)); token_streams.push(tokenizer.token_stream(text));
} }
} }
@@ -194,12 +189,8 @@ impl SegmentWriter {
0 0
} else { } else {
let mut token_stream = TokenStreamChain::new(offsets, token_streams); let mut token_stream = TokenStreamChain::new(offsets, token_streams);
multifield_postings.index_text( self.multifield_postings
doc_id, .index_text(doc_id, field, &mut token_stream)
field,
&mut token_stream,
term_buffer,
)
}; };
self.fieldnorms_writer.record(doc_id, field, num_tokens); self.fieldnorms_writer.record(doc_id, field, num_tokens);
@@ -207,36 +198,44 @@ impl SegmentWriter {
FieldType::U64(ref int_option) => { FieldType::U64(ref int_option) => {
if int_option.is_indexed() { if int_option.is_indexed() {
for field_value in field_values { for field_value in field_values {
term_buffer.set_field(field_value.field()); let term = Term::from_field_u64(
term_buffer.set_u64(field_value.value().u64_value()); field_value.field(),
multifield_postings.subscribe(doc_id, &term_buffer); field_value.value().u64_value(),
);
self.multifield_postings.subscribe(doc_id, &term);
} }
} }
} }
FieldType::Date(ref int_option) => { FieldType::Date(ref int_option) => {
if int_option.is_indexed() { if int_option.is_indexed() {
for field_value in field_values { for field_value in field_values {
term_buffer.set_field(field_value.field()); let term = Term::from_field_i64(
term_buffer.set_i64(field_value.value().date_value().timestamp()); field_value.field(),
multifield_postings.subscribe(doc_id, &term_buffer); field_value.value().date_value().timestamp(),
);
self.multifield_postings.subscribe(doc_id, &term);
} }
} }
} }
FieldType::I64(ref int_option) => { FieldType::I64(ref int_option) => {
if int_option.is_indexed() { if int_option.is_indexed() {
for field_value in field_values { for field_value in field_values {
term_buffer.set_field(field_value.field()); let term = Term::from_field_i64(
term_buffer.set_i64(field_value.value().i64_value()); field_value.field(),
multifield_postings.subscribe(doc_id, &term_buffer); field_value.value().i64_value(),
);
self.multifield_postings.subscribe(doc_id, &term);
} }
} }
} }
FieldType::F64(ref int_option) => { FieldType::F64(ref int_option) => {
if int_option.is_indexed() { if int_option.is_indexed() {
for field_value in field_values { for field_value in field_values {
term_buffer.set_field(field_value.field()); let term = Term::from_field_f64(
term_buffer.set_f64(field_value.value().f64_value()); field_value.field(),
multifield_postings.subscribe(doc_id, &term_buffer); field_value.value().f64_value(),
);
self.multifield_postings.subscribe(doc_id, &term);
} }
} }
} }
@@ -281,16 +280,9 @@ fn write(
fieldnorms_writer: &FieldNormsWriter, fieldnorms_writer: &FieldNormsWriter,
mut serializer: SegmentSerializer, mut serializer: SegmentSerializer,
) -> crate::Result<()> { ) -> crate::Result<()> {
if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() { let term_ord_map = multifield_postings.serialize(serializer.get_postings_serializer())?;
fieldnorms_writer.serialize(fieldnorms_serializer)?;
}
let fieldnorm_data = serializer
.segment()
.open_read(SegmentComponent::FIELDNORMS)?;
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
let term_ord_map =
multifield_postings.serialize(serializer.get_postings_serializer(), fieldnorm_readers)?;
fast_field_writers.serialize(serializer.get_fast_field_serializer(), &term_ord_map)?; fast_field_writers.serialize(serializer.get_fast_field_serializer(), &term_ord_map)?;
fieldnorms_writer.serialize(serializer.get_fieldnorms_serializer())?;
serializer.close()?; serializer.close()?;
Ok(()) Ok(())
} }

View File

@@ -98,11 +98,17 @@
//! [literate programming](https://tantivy-search.github.io/examples/basic_search.html) / //! [literate programming](https://tantivy-search.github.io/examples/basic_search.html) /
//! [source code](https://github.com/tantivy-search/tantivy/blob/master/examples/basic_search.rs)) //! [source code](https://github.com/tantivy-search/tantivy/blob/master/examples/basic_search.rs))
#[macro_use]
extern crate serde_derive;
#[cfg_attr(test, macro_use)] #[cfg_attr(test, macro_use)]
extern crate serde_json; extern crate serde_json;
#[macro_use] #[macro_use]
extern crate thiserror; extern crate log;
#[macro_use]
extern crate failure;
#[cfg(all(test, feature = "unstable"))] #[cfg(all(test, feature = "unstable"))]
extern crate test; extern crate test;
@@ -145,7 +151,6 @@ pub mod schema;
pub mod space_usage; pub mod space_usage;
pub mod store; pub mod store;
pub mod termdict; pub mod termdict;
pub use slog;
mod reader; mod reader;
@@ -154,7 +159,7 @@ mod snippet;
pub use self::snippet::{Snippet, SnippetGenerator}; pub use self::snippet::{Snippet, SnippetGenerator};
mod docset; mod docset;
pub use self::docset::{DocSet, TERMINATED}; pub use self::docset::{DocSet, SkipResult};
pub use crate::common::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64}; pub use crate::common::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
pub use crate::core::{Executor, SegmentComponent}; pub use crate::core::{Executor, SegmentComponent};
pub use crate::core::{Index, IndexMeta, Searcher, Segment, SegmentId, SegmentMeta}; pub use crate::core::{Index, IndexMeta, Searcher, Segment, SegmentId, SegmentMeta};
@@ -168,10 +173,9 @@ pub use crate::schema::{Document, Term};
use std::fmt; use std::fmt;
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use serde::{Deserialize, Serialize};
/// Index format version. /// Index format version.
const INDEX_FORMAT_VERSION: u32 = 2; const INDEX_FORMAT_VERSION: u32 = 1;
/// Structure version for the index. /// Structure version for the index.
#[derive(Clone, PartialEq, Eq, Serialize, Deserialize)] #[derive(Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -243,10 +247,11 @@ pub type DocId = u32;
/// with opstamp `n+1`. /// with opstamp `n+1`.
pub type Opstamp = u64; pub type Opstamp = u64;
/// A Score that represents the relevance of the document to the query /// A f32 that represents the relevance of the document to the query
/// ///
/// This is modelled internally as a `f32`. The larger the number, the more relevant /// This is modelled internally as a `f32`. The
/// the document to the search query. /// larger the number, the more relevant the document
/// to the search
pub type Score = f32; pub type Score = f32;
/// A `SegmentLocalId` identifies a segment. /// A `SegmentLocalId` identifies a segment.
@@ -279,13 +284,15 @@ pub struct DocAddress(pub SegmentLocalId, pub DocId);
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE; use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE;
use crate::core::SegmentReader; use crate::core::SegmentReader;
use crate::docset::{DocSet, TERMINATED}; use crate::docset::DocSet;
use crate::query::BooleanQuery; use crate::query::BooleanQuery;
use crate::schema::*; use crate::schema::*;
use crate::DocAddress; use crate::DocAddress;
use crate::Index; use crate::Index;
use crate::IndexWriter;
use crate::Postings; use crate::Postings;
use crate::ReloadPolicy; use crate::ReloadPolicy;
use rand::distributions::Bernoulli; use rand::distributions::Bernoulli;
@@ -293,26 +300,17 @@ mod tests {
use rand::rngs::StdRng; use rand::rngs::StdRng;
use rand::{Rng, SeedableRng}; use rand::{Rng, SeedableRng};
/// Checks if left and right are close one to each other. pub fn assert_nearly_equals(expected: f32, val: f32) {
/// Panics if the two values are more than 0.5% apart. assert!(
#[macro_export] nearly_equals(val, expected),
macro_rules! assert_nearly_equals { "Got {}, expected {}.",
($left:expr, $right:expr) => {{ val,
match (&$left, &$right) { expected
(left_val, right_val) => { );
let diff = (left_val - right_val).abs(); }
let add = left_val.abs() + right_val.abs();
if diff > 0.0005 * add { pub fn nearly_equals(a: f32, b: f32) -> bool {
panic!( (a - b).abs() < 0.0005 * (a + b).abs()
r#"assertion failed: `(left ~= right)`
left: `{:?}`,
right: `{:?}`"#,
&*left_val, &*right_val
)
}
}
}
}};
} }
pub fn generate_nonunique_unsorted(max_value: u32, n_elems: usize) -> Vec<u32> { pub fn generate_nonunique_unsorted(max_value: u32, n_elems: usize) -> Vec<u32> {
@@ -350,14 +348,14 @@ mod tests {
#[test] #[test]
#[cfg(feature = "mmap")] #[cfg(feature = "mmap")]
fn test_indexing() -> crate::Result<()> { fn test_indexing() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_from_tempdir(schema).unwrap(); let index = Index::create_from_tempdir(schema).unwrap();
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
{ {
let doc = doc!(text_field=>"af b"); let doc = doc!(text_field=>"af b");
index_writer.add_document(doc); index_writer.add_document(doc);
@@ -372,30 +370,36 @@ mod tests {
} }
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
} }
Ok(())
} }
#[test] #[test]
fn test_docfreq1() -> crate::Result<()> { fn test_docfreq1() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
{ {
index_writer.add_document(doc!(text_field=>"a b c")); index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.commit()?; index_writer.commit().unwrap();
} }
{ {
index_writer.add_document(doc!(text_field=>"a")); {
index_writer.add_document(doc!(text_field=>"a a")); let doc = doc!(text_field=>"a");
index_writer.commit()?; index_writer.add_document(doc);
}
{
let doc = doc!(text_field=>"a a");
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
} }
{ {
index_writer.add_document(doc!(text_field=>"c")); let doc = doc!(text_field=>"c");
index_writer.commit()?; index_writer.add_document(doc);
index_writer.commit().unwrap();
} }
{ {
let reader = index.reader()?; let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
let term_a = Term::from_field_text(text_field, "a"); let term_a = Term::from_field_text(text_field, "a");
assert_eq!(searcher.doc_freq(&term_a), 3); assert_eq!(searcher.doc_freq(&term_a), 3);
@@ -406,65 +410,80 @@ mod tests {
let term_d = Term::from_field_text(text_field, "d"); let term_d = Term::from_field_text(text_field, "d");
assert_eq!(searcher.doc_freq(&term_d), 0); assert_eq!(searcher.doc_freq(&term_d), 0);
} }
Ok(())
} }
#[test] #[test]
fn test_fieldnorm_no_docs_with_field() -> crate::Result<()> { fn test_fieldnorm_no_docs_with_field() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let title_field = schema_builder.add_text_field("title", TEXT); let title_field = schema_builder.add_text_field("title", TEXT);
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.commit()?;
let index_reader = index.reader()?;
let searcher = index_reader.searcher();
let reader = searcher.segment_reader(0);
{ {
let fieldnorm_reader = reader.get_fieldnorms_reader(text_field)?; let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
assert_eq!(fieldnorm_reader.fieldnorm(0), 3); {
let doc = doc!(text_field=>"a b c");
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
} }
{ {
let fieldnorm_reader = reader.get_fieldnorms_reader(title_field)?; let index_reader = index.reader().unwrap();
assert_eq!(fieldnorm_reader.fieldnorm_id(0), 0); let searcher = index_reader.searcher();
let reader = searcher.segment_reader(0);
{
let fieldnorm_reader = reader.get_fieldnorms_reader(text_field);
assert_eq!(fieldnorm_reader.fieldnorm(0), 3);
}
{
let fieldnorm_reader = reader.get_fieldnorms_reader(title_field);
assert_eq!(fieldnorm_reader.fieldnorm_id(0), 0);
}
} }
Ok(())
} }
#[test] #[test]
fn test_fieldnorm() -> crate::Result<()> { fn test_fieldnorm() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?; {
index_writer.add_document(doc!(text_field=>"a b c")); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!()); {
index_writer.add_document(doc!(text_field=>"a b")); let doc = doc!(text_field=>"a b c");
index_writer.commit()?; index_writer.add_document(doc);
let reader = index.reader()?; }
let searcher = reader.searcher(); {
let segment_reader: &SegmentReader = searcher.segment_reader(0); let doc = doc!();
let fieldnorms_reader = segment_reader.get_fieldnorms_reader(text_field)?; index_writer.add_document(doc);
assert_eq!(fieldnorms_reader.fieldnorm(0), 3); }
assert_eq!(fieldnorms_reader.fieldnorm(1), 0); {
assert_eq!(fieldnorms_reader.fieldnorm(2), 2); let doc = doc!(text_field=>"a b");
Ok(()) index_writer.add_document(doc);
}
index_writer.commit().unwrap();
}
{
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let segment_reader: &SegmentReader = searcher.segment_reader(0);
let fieldnorms_reader = segment_reader.get_fieldnorms_reader(text_field);
assert_eq!(fieldnorms_reader.fieldnorm(0), 3);
assert_eq!(fieldnorms_reader.fieldnorm(1), 0);
assert_eq!(fieldnorms_reader.fieldnorm(2), 2);
}
} }
fn advance_undeleted(docset: &mut dyn DocSet, reader: &SegmentReader) -> bool { fn advance_undeleted(docset: &mut dyn DocSet, reader: &SegmentReader) -> bool {
let mut doc = docset.advance(); while docset.advance() {
while doc != TERMINATED { if !reader.is_deleted(docset.doc()) {
if !reader.is_deleted(doc) {
return true; return true;
} }
doc = docset.advance();
} }
false false
} }
#[test] #[test]
fn test_delete_postings1() -> crate::Result<()> { fn test_delete_postings1() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let term_abcd = Term::from_field_text(text_field, "abcd"); let term_abcd = Term::from_field_text(text_field, "abcd");
@@ -480,7 +499,7 @@ mod tests {
.unwrap(); .unwrap();
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
// 0 // 0
index_writer.add_document(doc!(text_field=>"a b")); index_writer.add_document(doc!(text_field=>"a b"));
// 1 // 1
@@ -496,10 +515,10 @@ mod tests {
index_writer.add_document(doc!(text_field=>" b c")); index_writer.add_document(doc!(text_field=>" b c"));
// 5 // 5
index_writer.add_document(doc!(text_field=>" a")); index_writer.add_document(doc!(text_field=>" a"));
index_writer.commit()?; index_writer.commit().unwrap();
} }
{ {
reader.reload()?; reader.reload().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
let inverted_index = segment_reader.inverted_index(text_field); let inverted_index = segment_reader.inverted_index(text_field);
@@ -527,15 +546,15 @@ mod tests {
} }
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
// 0 // 0
index_writer.add_document(doc!(text_field=>"a b")); index_writer.add_document(doc!(text_field=>"a b"));
// 1 // 1
index_writer.delete_term(Term::from_field_text(text_field, "c")); index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.rollback()?; index_writer.rollback().unwrap();
} }
{ {
reader.reload()?; reader.reload().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
let seg_reader = searcher.segment_reader(0); let seg_reader = searcher.segment_reader(0);
let inverted_index = seg_reader.inverted_index(term_abcd.field()); let inverted_index = seg_reader.inverted_index(term_abcd.field());
@@ -564,15 +583,15 @@ mod tests {
} }
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"a b")); index_writer.add_document(doc!(text_field=>"a b"));
index_writer.delete_term(Term::from_field_text(text_field, "c")); index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.rollback()?; index_writer.rollback().unwrap();
index_writer.delete_term(Term::from_field_text(text_field, "a")); index_writer.delete_term(Term::from_field_text(text_field, "a"));
index_writer.commit()?; index_writer.commit().unwrap();
} }
{ {
reader.reload()?; reader.reload().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
let inverted_index = segment_reader.inverted_index(term_abcd.field()); let inverted_index = segment_reader.inverted_index(term_abcd.field());
@@ -604,20 +623,19 @@ mod tests {
assert!(!advance_undeleted(&mut postings, segment_reader)); assert!(!advance_undeleted(&mut postings, segment_reader));
} }
} }
Ok(())
} }
#[test] #[test]
fn test_indexed_u64() -> crate::Result<()> { fn test_indexed_u64() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field("value", INDEXED); let field = schema_builder.add_u64_field("value", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(field=>1u64)); index_writer.add_document(doc!(field=>1u64));
index_writer.commit()?; index_writer.commit().unwrap();
let reader = index.reader()?; let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
let term = Term::from_field_u64(field, 1u64); let term = Term::from_field_u64(field, 1u64);
let mut postings = searcher let mut postings = searcher
@@ -625,23 +643,23 @@ mod tests {
.inverted_index(term.field()) .inverted_index(term.field())
.read_postings(&term, IndexRecordOption::Basic) .read_postings(&term, IndexRecordOption::Basic)
.unwrap(); .unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 0); assert_eq!(postings.doc(), 0);
assert_eq!(postings.advance(), TERMINATED); assert!(!postings.advance());
Ok(())
} }
#[test] #[test]
fn test_indexed_i64() -> crate::Result<()> { fn test_indexed_i64() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let value_field = schema_builder.add_i64_field("value", INDEXED); let value_field = schema_builder.add_i64_field("value", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let negative_val = -1i64; let negative_val = -1i64;
index_writer.add_document(doc!(value_field => negative_val)); index_writer.add_document(doc!(value_field => negative_val));
index_writer.commit()?; index_writer.commit().unwrap();
let reader = index.reader()?; let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
let term = Term::from_field_i64(value_field, negative_val); let term = Term::from_field_i64(value_field, negative_val);
let mut postings = searcher let mut postings = searcher
@@ -649,23 +667,23 @@ mod tests {
.inverted_index(term.field()) .inverted_index(term.field())
.read_postings(&term, IndexRecordOption::Basic) .read_postings(&term, IndexRecordOption::Basic)
.unwrap(); .unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 0); assert_eq!(postings.doc(), 0);
assert_eq!(postings.advance(), TERMINATED); assert!(!postings.advance());
Ok(())
} }
#[test] #[test]
fn test_indexed_f64() -> crate::Result<()> { fn test_indexed_f64() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let value_field = schema_builder.add_f64_field("value", INDEXED); let value_field = schema_builder.add_f64_field("value", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let val = std::f64::consts::PI; let val = std::f64::consts::PI;
index_writer.add_document(doc!(value_field => val)); index_writer.add_document(doc!(value_field => val));
index_writer.commit()?; index_writer.commit().unwrap();
let reader = index.reader()?; let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
let term = Term::from_field_f64(value_field, val); let term = Term::from_field_f64(value_field, val);
let mut postings = searcher let mut postings = searcher
@@ -673,31 +691,29 @@ mod tests {
.inverted_index(term.field()) .inverted_index(term.field())
.read_postings(&term, IndexRecordOption::Basic) .read_postings(&term, IndexRecordOption::Basic)
.unwrap(); .unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 0); assert_eq!(postings.doc(), 0);
assert_eq!(postings.advance(), TERMINATED); assert!(!postings.advance());
Ok(())
} }
#[test] #[test]
fn test_indexedfield_not_in_documents() -> crate::Result<()> { fn test_indexedfield_not_in_documents() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let absent_field = schema_builder.add_text_field("text", TEXT); let absent_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_with_num_threads(2, 6_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"a")); index_writer.add_document(doc!(text_field=>"a"));
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
let reader = index.reader()?; let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
let inverted_index = segment_reader.inverted_index(absent_field); //< should not panic segment_reader.inverted_index(absent_field); //< should not panic
assert_eq!(inverted_index.terms().num_terms(), 0);
Ok(())
} }
#[test] #[test]
fn test_delete_postings2() -> crate::Result<()> { fn test_delete_postings2() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
@@ -705,40 +721,55 @@ mod tests {
let reader = index let reader = index
.reader_builder() .reader_builder()
.reload_policy(ReloadPolicy::Manual) .reload_policy(ReloadPolicy::Manual)
.try_into()?; .try_into()
.unwrap();
// writing the segment // writing the segment
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_with_num_threads(2, 6_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"63"));
index_writer.add_document(doc!(text_field=>"70")); let add_document = |index_writer: &mut IndexWriter, val: &'static str| {
index_writer.add_document(doc!(text_field=>"34")); let doc = doc!(text_field=>val);
index_writer.add_document(doc!(text_field=>"1")); index_writer.add_document(doc);
index_writer.add_document(doc!(text_field=>"38")); };
index_writer.add_document(doc!(text_field=>"33"));
index_writer.add_document(doc!(text_field=>"40")); let remove_document = |index_writer: &mut IndexWriter, val: &'static str| {
index_writer.add_document(doc!(text_field=>"17")); let delterm = Term::from_field_text(text_field, val);
index_writer.delete_term(Term::from_field_text(text_field, "38")); index_writer.delete_term(delterm);
index_writer.delete_term(Term::from_field_text(text_field, "34")); };
index_writer.commit()?;
reader.reload()?; add_document(&mut index_writer, "63");
assert_eq!(reader.searcher().num_docs(), 6); add_document(&mut index_writer, "70");
Ok(()) add_document(&mut index_writer, "34");
add_document(&mut index_writer, "1");
add_document(&mut index_writer, "38");
add_document(&mut index_writer, "33");
add_document(&mut index_writer, "40");
add_document(&mut index_writer, "17");
remove_document(&mut index_writer, "38");
remove_document(&mut index_writer, "34");
index_writer.commit().unwrap();
reader.reload().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 6);
} }
#[test] #[test]
fn test_termfreq() -> crate::Result<()> { fn test_termfreq() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"af af af bc bc")); {
index_writer.commit()?; let doc = doc!(text_field=>"af af af bc bc");
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
} }
{ {
let index_reader = index.reader()?; let index_reader = index.reader().unwrap();
let searcher = index_reader.searcher(); let searcher = index_reader.searcher();
let reader = searcher.segment_reader(0); let reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(text_field); let inverted_index = reader.inverted_index(text_field);
@@ -750,67 +781,68 @@ mod tests {
let mut postings = inverted_index let mut postings = inverted_index
.read_postings(&term_af, IndexRecordOption::WithFreqsAndPositions) .read_postings(&term_af, IndexRecordOption::WithFreqsAndPositions)
.unwrap(); .unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 0); assert_eq!(postings.doc(), 0);
assert_eq!(postings.term_freq(), 3); assert_eq!(postings.term_freq(), 3);
assert_eq!(postings.advance(), TERMINATED); assert!(!postings.advance());
} }
Ok(())
} }
#[test] #[test]
fn test_searcher_1() -> crate::Result<()> { fn test_searcher_1() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let reader = index.reader()?; let reader = index.reader().unwrap();
// writing the segment {
let mut index_writer = index.writer_for_tests()?; // writing the segment
index_writer.add_document(doc!(text_field=>"af af af b")); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"a b c")); index_writer.add_document(doc!(text_field=>"af af af b"));
index_writer.add_document(doc!(text_field=>"a b c d")); index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.commit()?; index_writer.add_document(doc!(text_field=>"a b c d"));
index_writer.commit().unwrap();
reader.reload()?; }
let searcher = reader.searcher(); {
let get_doc_ids = |terms: Vec<Term>| { reader.reload().unwrap();
let query = BooleanQuery::new_multiterms_query(terms); let searcher = reader.searcher();
searcher let get_doc_ids = |terms: Vec<Term>| {
.search(&query, &TEST_COLLECTOR_WITH_SCORE) let query = BooleanQuery::new_multiterms_query(terms);
.map(|topdocs| topdocs.docs().to_vec()) let topdocs = searcher.search(&query, &TEST_COLLECTOR_WITH_SCORE).unwrap();
}; topdocs.docs().to_vec()
assert_eq!( };
get_doc_ids(vec![Term::from_field_text(text_field, "a")])?, assert_eq!(
vec![DocAddress(0, 1), DocAddress(0, 2)] get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
); vec![DocAddress(0, 1), DocAddress(0, 2)]
assert_eq!( );
get_doc_ids(vec![Term::from_field_text(text_field, "af")])?, assert_eq!(
vec![DocAddress(0, 0)] get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
); vec![DocAddress(0, 0)]
assert_eq!( );
get_doc_ids(vec![Term::from_field_text(text_field, "b")])?, assert_eq!(
vec![DocAddress(0, 0), DocAddress(0, 1), DocAddress(0, 2)] get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
); vec![DocAddress(0, 0), DocAddress(0, 1), DocAddress(0, 2)]
assert_eq!( );
get_doc_ids(vec![Term::from_field_text(text_field, "c")])?, assert_eq!(
vec![DocAddress(0, 1), DocAddress(0, 2)] get_doc_ids(vec![Term::from_field_text(text_field, "c")]),
); vec![DocAddress(0, 1), DocAddress(0, 2)]
assert_eq!( );
get_doc_ids(vec![Term::from_field_text(text_field, "d")])?, assert_eq!(
vec![DocAddress(0, 2)] get_doc_ids(vec![Term::from_field_text(text_field, "d")]),
); vec![DocAddress(0, 2)]
assert_eq!( );
get_doc_ids(vec![ assert_eq!(
Term::from_field_text(text_field, "b"), get_doc_ids(vec![
Term::from_field_text(text_field, "a"), Term::from_field_text(text_field, "b"),
])?, Term::from_field_text(text_field, "a"),
vec![DocAddress(0, 0), DocAddress(0, 1), DocAddress(0, 2)] ]),
); vec![DocAddress(0, 0), DocAddress(0, 1), DocAddress(0, 2)]
Ok(()) );
}
} }
#[test] #[test]
fn test_searcher_2() -> crate::Result<()> { fn test_searcher_2() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
@@ -818,17 +850,19 @@ mod tests {
let reader = index let reader = index
.reader_builder() .reader_builder()
.reload_policy(ReloadPolicy::Manual) .reload_policy(ReloadPolicy::Manual)
.try_into()?; .try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 0u64); assert_eq!(reader.searcher().num_docs(), 0u64);
// writing the segment {
let mut index_writer = index.writer_for_tests()?; // writing the segment
index_writer.add_document(doc!(text_field=>"af b")); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"a b c")); index_writer.add_document(doc!(text_field=>"af b"));
index_writer.add_document(doc!(text_field=>"a b c d")); index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.commit()?; index_writer.add_document(doc!(text_field=>"a b c d"));
reader.reload()?; index_writer.commit().unwrap();
}
reader.reload().unwrap();
assert_eq!(reader.searcher().num_docs(), 3u64); assert_eq!(reader.searcher().num_docs(), 3u64);
Ok(())
} }
#[test] #[test]
@@ -850,7 +884,7 @@ mod tests {
} }
#[test] #[test]
fn test_wrong_fast_field_type() -> crate::Result<()> { fn test_wrong_fast_field_type() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let fast_field_unsigned = schema_builder.add_u64_field("unsigned", FAST); let fast_field_unsigned = schema_builder.add_u64_field("unsigned", FAST);
let fast_field_signed = schema_builder.add_i64_field("signed", FAST); let fast_field_signed = schema_builder.add_i64_field("signed", FAST);
@@ -860,14 +894,14 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_with_num_threads(1, 50_000_000).unwrap();
{ {
let document = let document =
doc!(fast_field_unsigned => 4u64, fast_field_signed=>4i64, fast_field_float=>4f64); doc!(fast_field_unsigned => 4u64, fast_field_signed=>4i64, fast_field_float=>4f64);
index_writer.add_document(document); index_writer.add_document(document);
index_writer.commit()?; index_writer.commit().unwrap();
} }
let reader = index.reader()?; let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
let segment_reader: &SegmentReader = searcher.segment_reader(0); let segment_reader: &SegmentReader = searcher.segment_reader(0);
{ {
@@ -906,12 +940,11 @@ mod tests {
let fast_field_reader = fast_field_reader_opt.unwrap(); let fast_field_reader = fast_field_reader_opt.unwrap();
assert_eq!(fast_field_reader.get(0), 4f64) assert_eq!(fast_field_reader.get(0), 4f64)
} }
Ok(())
} }
// motivated by #729 // motivated by #729
#[test] #[test]
fn test_update_via_delete_insert() -> crate::Result<()> { fn test_update_via_delete_insert() {
use crate::collector::Count; use crate::collector::Count;
use crate::indexer::NoMergePolicy; use crate::indexer::NoMergePolicy;
use crate::query::AllQuery; use crate::query::AllQuery;
@@ -925,17 +958,17 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone()); let index = Index::create_in_ram(schema.clone());
let index_reader = index.reader()?; let index_reader = index.reader().unwrap();
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer(3_000_000).unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy)); index_writer.set_merge_policy(Box::new(NoMergePolicy));
for doc_id in 0u64..DOC_COUNT { for doc_id in 0u64..DOC_COUNT {
index_writer.add_document(doc!(id => doc_id)); index_writer.add_document(doc!(id => doc_id));
} }
index_writer.commit()?; index_writer.commit().unwrap();
index_reader.reload()?; index_reader.reload().unwrap();
let searcher = index_reader.searcher(); let searcher = index_reader.searcher();
assert_eq!( assert_eq!(
@@ -946,11 +979,12 @@ mod tests {
// update the 10 elements by deleting and re-adding // update the 10 elements by deleting and re-adding
for doc_id in 0u64..DOC_COUNT { for doc_id in 0u64..DOC_COUNT {
index_writer.delete_term(Term::from_field_u64(id, doc_id)); index_writer.delete_term(Term::from_field_u64(id, doc_id));
index_writer.commit()?; index_writer.commit().unwrap();
index_reader.reload()?; index_reader.reload().unwrap();
index_writer.add_document(doc!(id => doc_id)); let doc = doc!(id => doc_id);
index_writer.commit()?; index_writer.add_document(doc);
index_reader.reload()?; index_writer.commit().unwrap();
index_reader.reload().unwrap();
let searcher = index_reader.searcher(); let searcher = index_reader.searcher();
// The number of document should be stable. // The number of document should be stable.
assert_eq!( assert_eq!(
@@ -959,7 +993,7 @@ mod tests {
); );
} }
index_reader.reload()?; index_reader.reload().unwrap();
let searcher = index_reader.searcher(); let searcher = index_reader.searcher();
let segment_ids: Vec<SegmentId> = searcher let segment_ids: Vec<SegmentId> = searcher
.segment_readers() .segment_readers()
@@ -968,18 +1002,12 @@ mod tests {
.collect(); .collect();
block_on(index_writer.merge(&segment_ids)).unwrap(); block_on(index_writer.merge(&segment_ids)).unwrap();
index_reader.reload()?; index_reader.reload().unwrap();
let searcher = index_reader.searcher(); let searcher = index_reader.searcher();
assert_eq!(searcher.search(&AllQuery, &Count)?, DOC_COUNT as usize);
Ok(())
}
#[test] assert_eq!(
fn test_validate_checksum() -> crate::Result<()> { searcher.search(&AllQuery, &Count).unwrap(),
let index_path = tempfile::tempdir().expect("dir"); DOC_COUNT as usize
let schema = Schema::builder().build(); );
let index = Index::create_in_dir(&index_path, schema)?;
assert!(index.validate_checksum()?.is_empty());
Ok(())
} }
} }

View File

@@ -37,9 +37,9 @@ const LONG_SKIP_INTERVAL: u64 = (LONG_SKIP_IN_BLOCKS * COMPRESSION_BLOCK_SIZE) a
#[cfg(test)] #[cfg(test)]
pub mod tests { pub mod tests {
use super::PositionSerializer; use super::{PositionReader, PositionSerializer};
use crate::directory::ReadOnlySource; use crate::directory::ReadOnlySource;
use crate::positions::reader::PositionReader; use crate::positions::COMPRESSION_BLOCK_SIZE;
use std::iter; use std::iter;
fn create_stream_buffer(vals: &[u32]) -> (ReadOnlySource, ReadOnlySource) { fn create_stream_buffer(vals: &[u32]) -> (ReadOnlySource, ReadOnlySource) {
@@ -68,7 +68,7 @@ pub mod tests {
let mut position_reader = PositionReader::new(stream, skip, 0u64); let mut position_reader = PositionReader::new(stream, skip, 0u64);
for &n in &[1, 10, 127, 128, 130, 312] { for &n in &[1, 10, 127, 128, 130, 312] {
let mut v = vec![0u32; n]; let mut v = vec![0u32; n];
position_reader.read(0, &mut v[..]); position_reader.read(&mut v[..n]);
for i in 0..n { for i in 0..n {
assert_eq!(v[i], i as u32); assert_eq!(v[i], i as u32);
} }
@@ -76,19 +76,19 @@ pub mod tests {
} }
#[test] #[test]
fn test_position_read_with_offset() { fn test_position_skip() {
let v: Vec<u32> = (0..1000).collect(); let v: Vec<u32> = (0..1_000).collect();
let (stream, skip) = create_stream_buffer(&v[..]); let (stream, skip) = create_stream_buffer(&v[..]);
assert_eq!(skip.len(), 12); assert_eq!(skip.len(), 12);
assert_eq!(stream.len(), 1168); assert_eq!(stream.len(), 1168);
let mut position_reader = PositionReader::new(stream, skip, 0u64); let mut position_reader = PositionReader::new(stream, skip, 0u64);
for &offset in &[1u64, 10u64, 127u64, 128u64, 130u64, 312u64] { position_reader.skip(10);
for &len in &[1, 10, 130, 500] { for &n in &[10, 127, COMPRESSION_BLOCK_SIZE, 130, 312] {
let mut v = vec![0u32; len]; let mut v = vec![0u32; n];
position_reader.read(offset, &mut v[..]); position_reader.read(&mut v[..n]);
for i in 0..len { for i in 0..n {
assert_eq!(v[i], i as u32 + offset as u32); assert_eq!(v[i], 10u32 + i as u32);
}
} }
} }
} }
@@ -103,12 +103,11 @@ pub mod tests {
let mut position_reader = PositionReader::new(stream, skip, 0u64); let mut position_reader = PositionReader::new(stream, skip, 0u64);
let mut buf = [0u32; 7]; let mut buf = [0u32; 7];
let mut c = 0; let mut c = 0;
let mut offset = 0;
for _ in 0..100 { for _ in 0..100 {
position_reader.read(offset, &mut buf); position_reader.read(&mut buf);
position_reader.read(offset, &mut buf); position_reader.read(&mut buf);
offset += 7; position_reader.skip(4);
position_reader.skip(3);
for &el in &buf { for &el in &buf {
assert_eq!(c, el); assert_eq!(c, el);
c += 1; c += 1;
@@ -116,58 +115,6 @@ pub mod tests {
} }
} }
#[test]
fn test_position_reread_anchor_different_than_block() {
let v: Vec<u32> = (0..2_000_000).collect();
let (stream, skip) = create_stream_buffer(&v[..]);
assert_eq!(skip.len(), 15_749);
assert_eq!(stream.len(), 4_987_872);
let mut position_reader = PositionReader::new(stream.clone(), skip.clone(), 0);
let mut buf = [0u32; 256];
position_reader.read(128, &mut buf);
for i in 0..256 {
assert_eq!(buf[i], (128 + i) as u32);
}
position_reader.read(128, &mut buf);
for i in 0..256 {
assert_eq!(buf[i], (128 + i) as u32);
}
}
#[test]
#[should_panic(expected = "offset arguments should be increasing.")]
fn test_position_panic_if_called_previous_anchor() {
let v: Vec<u32> = (0..2_000_000).collect();
let (stream, skip) = create_stream_buffer(&v[..]);
assert_eq!(skip.len(), 15_749);
assert_eq!(stream.len(), 4_987_872);
let mut buf = [0u32; 1];
let mut position_reader = PositionReader::new(stream.clone(), skip.clone(), 200_000);
position_reader.read(230, &mut buf);
position_reader.read(9, &mut buf);
}
#[test]
fn test_positions_bug() {
let mut v: Vec<u32> = vec![];
for i in 1..200 {
for j in 0..i {
v.push(j);
}
}
let (stream, skip) = create_stream_buffer(&v[..]);
let mut buf = Vec::new();
let mut position_reader = PositionReader::new(stream.clone(), skip.clone(), 0);
let mut offset = 0;
for i in 1..24 {
buf.resize(i, 0);
position_reader.read(offset, &mut buf[..]);
offset += i as u64;
let r: Vec<u32> = (0..i).map(|el| el as u32).collect();
assert_eq!(buf, &r[..]);
}
}
#[test] #[test]
fn test_position_long_skip_const() { fn test_position_long_skip_const() {
const CONST_VAL: u32 = 9u32; const CONST_VAL: u32 = 9u32;
@@ -177,7 +124,7 @@ pub mod tests {
assert_eq!(stream.len(), 1_000_000); assert_eq!(stream.len(), 1_000_000);
let mut position_reader = PositionReader::new(stream, skip, 128 * 1024); let mut position_reader = PositionReader::new(stream, skip, 128 * 1024);
let mut buf = [0u32; 1]; let mut buf = [0u32; 1];
position_reader.read(0, &mut buf); position_reader.read(&mut buf);
assert_eq!(buf[0], CONST_VAL); assert_eq!(buf[0], CONST_VAL);
} }
@@ -196,7 +143,7 @@ pub mod tests {
] { ] {
let mut position_reader = PositionReader::new(stream.clone(), skip.clone(), offset); let mut position_reader = PositionReader::new(stream.clone(), skip.clone(), offset);
let mut buf = [0u32; 1]; let mut buf = [0u32; 1];
position_reader.read(0, &mut buf); position_reader.read(&mut buf);
assert_eq!(buf[0], offset as u32); assert_eq!(buf[0], offset as u32);
} }
} }

View File

@@ -3,6 +3,7 @@ use crate::directory::ReadOnlySource;
use crate::positions::COMPRESSION_BLOCK_SIZE; use crate::positions::COMPRESSION_BLOCK_SIZE;
use crate::positions::LONG_SKIP_INTERVAL; use crate::positions::LONG_SKIP_INTERVAL;
use crate::positions::LONG_SKIP_IN_BLOCKS; use crate::positions::LONG_SKIP_IN_BLOCKS;
use crate::postings::compression::compressed_block_size;
/// Positions works as a long sequence of compressed block. /// Positions works as a long sequence of compressed block.
/// All terms are chained one after the other. /// All terms are chained one after the other.
/// ///
@@ -61,34 +62,74 @@ impl Positions {
fn reader(&self, offset: u64) -> PositionReader { fn reader(&self, offset: u64) -> PositionReader {
let long_skip_id = (offset / LONG_SKIP_INTERVAL) as usize; let long_skip_id = (offset / LONG_SKIP_INTERVAL) as usize;
let small_skip = (offset % LONG_SKIP_INTERVAL) as usize;
let offset_num_bytes: u64 = self.long_skip(long_skip_id); let offset_num_bytes: u64 = self.long_skip(long_skip_id);
let mut position_read = OwnedRead::new(self.position_source.clone()); let mut position_read = OwnedRead::new(self.position_source.clone());
position_read.advance(offset_num_bytes as usize); position_read.advance(offset_num_bytes as usize);
let mut skip_read = OwnedRead::new(self.skip_source.clone()); let mut skip_read = OwnedRead::new(self.skip_source.clone());
skip_read.advance(long_skip_id * LONG_SKIP_IN_BLOCKS); skip_read.advance(long_skip_id * LONG_SKIP_IN_BLOCKS);
PositionReader { let mut position_reader = PositionReader {
bit_packer: self.bit_packer, bit_packer: self.bit_packer,
skip_read, skip_read,
position_read, position_read,
inner_offset: 0,
buffer: Box::new([0u32; 128]), buffer: Box::new([0u32; 128]),
block_offset: std::i64::MAX as u64, ahead: None,
anchor_offset: (long_skip_id as u64) * LONG_SKIP_INTERVAL, };
abs_offset: offset, position_reader.skip(small_skip);
} position_reader
} }
} }
#[derive(Clone)]
pub struct PositionReader { pub struct PositionReader {
skip_read: OwnedRead, skip_read: OwnedRead,
position_read: OwnedRead, position_read: OwnedRead,
bit_packer: BitPacker4x, bit_packer: BitPacker4x,
buffer: Box<[u32; COMPRESSION_BLOCK_SIZE]>, inner_offset: usize,
buffer: Box<[u32; 128]>,
ahead: Option<usize>, // if None, no block is loaded.
// if Some(num_blocks), the block currently loaded is num_blocks ahead
// of the block of the next int to read.
}
block_offset: u64, // `ahead` represents the offset of the block currently loaded
anchor_offset: u64, // compared to the cursor of the actual stream.
//
abs_offset: u64, // By contract, when this function is called, the current block has to be
// decompressed.
//
// If the requested number of els ends exactly at a given block, the next
// block is not decompressed.
fn read_impl(
bit_packer: BitPacker4x,
mut position: &[u8],
buffer: &mut [u32; 128],
mut inner_offset: usize,
num_bits: &[u8],
output: &mut [u32],
) -> usize {
let mut output_start = 0;
let mut output_len = output.len();
let mut ahead = 0;
loop {
let available_len = COMPRESSION_BLOCK_SIZE - inner_offset;
// We have enough elements in the current block.
// Let's copy the requested elements in the output buffer,
// and return.
if output_len <= available_len {
output[output_start..].copy_from_slice(&buffer[inner_offset..][..output_len]);
return ahead;
}
output[output_start..][..available_len].copy_from_slice(&buffer[inner_offset..]);
output_len -= available_len;
output_start += available_len;
inner_offset = 0;
let num_bits = num_bits[ahead];
bit_packer.decompress(position, &mut buffer[..], num_bits);
let block_len = compressed_block_size(num_bits);
position = &position[block_len..];
ahead += 1;
}
} }
impl PositionReader { impl PositionReader {
@@ -100,65 +141,57 @@ impl PositionReader {
Positions::new(position_source, skip_source).reader(offset) Positions::new(position_source, skip_source).reader(offset)
} }
fn advance_num_blocks(&mut self, num_blocks: usize) { /// Fills a buffer with the next `output.len()` integers.
let num_bits: usize = self.skip_read.as_ref()[..num_blocks] /// This does not consume / advance the stream.
.iter() pub fn read(&mut self, output: &mut [u32]) {
.cloned() let skip_data = self.skip_read.as_ref();
.map(|num_bits| num_bits as usize) let position_data = self.position_read.as_ref();
.sum(); let num_bits = self.skip_read.get(0);
let num_bytes_to_skip = num_bits * COMPRESSION_BLOCK_SIZE / 8; if self.ahead != Some(0) {
self.skip_read.advance(num_blocks as usize); // the block currently available is not the block
self.position_read.advance(num_bytes_to_skip); // for the current position
}
/// Fills a buffer with the positions `[offset..offset+output.len())` integers.
///
/// `offset` is required to have a value >= to the offsets given in previous calls
/// for the given `PositionReaderAbsolute` instance.
pub fn read(&mut self, mut offset: u64, mut output: &mut [u32]) {
offset += self.abs_offset;
assert!(
offset >= self.anchor_offset,
"offset arguments should be increasing."
);
let delta_to_block_offset = offset as i64 - self.block_offset as i64;
if delta_to_block_offset < 0 || delta_to_block_offset >= 128 {
// The first position is not within the first block.
// We need to decompress the first block.
let delta_to_anchor_offset = offset - self.anchor_offset;
let num_blocks_to_skip =
(delta_to_anchor_offset / (COMPRESSION_BLOCK_SIZE as u64)) as usize;
self.advance_num_blocks(num_blocks_to_skip);
self.anchor_offset = offset - (offset % COMPRESSION_BLOCK_SIZE as u64);
self.block_offset = self.anchor_offset;
let num_bits = self.skip_read.get(0);
self.bit_packer
.decompress(self.position_read.as_ref(), self.buffer.as_mut(), num_bits);
} else {
let num_blocks_to_skip =
((self.block_offset - self.anchor_offset) / COMPRESSION_BLOCK_SIZE as u64) as usize;
self.advance_num_blocks(num_blocks_to_skip);
self.anchor_offset = self.block_offset;
}
let mut num_bits = self.skip_read.get(0);
let mut position_data = self.position_read.as_ref();
for i in 1.. {
let offset_in_block = (offset as usize) % COMPRESSION_BLOCK_SIZE;
let remaining_in_block = COMPRESSION_BLOCK_SIZE - offset_in_block;
if remaining_in_block >= output.len() {
output.copy_from_slice(&self.buffer[offset_in_block..][..output.len()]);
break;
}
output[..remaining_in_block].copy_from_slice(&self.buffer[offset_in_block..]);
output = &mut output[remaining_in_block..];
offset += remaining_in_block as u64;
position_data = &position_data[(num_bits as usize * COMPRESSION_BLOCK_SIZE / 8)..];
num_bits = self.skip_read.get(i);
self.bit_packer self.bit_packer
.decompress(position_data, self.buffer.as_mut(), num_bits); .decompress(position_data, self.buffer.as_mut(), num_bits);
self.block_offset += COMPRESSION_BLOCK_SIZE as u64; self.ahead = Some(0);
} }
let block_len = compressed_block_size(num_bits);
self.ahead = Some(read_impl(
self.bit_packer,
&position_data[block_len..],
self.buffer.as_mut(),
self.inner_offset,
&skip_data[1..],
output,
));
}
/// Skip the next `skip_len` integer.
///
/// If a full block is skipped, calling
/// `.skip(...)` will avoid decompressing it.
///
/// May panic if the end of the stream is reached.
pub fn skip(&mut self, skip_len: usize) {
let skip_len_plus_inner_offset = skip_len + self.inner_offset;
let num_blocks_to_advance = skip_len_plus_inner_offset / COMPRESSION_BLOCK_SIZE;
self.inner_offset = skip_len_plus_inner_offset % COMPRESSION_BLOCK_SIZE;
self.ahead = self.ahead.and_then(|num_blocks| {
if num_blocks >= num_blocks_to_advance {
Some(num_blocks - num_blocks_to_advance)
} else {
None
}
});
let skip_len_in_bits = self.skip_read.as_ref()[..num_blocks_to_advance]
.iter()
.map(|num_bits| *num_bits as usize)
.sum::<usize>()
* COMPRESSION_BLOCK_SIZE;
let skip_len_in_bytes = skip_len_in_bits / 8;
self.skip_read.advance(num_blocks_to_advance);
self.position_read.advance(skip_len_in_bytes);
} }
} }

View File

@@ -87,7 +87,6 @@ fn exponential_search(arr: &[u32], target: u32) -> (usize, usize) {
(begin, end) (begin, end)
} }
#[inline(never)]
fn galloping(block_docs: &[u32], target: u32) -> usize { fn galloping(block_docs: &[u32], target: u32) -> usize {
let (start, end) = exponential_search(&block_docs, target); let (start, end) = exponential_search(&block_docs, target);
start + linear_search(&block_docs[start..end], target) start + linear_search(&block_docs[start..end], target)
@@ -107,7 +106,7 @@ impl BlockSearcher {
/// the target. /// the target.
/// ///
/// The results should be equivalent to /// The results should be equivalent to
/// ```compile_fail /// ```ignore
/// block[..] /// block[..]
// .iter() // .iter()
// .take_while(|&&val| val < target) // .take_while(|&&val| val < target)
@@ -130,18 +129,23 @@ impl BlockSearcher {
/// ///
/// If SSE2 instructions are available in the `(platform, running CPU)`, /// If SSE2 instructions are available in the `(platform, running CPU)`,
/// then we use a different implementation that does an exhaustive linear search over /// then we use a different implementation that does an exhaustive linear search over
/// the block regardless of whether the block is full or not. /// the full block whenever the block is full (`len == 128`). It is surprisingly faster, most likely because of the lack
/// /// of branch.
/// Indeed, if the block is not full, the remaining items are TERMINATED. pub(crate) fn search_in_block(
/// It is surprisingly faster, most likely because of the lack of branch misprediction. self,
pub(crate) fn search_in_block(self, block_docs: &AlignedBuffer, target: u32) -> usize { block_docs: &AlignedBuffer,
len: usize,
start: usize,
target: u32,
) -> usize {
#[cfg(target_arch = "x86_64")] #[cfg(target_arch = "x86_64")]
{ {
if self == BlockSearcher::SSE2 { use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
if self == BlockSearcher::SSE2 && len == COMPRESSION_BLOCK_SIZE {
return sse2::linear_search_sse2_128(block_docs, target); return sse2::linear_search_sse2_128(block_docs, target);
} }
} }
galloping(&block_docs.0[..], target) start + galloping(&block_docs.0[start..len], target)
} }
} }
@@ -162,7 +166,6 @@ mod tests {
use super::exponential_search; use super::exponential_search;
use super::linear_search; use super::linear_search;
use super::BlockSearcher; use super::BlockSearcher;
use crate::docset::TERMINATED;
use crate::postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE}; use crate::postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
#[test] #[test]
@@ -193,12 +196,19 @@ mod tests {
fn util_test_search_in_block(block_searcher: BlockSearcher, block: &[u32], target: u32) { fn util_test_search_in_block(block_searcher: BlockSearcher, block: &[u32], target: u32) {
let cursor = search_in_block_trivial_but_slow(block, target); let cursor = search_in_block_trivial_but_slow(block, target);
assert!(block.len() < COMPRESSION_BLOCK_SIZE); assert!(block.len() < COMPRESSION_BLOCK_SIZE);
let mut output_buffer = [TERMINATED; COMPRESSION_BLOCK_SIZE]; let mut output_buffer = [u32::max_value(); COMPRESSION_BLOCK_SIZE];
output_buffer[..block.len()].copy_from_slice(block); output_buffer[..block.len()].copy_from_slice(block);
assert_eq!( for i in 0..cursor {
block_searcher.search_in_block(&AlignedBuffer(output_buffer), target), assert_eq!(
cursor block_searcher.search_in_block(
); &AlignedBuffer(output_buffer),
block.len(),
i,
target
),
cursor
);
}
} }
fn util_test_search_in_block_all(block_searcher: BlockSearcher, block: &[u32]) { fn util_test_search_in_block_all(block_searcher: BlockSearcher, block: &[u32]) {

View File

@@ -1,527 +0,0 @@
use crate::common::{BinarySerializable, VInt};
use crate::directory::ReadOnlySource;
use crate::fieldnorm::FieldNormReader;
use crate::postings::compression::{
AlignedBuffer, BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE,
};
use crate::postings::{BlockInfo, FreqReadingOption, SkipReader};
use crate::query::BM25Weight;
use crate::schema::IndexRecordOption;
use crate::{DocId, Score, TERMINATED};
fn max_score<I: Iterator<Item = Score>>(mut it: I) -> Option<Score> {
if let Some(first) = it.next() {
Some(it.fold(first, Score::max))
} else {
None
}
}
/// `BlockSegmentPostings` is a cursor iterating over blocks
/// of documents.
///
/// # Warning
///
/// While it is useful for some very specific high-performance
/// use cases, you should prefer using `SegmentPostings` for most usage.
#[derive(Clone)]
pub struct BlockSegmentPostings {
pub(crate) doc_decoder: BlockDecoder,
loaded_offset: usize,
freq_decoder: BlockDecoder,
freq_reading_option: FreqReadingOption,
block_max_score_cache: Option<Score>,
doc_freq: u32,
data: ReadOnlySource,
pub(crate) skip_reader: SkipReader,
}
fn decode_bitpacked_block(
doc_decoder: &mut BlockDecoder,
freq_decoder_opt: Option<&mut BlockDecoder>,
data: &[u8],
doc_offset: DocId,
doc_num_bits: u8,
tf_num_bits: u8,
) {
let num_consumed_bytes = doc_decoder.uncompress_block_sorted(data, doc_offset, doc_num_bits);
if let Some(freq_decoder) = freq_decoder_opt {
freq_decoder.uncompress_block_unsorted(&data[num_consumed_bytes..], tf_num_bits);
}
}
fn decode_vint_block(
doc_decoder: &mut BlockDecoder,
freq_decoder_opt: Option<&mut BlockDecoder>,
data: &[u8],
doc_offset: DocId,
num_vint_docs: usize,
) {
let num_consumed_bytes =
doc_decoder.uncompress_vint_sorted(data, doc_offset, num_vint_docs, TERMINATED);
if let Some(freq_decoder) = freq_decoder_opt {
freq_decoder.uncompress_vint_unsorted(
&data[num_consumed_bytes..],
num_vint_docs,
TERMINATED,
);
}
}
fn split_into_skips_and_postings(
doc_freq: u32,
data: ReadOnlySource,
) -> (Option<ReadOnlySource>, ReadOnlySource) {
if doc_freq < COMPRESSION_BLOCK_SIZE as u32 {
return (None, data);
}
let mut data_byte_arr = data.as_slice();
let skip_len = VInt::deserialize(&mut data_byte_arr)
.expect("Data corrupted")
.0 as usize;
let vint_len = data.len() - data_byte_arr.len();
let (skip_data, postings_data) = data.slice_from(vint_len).split(skip_len);
(Some(skip_data), postings_data)
}
impl BlockSegmentPostings {
pub(crate) fn from_data(
doc_freq: u32,
data: ReadOnlySource,
record_option: IndexRecordOption,
requested_option: IndexRecordOption,
) -> BlockSegmentPostings {
let freq_reading_option = match (record_option, requested_option) {
(IndexRecordOption::Basic, _) => FreqReadingOption::NoFreq,
(_, IndexRecordOption::Basic) => FreqReadingOption::SkipFreq,
(_, _) => FreqReadingOption::ReadFreq,
};
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, data);
let skip_reader = match skip_data_opt {
Some(skip_data) => SkipReader::new(skip_data, doc_freq, record_option),
None => SkipReader::new(ReadOnlySource::empty(), doc_freq, record_option),
};
let mut block_segment_postings = BlockSegmentPostings {
doc_decoder: BlockDecoder::with_val(TERMINATED),
loaded_offset: std::usize::MAX,
freq_decoder: BlockDecoder::with_val(1),
freq_reading_option,
block_max_score_cache: None,
doc_freq,
data: postings_data,
skip_reader,
};
block_segment_postings.load_block();
block_segment_postings
}
/// Returns the block_max_score for the current block.
/// It does not require the block to be loaded. For instance, it is ok to call this method
/// after having called `.shallow_advance(..)`.
///
/// See `TermScorer::block_max_score(..)` for more information.
pub fn block_max_score(
&mut self,
fieldnorm_reader: &FieldNormReader,
bm25_weight: &BM25Weight,
) -> Score {
if let Some(score) = self.block_max_score_cache {
return score;
}
if let Some(skip_reader_max_score) = self.skip_reader.block_max_score(bm25_weight) {
// if we are on a full block, the skip reader should have the block max information
// for us
self.block_max_score_cache = Some(skip_reader_max_score);
return skip_reader_max_score;
}
// this is the last block of the segment posting list.
// If it is actually loaded, we can compute block max manually.
if self.block_is_loaded() {
let docs = self.doc_decoder.output_array().iter().cloned();
let freqs = self.freq_decoder.output_array().iter().cloned();
let bm25_scores = docs.zip(freqs).map(|(doc, term_freq)| {
let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc);
bm25_weight.score(fieldnorm_id, term_freq)
});
let block_max_score = max_score(bm25_scores).unwrap_or(0.0);
self.block_max_score_cache = Some(block_max_score);
return block_max_score;
}
// We do not have access to any good block max value. We return bm25_weight.max_score()
// as it is a valid upperbound.
//
// We do not cache it however, so that it gets computed when once block is loaded.
bm25_weight.max_score()
}
pub(crate) fn freq_reading_option(&self) -> FreqReadingOption {
self.freq_reading_option
}
// Resets the block segment postings on another position
// in the postings file.
//
// This is useful for enumerating through a list of terms,
// and consuming the associated posting lists while avoiding
// reallocating a `BlockSegmentPostings`.
//
// # Warning
//
// This does not reset the positions list.
pub(crate) fn reset(&mut self, doc_freq: u32, postings_data: ReadOnlySource) {
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, postings_data);
self.data = ReadOnlySource::new(postings_data);
self.block_max_score_cache = None;
self.loaded_offset = std::usize::MAX;
if let Some(skip_data) = skip_data_opt {
self.skip_reader.reset(skip_data, doc_freq);
} else {
self.skip_reader.reset(ReadOnlySource::empty(), doc_freq);
}
self.doc_freq = doc_freq;
self.load_block();
}
/// Returns the overall number of documents in the block postings.
/// It does not take in account whether documents are deleted or not.
///
/// This `doc_freq` is simply the sum of the length of all of the blocks
/// length, and it does not take in account deleted documents.
pub fn doc_freq(&self) -> u32 {
self.doc_freq
}
/// Returns the array of docs in the current block.
///
/// Before the first call to `.advance()`, the block
/// returned by `.docs()` is empty.
#[inline]
pub fn docs(&self) -> &[DocId] {
debug_assert!(self.block_is_loaded());
self.doc_decoder.output_array()
}
/// Returns a full block, regardless of whetehr the block is complete or incomplete (
/// as it happens for the last block of the posting list).
///
/// In the latter case, the block is guaranteed to be padded with the sentinel value:
/// `TERMINATED`. The array is also guaranteed to be aligned on 16 bytes = 128 bits.
///
/// This method is useful to run SSE2 linear search.
#[inline(always)]
pub(crate) fn docs_aligned(&self) -> &AlignedBuffer {
debug_assert!(self.block_is_loaded());
self.doc_decoder.output_aligned()
}
/// Return the document at index `idx` of the block.
#[inline(always)]
pub fn doc(&self, idx: usize) -> u32 {
self.doc_decoder.output(idx)
}
/// Return the array of `term freq` in the block.
#[inline]
pub fn freqs(&self) -> &[u32] {
debug_assert!(self.block_is_loaded());
self.freq_decoder.output_array()
}
/// Return the frequency at index `idx` of the block.
#[inline]
pub fn freq(&self, idx: usize) -> u32 {
debug_assert!(self.block_is_loaded());
self.freq_decoder.output(idx)
}
/// Returns the length of the current block.
///
/// All blocks have a length of `NUM_DOCS_PER_BLOCK`,
/// except the last block that may have a length
/// of any number between 1 and `NUM_DOCS_PER_BLOCK - 1`
#[inline]
pub fn block_len(&self) -> usize {
debug_assert!(self.block_is_loaded());
self.doc_decoder.output_len
}
/// Position on a block that may contains `target_doc`.
///
/// If all docs are smaller than target, the block loaded may be empty,
/// or be the last an incomplete VInt block.
pub fn seek(&mut self, target_doc: DocId) {
self.shallow_seek(target_doc);
self.load_block();
}
pub(crate) fn position_offset(&self) -> u64 {
self.skip_reader.position_offset()
}
/// Dangerous API! This calls seek on the skip list,
/// but does not `.load_block()` afterwards.
///
/// `.load_block()` needs to be called manually afterwards.
/// If all docs are smaller than target, the block loaded may be empty,
/// or be the last an incomplete VInt block.
pub(crate) fn shallow_seek(&mut self, target_doc: DocId) {
if self.skip_reader.seek(target_doc) {
self.block_max_score_cache = None;
}
}
pub(crate) fn block_is_loaded(&self) -> bool {
self.loaded_offset == self.skip_reader.byte_offset()
}
pub(crate) fn load_block(&mut self) {
let offset = self.skip_reader.byte_offset();
if self.loaded_offset == offset {
return;
}
self.loaded_offset = offset;
match self.skip_reader.block_info() {
BlockInfo::BitPacked {
doc_num_bits,
tf_num_bits,
..
} => {
decode_bitpacked_block(
&mut self.doc_decoder,
if let FreqReadingOption::ReadFreq = self.freq_reading_option {
Some(&mut self.freq_decoder)
} else {
None
},
&self.data.as_slice()[offset..],
self.skip_reader.last_doc_in_previous_block,
doc_num_bits,
tf_num_bits,
);
}
BlockInfo::VInt { num_docs } => {
let data = {
if num_docs == 0 {
&[]
} else {
&self.data.as_slice()[offset..]
}
};
decode_vint_block(
&mut self.doc_decoder,
if let FreqReadingOption::ReadFreq = self.freq_reading_option {
Some(&mut self.freq_decoder)
} else {
None
},
data,
self.skip_reader.last_doc_in_previous_block,
num_docs as usize,
);
}
}
}
/// Advance to the next block.
///
/// Returns false iff there was no remaining blocks.
pub fn advance(&mut self) {
self.skip_reader.advance();
self.block_max_score_cache = None;
self.load_block();
}
/// Returns an empty segment postings object
pub fn empty() -> BlockSegmentPostings {
BlockSegmentPostings {
doc_decoder: BlockDecoder::with_val(TERMINATED),
loaded_offset: 0,
freq_decoder: BlockDecoder::with_val(1),
freq_reading_option: FreqReadingOption::NoFreq,
block_max_score_cache: None,
doc_freq: 0,
data: ReadOnlySource::new(vec![]),
skip_reader: SkipReader::new(ReadOnlySource::new(vec![]), 0, IndexRecordOption::Basic),
}
}
}
#[cfg(test)]
mod tests {
use super::BlockSegmentPostings;
use crate::common::HasLen;
use crate::core::Index;
use crate::docset::{DocSet, TERMINATED};
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
use crate::postings::postings::Postings;
use crate::postings::SegmentPostings;
use crate::schema::IndexRecordOption;
use crate::schema::Schema;
use crate::schema::Term;
use crate::schema::INDEXED;
use crate::DocId;
#[test]
fn test_empty_segment_postings() {
let mut postings = SegmentPostings::empty();
assert_eq!(postings.doc(), TERMINATED);
assert_eq!(postings.advance(), TERMINATED);
assert_eq!(postings.advance(), TERMINATED);
assert_eq!(postings.doc_freq(), 0);
assert_eq!(postings.len(), 0);
}
#[test]
fn test_empty_postings_doc_returns_terminated() {
let mut postings = SegmentPostings::empty();
assert_eq!(postings.doc(), TERMINATED);
assert_eq!(postings.advance(), TERMINATED);
}
#[test]
fn test_empty_postings_doc_term_freq_returns_0() {
let postings = SegmentPostings::empty();
assert_eq!(postings.term_freq(), 1);
}
#[test]
fn test_empty_block_segment_postings() {
let mut postings = BlockSegmentPostings::empty();
assert!(postings.docs().is_empty());
assert_eq!(postings.doc_freq(), 0);
postings.advance();
assert!(postings.docs().is_empty());
assert_eq!(postings.doc_freq(), 0);
}
#[test]
fn test_block_segment_postings() {
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>());
let mut offset: u32 = 0u32;
// checking that the `doc_freq` is correct
assert_eq!(block_segments.doc_freq(), 100_000);
loop {
let block = block_segments.docs();
if block.is_empty() {
break;
}
for (i, doc) in block.iter().cloned().enumerate() {
assert_eq!(offset + (i as u32), doc);
}
offset += block.len() as u32;
block_segments.advance();
}
}
#[test]
fn test_skip_right_at_new_block() {
let mut doc_ids = (0..128).collect::<Vec<u32>>();
// 128 is missing
doc_ids.push(129);
doc_ids.push(130);
{
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.seek(128), 129);
assert_eq!(docset.doc(), 129);
assert_eq!(docset.advance(), 130);
assert_eq!(docset.doc(), 130);
assert_eq!(docset.advance(), TERMINATED);
}
{
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.seek(129), 129);
assert_eq!(docset.doc(), 129);
assert_eq!(docset.advance(), 130);
assert_eq!(docset.doc(), 130);
assert_eq!(docset.advance(), TERMINATED);
}
{
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.doc(), 0);
assert_eq!(docset.seek(131), TERMINATED);
assert_eq!(docset.doc(), TERMINATED);
}
}
fn build_block_postings(docs: &[DocId]) -> BlockSegmentPostings {
let mut schema_builder = Schema::builder();
let int_field = schema_builder.add_u64_field("id", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut last_doc = 0u32;
for &doc in docs {
for _ in last_doc..doc {
index_writer.add_document(doc!(int_field=>1u64));
}
index_writer.add_document(doc!(int_field=>0u64));
last_doc = doc + 1;
}
index_writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
let segment_reader = searcher.segment_reader(0);
let inverted_index = segment_reader.inverted_index(int_field);
let term = Term::from_field_u64(int_field, 0u64);
let term_info = inverted_index.get_term_info(&term).unwrap();
inverted_index.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic)
}
#[test]
fn test_block_segment_postings_seek() {
let mut docs = vec![0];
for i in 0..1300 {
docs.push((i * i / 100) + i);
}
let mut block_postings = build_block_postings(&docs[..]);
for i in vec![0, 424, 10000] {
block_postings.seek(i);
let docs = block_postings.docs();
assert!(docs[0] <= i);
assert!(docs.last().cloned().unwrap_or(0u32) >= i);
}
block_postings.seek(100_000);
assert_eq!(block_postings.doc(COMPRESSION_BLOCK_SIZE - 1), TERMINATED);
}
#[test]
fn test_reset_block_segment_postings() {
let mut schema_builder = Schema::builder();
let int_field = schema_builder.add_u64_field("id", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
// create two postings list, one containg even number,
// the other containing odd numbers.
for i in 0..6 {
let doc = doc!(int_field=> (i % 2) as u64);
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
let segment_reader = searcher.segment_reader(0);
let mut block_segments;
{
let term = Term::from_field_u64(int_field, 0u64);
let inverted_index = segment_reader.inverted_index(int_field);
let term_info = inverted_index.get_term_info(&term).unwrap();
block_segments = inverted_index
.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic);
}
assert_eq!(block_segments.docs(), &[0, 2, 4]);
{
let term = Term::from_field_u64(int_field, 1u64);
let inverted_index = segment_reader.inverted_index(int_field);
let term_info = inverted_index.get_term_info(&term).unwrap();
inverted_index.reset_block_postings_from_terminfo(&term_info, &mut block_segments);
}
assert_eq!(block_segments.docs(), &[1, 3, 5]);
}
}

View File

@@ -17,12 +17,6 @@ pub struct BlockEncoder {
pub output_len: usize, pub output_len: usize,
} }
impl Default for BlockEncoder {
fn default() -> Self {
BlockEncoder::new()
}
}
impl BlockEncoder { impl BlockEncoder {
pub fn new() -> BlockEncoder { pub fn new() -> BlockEncoder {
BlockEncoder { BlockEncoder {
@@ -52,23 +46,19 @@ impl BlockEncoder {
/// We ensure that the OutputBuffer is align on 128 bits /// We ensure that the OutputBuffer is align on 128 bits
/// in order to run SSE2 linear search on it. /// in order to run SSE2 linear search on it.
#[repr(align(128))] #[repr(align(128))]
#[derive(Clone)]
pub(crate) struct AlignedBuffer(pub [u32; COMPRESSION_BLOCK_SIZE]); pub(crate) struct AlignedBuffer(pub [u32; COMPRESSION_BLOCK_SIZE]);
#[derive(Clone)]
pub struct BlockDecoder { pub struct BlockDecoder {
bitpacker: BitPacker4x, bitpacker: BitPacker4x,
output: AlignedBuffer, output: AlignedBuffer,
pub output_len: usize, pub output_len: usize,
} }
impl Default for BlockDecoder { impl BlockDecoder {
fn default() -> Self { pub fn new() -> BlockDecoder {
BlockDecoder::with_val(0u32) BlockDecoder::with_val(0u32)
} }
}
impl BlockDecoder {
pub fn with_val(val: u32) -> BlockDecoder { pub fn with_val(val: u32) -> BlockDecoder {
BlockDecoder { BlockDecoder {
bitpacker: BitPacker4x::new(), bitpacker: BitPacker4x::new(),
@@ -100,8 +90,8 @@ impl BlockDecoder {
} }
#[inline] #[inline]
pub(crate) fn output_aligned(&self) -> &AlignedBuffer { pub(crate) fn output_aligned(&self) -> (&AlignedBuffer, usize) {
&self.output (&self.output, self.output_len)
} }
#[inline] #[inline]
@@ -144,14 +134,11 @@ pub trait VIntDecoder {
/// For instance, if delta encoded are `1, 3, 9`, and the /// For instance, if delta encoded are `1, 3, 9`, and the
/// `offset` is 5, then the output will be: /// `offset` is 5, then the output will be:
/// `5 + 1 = 6, 6 + 3= 9, 9 + 9 = 18` /// `5 + 1 = 6, 6 + 3= 9, 9 + 9 = 18`
/// fn uncompress_vint_sorted<'a>(
/// The value given in `padding` will be used to fill the remaining `128 - num_els` values.
fn uncompress_vint_sorted(
&mut self, &mut self,
compressed_data: &[u8], compressed_data: &'a [u8],
offset: u32, offset: u32,
num_els: usize, num_els: usize,
padding: u32,
) -> usize; ) -> usize;
/// Uncompress an array of `u32s`, compressed using variable /// Uncompress an array of `u32s`, compressed using variable
@@ -159,14 +146,7 @@ pub trait VIntDecoder {
/// ///
/// The method takes a number of int to decompress, and returns /// The method takes a number of int to decompress, and returns
/// the amount of bytes that were read to decompress them. /// the amount of bytes that were read to decompress them.
/// fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize;
/// The value given in `padding` will be used to fill the remaining `128 - num_els` values.
fn uncompress_vint_unsorted(
&mut self,
compressed_data: &[u8],
num_els: usize,
padding: u32,
) -> usize;
} }
impl VIntEncoder for BlockEncoder { impl VIntEncoder for BlockEncoder {
@@ -180,26 +160,18 @@ impl VIntEncoder for BlockEncoder {
} }
impl VIntDecoder for BlockDecoder { impl VIntDecoder for BlockDecoder {
fn uncompress_vint_sorted( fn uncompress_vint_sorted<'a>(
&mut self, &mut self,
compressed_data: &[u8], compressed_data: &'a [u8],
offset: u32, offset: u32,
num_els: usize, num_els: usize,
padding: u32,
) -> usize { ) -> usize {
self.output_len = num_els; self.output_len = num_els;
self.output.0.iter_mut().for_each(|el| *el = padding);
vint::uncompress_sorted(compressed_data, &mut self.output.0[..num_els], offset) vint::uncompress_sorted(compressed_data, &mut self.output.0[..num_els], offset)
} }
fn uncompress_vint_unsorted( fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize {
&mut self,
compressed_data: &[u8],
num_els: usize,
padding: u32,
) -> usize {
self.output_len = num_els; self.output_len = num_els;
self.output.0.iter_mut().for_each(|el| *el = padding);
vint::uncompress_unsorted(compressed_data, &mut self.output.0[..num_els]) vint::uncompress_unsorted(compressed_data, &mut self.output.0[..num_els])
} }
} }
@@ -208,14 +180,13 @@ impl VIntDecoder for BlockDecoder {
pub mod tests { pub mod tests {
use super::*; use super::*;
use crate::TERMINATED;
#[test] #[test]
fn test_encode_sorted_block() { fn test_encode_sorted_block() {
let vals: Vec<u32> = (0u32..128u32).map(|i| i * 7).collect(); let vals: Vec<u32> = (0u32..128u32).map(|i| i * 7).collect();
let mut encoder = BlockEncoder::new(); let mut encoder = BlockEncoder::new();
let (num_bits, compressed_data) = encoder.compress_block_sorted(&vals, 0); let (num_bits, compressed_data) = encoder.compress_block_sorted(&vals, 0);
let mut decoder = BlockDecoder::default(); let mut decoder = BlockDecoder::new();
{ {
let consumed_num_bytes = decoder.uncompress_block_sorted(compressed_data, 0, num_bits); let consumed_num_bytes = decoder.uncompress_block_sorted(compressed_data, 0, num_bits);
assert_eq!(consumed_num_bytes, compressed_data.len()); assert_eq!(consumed_num_bytes, compressed_data.len());
@@ -228,9 +199,9 @@ pub mod tests {
#[test] #[test]
fn test_encode_sorted_block_with_offset() { fn test_encode_sorted_block_with_offset() {
let vals: Vec<u32> = (0u32..128u32).map(|i| 11 + i * 7).collect(); let vals: Vec<u32> = (0u32..128u32).map(|i| 11 + i * 7).collect();
let mut encoder = BlockEncoder::default(); let mut encoder = BlockEncoder::new();
let (num_bits, compressed_data) = encoder.compress_block_sorted(&vals, 10); let (num_bits, compressed_data) = encoder.compress_block_sorted(&vals, 10);
let mut decoder = BlockDecoder::default(); let mut decoder = BlockDecoder::new();
{ {
let consumed_num_bytes = decoder.uncompress_block_sorted(compressed_data, 10, num_bits); let consumed_num_bytes = decoder.uncompress_block_sorted(compressed_data, 10, num_bits);
assert_eq!(consumed_num_bytes, compressed_data.len()); assert_eq!(consumed_num_bytes, compressed_data.len());
@@ -245,11 +216,11 @@ pub mod tests {
let mut compressed: Vec<u8> = Vec::new(); let mut compressed: Vec<u8> = Vec::new();
let n = 128; let n = 128;
let vals: Vec<u32> = (0..n).map(|i| 11u32 + (i as u32) * 7u32).collect(); let vals: Vec<u32> = (0..n).map(|i| 11u32 + (i as u32) * 7u32).collect();
let mut encoder = BlockEncoder::default(); let mut encoder = BlockEncoder::new();
let (num_bits, compressed_data) = encoder.compress_block_sorted(&vals, 10); let (num_bits, compressed_data) = encoder.compress_block_sorted(&vals, 10);
compressed.extend_from_slice(compressed_data); compressed.extend_from_slice(compressed_data);
compressed.push(173u8); compressed.push(173u8);
let mut decoder = BlockDecoder::default(); let mut decoder = BlockDecoder::new();
{ {
let consumed_num_bytes = decoder.uncompress_block_sorted(&compressed, 10, num_bits); let consumed_num_bytes = decoder.uncompress_block_sorted(&compressed, 10, num_bits);
assert_eq!(consumed_num_bytes, compressed.len() - 1); assert_eq!(consumed_num_bytes, compressed.len() - 1);
@@ -265,11 +236,11 @@ pub mod tests {
let mut compressed: Vec<u8> = Vec::new(); let mut compressed: Vec<u8> = Vec::new();
let n = 128; let n = 128;
let vals: Vec<u32> = (0..n).map(|i| 11u32 + (i as u32) * 7u32 % 12).collect(); let vals: Vec<u32> = (0..n).map(|i| 11u32 + (i as u32) * 7u32 % 12).collect();
let mut encoder = BlockEncoder::default(); let mut encoder = BlockEncoder::new();
let (num_bits, compressed_data) = encoder.compress_block_unsorted(&vals); let (num_bits, compressed_data) = encoder.compress_block_unsorted(&vals);
compressed.extend_from_slice(compressed_data); compressed.extend_from_slice(compressed_data);
compressed.push(173u8); compressed.push(173u8);
let mut decoder = BlockDecoder::default(); let mut decoder = BlockDecoder::new();
{ {
let consumed_num_bytes = decoder.uncompress_block_unsorted(&compressed, num_bits); let consumed_num_bytes = decoder.uncompress_block_unsorted(&compressed, num_bits);
assert_eq!(consumed_num_bytes + 1, compressed.len()); assert_eq!(consumed_num_bytes + 1, compressed.len());
@@ -280,27 +251,20 @@ pub mod tests {
} }
} }
#[test]
fn test_block_decoder_initialization() {
let block = BlockDecoder::with_val(TERMINATED);
assert_eq!(block.output(0), TERMINATED);
}
#[test] #[test]
fn test_encode_vint() { fn test_encode_vint() {
const PADDING_VALUE: u32 = 234_234_345u32; {
let expected_length = 154; let expected_length = 154;
let mut encoder = BlockEncoder::new(); let mut encoder = BlockEncoder::new();
let input: Vec<u32> = (0u32..123u32).map(|i| 4 + i * 7 / 2).into_iter().collect(); let input: Vec<u32> = (0u32..123u32).map(|i| 4 + i * 7 / 2).into_iter().collect();
for offset in &[0u32, 1u32, 2u32] { for offset in &[0u32, 1u32, 2u32] {
let encoded_data = encoder.compress_vint_sorted(&input, *offset); let encoded_data = encoder.compress_vint_sorted(&input, *offset);
assert!(encoded_data.len() <= expected_length); assert!(encoded_data.len() <= expected_length);
let mut decoder = BlockDecoder::default(); let mut decoder = BlockDecoder::new();
let consumed_num_bytes = let consumed_num_bytes =
decoder.uncompress_vint_sorted(&encoded_data, *offset, input.len(), PADDING_VALUE); decoder.uncompress_vint_sorted(&encoded_data, *offset, input.len());
assert_eq!(consumed_num_bytes, encoded_data.len()); assert_eq!(consumed_num_bytes, encoded_data.len());
assert_eq!(input, decoder.output_array()); assert_eq!(input, decoder.output_array());
for i in input.len()..COMPRESSION_BLOCK_SIZE {
assert_eq!(decoder.output(i), PADDING_VALUE);
} }
} }
} }
@@ -310,7 +274,6 @@ pub mod tests {
mod bench { mod bench {
use super::*; use super::*;
use crate::TERMINATED;
use rand::rngs::StdRng; use rand::rngs::StdRng;
use rand::Rng; use rand::Rng;
use rand::SeedableRng; use rand::SeedableRng;
@@ -341,7 +304,7 @@ mod bench {
let mut encoder = BlockEncoder::new(); let mut encoder = BlockEncoder::new();
let data = generate_array(COMPRESSION_BLOCK_SIZE, 0.1); let data = generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
let (num_bits, compressed) = encoder.compress_block_sorted(&data, 0u32); let (num_bits, compressed) = encoder.compress_block_sorted(&data, 0u32);
let mut decoder = BlockDecoder::default(); let mut decoder = BlockDecoder::new();
b.iter(|| { b.iter(|| {
decoder.uncompress_block_sorted(compressed, 0u32, num_bits); decoder.uncompress_block_sorted(compressed, 0u32, num_bits);
}); });
@@ -376,9 +339,9 @@ mod bench {
let mut encoder = BlockEncoder::new(); let mut encoder = BlockEncoder::new();
let data = generate_array(NUM_INTS_BENCH_VINT, 0.001); let data = generate_array(NUM_INTS_BENCH_VINT, 0.001);
let compressed = encoder.compress_vint_sorted(&data, 0u32); let compressed = encoder.compress_vint_sorted(&data, 0u32);
let mut decoder = BlockDecoder::default(); let mut decoder = BlockDecoder::new();
b.iter(|| { b.iter(|| {
decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT, TERMINATED); decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT);
}); });
} }
} }

View File

@@ -42,7 +42,7 @@ pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a
} }
#[inline(always)] #[inline(always)]
pub fn uncompress_sorted(compressed_data: &[u8], output: &mut [u32], offset: u32) -> usize { pub fn uncompress_sorted<'a>(compressed_data: &'a [u8], output: &mut [u32], offset: u32) -> usize {
let mut read_byte = 0; let mut read_byte = 0;
let mut result = offset; let mut result = offset;
for output_mut in output.iter_mut() { for output_mut in output.iter_mut() {

View File

@@ -3,8 +3,11 @@ Postings module (also called inverted index)
*/ */
mod block_search; mod block_search;
mod block_segment_postings;
pub(crate) mod compression; pub(crate) mod compression;
/// Postings module
///
/// Postings, also called inverted lists, is the key datastructure
/// to full-text search.
mod postings; mod postings;
mod postings_writer; mod postings_writer;
mod recorder; mod recorder;
@@ -19,17 +22,18 @@ pub(crate) use self::block_search::BlockSearcher;
pub(crate) use self::postings_writer::MultiFieldPostingsWriter; pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer}; pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
use self::compression::COMPRESSION_BLOCK_SIZE;
pub use self::postings::Postings; pub use self::postings::Postings;
pub(crate) use self::skip::{BlockInfo, SkipReader}; pub(crate) use self::skip::SkipReader;
pub use self::term_info::TermInfo; pub use self::term_info::TermInfo;
pub use self::block_segment_postings::BlockSegmentPostings; pub use self::segment_postings::{BlockSegmentPostings, SegmentPostings};
pub use self::segment_postings::SegmentPostings;
pub(crate) use self::stacker::compute_table_size; pub(crate) use self::stacker::compute_table_size;
pub use crate::common::HasLen; pub use crate::common::HasLen;
pub(crate) const USE_SKIP_INFO_LIMIT: u32 = COMPRESSION_BLOCK_SIZE as u32;
pub(crate) type UnorderedTermId = u64; pub(crate) type UnorderedTermId = u64;
#[cfg_attr(feature = "cargo-clippy", allow(clippy::enum_variant_names))] #[cfg_attr(feature = "cargo-clippy", allow(clippy::enum_variant_names))]
@@ -47,7 +51,7 @@ pub mod tests {
use crate::core::Index; use crate::core::Index;
use crate::core::SegmentComponent; use crate::core::SegmentComponent;
use crate::core::SegmentReader; use crate::core::SegmentReader;
use crate::docset::{DocSet, TERMINATED}; use crate::docset::{DocSet, SkipResult};
use crate::fieldnorm::FieldNormReader; use crate::fieldnorm::FieldNormReader;
use crate::indexer::operation::AddOperation; use crate::indexer::operation::AddOperation;
use crate::indexer::SegmentWriter; use crate::indexer::SegmentWriter;
@@ -65,42 +69,45 @@ pub mod tests {
use std::iter; use std::iter;
#[test] #[test]
pub fn test_position_write() -> crate::Result<()> { pub fn test_position_write() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut segment = index.new_segment(); let mut segment = index.new_segment();
let mut posting_serializer = InvertedIndexSerializer::open(&mut segment)?; let mut posting_serializer = InvertedIndexSerializer::open(&mut segment).unwrap();
let mut field_serializer = posting_serializer.new_field(text_field, 120 * 4, None)?; {
field_serializer.new_term("abc".as_bytes(), 12u32)?; let mut field_serializer = posting_serializer.new_field(text_field, 120 * 4).unwrap();
for doc_id in 0u32..120u32 { field_serializer.new_term("abc".as_bytes()).unwrap();
let delta_positions = vec![1, 2, 3, 2]; for doc_id in 0u32..120u32 {
field_serializer.write_doc(doc_id, 4, &delta_positions)?; let delta_positions = vec![1, 2, 3, 2];
field_serializer
.write_doc(doc_id, 4, &delta_positions)
.unwrap();
}
field_serializer.close_term().unwrap();
} }
field_serializer.close_term()?; posting_serializer.close().unwrap();
posting_serializer.close()?; let read = segment.open_read(SegmentComponent::POSITIONS).unwrap();
let read = segment.open_read(SegmentComponent::POSITIONS)?;
assert!(read.len() <= 140); assert!(read.len() <= 140);
Ok(())
} }
#[test] #[test]
pub fn test_skip_positions() -> crate::Result<()> { pub fn test_skip_positions() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let title = schema_builder.add_text_field("title", TEXT); let title = schema_builder.add_text_field("title", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_with_num_threads(1, 30_000_000).unwrap();
index_writer.add_document(doc!(title => r#"abc abc abc"#)); index_writer.add_document(doc!(title => r#"abc abc abc"#));
index_writer.add_document(doc!(title => r#"abc be be be be abc"#)); index_writer.add_document(doc!(title => r#"abc be be be be abc"#));
for _ in 0..1_000 { for _ in 0..1_000 {
index_writer.add_document(doc!(title => r#"abc abc abc"#)); index_writer.add_document(doc!(title => r#"abc abc abc"#));
} }
index_writer.add_document(doc!(title => r#"abc be be be be abc"#)); index_writer.add_document(doc!(title => r#"abc be be be be abc"#));
index_writer.commit()?; index_writer.commit().unwrap();
let searcher = index.reader()?.searcher(); let searcher = index.reader().unwrap().searcher();
let inverted_index = searcher.segment_reader(0u32).inverted_index(title); let inverted_index = searcher.segment_reader(0u32).inverted_index(title);
let term = Term::from_field_text(title, "abc"); let term = Term::from_field_text(title, "abc");
let mut positions = Vec::new(); let mut positions = Vec::new();
@@ -108,12 +115,29 @@ pub mod tests {
let mut postings = inverted_index let mut postings = inverted_index
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions) .read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
.unwrap(); .unwrap();
assert_eq!(postings.doc(), 0); postings.advance();
postings.positions(&mut positions); postings.positions(&mut positions);
assert_eq!(&[0, 1, 2], &positions[..]); assert_eq!(&[0, 1, 2], &positions[..]);
postings.positions(&mut positions); postings.positions(&mut positions);
assert_eq!(&[0, 1, 2], &positions[..]); assert_eq!(&[0, 1, 2], &positions[..]);
assert_eq!(postings.advance(), 1); postings.advance();
postings.positions(&mut positions);
assert_eq!(&[0, 5], &positions[..]);
}
{
let mut postings = inverted_index
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
postings.advance();
postings.advance();
postings.positions(&mut positions);
assert_eq!(&[0, 5], &positions[..]);
}
{
let mut postings = inverted_index
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
assert_eq!(postings.skip_next(1), SkipResult::Reached);
assert_eq!(postings.doc(), 1); assert_eq!(postings.doc(), 1);
postings.positions(&mut positions); postings.positions(&mut positions);
assert_eq!(&[0, 5], &positions[..]); assert_eq!(&[0, 5], &positions[..]);
@@ -122,25 +146,7 @@ pub mod tests {
let mut postings = inverted_index let mut postings = inverted_index
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions) .read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
.unwrap(); .unwrap();
assert_eq!(postings.doc(), 0); assert_eq!(postings.skip_next(1002), SkipResult::Reached);
assert_eq!(postings.advance(), 1);
postings.positions(&mut positions);
assert_eq!(&[0, 5], &positions[..]);
}
{
let mut postings = inverted_index
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
assert_eq!(postings.seek(1), 1);
assert_eq!(postings.doc(), 1);
postings.positions(&mut positions);
assert_eq!(&[0, 5], &positions[..]);
}
{
let mut postings = inverted_index
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
assert_eq!(postings.seek(1002), 1002);
assert_eq!(postings.doc(), 1002); assert_eq!(postings.doc(), 1002);
postings.positions(&mut positions); postings.positions(&mut positions);
assert_eq!(&[0, 5], &positions[..]); assert_eq!(&[0, 5], &positions[..]);
@@ -149,13 +155,12 @@ pub mod tests {
let mut postings = inverted_index let mut postings = inverted_index
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions) .read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
.unwrap(); .unwrap();
assert_eq!(postings.seek(100), 100); assert_eq!(postings.skip_next(100), SkipResult::Reached);
assert_eq!(postings.seek(1002), 1002); assert_eq!(postings.skip_next(1002), SkipResult::Reached);
assert_eq!(postings.doc(), 1002); assert_eq!(postings.doc(), 1002);
postings.positions(&mut positions); postings.positions(&mut positions);
assert_eq!(&[0, 5], &positions[..]); assert_eq!(&[0, 5], &positions[..]);
} }
Ok(())
} }
#[test] #[test]
@@ -176,7 +181,7 @@ pub mod tests {
.tokenizers() .tokenizers()
.register("simple_no_truncation", SimpleTokenizer); .register("simple_no_truncation", SimpleTokenizer);
let reader = index.reader().unwrap(); let reader = index.reader().unwrap();
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy)); index_writer.set_merge_policy(Box::new(NoMergePolicy));
{ {
index_writer.add_document(doc!(text_field=>exceeding_token_text)); index_writer.add_document(doc!(text_field=>exceeding_token_text));
@@ -205,7 +210,7 @@ pub mod tests {
} }
#[test] #[test]
pub fn test_position_and_fieldnorm1() -> crate::Result<()> { pub fn test_position_and_fieldnorm1() {
let mut positions = Vec::new(); let mut positions = Vec::new();
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
@@ -217,38 +222,42 @@ pub mod tests {
let mut segment_writer = let mut segment_writer =
SegmentWriter::for_segment(3_000_000, segment.clone(), &schema).unwrap(); SegmentWriter::for_segment(3_000_000, segment.clone(), &schema).unwrap();
{ {
let mut doc = Document::default();
// checking that position works if the field has two values // checking that position works if the field has two values
doc.add_text(text_field, "a b a c a d a a.");
doc.add_text(text_field, "d d d d a");
let op = AddOperation { let op = AddOperation {
opstamp: 0u64, opstamp: 0u64,
document: doc!( document: doc,
text_field => "a b a c a d a a.",
text_field => "d d d d a"
),
}; };
segment_writer.add_document(op, &schema)?; segment_writer.add_document(op, &schema).unwrap();
} }
{ {
let mut doc = Document::default();
doc.add_text(text_field, "b a");
let op = AddOperation { let op = AddOperation {
opstamp: 1u64, opstamp: 1u64,
document: doc!(text_field => "b a"), document: doc,
}; };
segment_writer.add_document(op, &schema).unwrap(); segment_writer.add_document(op, &schema).unwrap();
} }
for i in 2..1000 { for i in 2..1000 {
let mut text: String = iter::repeat("e ").take(i).collect(); let mut doc = Document::default();
let mut text = iter::repeat("e ").take(i).collect::<String>();
text.push_str(" a"); text.push_str(" a");
doc.add_text(text_field, &text);
let op = AddOperation { let op = AddOperation {
opstamp: 2u64, opstamp: 2u64,
document: doc!(text_field => text), document: doc,
}; };
segment_writer.add_document(op, &schema).unwrap(); segment_writer.add_document(op, &schema).unwrap();
} }
segment_writer.finalize()?; segment_writer.finalize().unwrap();
} }
{ {
let segment_reader = SegmentReader::open(&segment)?; let segment_reader = SegmentReader::open(&segment).unwrap();
{ {
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text_field)?; let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text_field);
assert_eq!(fieldnorm_reader.fieldnorm(0), 8 + 5); assert_eq!(fieldnorm_reader.fieldnorm(0), 8 + 5);
assert_eq!(fieldnorm_reader.fieldnorm(1), 2); assert_eq!(fieldnorm_reader.fieldnorm(1), 2);
for i in 2..1000 { for i in 2..1000 {
@@ -272,21 +281,22 @@ pub mod tests {
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions) .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap(); .unwrap();
assert_eq!(postings_a.len(), 1000); assert_eq!(postings_a.len(), 1000);
assert!(postings_a.advance());
assert_eq!(postings_a.doc(), 0); assert_eq!(postings_a.doc(), 0);
assert_eq!(postings_a.term_freq(), 6); assert_eq!(postings_a.term_freq(), 6);
postings_a.positions(&mut positions); postings_a.positions(&mut positions);
assert_eq!(&positions[..], [0, 2, 4, 6, 7, 13]); assert_eq!(&positions[..], [0, 2, 4, 6, 7, 13]);
assert_eq!(postings_a.advance(), 1u32); assert!(postings_a.advance());
assert_eq!(postings_a.doc(), 1u32); assert_eq!(postings_a.doc(), 1u32);
assert_eq!(postings_a.term_freq(), 1); assert_eq!(postings_a.term_freq(), 1);
for i in 2u32..1000u32 { for i in 2u32..1000u32 {
assert_eq!(postings_a.advance(), i); assert!(postings_a.advance());
assert_eq!(postings_a.term_freq(), 1); assert_eq!(postings_a.term_freq(), 1);
postings_a.positions(&mut positions); postings_a.positions(&mut positions);
assert_eq!(&positions[..], [i]); assert_eq!(&positions[..], [i]);
assert_eq!(postings_a.doc(), i); assert_eq!(postings_a.doc(), i);
} }
assert_eq!(postings_a.advance(), TERMINATED); assert!(!postings_a.advance());
} }
{ {
let term_e = Term::from_field_text(text_field, "e"); let term_e = Term::from_field_text(text_field, "e");
@@ -296,6 +306,7 @@ pub mod tests {
.unwrap(); .unwrap();
assert_eq!(postings_e.len(), 1000 - 2); assert_eq!(postings_e.len(), 1000 - 2);
for i in 2u32..1000u32 { for i in 2u32..1000u32 {
assert!(postings_e.advance());
assert_eq!(postings_e.term_freq(), i); assert_eq!(postings_e.term_freq(), i);
postings_e.positions(&mut positions); postings_e.positions(&mut positions);
assert_eq!(positions.len(), i as usize); assert_eq!(positions.len(), i as usize);
@@ -303,12 +314,10 @@ pub mod tests {
assert_eq!(positions[j], (j as u32)); assert_eq!(positions[j], (j as u32));
} }
assert_eq!(postings_e.doc(), i); assert_eq!(postings_e.doc(), i);
postings_e.advance();
} }
assert_eq!(postings_e.doc(), TERMINATED); assert!(!postings_e.advance());
} }
} }
Ok(())
} }
#[test] #[test]
@@ -319,9 +328,17 @@ pub mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(text_field => "g b b d c g c")); {
index_writer.add_document(doc!(text_field => "g a b b a d c g c")); let mut doc = Document::default();
doc.add_text(text_field, "g b b d c g c");
index_writer.add_document(doc);
}
{
let mut doc = Document::default();
doc.add_text(text_field, "g a b b a d c g c");
index_writer.add_document(doc);
}
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
} }
let term_a = Term::from_field_text(text_field, "a"); let term_a = Term::from_field_text(text_field, "a");
@@ -331,6 +348,7 @@ pub mod tests {
.inverted_index(text_field) .inverted_index(text_field)
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions) .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap(); .unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 1u32); assert_eq!(postings.doc(), 1u32);
postings.positions(&mut positions); postings.positions(&mut positions);
assert_eq!(&positions[..], &[1u32, 4]); assert_eq!(&positions[..], &[1u32, 4]);
@@ -351,9 +369,12 @@ pub mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
for i in 0u64..num_docs as u64 { for i in 0..num_docs {
let doc = doc!(value_field => 2u64, value_field => i % 2u64); let mut doc = Document::default();
doc.add_u64(value_field, 2);
doc.add_u64(value_field, (i % 2) as u64);
index_writer.add_document(doc); index_writer.add_document(doc);
} }
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
@@ -370,10 +391,11 @@ pub mod tests {
.inverted_index(term_2.field()) .inverted_index(term_2.field())
.read_postings(&term_2, IndexRecordOption::Basic) .read_postings(&term_2, IndexRecordOption::Basic)
.unwrap(); .unwrap();
assert_eq!(segment_postings.seek(i), i);
assert_eq!(segment_postings.skip_next(i), SkipResult::Reached);
assert_eq!(segment_postings.doc(), i); assert_eq!(segment_postings.doc(), i);
assert_eq!(segment_postings.seek(j), j); assert_eq!(segment_postings.skip_next(j), SkipResult::Reached);
assert_eq!(segment_postings.doc(), j); assert_eq!(segment_postings.doc(), j);
} }
} }
@@ -385,16 +407,17 @@ pub mod tests {
.unwrap(); .unwrap();
// check that `skip_next` advances the iterator // check that `skip_next` advances the iterator
assert!(segment_postings.advance());
assert_eq!(segment_postings.doc(), 0); assert_eq!(segment_postings.doc(), 0);
assert_eq!(segment_postings.seek(1), 1); assert_eq!(segment_postings.skip_next(1), SkipResult::Reached);
assert_eq!(segment_postings.doc(), 1); assert_eq!(segment_postings.doc(), 1);
assert_eq!(segment_postings.seek(1), 1); assert_eq!(segment_postings.skip_next(1), SkipResult::OverStep);
assert_eq!(segment_postings.doc(), 1); assert_eq!(segment_postings.doc(), 2);
// check that going beyond the end is handled // check that going beyond the end is handled
assert_eq!(segment_postings.seek(num_docs), TERMINATED); assert_eq!(segment_postings.skip_next(num_docs), SkipResult::End);
} }
// check that filtering works // check that filtering works
@@ -405,7 +428,7 @@ pub mod tests {
.unwrap(); .unwrap();
for i in 0..num_docs / 2 { for i in 0..num_docs / 2 {
assert_eq!(segment_postings.seek(i * 2), i * 2); assert_eq!(segment_postings.skip_next(i * 2), SkipResult::Reached);
assert_eq!(segment_postings.doc(), i * 2); assert_eq!(segment_postings.doc(), i * 2);
} }
@@ -415,19 +438,18 @@ pub mod tests {
.unwrap(); .unwrap();
for i in 0..num_docs / 2 - 1 { for i in 0..num_docs / 2 - 1 {
assert!(segment_postings.seek(i * 2 + 1) > (i * 1) * 2); assert_eq!(segment_postings.skip_next(i * 2 + 1), SkipResult::OverStep);
assert_eq!(segment_postings.doc(), (i + 1) * 2); assert_eq!(segment_postings.doc(), (i + 1) * 2);
} }
} }
// delete some of the documents // delete some of the documents
{ {
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.delete_term(term_0); index_writer.delete_term(term_0);
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
} }
let searcher = index.reader().unwrap().searcher(); let searcher = index.reader().unwrap().searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let segment_reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
// make sure seeking still works // make sure seeking still works
@@ -438,11 +460,11 @@ pub mod tests {
.unwrap(); .unwrap();
if i % 2 == 0 { if i % 2 == 0 {
assert_eq!(segment_postings.seek(i), i); assert_eq!(segment_postings.skip_next(i), SkipResult::Reached);
assert_eq!(segment_postings.doc(), i); assert_eq!(segment_postings.doc(), i);
assert!(segment_reader.is_deleted(i)); assert!(segment_reader.is_deleted(i));
} else { } else {
assert_eq!(segment_postings.seek(i), i); assert_eq!(segment_postings.skip_next(i), SkipResult::Reached);
assert_eq!(segment_postings.doc(), i); assert_eq!(segment_postings.doc(), i);
} }
} }
@@ -457,16 +479,12 @@ pub mod tests {
let mut last = 2; // start from 5 to avoid seeking to 3 twice let mut last = 2; // start from 5 to avoid seeking to 3 twice
let mut cur = 3; let mut cur = 3;
loop { loop {
let seek = segment_postings.seek(cur); match segment_postings.skip_next(cur) {
if seek == TERMINATED { SkipResult::End => break,
break; SkipResult::Reached => assert_eq!(segment_postings.doc(), cur),
} SkipResult::OverStep => assert_eq!(segment_postings.doc(), cur + 1),
assert_eq!(seek, segment_postings.doc());
if seek == cur {
assert_eq!(segment_postings.doc(), cur);
} else {
assert_eq!(segment_postings.doc(), cur + 1);
} }
let next = cur + last; let next = cur + last;
last = cur; last = cur;
cur = next; cur = next;
@@ -476,7 +494,7 @@ pub mod tests {
// delete everything else // delete everything else
{ {
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.delete_term(term_1); index_writer.delete_term(term_1);
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
} }
@@ -519,7 +537,7 @@ pub mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let posting_list_size = 1_000_000; let posting_list_size = 1_000_000;
{ {
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
for _ in 0..posting_list_size { for _ in 0..posting_list_size {
let mut doc = Document::default(); let mut doc = Document::default();
if rng.gen_bool(1f64 / 15f64) { if rng.gen_bool(1f64 / 15f64) {
@@ -552,7 +570,7 @@ pub mod tests {
} }
impl<TDocSet: DocSet> DocSet for UnoptimizedDocSet<TDocSet> { impl<TDocSet: DocSet> DocSet for UnoptimizedDocSet<TDocSet> {
fn advance(&mut self) -> DocId { fn advance(&mut self) -> bool {
self.0.advance() self.0.advance()
} }
@@ -577,26 +595,31 @@ pub mod tests {
) { ) {
for target in targets { for target in targets {
let mut postings_opt = postings_factory(); let mut postings_opt = postings_factory();
if target < postings_opt.doc() {
continue;
}
let mut postings_unopt = UnoptimizedDocSet::wrap(postings_factory()); let mut postings_unopt = UnoptimizedDocSet::wrap(postings_factory());
let skip_result_opt = postings_opt.seek(target); let skip_result_opt = postings_opt.skip_next(target);
let skip_result_unopt = postings_unopt.seek(target); let skip_result_unopt = postings_unopt.skip_next(target);
assert_eq!( assert_eq!(
skip_result_unopt, skip_result_opt, skip_result_unopt, skip_result_opt,
"Failed while skipping to {}", "Failed while skipping to {}",
target target
); );
assert!(skip_result_opt >= target); match skip_result_opt {
assert_eq!(skip_result_opt, postings_opt.doc()); SkipResult::Reached => assert_eq!(postings_opt.doc(), target),
if skip_result_opt == TERMINATED { SkipResult::OverStep => assert!(postings_opt.doc() > target),
return; SkipResult::End => {
return;
}
} }
while postings_opt.doc() != TERMINATED { while postings_opt.advance() {
assert_eq!(postings_opt.doc(), postings_unopt.doc()); assert!(postings_unopt.advance());
assert_eq!(postings_opt.advance(), postings_unopt.advance()); assert_eq!(
postings_opt.doc(),
postings_unopt.doc(),
"Failed while skipping to {}",
target
);
} }
assert!(!postings_unopt.advance());
} }
} }
} }
@@ -605,7 +628,7 @@ pub mod tests {
mod bench { mod bench {
use super::tests::*; use super::tests::*;
use crate::docset::TERMINATED; use crate::docset::SkipResult;
use crate::query::Intersection; use crate::query::Intersection;
use crate::schema::IndexRecordOption; use crate::schema::IndexRecordOption;
use crate::tests; use crate::tests;
@@ -623,7 +646,7 @@ mod bench {
.inverted_index(TERM_A.field()) .inverted_index(TERM_A.field())
.read_postings(&*TERM_A, IndexRecordOption::Basic) .read_postings(&*TERM_A, IndexRecordOption::Basic)
.unwrap(); .unwrap();
while segment_postings.advance() != TERMINATED {} while segment_postings.advance() {}
}); });
} }
@@ -655,7 +678,7 @@ mod bench {
segment_postings_c, segment_postings_c,
segment_postings_d, segment_postings_d,
]); ]);
while intersection.advance() != TERMINATED {} while intersection.advance() {}
}); });
} }
@@ -671,10 +694,11 @@ mod bench {
.unwrap(); .unwrap();
let mut existing_docs = Vec::new(); let mut existing_docs = Vec::new();
segment_postings.advance();
for doc in &docs { for doc in &docs {
if *doc >= segment_postings.doc() { if *doc >= segment_postings.doc() {
existing_docs.push(*doc); existing_docs.push(*doc);
if segment_postings.seek(*doc) == TERMINATED { if segment_postings.skip_next(*doc) == SkipResult::End {
break; break;
} }
} }
@@ -686,7 +710,7 @@ mod bench {
.read_postings(&*TERM_A, IndexRecordOption::Basic) .read_postings(&*TERM_A, IndexRecordOption::Basic)
.unwrap(); .unwrap();
for doc in &existing_docs { for doc in &existing_docs {
if segment_postings.seek(*doc) == TERMINATED { if segment_postings.skip_next(*doc) == SkipResult::End {
break; break;
} }
} }
@@ -725,9 +749,8 @@ mod bench {
.read_postings(&*TERM_A, IndexRecordOption::Basic) .read_postings(&*TERM_A, IndexRecordOption::Basic)
.unwrap(); .unwrap();
let mut s = 0u32; let mut s = 0u32;
while segment_postings.doc() != TERMINATED { while segment_postings.advance() {
s += (segment_postings.doc() & n) % 1024; s += (segment_postings.doc() & n) % 1024;
segment_postings.advance();
} }
s s
}); });

View File

@@ -1,6 +1,5 @@
use super::stacker::{Addr, MemoryArena, TermHashMap}; use super::stacker::{Addr, MemoryArena, TermHashMap};
use crate::fieldnorm::FieldNormReaders;
use crate::postings::recorder::{ use crate::postings::recorder::{
BufferLender, NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder, BufferLender, NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder,
}; };
@@ -105,7 +104,6 @@ impl MultiFieldPostingsWriter {
doc: DocId, doc: DocId,
field: Field, field: Field,
token_stream: &mut dyn TokenStream, token_stream: &mut dyn TokenStream,
term_buffer: &mut Term,
) -> u32 { ) -> u32 {
let postings_writer = let postings_writer =
self.per_field_postings_writers[field.field_id() as usize].deref_mut(); self.per_field_postings_writers[field.field_id() as usize].deref_mut();
@@ -115,7 +113,6 @@ impl MultiFieldPostingsWriter {
field, field,
token_stream, token_stream,
&mut self.heap, &mut self.heap,
term_buffer,
) )
} }
@@ -131,7 +128,6 @@ impl MultiFieldPostingsWriter {
pub fn serialize( pub fn serialize(
&self, &self,
serializer: &mut InvertedIndexSerializer, serializer: &mut InvertedIndexSerializer,
fieldnorm_readers: FieldNormReaders,
) -> crate::Result<HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>> { ) -> crate::Result<HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>> {
let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> = let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> =
self.term_index.iter().collect(); self.term_index.iter().collect();
@@ -165,12 +161,8 @@ impl MultiFieldPostingsWriter {
} }
let postings_writer = &self.per_field_postings_writers[field.field_id() as usize]; let postings_writer = &self.per_field_postings_writers[field.field_id() as usize];
let fieldnorm_reader = fieldnorm_readers.get_field(field); let mut field_serializer =
let mut field_serializer = serializer.new_field( serializer.new_field(field, postings_writer.total_num_tokens())?;
field,
postings_writer.total_num_tokens(),
fieldnorm_reader,
)?;
postings_writer.serialize( postings_writer.serialize(
&term_offsets[start..stop], &term_offsets[start..stop],
&mut field_serializer, &mut field_serializer,
@@ -222,22 +214,21 @@ pub trait PostingsWriter {
field: Field, field: Field,
token_stream: &mut dyn TokenStream, token_stream: &mut dyn TokenStream,
heap: &mut MemoryArena, heap: &mut MemoryArena,
term_buffer: &mut Term,
) -> u32 { ) -> u32 {
term_buffer.set_field(field); let mut term = Term::for_field(field);
let mut sink = |token: &Token| { let mut sink = |token: &Token| {
// We skip all tokens with a len greater than u16. // We skip all tokens with a len greater than u16.
if token.text.len() > MAX_TOKEN_LEN { if token.text.len() <= MAX_TOKEN_LEN {
return; term.set_text(token.text.as_str());
self.subscribe(term_index, doc_id, token.position as u32, &term, heap);
} else {
info!(
"A token exceeding MAX_TOKEN_LEN ({}>{}) was dropped. Search for \
MAX_TOKEN_LEN in the documentation for more information.",
token.text.len(),
MAX_TOKEN_LEN
);
} }
term_buffer.set_text(token.text.as_str());
self.subscribe(
term_index,
doc_id,
token.position as u32,
&term_buffer,
heap,
);
}; };
token_stream.process(&mut sink) token_stream.process(&mut sink)
} }
@@ -306,8 +297,7 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
let mut buffer_lender = BufferLender::default(); let mut buffer_lender = BufferLender::default();
for &(term_bytes, addr, _) in term_addrs { for &(term_bytes, addr, _) in term_addrs {
let recorder: Rec = termdict_heap.read(addr); let recorder: Rec = termdict_heap.read(addr);
let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32); serializer.new_term(&term_bytes[4..])?;
serializer.new_term(&term_bytes[4..], term_doc_freq)?;
recorder.serialize(&mut buffer_lender, serializer, heap)?; recorder.serialize(&mut buffer_lender, serializer, heap)?;
serializer.close_term()?; serializer.close_term()?;
} }

View File

@@ -75,10 +75,6 @@ pub(crate) trait Recorder: Copy + 'static {
serializer: &mut FieldSerializer<'_>, serializer: &mut FieldSerializer<'_>,
heap: &MemoryArena, heap: &MemoryArena,
) -> io::Result<()>; ) -> io::Result<()>;
/// Returns the number of document containing this term.
///
/// Returns `None` if not available.
fn term_doc_freq(&self) -> Option<u32>;
} }
/// Only records the doc ids /// Only records the doc ids
@@ -117,16 +113,11 @@ impl Recorder for NothingRecorder {
) -> io::Result<()> { ) -> io::Result<()> {
let buffer = buffer_lender.lend_u8(); let buffer = buffer_lender.lend_u8();
self.stack.read_to_end(heap, buffer); self.stack.read_to_end(heap, buffer);
// TODO avoid reading twice.
for doc in VInt32Reader::new(&buffer[..]) { for doc in VInt32Reader::new(&buffer[..]) {
serializer.write_doc(doc as u32, 0u32, &[][..])?; serializer.write_doc(doc as u32, 0u32, &[][..])?;
} }
Ok(()) Ok(())
} }
fn term_doc_freq(&self) -> Option<u32> {
None
}
} }
/// Recorder encoding document ids, and term frequencies /// Recorder encoding document ids, and term frequencies
@@ -135,7 +126,6 @@ pub struct TermFrequencyRecorder {
stack: ExpUnrolledLinkedList, stack: ExpUnrolledLinkedList,
current_doc: DocId, current_doc: DocId,
current_tf: u32, current_tf: u32,
term_doc_freq: u32,
} }
impl Recorder for TermFrequencyRecorder { impl Recorder for TermFrequencyRecorder {
@@ -144,7 +134,6 @@ impl Recorder for TermFrequencyRecorder {
stack: ExpUnrolledLinkedList::new(), stack: ExpUnrolledLinkedList::new(),
current_doc: u32::max_value(), current_doc: u32::max_value(),
current_tf: 0u32, current_tf: 0u32,
term_doc_freq: 0u32,
} }
} }
@@ -153,7 +142,6 @@ impl Recorder for TermFrequencyRecorder {
} }
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) { fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
self.term_doc_freq += 1;
self.current_doc = doc; self.current_doc = doc;
let _ = write_u32_vint(doc, &mut self.stack.writer(heap)); let _ = write_u32_vint(doc, &mut self.stack.writer(heap));
} }
@@ -184,10 +172,6 @@ impl Recorder for TermFrequencyRecorder {
Ok(()) Ok(())
} }
fn term_doc_freq(&self) -> Option<u32> {
Some(self.term_doc_freq)
}
} }
/// Recorder encoding term frequencies as well as positions. /// Recorder encoding term frequencies as well as positions.
@@ -195,14 +179,12 @@ impl Recorder for TermFrequencyRecorder {
pub struct TFAndPositionRecorder { pub struct TFAndPositionRecorder {
stack: ExpUnrolledLinkedList, stack: ExpUnrolledLinkedList,
current_doc: DocId, current_doc: DocId,
term_doc_freq: u32,
} }
impl Recorder for TFAndPositionRecorder { impl Recorder for TFAndPositionRecorder {
fn new() -> Self { fn new() -> Self {
TFAndPositionRecorder { TFAndPositionRecorder {
stack: ExpUnrolledLinkedList::new(), stack: ExpUnrolledLinkedList::new(),
current_doc: u32::max_value(), current_doc: u32::max_value(),
term_doc_freq: 0u32,
} }
} }
@@ -212,7 +194,6 @@ impl Recorder for TFAndPositionRecorder {
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) { fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
self.current_doc = doc; self.current_doc = doc;
self.term_doc_freq += 1u32;
let _ = write_u32_vint(doc, &mut self.stack.writer(heap)); let _ = write_u32_vint(doc, &mut self.stack.writer(heap));
} }
@@ -252,10 +233,6 @@ impl Recorder for TFAndPositionRecorder {
} }
Ok(()) Ok(())
} }
fn term_doc_freq(&self) -> Option<u32> {
Some(self.term_doc_freq)
}
} }
#[cfg(test)] #[cfg(test)]

View File

@@ -1,70 +1,81 @@
use crate::common::BitSet;
use crate::common::HasLen; use crate::common::HasLen;
use crate::common::{BinarySerializable, VInt};
use crate::docset::DocSet; use crate::docset::{DocSet, SkipResult};
use crate::positions::PositionReader; use crate::positions::PositionReader;
use crate::postings::compression::{compressed_block_size, AlignedBuffer};
use crate::postings::compression::COMPRESSION_BLOCK_SIZE; use crate::postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE};
use crate::postings::serializer::PostingsSerializer; use crate::postings::serializer::PostingsSerializer;
use crate::postings::BlockSearcher; use crate::postings::BlockSearcher;
use crate::postings::FreqReadingOption;
use crate::postings::Postings; use crate::postings::Postings;
use crate::postings::SkipReader;
use crate::postings::USE_SKIP_INFO_LIMIT;
use crate::schema::IndexRecordOption; use crate::schema::IndexRecordOption;
use crate::{DocId, TERMINATED}; use crate::DocId;
use owned_read::OwnedRead;
use std::cmp::Ordering;
use tantivy_fst::Streamer;
use crate::directory::ReadOnlySource; struct PositionComputer {
use crate::fastfield::DeleteBitSet; // store the amount of position int
use crate::postings::BlockSegmentPostings; // before reading positions.
//
// if none, position are already loaded in
// the positions vec.
position_to_skip: usize,
position_reader: PositionReader,
}
impl PositionComputer {
pub fn new(position_reader: PositionReader) -> PositionComputer {
PositionComputer {
position_to_skip: 0,
position_reader,
}
}
pub fn add_skip(&mut self, num_skip: usize) {
self.position_to_skip += num_skip;
}
// Positions can only be read once.
pub fn positions_with_offset(&mut self, offset: u32, output: &mut [u32]) {
self.position_reader.skip(self.position_to_skip);
self.position_to_skip = 0;
self.position_reader.read(output);
let mut cum = offset;
for output_mut in output.iter_mut() {
cum += *output_mut;
*output_mut = cum;
}
}
}
/// `SegmentPostings` represents the inverted list or postings associated to /// `SegmentPostings` represents the inverted list or postings associated to
/// a term in a `Segment`. /// a term in a `Segment`.
/// ///
/// As we iterate through the `SegmentPostings`, the frequencies are optionally decoded. /// As we iterate through the `SegmentPostings`, the frequencies are optionally decoded.
/// Positions on the other hand, are optionally entirely decoded upfront. /// Positions on the other hand, are optionally entirely decoded upfront.
#[derive(Clone)]
pub struct SegmentPostings { pub struct SegmentPostings {
pub(crate) block_cursor: BlockSegmentPostings, block_cursor: BlockSegmentPostings,
cur: usize, cur: usize,
position_reader: Option<PositionReader>, position_computer: Option<PositionComputer>,
block_searcher: BlockSearcher, block_searcher: BlockSearcher,
} }
impl SegmentPostings { impl SegmentPostings {
/// Returns an empty segment postings object /// Returns an empty segment postings object
pub fn empty() -> Self { pub fn empty() -> Self {
let empty_block_cursor = BlockSegmentPostings::empty();
SegmentPostings { SegmentPostings {
block_cursor: BlockSegmentPostings::empty(), block_cursor: empty_block_cursor,
cur: 0, cur: COMPRESSION_BLOCK_SIZE,
position_reader: None, position_computer: None,
block_searcher: BlockSearcher::default(), block_searcher: BlockSearcher::default(),
} }
} }
/// Compute the number of non-deleted documents.
///
/// This method will clone and scan through the posting lists.
/// (this is a rather expensive operation).
pub fn doc_freq_given_deletes(&self, delete_bitset: &DeleteBitSet) -> u32 {
let mut docset = self.clone();
let mut doc_freq = 0;
loop {
let doc = docset.doc();
if doc == TERMINATED {
return doc_freq;
}
if delete_bitset.is_alive(doc) {
doc_freq += 1u32;
}
docset.advance();
}
}
/// Returns the overall number of documents in the block postings.
/// It does not take in account whether documents are deleted or not.
pub fn doc_freq(&self) -> u32 {
self.block_cursor.doc_freq()
}
/// Creates a segment postings object with the given documents /// Creates a segment postings object with the given documents
/// and no frequency encoded. /// and no frequency encoded.
/// ///
@@ -76,9 +87,7 @@ impl SegmentPostings {
pub fn create_from_docs(docs: &[u32]) -> SegmentPostings { pub fn create_from_docs(docs: &[u32]) -> SegmentPostings {
let mut buffer = Vec::new(); let mut buffer = Vec::new();
{ {
let mut postings_serializer = let mut postings_serializer = PostingsSerializer::new(&mut buffer, false, false);
PostingsSerializer::new(&mut buffer, 0.0, false, false, None);
postings_serializer.new_term(docs.len() as u32);
for &doc in docs { for &doc in docs {
postings_serializer.write_doc(doc, 1u32); postings_serializer.write_doc(doc, 1u32);
} }
@@ -88,58 +97,15 @@ impl SegmentPostings {
} }
let block_segment_postings = BlockSegmentPostings::from_data( let block_segment_postings = BlockSegmentPostings::from_data(
docs.len() as u32, docs.len() as u32,
ReadOnlySource::from(buffer), OwnedRead::new(buffer),
IndexRecordOption::Basic, IndexRecordOption::Basic,
IndexRecordOption::Basic, IndexRecordOption::Basic,
); );
SegmentPostings::from_block_postings(block_segment_postings, None) SegmentPostings::from_block_postings(block_segment_postings, None)
} }
}
/// Helper functions to create `SegmentPostings` for tests. impl SegmentPostings {
#[cfg(test)]
pub fn create_from_docs_and_tfs(
doc_and_tfs: &[(u32, u32)],
fieldnorms: Option<&[u32]>,
) -> SegmentPostings {
use crate::fieldnorm::FieldNormReader;
use crate::Score;
let mut buffer: Vec<u8> = Vec::new();
let fieldnorm_reader = fieldnorms.map(FieldNormReader::for_test);
let average_field_norm = fieldnorms
.map(|fieldnorms| {
if fieldnorms.len() == 0 {
return 0.0;
}
let total_num_tokens: u64 = fieldnorms
.iter()
.map(|&fieldnorm| fieldnorm as u64)
.sum::<u64>();
total_num_tokens as Score / fieldnorms.len() as Score
})
.unwrap_or(0.0);
let mut postings_serializer = PostingsSerializer::new(
&mut buffer,
average_field_norm,
true,
false,
fieldnorm_reader,
);
postings_serializer.new_term(doc_and_tfs.len() as u32);
for &(doc, tf) in doc_and_tfs {
postings_serializer.write_doc(doc, tf);
}
postings_serializer
.close_term(doc_and_tfs.len() as u32)
.unwrap();
let block_segment_postings = BlockSegmentPostings::from_data(
doc_and_tfs.len() as u32,
ReadOnlySource::from(buffer),
IndexRecordOption::WithFreqs,
IndexRecordOption::WithFreqs,
);
SegmentPostings::from_block_postings(block_segment_postings, None)
}
/// Reads a Segment postings from an &[u8] /// Reads a Segment postings from an &[u8]
/// ///
/// * `len` - number of document in the posting lists. /// * `len` - number of document in the posting lists.
@@ -148,12 +114,12 @@ impl SegmentPostings {
/// frequencies and/or positions /// frequencies and/or positions
pub(crate) fn from_block_postings( pub(crate) fn from_block_postings(
segment_block_postings: BlockSegmentPostings, segment_block_postings: BlockSegmentPostings,
position_reader: Option<PositionReader>, positions_stream_opt: Option<PositionReader>,
) -> SegmentPostings { ) -> SegmentPostings {
SegmentPostings { SegmentPostings {
block_cursor: segment_block_postings, block_cursor: segment_block_postings,
cur: 0, // cursor within the block cur: COMPRESSION_BLOCK_SIZE, // cursor within the block
position_reader, position_computer: positions_stream_opt.map(PositionComputer::new),
block_searcher: BlockSearcher::default(), block_searcher: BlockSearcher::default(),
} }
} }
@@ -163,60 +129,139 @@ impl DocSet for SegmentPostings {
// goes to the next element. // goes to the next element.
// next needs to be called a first time to point to the correct element. // next needs to be called a first time to point to the correct element.
#[inline] #[inline]
fn advance(&mut self) -> DocId { fn advance(&mut self) -> bool {
debug_assert!(self.block_cursor.block_is_loaded()); if self.position_computer.is_some() && self.cur < COMPRESSION_BLOCK_SIZE {
if self.cur == COMPRESSION_BLOCK_SIZE - 1 { let term_freq = self.term_freq() as usize;
self.cur = 0; if let Some(position_computer) = self.position_computer.as_mut() {
self.block_cursor.advance(); position_computer.add_skip(term_freq);
} else { }
self.cur += 1;
} }
self.doc() self.cur += 1;
if self.cur >= self.block_cursor.block_len() {
self.cur = 0;
if !self.block_cursor.advance() {
self.cur = COMPRESSION_BLOCK_SIZE;
return false;
}
}
true
} }
fn seek(&mut self, target: DocId) -> DocId { fn skip_next(&mut self, target: DocId) -> SkipResult {
debug_assert!(self.doc() <= target); if !self.advance() {
if self.doc() >= target { return SkipResult::End;
return self.doc(); }
match self.doc().cmp(&target) {
Ordering::Equal => {
return SkipResult::Reached;
}
Ordering::Greater => {
return SkipResult::OverStep;
}
_ => {
// ...
}
} }
self.block_cursor.seek(target); // In the following, thanks to the call to advance above,
// we know that the position is not loaded and we need
// to skip every doc_freq we cross.
// At this point we are on the block, that might contain our document. // skip blocks until one that might contain the target
let output = self.block_cursor.docs_aligned(); // check if we need to go to the next block
self.cur = self.block_searcher.search_in_block(&output, target); let mut sum_freqs_skipped: u32 = 0;
if !self
.block_cursor
.docs()
.last()
.map(|doc| *doc >= target)
.unwrap_or(false)
// there should always be at least a document in the block
// since advance returned.
{
// we are not in the right block.
//
// First compute all of the freqs skipped from the current block.
if self.position_computer.is_some() {
sum_freqs_skipped = self.block_cursor.freqs()[self.cur..].iter().sum();
match self.block_cursor.skip_to(target) {
BlockSegmentPostingsSkipResult::Success(block_skip_freqs) => {
sum_freqs_skipped += block_skip_freqs;
}
BlockSegmentPostingsSkipResult::Terminated => {
return SkipResult::End;
}
}
} else if self.block_cursor.skip_to(target)
== BlockSegmentPostingsSkipResult::Terminated
{
// no positions needed. no need to sum freqs.
return SkipResult::End;
}
self.cur = 0;
}
// The last block is not full and padded with the value TERMINATED, let cur = self.cur;
// so that we are guaranteed to have at least doc in the block (a real one or the padding)
// that is greater or equal to the target. // we're in the right block now, start with an exponential search
debug_assert!(self.cur < COMPRESSION_BLOCK_SIZE); let (output, len) = self.block_cursor.docs_aligned();
let new_cur = self
.block_searcher
.search_in_block(&output, len, cur, target);
if let Some(position_computer) = self.position_computer.as_mut() {
sum_freqs_skipped += self.block_cursor.freqs()[cur..new_cur].iter().sum::<u32>();
position_computer.add_skip(sum_freqs_skipped as usize);
}
self.cur = new_cur;
// `doc` is now the first element >= `target` // `doc` is now the first element >= `target`
let doc = output.0[new_cur];
// If all docs are smaller than target the current block should be incomplemented and padded
// with the value `TERMINATED`.
//
// After the search, the cursor should point to the first value of TERMINATED.
let doc = output.0[self.cur];
debug_assert!(doc >= target); debug_assert!(doc >= target);
debug_assert_eq!(doc, self.doc()); if doc == target {
doc SkipResult::Reached
} else {
SkipResult::OverStep
}
} }
/// Return the current document's `DocId`. /// Return the current document's `DocId`.
///
/// # Panics
///
/// Will panics if called without having called advance before.
#[inline] #[inline]
fn doc(&self) -> DocId { fn doc(&self) -> DocId {
self.block_cursor.doc(self.cur) let docs = self.block_cursor.docs();
debug_assert!(
self.cur < docs.len(),
"Have you forgotten to call `.advance()` at least once before calling `.doc()` ."
);
docs[self.cur]
} }
fn size_hint(&self) -> u32 { fn size_hint(&self) -> u32 {
self.len() as u32 self.len() as u32
} }
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
// finish the current block
if self.advance() {
for &doc in &self.block_cursor.docs()[self.cur..] {
bitset.insert(doc);
}
// ... iterate through the remaining blocks.
while self.block_cursor.advance() {
for &doc in self.block_cursor.docs() {
bitset.insert(doc);
}
}
}
}
} }
impl HasLen for SegmentPostings { impl HasLen for SegmentPostings {
fn len(&self) -> usize { fn len(&self) -> usize {
self.block_cursor.doc_freq() as usize self.block_cursor.doc_freq()
} }
} }
@@ -245,63 +290,515 @@ impl Postings for SegmentPostings {
fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) { fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
let term_freq = self.term_freq() as usize; let term_freq = self.term_freq() as usize;
if let Some(position_reader) = self.position_reader.as_mut() { if let Some(position_comp) = self.position_computer.as_mut() {
let read_offset = self.block_cursor.position_offset()
+ (self.block_cursor.freqs()[..self.cur]
.iter()
.cloned()
.sum::<u32>() as u64);
output.resize(term_freq, 0u32); output.resize(term_freq, 0u32);
position_reader.read(read_offset, &mut output[..]); position_comp.positions_with_offset(offset, &mut output[..]);
let mut cum = offset;
for output_mut in output.iter_mut() {
cum += *output_mut;
*output_mut = cum;
}
} else { } else {
output.clear(); output.clear();
} }
} }
} }
/// `BlockSegmentPostings` is a cursor iterating over blocks
/// of documents.
///
/// # Warning
///
/// While it is useful for some very specific high-performance
/// use cases, you should prefer using `SegmentPostings` for most usage.
pub struct BlockSegmentPostings {
doc_decoder: BlockDecoder,
freq_decoder: BlockDecoder,
freq_reading_option: FreqReadingOption,
doc_freq: usize,
doc_offset: DocId,
num_vint_docs: usize,
remaining_data: OwnedRead,
skip_reader: SkipReader,
}
fn split_into_skips_and_postings(
doc_freq: u32,
mut data: OwnedRead,
) -> (Option<OwnedRead>, OwnedRead) {
if doc_freq >= USE_SKIP_INFO_LIMIT {
let skip_len = VInt::deserialize(&mut data).expect("Data corrupted").0 as usize;
let mut postings_data = data.clone();
postings_data.advance(skip_len);
data.clip(skip_len);
(Some(data), postings_data)
} else {
(None, data)
}
}
#[derive(Debug, Eq, PartialEq)]
pub enum BlockSegmentPostingsSkipResult {
Terminated,
Success(u32), //< number of term freqs to skip
}
impl BlockSegmentPostings {
pub(crate) fn from_data(
doc_freq: u32,
data: OwnedRead,
record_option: IndexRecordOption,
requested_option: IndexRecordOption,
) -> BlockSegmentPostings {
let freq_reading_option = match (record_option, requested_option) {
(IndexRecordOption::Basic, _) => FreqReadingOption::NoFreq,
(_, IndexRecordOption::Basic) => FreqReadingOption::SkipFreq,
(_, _) => FreqReadingOption::ReadFreq,
};
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, data);
let skip_reader = match skip_data_opt {
Some(skip_data) => SkipReader::new(skip_data, record_option),
None => SkipReader::new(OwnedRead::new(&[][..]), record_option),
};
let doc_freq = doc_freq as usize;
let num_vint_docs = doc_freq % COMPRESSION_BLOCK_SIZE;
BlockSegmentPostings {
num_vint_docs,
doc_decoder: BlockDecoder::new(),
freq_decoder: BlockDecoder::with_val(1),
freq_reading_option,
doc_offset: 0,
doc_freq,
remaining_data: postings_data,
skip_reader,
}
}
// Resets the block segment postings on another position
// in the postings file.
//
// This is useful for enumerating through a list of terms,
// and consuming the associated posting lists while avoiding
// reallocating a `BlockSegmentPostings`.
//
// # Warning
//
// This does not reset the positions list.
pub(crate) fn reset(&mut self, doc_freq: u32, postings_data: OwnedRead) {
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, postings_data);
let num_vint_docs = (doc_freq as usize) & (COMPRESSION_BLOCK_SIZE - 1);
self.num_vint_docs = num_vint_docs;
self.remaining_data = postings_data;
if let Some(skip_data) = skip_data_opt {
self.skip_reader.reset(skip_data);
} else {
self.skip_reader.reset(OwnedRead::new(&[][..]))
}
self.doc_offset = 0;
self.doc_freq = doc_freq as usize;
}
/// Returns the document frequency associated to this block postings.
///
/// This `doc_freq` is simply the sum of the length of all of the blocks
/// length, and it does not take in account deleted documents.
pub fn doc_freq(&self) -> usize {
self.doc_freq
}
/// Returns the array of docs in the current block.
///
/// Before the first call to `.advance()`, the block
/// returned by `.docs()` is empty.
#[inline]
pub fn docs(&self) -> &[DocId] {
self.doc_decoder.output_array()
}
pub(crate) fn docs_aligned(&self) -> (&AlignedBuffer, usize) {
self.doc_decoder.output_aligned()
}
/// Return the document at index `idx` of the block.
#[inline]
pub fn doc(&self, idx: usize) -> u32 {
self.doc_decoder.output(idx)
}
/// Return the array of `term freq` in the block.
#[inline]
pub fn freqs(&self) -> &[u32] {
self.freq_decoder.output_array()
}
/// Return the frequency at index `idx` of the block.
#[inline]
pub fn freq(&self, idx: usize) -> u32 {
self.freq_decoder.output(idx)
}
/// Returns the length of the current block.
///
/// All blocks have a length of `NUM_DOCS_PER_BLOCK`,
/// except the last block that may have a length
/// of any number between 1 and `NUM_DOCS_PER_BLOCK - 1`
#[inline]
fn block_len(&self) -> usize {
self.doc_decoder.output_len
}
/// position on a block that may contains `doc_id`.
/// Always advance the current block.
///
/// Returns true if a block that has an element greater or equal to the target is found.
/// Returning true does not guarantee that the smallest element of the block is smaller
/// than the target. It only guarantees that the last element is greater or equal.
///
/// Returns false iff all of the document remaining are smaller than
/// `doc_id`. In that case, all of these document are consumed.
///
pub fn skip_to(&mut self, target_doc: DocId) -> BlockSegmentPostingsSkipResult {
let mut skip_freqs = 0u32;
while self.skip_reader.advance() {
if self.skip_reader.doc() >= target_doc {
// the last document of the current block is larger
// than the target.
//
// We found our block!
let num_bits = self.skip_reader.doc_num_bits();
let num_consumed_bytes = self.doc_decoder.uncompress_block_sorted(
self.remaining_data.as_ref(),
self.doc_offset,
num_bits,
);
self.remaining_data.advance(num_consumed_bytes);
let tf_num_bits = self.skip_reader.tf_num_bits();
match self.freq_reading_option {
FreqReadingOption::NoFreq => {}
FreqReadingOption::SkipFreq => {
let num_bytes_to_skip = compressed_block_size(tf_num_bits);
self.remaining_data.advance(num_bytes_to_skip);
}
FreqReadingOption::ReadFreq => {
let num_consumed_bytes = self
.freq_decoder
.uncompress_block_unsorted(self.remaining_data.as_ref(), tf_num_bits);
self.remaining_data.advance(num_consumed_bytes);
}
}
self.doc_offset = self.skip_reader.doc();
return BlockSegmentPostingsSkipResult::Success(skip_freqs);
} else {
skip_freqs += self.skip_reader.tf_sum();
let advance_len = self.skip_reader.total_block_len();
self.doc_offset = self.skip_reader.doc();
self.remaining_data.advance(advance_len);
}
}
// we are now on the last, incomplete, variable encoded block.
if self.num_vint_docs > 0 {
let num_compressed_bytes = self.doc_decoder.uncompress_vint_sorted(
self.remaining_data.as_ref(),
self.doc_offset,
self.num_vint_docs,
);
self.remaining_data.advance(num_compressed_bytes);
match self.freq_reading_option {
FreqReadingOption::NoFreq | FreqReadingOption::SkipFreq => {}
FreqReadingOption::ReadFreq => {
self.freq_decoder
.uncompress_vint_unsorted(self.remaining_data.as_ref(), self.num_vint_docs);
}
}
self.num_vint_docs = 0;
return self
.docs()
.last()
.map(|last_doc| {
if *last_doc >= target_doc {
BlockSegmentPostingsSkipResult::Success(skip_freqs)
} else {
BlockSegmentPostingsSkipResult::Terminated
}
})
.unwrap_or(BlockSegmentPostingsSkipResult::Terminated);
}
BlockSegmentPostingsSkipResult::Terminated
}
/// Advance to the next block.
///
/// Returns false iff there was no remaining blocks.
pub fn advance(&mut self) -> bool {
if self.skip_reader.advance() {
let num_bits = self.skip_reader.doc_num_bits();
let num_consumed_bytes = self.doc_decoder.uncompress_block_sorted(
self.remaining_data.as_ref(),
self.doc_offset,
num_bits,
);
self.remaining_data.advance(num_consumed_bytes);
let tf_num_bits = self.skip_reader.tf_num_bits();
match self.freq_reading_option {
FreqReadingOption::NoFreq => {}
FreqReadingOption::SkipFreq => {
let num_bytes_to_skip = compressed_block_size(tf_num_bits);
self.remaining_data.advance(num_bytes_to_skip);
}
FreqReadingOption::ReadFreq => {
let num_consumed_bytes = self
.freq_decoder
.uncompress_block_unsorted(self.remaining_data.as_ref(), tf_num_bits);
self.remaining_data.advance(num_consumed_bytes);
}
}
// it will be used as the next offset.
self.doc_offset = self.doc_decoder.output(COMPRESSION_BLOCK_SIZE - 1);
true
} else if self.num_vint_docs > 0 {
let num_compressed_bytes = self.doc_decoder.uncompress_vint_sorted(
self.remaining_data.as_ref(),
self.doc_offset,
self.num_vint_docs,
);
self.remaining_data.advance(num_compressed_bytes);
match self.freq_reading_option {
FreqReadingOption::NoFreq | FreqReadingOption::SkipFreq => {}
FreqReadingOption::ReadFreq => {
self.freq_decoder
.uncompress_vint_unsorted(self.remaining_data.as_ref(), self.num_vint_docs);
}
}
self.num_vint_docs = 0;
true
} else {
false
}
}
/// Returns an empty segment postings object
pub fn empty() -> BlockSegmentPostings {
BlockSegmentPostings {
num_vint_docs: 0,
doc_decoder: BlockDecoder::new(),
freq_decoder: BlockDecoder::with_val(1),
freq_reading_option: FreqReadingOption::NoFreq,
doc_offset: 0,
doc_freq: 0,
remaining_data: OwnedRead::new(vec![]),
skip_reader: SkipReader::new(OwnedRead::new(vec![]), IndexRecordOption::Basic),
}
}
}
impl<'b> Streamer<'b> for BlockSegmentPostings {
type Item = &'b [DocId];
fn next(&'b mut self) -> Option<&'b [DocId]> {
if self.advance() {
Some(self.docs())
} else {
None
}
}
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::BlockSegmentPostings;
use super::BlockSegmentPostingsSkipResult;
use super::SegmentPostings; use super::SegmentPostings;
use crate::common::HasLen; use crate::common::HasLen;
use crate::core::Index;
use crate::docset::{DocSet, TERMINATED}; use crate::docset::DocSet;
use crate::fastfield::DeleteBitSet;
use crate::postings::postings::Postings; use crate::postings::postings::Postings;
use crate::schema::IndexRecordOption;
use crate::schema::Schema;
use crate::schema::Term;
use crate::schema::INDEXED;
use crate::DocId;
use crate::SkipResult;
use tantivy_fst::Streamer;
#[test] #[test]
fn test_empty_segment_postings() { fn test_empty_segment_postings() {
let mut postings = SegmentPostings::empty(); let mut postings = SegmentPostings::empty();
assert_eq!(postings.advance(), TERMINATED); assert!(!postings.advance());
assert_eq!(postings.advance(), TERMINATED); assert!(!postings.advance());
assert_eq!(postings.len(), 0); assert_eq!(postings.len(), 0);
} }
#[test] #[test]
fn test_empty_postings_doc_returns_terminated() { #[should_panic(expected = "Have you forgotten to call `.advance()`")]
let mut postings = SegmentPostings::empty(); fn test_panic_if_doc_called_before_advance() {
assert_eq!(postings.doc(), TERMINATED); SegmentPostings::empty().doc();
assert_eq!(postings.advance(), TERMINATED);
} }
#[test] #[test]
fn test_empty_postings_doc_term_freq_returns_0() { #[should_panic(expected = "Have you forgotten to call `.advance()`")]
let postings = SegmentPostings::empty(); fn test_panic_if_freq_called_before_advance() {
assert_eq!(postings.term_freq(), 1); SegmentPostings::empty().term_freq();
} }
#[test] #[test]
fn test_doc_freq() { fn test_empty_block_segment_postings() {
let docs = SegmentPostings::create_from_docs(&[0, 2, 10]); let mut postings = BlockSegmentPostings::empty();
assert_eq!(docs.doc_freq(), 3); assert!(!postings.advance());
let delete_bitset = DeleteBitSet::for_test(&[2], 12); assert_eq!(postings.doc_freq(), 0);
assert_eq!(docs.doc_freq_given_deletes(&delete_bitset), 2); }
let all_deleted = DeleteBitSet::for_test(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 12);
assert_eq!(docs.doc_freq_given_deletes(&all_deleted), 0); #[test]
fn test_block_segment_postings() {
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>());
let mut offset: u32 = 0u32;
// checking that the block before calling advance is empty
assert!(block_segments.docs().is_empty());
// checking that the `doc_freq` is correct
assert_eq!(block_segments.doc_freq(), 100_000);
while let Some(block) = block_segments.next() {
for (i, doc) in block.iter().cloned().enumerate() {
assert_eq!(offset + (i as u32), doc);
}
offset += block.len() as u32;
}
}
#[test]
fn test_skip_right_at_new_block() {
let mut doc_ids = (0..128).collect::<Vec<u32>>();
doc_ids.push(129);
doc_ids.push(130);
{
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.skip_next(128), SkipResult::OverStep);
assert_eq!(docset.doc(), 129);
assert!(docset.advance());
assert_eq!(docset.doc(), 130);
assert!(!docset.advance());
}
{
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.skip_next(129), SkipResult::Reached);
assert_eq!(docset.doc(), 129);
assert!(docset.advance());
assert_eq!(docset.doc(), 130);
assert!(!docset.advance());
}
{
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.skip_next(131), SkipResult::End);
}
}
fn build_block_postings(docs: &[DocId]) -> BlockSegmentPostings {
let mut schema_builder = Schema::builder();
let int_field = schema_builder.add_u64_field("id", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut last_doc = 0u32;
for &doc in docs {
for _ in last_doc..doc {
index_writer.add_document(doc!(int_field=>1u64));
}
index_writer.add_document(doc!(int_field=>0u64));
last_doc = doc + 1;
}
index_writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
let segment_reader = searcher.segment_reader(0);
let inverted_index = segment_reader.inverted_index(int_field);
let term = Term::from_field_u64(int_field, 0u64);
let term_info = inverted_index.get_term_info(&term).unwrap();
inverted_index.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic)
}
#[test]
fn test_block_segment_postings_skip() {
for i in 0..4 {
let mut block_postings = build_block_postings(&[3]);
assert_eq!(
block_postings.skip_to(i),
BlockSegmentPostingsSkipResult::Success(0u32)
);
assert_eq!(
block_postings.skip_to(i),
BlockSegmentPostingsSkipResult::Terminated
);
}
let mut block_postings = build_block_postings(&[3]);
assert_eq!(
block_postings.skip_to(4u32),
BlockSegmentPostingsSkipResult::Terminated
);
}
#[test]
fn test_block_segment_postings_skip2() {
let mut docs = vec![0];
for i in 0..1300 {
docs.push((i * i / 100) + i);
}
let mut block_postings = build_block_postings(&docs[..]);
for i in vec![0, 424, 10000] {
assert_eq!(
block_postings.skip_to(i),
BlockSegmentPostingsSkipResult::Success(0u32)
);
let docs = block_postings.docs();
assert!(docs[0] <= i);
assert!(docs.last().cloned().unwrap_or(0u32) >= i);
}
assert_eq!(
block_postings.skip_to(100_000),
BlockSegmentPostingsSkipResult::Terminated
);
assert_eq!(
block_postings.skip_to(101_000),
BlockSegmentPostingsSkipResult::Terminated
);
}
#[test]
fn test_reset_block_segment_postings() {
let mut schema_builder = Schema::builder();
let int_field = schema_builder.add_u64_field("id", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
// create two postings list, one containg even number,
// the other containing odd numbers.
for i in 0..6 {
let doc = doc!(int_field=> (i % 2) as u64);
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
let segment_reader = searcher.segment_reader(0);
let mut block_segments;
{
let term = Term::from_field_u64(int_field, 0u64);
let inverted_index = segment_reader.inverted_index(int_field);
let term_info = inverted_index.get_term_info(&term).unwrap();
block_segments = inverted_index
.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic);
}
assert!(block_segments.advance());
assert_eq!(block_segments.docs(), &[0, 2, 4]);
{
let term = Term::from_field_u64(int_field, 1u64);
let inverted_index = segment_reader.inverted_index(int_field);
let term_info = inverted_index.get_term_info(&term).unwrap();
inverted_index.reset_block_postings_from_terminfo(&term_info, &mut block_segments);
}
assert!(block_segments.advance());
assert_eq!(block_segments.docs(), &[1, 3, 5]);
} }
} }

View File

@@ -3,16 +3,14 @@ use crate::common::{BinarySerializable, VInt};
use crate::common::{CompositeWrite, CountingWriter}; use crate::common::{CompositeWrite, CountingWriter};
use crate::core::Segment; use crate::core::Segment;
use crate::directory::WritePtr; use crate::directory::WritePtr;
use crate::fieldnorm::FieldNormReader;
use crate::positions::PositionSerializer; use crate::positions::PositionSerializer;
use crate::postings::compression::{BlockEncoder, VIntEncoder, COMPRESSION_BLOCK_SIZE}; use crate::postings::compression::{BlockEncoder, VIntEncoder, COMPRESSION_BLOCK_SIZE};
use crate::postings::skip::SkipSerializer; use crate::postings::skip::SkipSerializer;
use crate::query::BM25Weight; use crate::postings::USE_SKIP_INFO_LIMIT;
use crate::schema::Schema; use crate::schema::Schema;
use crate::schema::{Field, FieldEntry, FieldType}; use crate::schema::{Field, FieldEntry, FieldType};
use crate::termdict::{TermDictionaryBuilder, TermOrdinal}; use crate::termdict::{TermDictionaryBuilder, TermOrdinal};
use crate::{DocId, Score}; use crate::DocId;
use std::cmp::Ordering;
use std::io::{self, Write}; use std::io::{self, Write};
/// `InvertedIndexSerializer` is in charge of serializing /// `InvertedIndexSerializer` is in charge of serializing
@@ -92,22 +90,20 @@ impl InvertedIndexSerializer {
&mut self, &mut self,
field: Field, field: Field,
total_num_tokens: u64, total_num_tokens: u64,
fieldnorm_reader: Option<FieldNormReader>,
) -> io::Result<FieldSerializer<'_>> { ) -> io::Result<FieldSerializer<'_>> {
let field_entry: &FieldEntry = self.schema.get_field_entry(field); let field_entry: &FieldEntry = self.schema.get_field_entry(field);
let term_dictionary_write = self.terms_write.for_field(field); let term_dictionary_write = self.terms_write.for_field(field);
let postings_write = self.postings_write.for_field(field); let postings_write = self.postings_write.for_field(field);
total_num_tokens.serialize(postings_write)?;
let positions_write = self.positions_write.for_field(field); let positions_write = self.positions_write.for_field(field);
let positionsidx_write = self.positionsidx_write.for_field(field); let positionsidx_write = self.positionsidx_write.for_field(field);
let field_type: FieldType = (*field_entry.field_type()).clone(); let field_type: FieldType = (*field_entry.field_type()).clone();
FieldSerializer::create( FieldSerializer::create(
&field_type, &field_type,
total_num_tokens,
term_dictionary_write, term_dictionary_write,
postings_write, postings_write,
positions_write, positions_write,
positionsidx_write, positionsidx_write,
fieldnorm_reader,
) )
} }
@@ -135,14 +131,11 @@ pub struct FieldSerializer<'a> {
impl<'a> FieldSerializer<'a> { impl<'a> FieldSerializer<'a> {
fn create( fn create(
field_type: &FieldType, field_type: &FieldType,
total_num_tokens: u64,
term_dictionary_write: &'a mut CountingWriter<WritePtr>, term_dictionary_write: &'a mut CountingWriter<WritePtr>,
postings_write: &'a mut CountingWriter<WritePtr>, postings_write: &'a mut CountingWriter<WritePtr>,
positions_write: &'a mut CountingWriter<WritePtr>, positions_write: &'a mut CountingWriter<WritePtr>,
positionsidx_write: &'a mut CountingWriter<WritePtr>, positionsidx_write: &'a mut CountingWriter<WritePtr>,
fieldnorm_reader: Option<FieldNormReader>,
) -> io::Result<FieldSerializer<'a>> { ) -> io::Result<FieldSerializer<'a>> {
total_num_tokens.serialize(postings_write)?;
let (term_freq_enabled, position_enabled): (bool, bool) = match field_type { let (term_freq_enabled, position_enabled): (bool, bool) = match field_type {
FieldType::Str(ref text_options) => { FieldType::Str(ref text_options) => {
if let Some(text_indexing_options) = text_options.get_indexing_options() { if let Some(text_indexing_options) = text_options.get_indexing_options() {
@@ -155,17 +148,8 @@ impl<'a> FieldSerializer<'a> {
_ => (false, false), _ => (false, false),
}; };
let term_dictionary_builder = TermDictionaryBuilder::create(term_dictionary_write)?; let term_dictionary_builder = TermDictionaryBuilder::create(term_dictionary_write)?;
let average_fieldnorm = fieldnorm_reader let postings_serializer =
.as_ref() PostingsSerializer::new(postings_write, term_freq_enabled, position_enabled);
.map(|ff_reader| (total_num_tokens as Score / ff_reader.num_docs() as Score))
.unwrap_or(0.0);
let postings_serializer = PostingsSerializer::new(
postings_write,
average_fieldnorm,
term_freq_enabled,
position_enabled,
fieldnorm_reader,
);
let positions_serializer_opt = if position_enabled { let positions_serializer_opt = if position_enabled {
Some(PositionSerializer::new(positions_write, positionsidx_write)) Some(PositionSerializer::new(positions_write, positionsidx_write))
} else { } else {
@@ -198,20 +182,18 @@ impl<'a> FieldSerializer<'a> {
/// Starts the postings for a new term. /// Starts the postings for a new term.
/// * term - the term. It needs to come after the previous term according /// * term - the term. It needs to come after the previous term according
/// to the lexicographical order. /// to the lexicographical order.
/// * term_doc_freq - return the number of document containing the term. /// * doc_freq - return the number of document containing the term.
pub fn new_term(&mut self, term: &[u8], term_doc_freq: u32) -> io::Result<TermOrdinal> { pub fn new_term(&mut self, term: &[u8]) -> io::Result<TermOrdinal> {
assert!( assert!(
!self.term_open, !self.term_open,
"Called new_term, while the previous term was not closed." "Called new_term, while the previous term was not closed."
); );
self.term_open = true; self.term_open = true;
self.postings_serializer.clear(); self.postings_serializer.clear();
self.current_term_info = self.current_term_info(); self.current_term_info = self.current_term_info();
self.term_dictionary_builder.insert_key(term)?; self.term_dictionary_builder.insert_key(term)?;
let term_ordinal = self.num_terms; let term_ordinal = self.num_terms;
self.num_terms += 1; self.num_terms += 1;
self.postings_serializer.new_term(term_doc_freq);
Ok(term_ordinal) Ok(term_ordinal)
} }
@@ -325,27 +307,14 @@ pub struct PostingsSerializer<W: Write> {
termfreq_enabled: bool, termfreq_enabled: bool,
termfreq_sum_enabled: bool, termfreq_sum_enabled: bool,
fieldnorm_reader: Option<FieldNormReader>,
bm25_weight: Option<BM25Weight>,
num_docs: u32, // Number of docs in the segment
avg_fieldnorm: Score, // Average number of term in the field for that segment.
// this value is used to compute the block wand information.
} }
impl<W: Write> PostingsSerializer<W> { impl<W: Write> PostingsSerializer<W> {
pub fn new( pub fn new(
write: W, write: W,
avg_fieldnorm: Score,
termfreq_enabled: bool, termfreq_enabled: bool,
termfreq_sum_enabled: bool, termfreq_sum_enabled: bool,
fieldnorm_reader: Option<FieldNormReader>,
) -> PostingsSerializer<W> { ) -> PostingsSerializer<W> {
let num_docs = fieldnorm_reader
.as_ref()
.map(|fieldnorm_reader| fieldnorm_reader.num_docs())
.unwrap_or(0u32);
PostingsSerializer { PostingsSerializer {
output_write: CountingWriter::wrap(write), output_write: CountingWriter::wrap(write),
@@ -358,23 +327,6 @@ impl<W: Write> PostingsSerializer<W> {
last_doc_id_encoded: 0u32, last_doc_id_encoded: 0u32,
termfreq_enabled, termfreq_enabled,
termfreq_sum_enabled, termfreq_sum_enabled,
fieldnorm_reader,
bm25_weight: None,
num_docs,
avg_fieldnorm,
}
}
pub fn new_term(&mut self, term_doc_freq: u32) {
if self.termfreq_enabled && self.num_docs > 0 {
let bm25_weight = BM25Weight::for_one_term(
term_doc_freq as u64,
self.num_docs as u64,
self.avg_fieldnorm,
);
self.bm25_weight = Some(bm25_weight);
} }
} }
@@ -391,6 +343,7 @@ impl<W: Write> PostingsSerializer<W> {
self.postings_write.extend(block_encoded); self.postings_write.extend(block_encoded);
} }
if self.termfreq_enabled { if self.termfreq_enabled {
// encode the term_freqs
let (num_bits, block_encoded): (u8, &[u8]) = self let (num_bits, block_encoded): (u8, &[u8]) = self
.block_encoder .block_encoder
.compress_block_unsorted(&self.block.term_freqs()); .compress_block_unsorted(&self.block.term_freqs());
@@ -400,31 +353,6 @@ impl<W: Write> PostingsSerializer<W> {
let sum_freq = self.block.term_freqs().iter().cloned().sum(); let sum_freq = self.block.term_freqs().iter().cloned().sum();
self.skip_write.write_total_term_freq(sum_freq); self.skip_write.write_total_term_freq(sum_freq);
} }
let mut blockwand_params = (0u8, 0u32);
if let Some(bm25_weight) = self.bm25_weight.as_ref() {
if let Some(fieldnorm_reader) = self.fieldnorm_reader.as_ref() {
let docs = self.block.doc_ids().iter().cloned();
let term_freqs = self.block.term_freqs().iter().cloned();
let fieldnorms = docs.map(|doc| fieldnorm_reader.fieldnorm_id(doc));
blockwand_params = fieldnorms
.zip(term_freqs)
.max_by(
|(left_fieldnorm_id, left_term_freq),
(right_fieldnorm_id, right_term_freq)| {
let left_score =
bm25_weight.tf_factor(*left_fieldnorm_id, *left_term_freq);
let right_score =
bm25_weight.tf_factor(*right_fieldnorm_id, *right_term_freq);
left_score
.partial_cmp(&right_score)
.unwrap_or(Ordering::Equal)
},
)
.unwrap();
}
}
let (fieldnorm_id, term_freq) = blockwand_params;
self.skip_write.write_blockwand_max(fieldnorm_id, term_freq);
} }
self.block.clear(); self.block.clear();
} }
@@ -463,7 +391,7 @@ impl<W: Write> PostingsSerializer<W> {
} }
self.block.clear(); self.block.clear();
} }
if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 { if doc_freq >= USE_SKIP_INFO_LIMIT {
let skip_data = self.skip_write.data(); let skip_data = self.skip_write.data();
VInt(skip_data.len() as u64).serialize(&mut self.output_write)?; VInt(skip_data.len() as u64).serialize(&mut self.output_write)?;
self.output_write.write_all(skip_data)?; self.output_write.write_all(skip_data)?;
@@ -473,7 +401,6 @@ impl<W: Write> PostingsSerializer<W> {
} }
self.skip_write.clear(); self.skip_write.clear();
self.postings_write.clear(); self.postings_write.clear();
self.bm25_weight = None;
Ok(()) Ok(())
} }

View File

@@ -1,9 +1,7 @@
use crate::common::{read_u32_vint_no_advance, serialize_vint_u32, BinarySerializable, VInt}; use crate::common::BinarySerializable;
use crate::directory::ReadOnlySource; use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
use crate::postings::compression::{compressed_block_size, COMPRESSION_BLOCK_SIZE};
use crate::query::BM25Weight;
use crate::schema::IndexRecordOption; use crate::schema::IndexRecordOption;
use crate::{DocId, Score, TERMINATED}; use crate::DocId;
use owned_read::OwnedRead; use owned_read::OwnedRead;
pub struct SkipSerializer { pub struct SkipSerializer {
@@ -41,13 +39,6 @@ impl SkipSerializer {
.expect("Should never fail"); .expect("Should never fail");
} }
pub fn write_blockwand_max(&mut self, fieldnorm_id: u8, term_freq: u32) {
self.buffer.push(fieldnorm_id);
let mut buf = [0u8; 8];
let bytes = serialize_vint_u32(term_freq, &mut buf);
self.buffer.extend_from_slice(bytes);
}
pub fn data(&self) -> &[u8] { pub fn data(&self) -> &[u8] {
&self.buffer[..] &self.buffer[..]
} }
@@ -58,200 +49,81 @@ impl SkipSerializer {
} }
} }
#[derive(Clone)]
pub(crate) struct SkipReader { pub(crate) struct SkipReader {
last_doc_in_block: DocId, doc: DocId,
pub(crate) last_doc_in_previous_block: DocId,
owned_read: OwnedRead, owned_read: OwnedRead,
doc_num_bits: u8,
tf_num_bits: u8,
tf_sum: u32,
skip_info: IndexRecordOption, skip_info: IndexRecordOption,
byte_offset: usize,
remaining_docs: u32, // number of docs remaining, including the
// documents in the current block.
block_info: BlockInfo,
position_offset: u64,
}
#[derive(Clone, Eq, PartialEq, Copy, Debug)]
pub(crate) enum BlockInfo {
BitPacked {
doc_num_bits: u8,
tf_num_bits: u8,
tf_sum: u32,
block_wand_fieldnorm_id: u8,
block_wand_term_freq: u32,
},
VInt {
num_docs: u32,
},
}
impl Default for BlockInfo {
fn default() -> Self {
BlockInfo::VInt { num_docs: 0u32 }
}
} }
impl SkipReader { impl SkipReader {
pub fn new(data: ReadOnlySource, doc_freq: u32, skip_info: IndexRecordOption) -> SkipReader { pub fn new(data: OwnedRead, skip_info: IndexRecordOption) -> SkipReader {
let mut skip_reader = SkipReader { SkipReader {
last_doc_in_block: if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 { doc: 0u32,
0 owned_read: data,
} else {
TERMINATED
},
last_doc_in_previous_block: 0u32,
owned_read: OwnedRead::new(data),
skip_info, skip_info,
block_info: BlockInfo::VInt { num_docs: doc_freq }, doc_num_bits: 0u8,
byte_offset: 0, tf_num_bits: 0u8,
remaining_docs: doc_freq, tf_sum: 0u32,
position_offset: 0u64,
};
if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 {
skip_reader.read_block_info();
}
skip_reader
}
pub fn reset(&mut self, data: ReadOnlySource, doc_freq: u32) {
self.last_doc_in_block = if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 {
0
} else {
TERMINATED
};
self.last_doc_in_previous_block = 0u32;
self.owned_read = OwnedRead::new(data);
self.block_info = BlockInfo::VInt { num_docs: doc_freq };
self.byte_offset = 0;
self.remaining_docs = doc_freq;
self.position_offset = 0u64;
if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 {
self.read_block_info();
} }
} }
// Returns the block max score for this block if available. pub fn reset(&mut self, data: OwnedRead) {
// self.doc = 0u32;
// The block max score is available for all full bitpacked block, self.owned_read = data;
// but no available for the last VInt encoded incomplete block. self.doc_num_bits = 0u8;
pub fn block_max_score(&self, bm25_weight: &BM25Weight) -> Option<Score> { self.tf_num_bits = 0u8;
match self.block_info { self.tf_sum = 0u32;
BlockInfo::BitPacked {
block_wand_fieldnorm_id,
block_wand_term_freq,
..
} => Some(bm25_weight.score(block_wand_fieldnorm_id, block_wand_term_freq)),
BlockInfo::VInt { .. } => None,
}
} }
pub(crate) fn last_doc_in_block(&self) -> DocId { pub fn total_block_len(&self) -> usize {
self.last_doc_in_block (self.doc_num_bits + self.tf_num_bits) as usize * COMPRESSION_BLOCK_SIZE / 8
} }
pub fn position_offset(&self) -> u64 { pub fn doc(&self) -> DocId {
self.position_offset self.doc
} }
pub fn byte_offset(&self) -> usize { pub fn doc_num_bits(&self) -> u8 {
self.byte_offset self.doc_num_bits
} }
fn read_block_info(&mut self) { /// Number of bits used to encode term frequencies
let doc_delta = u32::deserialize(&mut self.owned_read).expect("Skip data corrupted");
self.last_doc_in_block += doc_delta as DocId;
let doc_num_bits = self.owned_read.get(0);
match self.skip_info {
IndexRecordOption::Basic => {
self.owned_read.advance(1);
self.block_info = BlockInfo::BitPacked {
doc_num_bits,
tf_num_bits: 0,
tf_sum: 0,
block_wand_fieldnorm_id: 0,
block_wand_term_freq: 0,
};
}
IndexRecordOption::WithFreqs => {
let tf_num_bits = self.owned_read.get(1);
let block_wand_fieldnorm_id = self.owned_read.get(2);
let data = &self.owned_read.as_ref()[3..];
let (block_wand_term_freq, num_bytes) = read_u32_vint_no_advance(data);
self.owned_read.advance(3 + num_bytes);
self.block_info = BlockInfo::BitPacked {
doc_num_bits,
tf_num_bits,
tf_sum: 0,
block_wand_fieldnorm_id,
block_wand_term_freq,
};
}
IndexRecordOption::WithFreqsAndPositions => {
let tf_num_bits = self.owned_read.get(1);
self.owned_read.advance(2);
let tf_sum = u32::deserialize(&mut self.owned_read).expect("Failed reading tf_sum");
let block_wand_fieldnorm_id = self.owned_read.get(0);
self.owned_read.advance(1);
let block_wand_term_freq =
VInt::deserialize_u64(&mut self.owned_read).unwrap() as u32;
self.block_info = BlockInfo::BitPacked {
doc_num_bits,
tf_num_bits,
tf_sum,
block_wand_fieldnorm_id,
block_wand_term_freq,
};
}
}
}
pub fn block_info(&self) -> BlockInfo {
self.block_info
}
/// Advance the skip reader to the block that may contain the target.
/// ///
/// If the target is larger than all documents, the skip_reader /// 0 if term frequencies are not enabled.
/// then advance to the last Variable In block. pub fn tf_num_bits(&self) -> u8 {
pub fn seek(&mut self, target: DocId) -> bool { self.tf_num_bits
if self.last_doc_in_block() >= target {
return false;
}
loop {
self.advance();
if self.last_doc_in_block() >= target {
return true;
}
}
} }
pub fn advance(&mut self) { pub fn tf_sum(&self) -> u32 {
match self.block_info { self.tf_sum
BlockInfo::BitPacked { }
doc_num_bits,
tf_num_bits, pub fn advance(&mut self) -> bool {
tf_sum, if self.owned_read.as_ref().is_empty() {
.. false
} => {
self.remaining_docs -= COMPRESSION_BLOCK_SIZE as u32;
self.byte_offset += compressed_block_size(doc_num_bits + tf_num_bits);
self.position_offset += tf_sum as u64;
}
BlockInfo::VInt { num_docs } => {
debug_assert_eq!(num_docs, self.remaining_docs);
self.remaining_docs = 0;
self.byte_offset = std::usize::MAX;
}
}
self.last_doc_in_previous_block = self.last_doc_in_block;
if self.remaining_docs >= COMPRESSION_BLOCK_SIZE as u32 {
self.read_block_info();
} else { } else {
self.last_doc_in_block = TERMINATED; let doc_delta = u32::deserialize(&mut self.owned_read).expect("Skip data corrupted");
self.block_info = BlockInfo::VInt { self.doc += doc_delta as DocId;
num_docs: self.remaining_docs, self.doc_num_bits = self.owned_read.get(0);
}; match self.skip_info {
IndexRecordOption::Basic => {
self.owned_read.advance(1);
}
IndexRecordOption::WithFreqs => {
self.tf_num_bits = self.owned_read.get(1);
self.owned_read.advance(2);
}
IndexRecordOption::WithFreqsAndPositions => {
self.tf_num_bits = self.owned_read.get(1);
self.owned_read.advance(2);
self.tf_sum =
u32::deserialize(&mut self.owned_read).expect("Failed reading tf_sum");
}
}
true
} }
} }
} }
@@ -259,11 +131,9 @@ impl SkipReader {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::BlockInfo;
use super::IndexRecordOption; use super::IndexRecordOption;
use super::{SkipReader, SkipSerializer}; use super::{SkipReader, SkipSerializer};
use crate::directory::ReadOnlySource; use owned_read::OwnedRead;
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
#[test] #[test]
fn test_skip_with_freq() { fn test_skip_with_freq() {
@@ -271,47 +141,20 @@ mod tests {
let mut skip_serializer = SkipSerializer::new(); let mut skip_serializer = SkipSerializer::new();
skip_serializer.write_doc(1u32, 2u8); skip_serializer.write_doc(1u32, 2u8);
skip_serializer.write_term_freq(3u8); skip_serializer.write_term_freq(3u8);
skip_serializer.write_blockwand_max(13u8, 3u32);
skip_serializer.write_doc(5u32, 5u8); skip_serializer.write_doc(5u32, 5u8);
skip_serializer.write_term_freq(2u8); skip_serializer.write_term_freq(2u8);
skip_serializer.write_blockwand_max(8u8, 2u32);
skip_serializer.data().to_owned() skip_serializer.data().to_owned()
}; };
let doc_freq = 3u32 + (COMPRESSION_BLOCK_SIZE * 2) as u32; let mut skip_reader = SkipReader::new(OwnedRead::new(buf), IndexRecordOption::WithFreqs);
let mut skip_reader = SkipReader::new( assert!(skip_reader.advance());
ReadOnlySource::new(buf), assert_eq!(skip_reader.doc(), 1u32);
doc_freq, assert_eq!(skip_reader.doc_num_bits(), 2u8);
IndexRecordOption::WithFreqs, assert_eq!(skip_reader.tf_num_bits(), 3u8);
); assert!(skip_reader.advance());
assert_eq!(skip_reader.last_doc_in_block(), 1u32); assert_eq!(skip_reader.doc(), 5u32);
assert_eq!( assert_eq!(skip_reader.doc_num_bits(), 5u8);
skip_reader.block_info, assert_eq!(skip_reader.tf_num_bits(), 2u8);
BlockInfo::BitPacked { assert!(!skip_reader.advance());
doc_num_bits: 2u8,
tf_num_bits: 3u8,
tf_sum: 0,
block_wand_fieldnorm_id: 13,
block_wand_term_freq: 3
}
);
skip_reader.advance();
assert_eq!(skip_reader.last_doc_in_block(), 5u32);
assert_eq!(
skip_reader.block_info(),
BlockInfo::BitPacked {
doc_num_bits: 5u8,
tf_num_bits: 2u8,
tf_sum: 0,
block_wand_fieldnorm_id: 8,
block_wand_term_freq: 2
}
);
skip_reader.advance();
assert_eq!(skip_reader.block_info(), BlockInfo::VInt { num_docs: 3u32 });
skip_reader.advance();
assert_eq!(skip_reader.block_info(), BlockInfo::VInt { num_docs: 0u32 });
skip_reader.advance();
assert_eq!(skip_reader.block_info(), BlockInfo::VInt { num_docs: 0u32 });
} }
#[test] #[test]
@@ -322,68 +165,13 @@ mod tests {
skip_serializer.write_doc(5u32, 5u8); skip_serializer.write_doc(5u32, 5u8);
skip_serializer.data().to_owned() skip_serializer.data().to_owned()
}; };
let doc_freq = 3u32 + (COMPRESSION_BLOCK_SIZE * 2) as u32; let mut skip_reader = SkipReader::new(OwnedRead::new(buf), IndexRecordOption::Basic);
let mut skip_reader = SkipReader::new( assert!(skip_reader.advance());
ReadOnlySource::from(buf), assert_eq!(skip_reader.doc(), 1u32);
doc_freq, assert_eq!(skip_reader.doc_num_bits(), 2u8);
IndexRecordOption::Basic, assert!(skip_reader.advance());
); assert_eq!(skip_reader.doc(), 5u32);
assert_eq!(skip_reader.last_doc_in_block(), 1u32); assert_eq!(skip_reader.doc_num_bits(), 5u8);
assert_eq!( assert!(!skip_reader.advance());
skip_reader.block_info(),
BlockInfo::BitPacked {
doc_num_bits: 2u8,
tf_num_bits: 0,
tf_sum: 0u32,
block_wand_fieldnorm_id: 0,
block_wand_term_freq: 0
}
);
skip_reader.advance();
assert_eq!(skip_reader.last_doc_in_block(), 5u32);
assert_eq!(
skip_reader.block_info(),
BlockInfo::BitPacked {
doc_num_bits: 5u8,
tf_num_bits: 0,
tf_sum: 0u32,
block_wand_fieldnorm_id: 0,
block_wand_term_freq: 0
}
);
skip_reader.advance();
assert_eq!(skip_reader.block_info(), BlockInfo::VInt { num_docs: 3u32 });
skip_reader.advance();
assert_eq!(skip_reader.block_info(), BlockInfo::VInt { num_docs: 0u32 });
skip_reader.advance();
assert_eq!(skip_reader.block_info(), BlockInfo::VInt { num_docs: 0u32 });
}
#[test]
fn test_skip_multiple_of_block_size() {
let buf = {
let mut skip_serializer = SkipSerializer::new();
skip_serializer.write_doc(1u32, 2u8);
skip_serializer.data().to_owned()
};
let doc_freq = COMPRESSION_BLOCK_SIZE as u32;
let mut skip_reader = SkipReader::new(
ReadOnlySource::from(buf),
doc_freq,
IndexRecordOption::Basic,
);
assert_eq!(skip_reader.last_doc_in_block(), 1u32);
assert_eq!(
skip_reader.block_info(),
BlockInfo::BitPacked {
doc_num_bits: 2u8,
tf_num_bits: 0,
tf_sum: 0u32,
block_wand_fieldnorm_id: 0,
block_wand_term_freq: 0
}
);
skip_reader.advance();
assert_eq!(skip_reader.block_info(), BlockInfo::VInt { num_docs: 0u32 });
} }
} }

View File

@@ -1,4 +1,6 @@
use murmurhash32::murmurhash2; use murmurhash32;
use self::murmurhash32::murmurhash2;
use super::{Addr, MemoryArena}; use super::{Addr, MemoryArena};
use crate::postings::stacker::memory_arena::store; use crate::postings::stacker::memory_arena::store;

View File

@@ -1,7 +1,6 @@
use crate::core::Searcher; use crate::core::Searcher;
use crate::core::SegmentReader; use crate::core::SegmentReader;
use crate::docset::{DocSet, TERMINATED}; use crate::docset::DocSet;
use crate::query::boost_query::BoostScorer;
use crate::query::explanation::does_not_match; use crate::query::explanation::does_not_match;
use crate::query::{Explanation, Query, Scorer, Weight}; use crate::query::{Explanation, Query, Scorer, Weight};
use crate::DocId; use crate::DocId;
@@ -9,7 +8,7 @@ use crate::Score;
/// Query that matches all of the documents. /// Query that matches all of the documents.
/// ///
/// All of the document get the score 1.0. /// All of the document get the score 1f32.
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
pub struct AllQuery; pub struct AllQuery;
@@ -23,36 +22,55 @@ impl Query for AllQuery {
pub struct AllWeight; pub struct AllWeight;
impl Weight for AllWeight { impl Weight for AllWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> { fn scorer(&self, reader: &SegmentReader) -> crate::Result<Box<dyn Scorer>> {
let all_scorer = AllScorer { Ok(Box::new(AllScorer {
state: State::NotStarted,
doc: 0u32, doc: 0u32,
max_doc: reader.max_doc(), max_doc: reader.max_doc(),
}; }))
Ok(Box::new(BoostScorer::new(all_scorer, boost)))
} }
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> { fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
if doc >= reader.max_doc() { if doc >= reader.max_doc() {
return Err(does_not_match(doc)); return Err(does_not_match(doc));
} }
Ok(Explanation::new("AllQuery", 1.0)) Ok(Explanation::new("AllQuery", 1f32))
} }
} }
enum State {
NotStarted,
Started,
Finished,
}
/// Scorer associated to the `AllQuery` query. /// Scorer associated to the `AllQuery` query.
pub struct AllScorer { pub struct AllScorer {
state: State,
doc: DocId, doc: DocId,
max_doc: DocId, max_doc: DocId,
} }
impl DocSet for AllScorer { impl DocSet for AllScorer {
fn advance(&mut self) -> DocId { fn advance(&mut self) -> bool {
if self.doc + 1 >= self.max_doc { match self.state {
self.doc = TERMINATED; State::NotStarted => {
return TERMINATED; self.state = State::Started;
self.doc = 0;
}
State::Started => {
self.doc += 1u32;
}
State::Finished => {
return false;
}
}
if self.doc < self.max_doc {
true
} else {
self.state = State::Finished;
false
} }
self.doc += 1;
self.doc
} }
fn doc(&self) -> DocId { fn doc(&self) -> DocId {
@@ -66,70 +84,49 @@ impl DocSet for AllScorer {
impl Scorer for AllScorer { impl Scorer for AllScorer {
fn score(&mut self) -> Score { fn score(&mut self) -> Score {
1.0 1f32
} }
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::AllQuery; use super::AllQuery;
use crate::docset::TERMINATED;
use crate::query::Query; use crate::query::Query;
use crate::schema::{Schema, TEXT}; use crate::schema::{Schema, TEXT};
use crate::Index; use crate::Index;
fn create_test_index() -> Index { #[test]
fn test_all_query() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let field = schema_builder.add_text_field("text", TEXT); let field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
index_writer.add_document(doc!(field=>"aaa")); index_writer.add_document(doc!(field=>"aaa"));
index_writer.add_document(doc!(field=>"bbb")); index_writer.add_document(doc!(field=>"bbb"));
index_writer.commit().unwrap(); index_writer.commit().unwrap();
index_writer.add_document(doc!(field=>"ccc")); index_writer.add_document(doc!(field=>"ccc"));
index_writer.commit().unwrap(); index_writer.commit().unwrap();
index
}
#[test]
fn test_all_query() {
let index = create_test_index();
let reader = index.reader().unwrap(); let reader = index.reader().unwrap();
reader.reload().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
let weight = AllQuery.weight(&searcher, false).unwrap(); let weight = AllQuery.weight(&searcher, false).unwrap();
{ {
let reader = searcher.segment_reader(0); let reader = searcher.segment_reader(0);
let mut scorer = weight.scorer(reader, 1.0).unwrap(); let mut scorer = weight.scorer(reader).unwrap();
assert!(scorer.advance());
assert_eq!(scorer.doc(), 0u32); assert_eq!(scorer.doc(), 0u32);
assert_eq!(scorer.advance(), 1u32); assert!(scorer.advance());
assert_eq!(scorer.doc(), 1u32); assert_eq!(scorer.doc(), 1u32);
assert_eq!(scorer.advance(), TERMINATED); assert!(!scorer.advance());
} }
{ {
let reader = searcher.segment_reader(1); let reader = searcher.segment_reader(1);
let mut scorer = weight.scorer(reader, 1.0).unwrap(); let mut scorer = weight.scorer(reader).unwrap();
assert!(scorer.advance());
assert_eq!(scorer.doc(), 0u32); assert_eq!(scorer.doc(), 0u32);
assert_eq!(scorer.advance(), TERMINATED); assert!(!scorer.advance());
}
}
#[test]
fn test_all_query_with_boost() {
let index = create_test_index();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let weight = AllQuery.weight(&searcher, false).unwrap();
let reader = searcher.segment_reader(0);
{
let mut scorer = weight.scorer(reader, 2.0).unwrap();
assert_eq!(scorer.doc(), 0u32);
assert_eq!(scorer.score(), 2.0);
}
{
let mut scorer = weight.scorer(reader, 1.5).unwrap();
assert_eq!(scorer.doc(), 0u32);
assert_eq!(scorer.score(), 1.5);
} }
} }
} }

View File

@@ -5,8 +5,9 @@ use crate::query::{BitSetDocSet, Explanation};
use crate::query::{Scorer, Weight}; use crate::query::{Scorer, Weight};
use crate::schema::{Field, IndexRecordOption}; use crate::schema::{Field, IndexRecordOption};
use crate::termdict::{TermDictionary, TermStreamer}; use crate::termdict::{TermDictionary, TermStreamer};
use crate::DocId;
use crate::TantivyError; use crate::TantivyError;
use crate::{DocId, Score}; use crate::{Result, SkipResult};
use std::sync::Arc; use std::sync::Arc;
use tantivy_fst::Automaton; use tantivy_fst::Automaton;
@@ -39,9 +40,10 @@ impl<A> Weight for AutomatonWeight<A>
where where
A: Automaton + Send + Sync + 'static, A: Automaton + Send + Sync + 'static,
{ {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> { fn scorer(&self, reader: &SegmentReader) -> Result<Box<dyn Scorer>> {
let max_doc = reader.max_doc(); let max_doc = reader.max_doc();
let mut doc_bitset = BitSet::with_max_value(max_doc); let mut doc_bitset = BitSet::with_max_value(max_doc);
let inverted_index = reader.inverted_index(self.field); let inverted_index = reader.inverted_index(self.field);
let term_dict = inverted_index.terms(); let term_dict = inverted_index.terms();
let mut term_stream = self.automaton_stream(term_dict); let mut term_stream = self.automaton_stream(term_dict);
@@ -49,26 +51,20 @@ where
let term_info = term_stream.value(); let term_info = term_stream.value();
let mut block_segment_postings = inverted_index let mut block_segment_postings = inverted_index
.read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic); .read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic);
loop { while block_segment_postings.advance() {
let docs = block_segment_postings.docs(); for &doc in block_segment_postings.docs() {
if docs.is_empty() {
break;
}
for &doc in docs {
doc_bitset.insert(doc); doc_bitset.insert(doc);
} }
block_segment_postings.advance();
} }
} }
let doc_bitset = BitSetDocSet::from(doc_bitset); let doc_bitset = BitSetDocSet::from(doc_bitset);
let const_scorer = ConstScorer::new(doc_bitset, boost); Ok(Box::new(ConstScorer::new(doc_bitset)))
Ok(Box::new(const_scorer))
} }
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> { fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0)?; let mut scorer = self.scorer(reader)?;
if scorer.seek(doc) == doc { if scorer.skip_next(doc) == SkipResult::Reached {
Ok(Explanation::new("AutomatonScorer", 1.0)) Ok(Explanation::new("AutomatonScorer", 1.0f32))
} else { } else {
Err(TantivyError::InvalidArgument( Err(TantivyError::InvalidArgument(
"Document does not exist".to_string(), "Document does not exist".to_string(),
@@ -76,94 +72,3 @@ where
} }
} }
} }
#[cfg(test)]
mod tests {
use super::AutomatonWeight;
use crate::docset::TERMINATED;
use crate::query::Weight;
use crate::schema::{Schema, STRING};
use crate::Index;
use tantivy_fst::Automaton;
fn create_index() -> Index {
let mut schema = Schema::builder();
let title = schema.add_text_field("title", STRING);
let index = Index::create_in_ram(schema.build());
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(title=>"abc"));
index_writer.add_document(doc!(title=>"bcd"));
index_writer.add_document(doc!(title=>"abcd"));
assert!(index_writer.commit().is_ok());
index
}
enum State {
Start,
NotMatching,
AfterA,
}
struct PrefixedByA;
impl Automaton for PrefixedByA {
type State = State;
fn start(&self) -> Self::State {
State::Start
}
fn is_match(&self, state: &Self::State) -> bool {
match *state {
State::AfterA => true,
_ => false,
}
}
fn accept(&self, state: &Self::State, byte: u8) -> Self::State {
match *state {
State::Start => {
if byte == b'a' {
State::AfterA
} else {
State::NotMatching
}
}
State::AfterA => State::AfterA,
State::NotMatching => State::NotMatching,
}
}
}
#[test]
fn test_automaton_weight() {
let index = create_index();
let field = index.schema().get_field("title").unwrap();
let automaton_weight = AutomatonWeight::new(field, PrefixedByA);
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let mut scorer = automaton_weight
.scorer(searcher.segment_reader(0u32), 1.0)
.unwrap();
assert_eq!(scorer.doc(), 0u32);
assert_eq!(scorer.score(), 1.0);
assert_eq!(scorer.advance(), 2u32);
assert_eq!(scorer.doc(), 2u32);
assert_eq!(scorer.score(), 1.0);
assert_eq!(scorer.advance(), TERMINATED);
}
#[test]
fn test_automaton_weight_boost() {
let index = create_index();
let field = index.schema().get_field("title").unwrap();
let automaton_weight = AutomatonWeight::new(field, PrefixedByA);
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let mut scorer = automaton_weight
.scorer(searcher.segment_reader(0u32), 1.32)
.unwrap();
assert_eq!(scorer.doc(), 0u32);
assert_eq!(scorer.score(), 1.32);
}
}

View File

@@ -1,6 +1,7 @@
use crate::common::{BitSet, TinySet}; use crate::common::{BitSet, TinySet};
use crate::docset::{DocSet, TERMINATED}; use crate::docset::{DocSet, SkipResult};
use crate::DocId; use crate::DocId;
use std::cmp::Ordering;
/// A `BitSetDocSet` makes it possible to iterate through a bitset as if it was a `DocSet`. /// A `BitSetDocSet` makes it possible to iterate through a bitset as if it was a `DocSet`.
/// ///
@@ -32,51 +33,74 @@ impl From<BitSet> for BitSetDocSet {
} else { } else {
docs.tinyset(0) docs.tinyset(0)
}; };
let mut docset = BitSetDocSet { BitSetDocSet {
docs, docs,
cursor_bucket: 0, cursor_bucket: 0,
cursor_tinybitset: first_tiny_bitset, cursor_tinybitset: first_tiny_bitset,
doc: 0u32, doc: 0u32,
}; }
docset.advance();
docset
} }
} }
impl DocSet for BitSetDocSet { impl DocSet for BitSetDocSet {
fn advance(&mut self) -> DocId { fn advance(&mut self) -> bool {
if let Some(lower) = self.cursor_tinybitset.pop_lowest() { if let Some(lower) = self.cursor_tinybitset.pop_lowest() {
self.doc = (self.cursor_bucket as u32 * 64u32) | lower; self.doc = (self.cursor_bucket as u32 * 64u32) | lower;
return self.doc; return true;
} }
if let Some(cursor_bucket) = self.docs.first_non_empty_bucket(self.cursor_bucket + 1) { if let Some(cursor_bucket) = self.docs.first_non_empty_bucket(self.cursor_bucket + 1) {
self.go_to_bucket(cursor_bucket); self.go_to_bucket(cursor_bucket);
let lower = self.cursor_tinybitset.pop_lowest().unwrap(); let lower = self.cursor_tinybitset.pop_lowest().unwrap();
self.doc = (cursor_bucket * 64u32) | lower; self.doc = (cursor_bucket * 64u32) | lower;
self.doc true
} else { } else {
self.doc = TERMINATED; false
TERMINATED
} }
} }
fn seek(&mut self, target: DocId) -> DocId { fn skip_next(&mut self, target: DocId) -> SkipResult {
if target >= self.docs.max_value() { // skip is required to advance.
self.doc = TERMINATED; if !self.advance() {
return TERMINATED; return SkipResult::End;
} }
let target_bucket = target / 64u32; let target_bucket = target / 64u32;
if target_bucket > self.cursor_bucket {
self.go_to_bucket(target_bucket); // Mask for all of the bits greater or equal
let greater_filter: TinySet = TinySet::range_greater_or_equal(target); // to our target document.
self.cursor_tinybitset = self.cursor_tinybitset.intersect(greater_filter); match target_bucket.cmp(&self.cursor_bucket) {
self.advance() Ordering::Greater => {
} else { self.go_to_bucket(target_bucket);
let mut doc = self.doc(); let greater_filter: TinySet = TinySet::range_greater_or_equal(target);
while doc < target { self.cursor_tinybitset = self.cursor_tinybitset.intersect(greater_filter);
doc = self.advance(); if !self.advance() {
SkipResult::End
} else if self.doc() == target {
SkipResult::Reached
} else {
debug_assert!(self.doc() > target);
SkipResult::OverStep
}
}
Ordering::Equal => loop {
match self.doc().cmp(&target) {
Ordering::Less => {
if !self.advance() {
return SkipResult::End;
}
}
Ordering::Equal => {
return SkipResult::Reached;
}
Ordering::Greater => {
debug_assert!(self.doc() > target);
return SkipResult::OverStep;
}
}
},
Ordering::Less => {
debug_assert!(self.doc() > target);
SkipResult::OverStep
} }
doc
} }
} }
@@ -98,7 +122,7 @@ impl DocSet for BitSetDocSet {
mod tests { mod tests {
use super::BitSetDocSet; use super::BitSetDocSet;
use crate::common::BitSet; use crate::common::BitSet;
use crate::docset::{DocSet, TERMINATED}; use crate::docset::{DocSet, SkipResult};
use crate::DocId; use crate::DocId;
fn create_docbitset(docs: &[DocId], max_doc: DocId) -> BitSetDocSet { fn create_docbitset(docs: &[DocId], max_doc: DocId) -> BitSetDocSet {
@@ -109,31 +133,19 @@ mod tests {
BitSetDocSet::from(docset) BitSetDocSet::from(docset)
} }
#[test]
fn test_empty() {
let bitset = BitSet::with_max_value(1000);
let mut empty = BitSetDocSet::from(bitset);
assert_eq!(empty.advance(), TERMINATED)
}
#[test]
fn test_seek_terminated() {
let bitset = BitSet::with_max_value(1000);
let mut empty = BitSetDocSet::from(bitset);
assert_eq!(empty.seek(TERMINATED), TERMINATED)
}
fn test_go_through_sequential(docs: &[DocId]) { fn test_go_through_sequential(docs: &[DocId]) {
let mut docset = create_docbitset(docs, 1_000u32); let mut docset = create_docbitset(docs, 1_000u32);
for &doc in docs { for &doc in docs {
assert!(docset.advance());
assert_eq!(doc, docset.doc()); assert_eq!(doc, docset.doc());
docset.advance();
} }
assert_eq!(docset.advance(), TERMINATED); assert!(!docset.advance());
assert!(!docset.advance());
} }
#[test] #[test]
fn test_docbitset_sequential() { fn test_docbitset_sequential() {
test_go_through_sequential(&[]);
test_go_through_sequential(&[1, 2, 3]); test_go_through_sequential(&[1, 2, 3]);
test_go_through_sequential(&[1, 2, 3, 4, 5, 63, 64, 65]); test_go_through_sequential(&[1, 2, 3, 4, 5, 63, 64, 65]);
test_go_through_sequential(&[63, 64, 65]); test_go_through_sequential(&[63, 64, 65]);
@@ -144,64 +156,64 @@ mod tests {
fn test_docbitset_skip() { fn test_docbitset_skip() {
{ {
let mut docset = create_docbitset(&[1, 5, 6, 7, 5112], 10_000); let mut docset = create_docbitset(&[1, 5, 6, 7, 5112], 10_000);
assert_eq!(docset.seek(7), 7); assert_eq!(docset.skip_next(7), SkipResult::Reached);
assert_eq!(docset.doc(), 7); assert_eq!(docset.doc(), 7);
assert_eq!(docset.advance(), 5112); assert!(docset.advance(), 7);
assert_eq!(docset.doc(), 5112); assert_eq!(docset.doc(), 5112);
assert_eq!(docset.advance(), TERMINATED); assert!(!docset.advance());
} }
{ {
let mut docset = create_docbitset(&[1, 5, 6, 7, 5112], 10_000); let mut docset = create_docbitset(&[1, 5, 6, 7, 5112], 10_000);
assert_eq!(docset.seek(3), 5); assert_eq!(docset.skip_next(3), SkipResult::OverStep);
assert_eq!(docset.doc(), 5); assert_eq!(docset.doc(), 5);
assert_eq!(docset.advance(), 6); assert!(docset.advance());
} }
{ {
let mut docset = create_docbitset(&[5112], 10_000); let mut docset = create_docbitset(&[5112], 10_000);
assert_eq!(docset.seek(5112), 5112); assert_eq!(docset.skip_next(5112), SkipResult::Reached);
assert_eq!(docset.doc(), 5112); assert_eq!(docset.doc(), 5112);
assert_eq!(docset.advance(), TERMINATED); assert!(!docset.advance());
} }
{ {
let mut docset = create_docbitset(&[5112], 10_000); let mut docset = create_docbitset(&[5112], 10_000);
assert_eq!(docset.seek(5113), TERMINATED); assert_eq!(docset.skip_next(5113), SkipResult::End);
assert_eq!(docset.advance(), TERMINATED); assert!(!docset.advance());
} }
{ {
let mut docset = create_docbitset(&[5112], 10_000); let mut docset = create_docbitset(&[5112], 10_000);
assert_eq!(docset.seek(5111), 5112); assert_eq!(docset.skip_next(5111), SkipResult::OverStep);
assert_eq!(docset.doc(), 5112); assert_eq!(docset.doc(), 5112);
assert_eq!(docset.advance(), TERMINATED); assert!(!docset.advance());
} }
{ {
let mut docset = create_docbitset(&[1, 5, 6, 7, 5112, 5500, 6666], 10_000); let mut docset = create_docbitset(&[1, 5, 6, 7, 5112, 5500, 6666], 10_000);
assert_eq!(docset.seek(5112), 5112); assert_eq!(docset.skip_next(5112), SkipResult::Reached);
assert_eq!(docset.doc(), 5112); assert_eq!(docset.doc(), 5112);
assert_eq!(docset.advance(), 5500); assert!(docset.advance());
assert_eq!(docset.doc(), 5500); assert_eq!(docset.doc(), 5500);
assert_eq!(docset.advance(), 6666); assert!(docset.advance());
assert_eq!(docset.doc(), 6666); assert_eq!(docset.doc(), 6666);
assert_eq!(docset.advance(), TERMINATED); assert!(!docset.advance());
} }
{ {
let mut docset = create_docbitset(&[1, 5, 6, 7, 5112, 5500, 6666], 10_000); let mut docset = create_docbitset(&[1, 5, 6, 7, 5112, 5500, 6666], 10_000);
assert_eq!(docset.seek(5111), 5112); assert_eq!(docset.skip_next(5111), SkipResult::OverStep);
assert_eq!(docset.doc(), 5112); assert_eq!(docset.doc(), 5112);
assert_eq!(docset.advance(), 5500); assert!(docset.advance());
assert_eq!(docset.doc(), 5500); assert_eq!(docset.doc(), 5500);
assert_eq!(docset.advance(), 6666); assert!(docset.advance());
assert_eq!(docset.doc(), 6666); assert_eq!(docset.doc(), 6666);
assert_eq!(docset.advance(), TERMINATED); assert!(!docset.advance());
} }
{ {
let mut docset = create_docbitset(&[1, 5, 6, 7, 5112, 5513, 6666], 10_000); let mut docset = create_docbitset(&[1, 5, 6, 7, 5112, 5513, 6666], 10_000);
assert_eq!(docset.seek(5111), 5112); assert_eq!(docset.skip_next(5111), SkipResult::OverStep);
assert_eq!(docset.doc(), 5112); assert_eq!(docset.doc(), 5112);
assert_eq!(docset.advance(), 5513); assert!(docset.advance());
assert_eq!(docset.doc(), 5513); assert_eq!(docset.doc(), 5513);
assert_eq!(docset.advance(), 6666); assert!(docset.advance());
assert_eq!(docset.doc(), 6666); assert_eq!(docset.doc(), 6666);
assert_eq!(docset.advance(), TERMINATED); assert!(!docset.advance());
} }
} }
} }
@@ -211,7 +223,6 @@ mod bench {
use super::BitSet; use super::BitSet;
use super::BitSetDocSet; use super::BitSetDocSet;
use crate::docset::TERMINATED;
use crate::test; use crate::test;
use crate::tests; use crate::tests;
use crate::DocSet; use crate::DocSet;
@@ -246,7 +257,7 @@ mod bench {
} }
b.iter(|| { b.iter(|| {
let mut docset = BitSetDocSet::from(bitset.clone()); let mut docset = BitSetDocSet::from(bitset.clone());
while docset.advance() != TERMINATED {} while docset.advance() {}
}); });
} }
} }

View File

@@ -3,24 +3,21 @@ use crate::query::Explanation;
use crate::Score; use crate::Score;
use crate::Searcher; use crate::Searcher;
use crate::Term; use crate::Term;
use serde::Deserialize;
use serde::Serialize;
const K1: Score = 1.2; const K1: f32 = 1.2;
const B: Score = 0.75; const B: f32 = 0.75;
fn idf(doc_freq: u64, doc_count: u64) -> Score { fn idf(doc_freq: u64, doc_count: u64) -> f32 {
assert!(doc_count >= doc_freq, "{} >= {}", doc_count, doc_freq); let x = ((doc_count - doc_freq) as f32 + 0.5) / (doc_freq as f32 + 0.5);
let x = ((doc_count - doc_freq) as Score + 0.5) / (doc_freq as Score + 0.5); (1f32 + x).ln()
(1.0 + x).ln()
} }
fn cached_tf_component(fieldnorm: u32, average_fieldnorm: Score) -> Score { fn cached_tf_component(fieldnorm: u32, average_fieldnorm: f32) -> f32 {
K1 * (1.0 - B + B * fieldnorm as Score / average_fieldnorm) K1 * (1f32 - B + B * fieldnorm as f32 / average_fieldnorm)
} }
fn compute_tf_cache(average_fieldnorm: Score) -> [Score; 256] { fn compute_tf_cache(average_fieldnorm: f32) -> [f32; 256] {
let mut cache: [Score; 256] = [0.0; 256]; let mut cache = [0f32; 256];
for (fieldnorm_id, cache_mut) in cache.iter_mut().enumerate() { for (fieldnorm_id, cache_mut) in cache.iter_mut().enumerate() {
let fieldnorm = FieldNormReader::id_to_fieldnorm(fieldnorm_id as u8); let fieldnorm = FieldNormReader::id_to_fieldnorm(fieldnorm_id as u8);
*cache_mut = cached_tf_component(fieldnorm, average_fieldnorm); *cache_mut = cached_tf_component(fieldnorm, average_fieldnorm);
@@ -28,30 +25,15 @@ fn compute_tf_cache(average_fieldnorm: Score) -> [Score; 256] {
cache cache
} }
#[derive(Clone, PartialEq, Debug, Serialize, Deserialize)]
pub struct BM25Params {
pub idf: Score,
pub avg_fieldnorm: Score,
}
#[derive(Clone)] #[derive(Clone)]
pub struct BM25Weight { pub struct BM25Weight {
idf_explain: Explanation, idf_explain: Explanation,
weight: Score, weight: f32,
cache: [Score; 256], cache: [f32; 256],
average_fieldnorm: Score, average_fieldnorm: f32,
} }
impl BM25Weight { impl BM25Weight {
pub fn boost_by(&self, boost: Score) -> BM25Weight {
BM25Weight {
idf_explain: self.idf_explain.clone(),
weight: self.weight * boost,
cache: self.cache,
average_fieldnorm: self.average_fieldnorm,
}
}
pub fn for_terms(searcher: &Searcher, terms: &[Term]) -> BM25Weight { pub fn for_terms(searcher: &Searcher, terms: &[Term]) -> BM25Weight {
assert!(!terms.is_empty(), "BM25 requires at least one term"); assert!(!terms.is_empty(), "BM25 requires at least one term");
let field = terms[0].field(); let field = terms[0].field();
@@ -70,11 +52,19 @@ impl BM25Weight {
total_num_tokens += inverted_index.total_num_tokens(); total_num_tokens += inverted_index.total_num_tokens();
total_num_docs += u64::from(segment_reader.max_doc()); total_num_docs += u64::from(segment_reader.max_doc());
} }
let average_fieldnorm = total_num_tokens as Score / total_num_docs as Score; let average_fieldnorm = total_num_tokens as f32 / total_num_docs as f32;
let mut idf_explain: Explanation;
if terms.len() == 1 { if terms.len() == 1 {
let term_doc_freq = searcher.doc_freq(&terms[0]); let term_doc_freq = searcher.doc_freq(&terms[0]);
BM25Weight::for_one_term(term_doc_freq, total_num_docs, average_fieldnorm) let idf = idf(term_doc_freq, total_num_docs);
idf_explain =
Explanation::new("idf, computed as log(1 + (N - n + 0.5) / (n + 0.5))", idf);
idf_explain.add_const(
"n, number of docs containing this term",
term_doc_freq as f32,
);
idf_explain.add_const("N, total number of docs", total_num_docs as f32);
} else { } else {
let idf = terms let idf = terms
.iter() .iter()
@@ -82,30 +72,14 @@ impl BM25Weight {
let term_doc_freq = searcher.doc_freq(term); let term_doc_freq = searcher.doc_freq(term);
idf(term_doc_freq, total_num_docs) idf(term_doc_freq, total_num_docs)
}) })
.sum::<Score>(); .sum::<f32>();
let idf_explain = Explanation::new("idf", idf); idf_explain = Explanation::new("idf", idf);
BM25Weight::new(idf_explain, average_fieldnorm)
} }
BM25Weight::new(idf_explain, average_fieldnorm)
} }
pub fn for_one_term( fn new(idf_explain: Explanation, average_fieldnorm: f32) -> BM25Weight {
term_doc_freq: u64, let weight = idf_explain.value() * (1f32 + K1);
total_num_docs: u64,
avg_fieldnorm: Score,
) -> BM25Weight {
let idf = idf(term_doc_freq, total_num_docs);
let mut idf_explain =
Explanation::new("idf, computed as log(1 + (N - n + 0.5) / (n + 0.5))", idf);
idf_explain.add_const(
"n, number of docs containing this term",
term_doc_freq as Score,
);
idf_explain.add_const("N, total number of docs", total_num_docs as Score);
BM25Weight::new(idf_explain, avg_fieldnorm)
}
fn new(idf_explain: Explanation, average_fieldnorm: Score) -> BM25Weight {
let weight = idf_explain.value() * (1.0 + K1);
BM25Weight { BM25Weight {
idf_explain, idf_explain,
weight, weight,
@@ -116,27 +90,19 @@ impl BM25Weight {
#[inline(always)] #[inline(always)]
pub fn score(&self, fieldnorm_id: u8, term_freq: u32) -> Score { pub fn score(&self, fieldnorm_id: u8, term_freq: u32) -> Score {
self.weight * self.tf_factor(fieldnorm_id, term_freq)
}
pub fn max_score(&self) -> Score {
self.score(255u8, 2_013_265_944)
}
#[inline(always)]
pub(crate) fn tf_factor(&self, fieldnorm_id: u8, term_freq: u32) -> Score {
let term_freq = term_freq as Score;
let norm = self.cache[fieldnorm_id as usize]; let norm = self.cache[fieldnorm_id as usize];
term_freq / (term_freq + norm) let term_freq = term_freq as f32;
self.weight * term_freq / (term_freq + norm)
} }
pub fn explain(&self, fieldnorm_id: u8, term_freq: u32) -> Explanation { pub fn explain(&self, fieldnorm_id: u8, term_freq: u32) -> Explanation {
// The explain format is directly copied from Lucene's. // The explain format is directly copied from Lucene's.
// (So, Kudos to Lucene) // (So, Kudos to Lucene)
let score = self.score(fieldnorm_id, term_freq); let score = self.score(fieldnorm_id, term_freq);
let norm = self.cache[fieldnorm_id as usize]; let norm = self.cache[fieldnorm_id as usize];
let term_freq = term_freq as Score; let term_freq = term_freq as f32;
let right_factor = term_freq / (term_freq + norm); let right_factor = term_freq / (term_freq + norm);
let mut tf_explanation = Explanation::new( let mut tf_explanation = Explanation::new(
@@ -149,12 +115,12 @@ impl BM25Weight {
tf_explanation.add_const("b, length normalization parameter", B); tf_explanation.add_const("b, length normalization parameter", B);
tf_explanation.add_const( tf_explanation.add_const(
"dl, length of field", "dl, length of field",
FieldNormReader::id_to_fieldnorm(fieldnorm_id) as Score, FieldNormReader::id_to_fieldnorm(fieldnorm_id) as f32,
); );
tf_explanation.add_const("avgdl, average length of field", self.average_fieldnorm); tf_explanation.add_const("avgdl, average length of field", self.average_fieldnorm);
let mut explanation = Explanation::new("TermQuery, product of...", score); let mut explanation = Explanation::new("TermQuery, product of...", score);
explanation.add_detail(Explanation::new("(K1+1)", K1 + 1.0)); explanation.add_detail(Explanation::new("(K1+1)", K1 + 1f32));
explanation.add_detail(self.idf_explain.clone()); explanation.add_detail(self.idf_explain.clone());
explanation.add_detail(tf_explanation); explanation.add_detail(tf_explanation);
explanation explanation
@@ -165,11 +131,10 @@ impl BM25Weight {
mod tests { mod tests {
use super::idf; use super::idf;
use crate::{assert_nearly_equals, Score}; use crate::tests::assert_nearly_equals;
#[test] #[test]
fn test_idf() { fn test_idf() {
let score: Score = 2.0; assert_nearly_equals(idf(1, 2), 0.6931472);
assert_nearly_equals!(idf(1, 2), score.ln());
} }
} }

View File

@@ -1,434 +0,0 @@
use crate::query::term_query::TermScorer;
use crate::query::Scorer;
use crate::{DocId, DocSet, Score, TERMINATED};
use std::ops::Deref;
use std::ops::DerefMut;
/// Takes a term_scorers sorted by their current doc() and a threshold and returns
/// Returns (pivot_len, pivot_ord) defined as follows:
/// - `pivot_doc` lowest document that has a chance of exceeding (>) the threshold score.
/// - `before_pivot_len` number of term_scorers such that term_scorer.doc() < pivot.
/// - `pivot_len` number of term_scorers such that term_scorer.doc() <= pivot.
///
/// We always have `before_pivot_len` < `pivot_len`.
///
/// None is returned if we establish that no document can exceed the threshold.
fn find_pivot_doc(
term_scorers: &[TermScorerWithMaxScore],
threshold: Score,
) -> Option<(usize, usize, DocId)> {
let mut max_score = 0.0;
let mut before_pivot_len = 0;
let mut pivot_doc = TERMINATED;
while before_pivot_len < term_scorers.len() {
let term_scorer = &term_scorers[before_pivot_len];
max_score += term_scorer.max_score;
if max_score > threshold {
pivot_doc = term_scorer.doc();
break;
}
before_pivot_len += 1;
}
if pivot_doc == TERMINATED {
return None;
}
// Right now i is an ordinal, we want a len.
let mut pivot_len = before_pivot_len + 1;
// Some other term_scorer may be positioned on the same document.
pivot_len += term_scorers[pivot_len..]
.iter()
.take_while(|term_scorer| term_scorer.doc() == pivot_doc)
.count();
Some((before_pivot_len, pivot_len, pivot_doc))
}
// Before and after calling this method, scorers need to be sorted by their `.doc()`.
fn block_max_was_too_low_advance_one_scorer(
scorers: &mut Vec<TermScorerWithMaxScore>,
pivot_len: usize,
) {
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
let mut scorer_to_seek = pivot_len - 1;
let mut doc_to_seek_after = scorers[scorer_to_seek].doc();
for scorer_ord in (0..pivot_len - 1).rev() {
let scorer = &scorers[scorer_ord];
if scorer.last_doc_in_block() <= doc_to_seek_after {
doc_to_seek_after = scorer.last_doc_in_block();
scorer_to_seek = scorer_ord;
}
}
for scorer in &scorers[pivot_len..] {
if scorer.doc() <= doc_to_seek_after {
doc_to_seek_after = scorer.doc();
}
}
scorers[scorer_to_seek].seek(doc_to_seek_after + 1);
restore_ordering(scorers, scorer_to_seek);
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
}
// Given a list of term_scorers and a `ord` and assuming that `term_scorers[ord]` is sorted
// except term_scorers[ord] that might be in advance compared to its ranks,
// bubble up term_scorers[ord] in order to restore the ordering.
fn restore_ordering(term_scorers: &mut Vec<TermScorerWithMaxScore>, ord: usize) {
let doc = term_scorers[ord].doc();
for i in ord + 1..term_scorers.len() {
if term_scorers[i].doc() >= doc {
break;
}
term_scorers.swap(i, i - 1);
}
debug_assert!(is_sorted(term_scorers.iter().map(|scorer| scorer.doc())));
}
// Attempts to advance all term_scorers between `&term_scorers[0..before_len]` to the pivot.
// If this works, return true.
// If this fails (ie: one of the term_scorer does not contain `pivot_doc` and seek goes past the
// pivot), reorder the term_scorers to ensure the list is still sorted and returns `false`.
// If a term_scorer reach TERMINATED in the process return false remove the term_scorer and return.
fn align_scorers(
term_scorers: &mut Vec<TermScorerWithMaxScore>,
pivot_doc: DocId,
before_pivot_len: usize,
) -> bool {
debug_assert_ne!(pivot_doc, TERMINATED);
for i in (0..before_pivot_len).rev() {
let new_doc = term_scorers[i].seek(pivot_doc);
if new_doc != pivot_doc {
if new_doc == TERMINATED {
term_scorers.swap_remove(i);
}
// We went past the pivot.
// We just go through the outer loop mechanic (Note that pivot is
// still a possible candidate).
//
// Termination is still guaranteed since we can only consider the same
// pivot at most term_scorers.len() - 1 times.
restore_ordering(term_scorers, i);
return false;
}
}
true
}
// Assumes terms_scorers[..pivot_len] are positioned on the same doc (pivot_doc).
// Advance term_scorers[..pivot_len] and out of these removes the terminated scores.
// Restores the ordering of term_scorers.
fn advance_all_scorers_on_pivot(term_scorers: &mut Vec<TermScorerWithMaxScore>, pivot_len: usize) {
for term_scorer in &mut term_scorers[..pivot_len] {
term_scorer.advance();
}
// TODO use drain_filter when available.
let mut i = 0;
while i != term_scorers.len() {
if term_scorers[i].doc() == TERMINATED {
term_scorers.swap_remove(i);
} else {
i += 1;
}
}
term_scorers.sort_by_key(|scorer| scorer.doc());
}
pub fn block_wand(
mut scorers: Vec<TermScorer>,
mut threshold: Score,
callback: &mut dyn FnMut(u32, Score) -> Score,
) {
let mut scorers: Vec<TermScorerWithMaxScore> = scorers
.iter_mut()
.map(TermScorerWithMaxScore::from)
.collect();
scorers.sort_by_key(|scorer| scorer.doc());
// At this point we need to ensure that the scorers are sorted!
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
while let Some((before_pivot_len, pivot_len, pivot_doc)) =
find_pivot_doc(&scorers[..], threshold)
{
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
debug_assert_ne!(pivot_doc, TERMINATED);
debug_assert!(before_pivot_len < pivot_len);
let block_max_score_upperbound: Score = scorers[..pivot_len]
.iter_mut()
.map(|scorer| {
scorer.shallow_seek(pivot_doc);
scorer.block_max_score()
})
.sum();
// Beware after shallow advance, skip readers can be in advance compared to
// the segment posting lists.
//
// `block_segment_postings.load_block()` need to be called separately.
if block_max_score_upperbound <= threshold {
// Block max condition was not reached
// We could get away by simply advancing the scorers to DocId + 1 but it would
// be inefficient. The optimization requires proper explanation and was
// isolated in a different function.
block_max_was_too_low_advance_one_scorer(&mut scorers, pivot_len);
continue;
}
// Block max condition is observed.
//
// Let's try and advance all scorers before the pivot to the pivot.
if !align_scorers(&mut scorers, pivot_doc, before_pivot_len) {
// At least of the scorer does not contain the pivot.
//
// Let's stop scoring this pivot and go through the pivot selection again.
// Note that the current pivot is not necessarily a bad candidate and it
// may be picked again.
continue;
}
// At this point, all scorers are positioned on the doc.
let score = scorers[..pivot_len]
.iter_mut()
.map(|scorer| scorer.score())
.sum();
if score > threshold {
threshold = callback(pivot_doc, score);
}
// let's advance all of the scorers that are currently positioned on the pivot.
advance_all_scorers_on_pivot(&mut scorers, pivot_len);
}
}
struct TermScorerWithMaxScore<'a> {
scorer: &'a mut TermScorer,
max_score: Score,
}
impl<'a> From<&'a mut TermScorer> for TermScorerWithMaxScore<'a> {
fn from(scorer: &'a mut TermScorer) -> Self {
let max_score = scorer.max_score();
TermScorerWithMaxScore { scorer, max_score }
}
}
impl<'a> Deref for TermScorerWithMaxScore<'a> {
type Target = TermScorer;
fn deref(&self) -> &Self::Target {
self.scorer
}
}
impl<'a> DerefMut for TermScorerWithMaxScore<'a> {
fn deref_mut(&mut self) -> &mut Self::Target {
self.scorer
}
}
fn is_sorted<I: Iterator<Item = DocId>>(mut it: I) -> bool {
if let Some(first) = it.next() {
let mut prev = first;
for doc in it {
if doc < prev {
return false;
}
prev = doc;
}
}
true
}
#[cfg(test)]
mod tests {
use crate::query::score_combiner::SumCombiner;
use crate::query::term_query::TermScorer;
use crate::query::Union;
use crate::query::{BM25Weight, Scorer};
use crate::{DocId, DocSet, Score, TERMINATED};
use proptest::prelude::*;
use std::cmp::Ordering;
use std::collections::BinaryHeap;
use std::iter;
struct Float(Score);
impl Eq for Float {}
impl PartialEq for Float {
fn eq(&self, other: &Self) -> bool {
self.cmp(&other) == Ordering::Equal
}
}
impl PartialOrd for Float {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for Float {
fn cmp(&self, other: &Self) -> Ordering {
other.0.partial_cmp(&self.0).unwrap_or(Ordering::Equal)
}
}
fn nearly_equals(left: Score, right: Score) -> bool {
(left - right).abs() < 0.000001 * (left + right).abs()
}
fn compute_checkpoints_for_each_pruning(
term_scorers: Vec<TermScorer>,
n: usize,
) -> Vec<(DocId, Score)> {
let mut heap: BinaryHeap<Float> = BinaryHeap::with_capacity(n);
let mut checkpoints: Vec<(DocId, Score)> = Vec::new();
let mut limit: Score = 0.0;
super::block_wand(term_scorers, Score::MIN, &mut |doc, score| {
heap.push(Float(score));
if heap.len() > n {
heap.pop().unwrap();
}
if heap.len() == n {
limit = heap.peek().unwrap().0;
}
if !nearly_equals(score, limit) {
checkpoints.push((doc, score));
}
return limit;
});
checkpoints
}
fn compute_checkpoints_manual(term_scorers: Vec<TermScorer>, n: usize) -> Vec<(DocId, Score)> {
let mut heap: BinaryHeap<Float> = BinaryHeap::with_capacity(n);
let mut checkpoints: Vec<(DocId, Score)> = Vec::new();
let mut scorer: Union<TermScorer, SumCombiner> = Union::from(term_scorers);
let mut limit = Score::MIN;
loop {
if scorer.doc() == TERMINATED {
break;
}
let doc = scorer.doc();
let score = scorer.score();
if score > limit {
heap.push(Float(score));
if heap.len() > n {
heap.pop().unwrap();
}
if heap.len() == n {
limit = heap.peek().unwrap().0;
}
if !nearly_equals(score, limit) {
checkpoints.push((doc, score));
}
}
scorer.advance();
}
checkpoints
}
const MAX_TERM_FREQ: u32 = 100u32;
fn posting_list(max_doc: u32) -> BoxedStrategy<Vec<(DocId, u32)>> {
(1..max_doc + 1)
.prop_flat_map(move |doc_freq| {
(
proptest::bits::bitset::sampled(doc_freq as usize, 0..max_doc as usize),
proptest::collection::vec(1u32..MAX_TERM_FREQ, doc_freq as usize),
)
})
.prop_map(|(docset, term_freqs)| {
docset
.iter()
.map(|doc| doc as u32)
.zip(term_freqs.iter().cloned())
.collect::<Vec<_>>()
})
.boxed()
}
fn gen_term_scorers(num_scorers: usize) -> BoxedStrategy<(Vec<Vec<(DocId, u32)>>, Vec<u32>)> {
(1u32..100u32)
.prop_flat_map(move |max_doc: u32| {
(
proptest::collection::vec(posting_list(max_doc), num_scorers),
proptest::collection::vec(2u32..10u32 * MAX_TERM_FREQ, max_doc as usize),
)
})
.boxed()
}
fn test_block_wand_aux(posting_lists: &[Vec<(DocId, u32)>], fieldnorms: &[u32]) {
// We virtually repeat all docs 64 times in order to emulate blocks of 2 documents
// and surface blogs more easily.
const REPEAT: usize = 64;
let fieldnorms_expanded = fieldnorms
.iter()
.cloned()
.flat_map(|fieldnorm| iter::repeat(fieldnorm).take(REPEAT))
.collect::<Vec<u32>>();
let postings_lists_expanded: Vec<Vec<(DocId, u32)>> = posting_lists
.iter()
.map(|posting_list| {
posting_list
.into_iter()
.cloned()
.flat_map(|(doc, term_freq)| {
(0 as u32..REPEAT as u32).map(move |offset| {
(
doc * (REPEAT as u32) + offset,
if offset == 0 { term_freq } else { 1 },
)
})
})
.collect::<Vec<(DocId, u32)>>()
})
.collect::<Vec<_>>();
let total_fieldnorms: u64 = fieldnorms_expanded
.iter()
.cloned()
.map(|fieldnorm| fieldnorm as u64)
.sum();
let average_fieldnorm = (total_fieldnorms as Score) / (fieldnorms_expanded.len() as Score);
let max_doc = fieldnorms_expanded.len();
let term_scorers: Vec<TermScorer> = postings_lists_expanded
.iter()
.map(|postings| {
let bm25_weight = BM25Weight::for_one_term(
postings.len() as u64,
max_doc as u64,
average_fieldnorm,
);
TermScorer::create_for_test(postings, &fieldnorms_expanded[..], bm25_weight)
})
.collect();
for top_k in 1..4 {
let checkpoints_for_each_pruning =
compute_checkpoints_for_each_pruning(term_scorers.clone(), top_k);
let checkpoints_manual = compute_checkpoints_manual(term_scorers.clone(), top_k);
assert_eq!(checkpoints_for_each_pruning.len(), checkpoints_manual.len());
for (&(left_doc, left_score), &(right_doc, right_score)) in checkpoints_for_each_pruning
.iter()
.zip(checkpoints_manual.iter())
{
assert_eq!(left_doc, right_doc);
assert!(nearly_equals(left_score, right_score));
}
}
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(500))]
#[test]
fn test_block_wand_two_term_scorers((posting_lists, fieldnorms) in gen_term_scorers(2)) {
test_block_wand_aux(&posting_lists[..], &fieldnorms[..]);
}
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(500))]
#[test]
fn test_block_wand_three_term_scorers((posting_lists, fieldnorms) in gen_term_scorers(3)) {
test_block_wand_aux(&posting_lists[..], &fieldnorms[..]);
}
}
}

View File

@@ -1,9 +1,7 @@
use crate::core::SegmentReader; use crate::core::SegmentReader;
use crate::postings::FreqReadingOption;
use crate::query::explanation::does_not_match; use crate::query::explanation::does_not_match;
use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner, SumWithCoordsCombiner}; use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner, SumWithCoordsCombiner};
use crate::query::term_query::TermScorer; use crate::query::term_query::TermScorer;
use crate::query::weight::{for_each_pruning_scorer, for_each_scorer};
use crate::query::EmptyScorer; use crate::query::EmptyScorer;
use crate::query::Exclude; use crate::query::Exclude;
use crate::query::Occur; use crate::query::Occur;
@@ -12,21 +10,16 @@ use crate::query::Scorer;
use crate::query::Union; use crate::query::Union;
use crate::query::Weight; use crate::query::Weight;
use crate::query::{intersect_scorers, Explanation}; use crate::query::{intersect_scorers, Explanation};
use crate::{DocId, Score}; use crate::{DocId, SkipResult};
use std::collections::HashMap; use std::collections::HashMap;
enum SpecializedScorer { fn scorer_union<TScoreCombiner>(scorers: Vec<Box<dyn Scorer>>) -> Box<dyn Scorer>
TermUnion(Vec<TermScorer>),
Other(Box<dyn Scorer>),
}
fn scorer_union<TScoreCombiner>(scorers: Vec<Box<dyn Scorer>>) -> SpecializedScorer
where where
TScoreCombiner: ScoreCombiner, TScoreCombiner: ScoreCombiner,
{ {
assert!(!scorers.is_empty()); assert!(!scorers.is_empty());
if scorers.len() == 1 { if scorers.len() == 1 {
return SpecializedScorer::Other(scorers.into_iter().next().unwrap()); //< we checked the size beforehands return scorers.into_iter().next().unwrap(); //< we checked the size beforehands
} }
{ {
@@ -36,30 +29,14 @@ where
.into_iter() .into_iter()
.map(|scorer| *(scorer.downcast::<TermScorer>().map_err(|_| ()).unwrap())) .map(|scorer| *(scorer.downcast::<TermScorer>().map_err(|_| ()).unwrap()))
.collect(); .collect();
if scorers let scorer: Box<dyn Scorer> =
.iter() Box::new(Union::<TermScorer, TScoreCombiner>::from(scorers));
.all(|scorer| scorer.freq_reading_option() == FreqReadingOption::ReadFreq) return scorer;
{
// Block wand is only available iff we read frequencies.
return SpecializedScorer::TermUnion(scorers);
} else {
return SpecializedScorer::Other(Box::new(Union::<_, TScoreCombiner>::from(
scorers,
)));
}
} }
} }
SpecializedScorer::Other(Box::new(Union::<_, TScoreCombiner>::from(scorers)))
}
fn into_box_scorer<TScoreCombiner: ScoreCombiner>(scorer: SpecializedScorer) -> Box<dyn Scorer> { let scorer: Box<dyn Scorer> = Box::new(Union::<_, TScoreCombiner>::from(scorers));
match scorer { scorer
SpecializedScorer::TermUnion(term_scorers) => {
let union_scorer = Union::<TermScorer, TScoreCombiner>::from(term_scorers);
Box::new(union_scorer)
}
SpecializedScorer::Other(scorer) => scorer,
}
} }
pub struct BooleanWeight { pub struct BooleanWeight {
@@ -78,11 +55,10 @@ impl BooleanWeight {
fn per_occur_scorers( fn per_occur_scorers(
&self, &self,
reader: &SegmentReader, reader: &SegmentReader,
boost: Score,
) -> crate::Result<HashMap<Occur, Vec<Box<dyn Scorer>>>> { ) -> crate::Result<HashMap<Occur, Vec<Box<dyn Scorer>>>> {
let mut per_occur_scorers: HashMap<Occur, Vec<Box<dyn Scorer>>> = HashMap::new(); let mut per_occur_scorers: HashMap<Occur, Vec<Box<dyn Scorer>>> = HashMap::new();
for &(ref occur, ref subweight) in &self.weights { for &(ref occur, ref subweight) in &self.weights {
let sub_scorer: Box<dyn Scorer> = subweight.scorer(reader, boost)?; let sub_scorer: Box<dyn Scorer> = subweight.scorer(reader)?;
per_occur_scorers per_occur_scorers
.entry(*occur) .entry(*occur)
.or_insert_with(Vec::new) .or_insert_with(Vec::new)
@@ -94,52 +70,41 @@ impl BooleanWeight {
fn complex_scorer<TScoreCombiner: ScoreCombiner>( fn complex_scorer<TScoreCombiner: ScoreCombiner>(
&self, &self,
reader: &SegmentReader, reader: &SegmentReader,
boost: Score, ) -> crate::Result<Box<dyn Scorer>> {
) -> crate::Result<SpecializedScorer> { let mut per_occur_scorers = self.per_occur_scorers(reader)?;
let mut per_occur_scorers = self.per_occur_scorers(reader, boost)?;
let should_scorer_opt: Option<SpecializedScorer> = per_occur_scorers let should_scorer_opt: Option<Box<dyn Scorer>> = per_occur_scorers
.remove(&Occur::Should) .remove(&Occur::Should)
.map(scorer_union::<TScoreCombiner>); .map(scorer_union::<TScoreCombiner>);
let exclude_scorer_opt: Option<Box<dyn Scorer>> = per_occur_scorers let exclude_scorer_opt: Option<Box<dyn Scorer>> = per_occur_scorers
.remove(&Occur::MustNot) .remove(&Occur::MustNot)
.map(scorer_union::<DoNothingCombiner>) .map(scorer_union::<TScoreCombiner>);
.map(into_box_scorer::<DoNothingCombiner>);
let must_scorer_opt: Option<Box<dyn Scorer>> = per_occur_scorers let must_scorer_opt: Option<Box<dyn Scorer>> = per_occur_scorers
.remove(&Occur::Must) .remove(&Occur::Must)
.map(intersect_scorers); .map(intersect_scorers);
let positive_scorer: SpecializedScorer = match (should_scorer_opt, must_scorer_opt) { let positive_scorer: Box<dyn Scorer> = match (should_scorer_opt, must_scorer_opt) {
(Some(should_scorer), Some(must_scorer)) => { (Some(should_scorer), Some(must_scorer)) => {
if self.scoring_enabled { if self.scoring_enabled {
SpecializedScorer::Other(Box::new(RequiredOptionalScorer::< Box::new(RequiredOptionalScorer::<_, _, TScoreCombiner>::new(
Box<dyn Scorer>,
Box<dyn Scorer>,
TScoreCombiner,
>::new(
must_scorer, must_scorer,
into_box_scorer::<TScoreCombiner>(should_scorer), should_scorer,
))) ))
} else { } else {
SpecializedScorer::Other(must_scorer) must_scorer
} }
} }
(None, Some(must_scorer)) => SpecializedScorer::Other(must_scorer), (None, Some(must_scorer)) => must_scorer,
(Some(should_scorer), None) => should_scorer, (Some(should_scorer), None) => should_scorer,
(None, None) => { (None, None) => {
return Ok(SpecializedScorer::Other(Box::new(EmptyScorer))); return Ok(Box::new(EmptyScorer));
} }
}; };
if let Some(exclude_scorer) = exclude_scorer_opt { if let Some(exclude_scorer) = exclude_scorer_opt {
let positive_scorer_boxed: Box<dyn Scorer> = Ok(Box::new(Exclude::new(positive_scorer, exclude_scorer)))
into_box_scorer::<TScoreCombiner>(positive_scorer);
Ok(SpecializedScorer::Other(Box::new(Exclude::new(
positive_scorer_boxed,
exclude_scorer,
))))
} else { } else {
Ok(positive_scorer) Ok(positive_scorer)
} }
@@ -147,7 +112,7 @@ impl BooleanWeight {
} }
impl Weight for BooleanWeight { impl Weight for BooleanWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> { fn scorer(&self, reader: &SegmentReader) -> crate::Result<Box<dyn Scorer>> {
if self.weights.is_empty() { if self.weights.is_empty() {
Ok(Box::new(EmptyScorer)) Ok(Box::new(EmptyScorer))
} else if self.weights.len() == 1 { } else if self.weights.len() == 1 {
@@ -155,26 +120,22 @@ impl Weight for BooleanWeight {
if occur == Occur::MustNot { if occur == Occur::MustNot {
Ok(Box::new(EmptyScorer)) Ok(Box::new(EmptyScorer))
} else { } else {
weight.scorer(reader, boost) weight.scorer(reader)
} }
} else if self.scoring_enabled { } else if self.scoring_enabled {
self.complex_scorer::<SumWithCoordsCombiner>(reader, boost) self.complex_scorer::<SumWithCoordsCombiner>(reader)
.map(|specialized_scorer| {
into_box_scorer::<SumWithCoordsCombiner>(specialized_scorer)
})
} else { } else {
self.complex_scorer::<DoNothingCombiner>(reader, boost) self.complex_scorer::<DoNothingCombiner>(reader)
.map(into_box_scorer::<DoNothingCombiner>)
} }
} }
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> { fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0)?; let mut scorer = self.scorer(reader)?;
if scorer.seek(doc) != doc { if scorer.skip_next(doc) != SkipResult::Reached {
return Err(does_not_match(doc)); return Err(does_not_match(doc));
} }
if !self.scoring_enabled { if !self.scoring_enabled {
return Ok(Explanation::new("BooleanQuery with no scoring", 1.0)); return Ok(Explanation::new("BooleanQuery with no scoring", 1f32));
} }
let mut explanation = Explanation::new("BooleanClause. Sum of ...", scorer.score()); let mut explanation = Explanation::new("BooleanClause. Sum of ...", scorer.score());
@@ -187,53 +148,6 @@ impl Weight for BooleanWeight {
} }
Ok(explanation) Ok(explanation)
} }
fn for_each(
&self,
reader: &SegmentReader,
callback: &mut dyn FnMut(DocId, Score),
) -> crate::Result<()> {
let scorer = self.complex_scorer::<SumWithCoordsCombiner>(reader, 1.0)?;
match scorer {
SpecializedScorer::TermUnion(term_scorers) => {
let mut union_scorer =
Union::<TermScorer, SumWithCoordsCombiner>::from(term_scorers);
for_each_scorer(&mut union_scorer, callback);
}
SpecializedScorer::Other(mut scorer) => {
for_each_scorer(scorer.as_mut(), callback);
}
}
Ok(())
}
/// Calls `callback` with all of the `(doc, score)` for which score
/// is exceeding a given threshold.
///
/// This method is useful for the TopDocs collector.
/// For all docsets, the blanket implementation has the benefit
/// of prefiltering (doc, score) pairs, avoiding the
/// virtual dispatch cost.
///
/// More importantly, it makes it possible for scorers to implement
/// important optimization (e.g. BlockWAND for union).
fn for_each_pruning(
&self,
threshold: Score,
reader: &SegmentReader,
callback: &mut dyn FnMut(DocId, Score) -> Score,
) -> crate::Result<()> {
let scorer = self.complex_scorer::<SumWithCoordsCombiner>(reader, 1.0)?;
match scorer {
SpecializedScorer::TermUnion(term_scorers) => {
super::block_wand(term_scorers, threshold, callback);
}
SpecializedScorer::Other(mut scorer) => {
for_each_pruning_scorer(scorer.as_mut(), threshold, callback);
}
}
Ok(())
}
} }
fn is_positive_occur(occur: Occur) -> bool { fn is_positive_occur(occur: Occur) -> bool {

View File

@@ -1,17 +1,13 @@
mod block_wand;
mod boolean_query; mod boolean_query;
mod boolean_weight; mod boolean_weight;
pub(crate) use self::block_wand::block_wand;
pub use self::boolean_query::BooleanQuery; pub use self::boolean_query::BooleanQuery;
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::assert_nearly_equals;
use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE; use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE;
use crate::collector::TopDocs;
use crate::query::score_combiner::SumWithCoordsCombiner; use crate::query::score_combiner::SumWithCoordsCombiner;
use crate::query::term_query::TermScorer; use crate::query::term_query::TermScorer;
use crate::query::Intersection; use crate::query::Intersection;
@@ -23,7 +19,7 @@ mod tests {
use crate::query::TermQuery; use crate::query::TermQuery;
use crate::schema::*; use crate::schema::*;
use crate::Index; use crate::Index;
use crate::{DocAddress, DocId, Score}; use crate::{DocAddress, DocId};
fn aux_test_helper() -> (Index, Field) { fn aux_test_helper() -> (Index, Field) {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
@@ -32,13 +28,26 @@ mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
{ {
index_writer.add_document(doc!(text_field => "a b c")); let doc = doc!(text_field => "a b c");
index_writer.add_document(doc!(text_field => "a c")); index_writer.add_document(doc);
index_writer.add_document(doc!(text_field => "b c")); }
index_writer.add_document(doc!(text_field => "a b c d")); {
index_writer.add_document(doc!(text_field => "d")); let doc = doc!(text_field => "a c");
index_writer.add_document(doc);
}
{
let doc = doc!(text_field => "b c");
index_writer.add_document(doc);
}
{
let doc = doc!(text_field => "a b c d");
index_writer.add_document(doc);
}
{
let doc = doc!(text_field => "d");
index_writer.add_document(doc);
} }
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
} }
@@ -61,7 +70,7 @@ mod tests {
let query = query_parser.parse_query("+a").unwrap(); let query = query_parser.parse_query("+a").unwrap();
let searcher = index.reader().unwrap().searcher(); let searcher = index.reader().unwrap().searcher();
let weight = query.weight(&searcher, true).unwrap(); let weight = query.weight(&searcher, true).unwrap();
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0).unwrap(); let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
assert!(scorer.is::<TermScorer>()); assert!(scorer.is::<TermScorer>());
} }
@@ -73,13 +82,13 @@ mod tests {
{ {
let query = query_parser.parse_query("+a +b +c").unwrap(); let query = query_parser.parse_query("+a +b +c").unwrap();
let weight = query.weight(&searcher, true).unwrap(); let weight = query.weight(&searcher, true).unwrap();
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0).unwrap(); let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
assert!(scorer.is::<Intersection<TermScorer>>()); assert!(scorer.is::<Intersection<TermScorer>>());
} }
{ {
let query = query_parser.parse_query("+a +(b c)").unwrap(); let query = query_parser.parse_query("+a +(b c)").unwrap();
let weight = query.weight(&searcher, true).unwrap(); let weight = query.weight(&searcher, true).unwrap();
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0).unwrap(); let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
assert!(scorer.is::<Intersection<Box<dyn Scorer>>>()); assert!(scorer.is::<Intersection<Box<dyn Scorer>>>());
} }
} }
@@ -92,7 +101,7 @@ mod tests {
{ {
let query = query_parser.parse_query("+a b").unwrap(); let query = query_parser.parse_query("+a b").unwrap();
let weight = query.weight(&searcher, true).unwrap(); let weight = query.weight(&searcher, true).unwrap();
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0).unwrap(); let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
assert!(scorer.is::<RequiredOptionalScorer< assert!(scorer.is::<RequiredOptionalScorer<
Box<dyn Scorer>, Box<dyn Scorer>,
Box<dyn Scorer>, Box<dyn Scorer>,
@@ -102,7 +111,7 @@ mod tests {
{ {
let query = query_parser.parse_query("+a b").unwrap(); let query = query_parser.parse_query("+a b").unwrap();
let weight = query.weight(&searcher, false).unwrap(); let weight = query.weight(&searcher, false).unwrap();
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0).unwrap(); let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
assert!(scorer.is::<TermScorer>()); assert!(scorer.is::<TermScorer>());
} }
} }
@@ -133,6 +142,7 @@ mod tests {
.map(|doc| doc.1) .map(|doc| doc.1)
.collect::<Vec<DocId>>() .collect::<Vec<DocId>>()
}; };
{ {
let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a"))]); let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a"))]);
assert_eq!(matching_docs(&boolean_query), vec![0, 1, 3]); assert_eq!(matching_docs(&boolean_query), vec![0, 1, 3]);
@@ -169,96 +179,6 @@ mod tests {
} }
} }
#[test]
pub fn test_boolean_query_two_excluded() {
let (index, text_field) = aux_test_helper();
let make_term_query = |text: &str| {
let term_query = TermQuery::new(
Term::from_field_text(text_field, text),
IndexRecordOption::Basic,
);
let query: Box<dyn Query> = Box::new(term_query);
query
};
let reader = index.reader().unwrap();
let matching_topdocs = |query: &dyn Query| {
reader
.searcher()
.search(query, &TopDocs::with_limit(3))
.unwrap()
};
let score_doc_4: Score; // score of doc 4 should not be influenced by exclusion
{
let boolean_query_no_excluded =
BooleanQuery::from(vec![(Occur::Must, make_term_query("d"))]);
let topdocs_no_excluded = matching_topdocs(&boolean_query_no_excluded);
assert_eq!(topdocs_no_excluded.len(), 2);
let (top_score, top_doc) = topdocs_no_excluded[0];
assert_eq!(top_doc, DocAddress(0, 4));
assert_eq!(topdocs_no_excluded[1].1, DocAddress(0, 3)); // ignore score of doc 3.
score_doc_4 = top_score;
}
{
let boolean_query_two_excluded = BooleanQuery::from(vec![
(Occur::Must, make_term_query("d")),
(Occur::MustNot, make_term_query("a")),
(Occur::MustNot, make_term_query("b")),
]);
let topdocs_excluded = matching_topdocs(&boolean_query_two_excluded);
assert_eq!(topdocs_excluded.len(), 1);
let (top_score, top_doc) = topdocs_excluded[0];
assert_eq!(top_doc, DocAddress(0, 4));
assert_eq!(top_score, score_doc_4);
}
}
#[test]
pub fn test_boolean_query_with_weight() {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field => "a b c"));
index_writer.add_document(doc!(text_field => "a c"));
index_writer.add_document(doc!(text_field => "b c"));
assert!(index_writer.commit().is_ok());
}
let term_a: Box<dyn Query> = Box::new(TermQuery::new(
Term::from_field_text(text_field, "a"),
IndexRecordOption::WithFreqs,
));
let term_b: Box<dyn Query> = Box::new(TermQuery::new(
Term::from_field_text(text_field, "b"),
IndexRecordOption::WithFreqs,
));
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let boolean_query =
BooleanQuery::from(vec![(Occur::Should, term_a), (Occur::Should, term_b)]);
let boolean_weight = boolean_query.weight(&searcher, true).unwrap();
{
let mut boolean_scorer = boolean_weight
.scorer(searcher.segment_reader(0u32), 1.0)
.unwrap();
assert_eq!(boolean_scorer.doc(), 0u32);
assert_nearly_equals!(boolean_scorer.score(), 0.84163445);
}
{
let mut boolean_scorer = boolean_weight
.scorer(searcher.segment_reader(0u32), 2.0)
.unwrap();
assert_eq!(boolean_scorer.doc(), 0u32);
assert_nearly_equals!(boolean_scorer.score(), 1.6832689);
}
}
#[test] #[test]
pub fn test_intersection_score() { pub fn test_intersection_score() {
let (index, text_field) = aux_test_helper(); let (index, text_field) = aux_test_helper();
@@ -285,9 +205,168 @@ mod tests {
(Occur::Must, make_term_query("a")), (Occur::Must, make_term_query("a")),
(Occur::Must, make_term_query("b")), (Occur::Must, make_term_query("b")),
]); ]);
let scores = score_docs(&boolean_query); assert_eq!(score_docs(&boolean_query), vec![0.977973, 0.84699446]);
assert_nearly_equals!(scores[0], 0.977973);
assert_nearly_equals!(scores[1], 0.84699446);
} }
} }
// motivated by #554
#[test]
fn test_bm25_several_fields() {
let mut schema_builder = Schema::builder();
let title = schema_builder.add_text_field("title", TEXT);
let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(
// tf = 1 0
title => "Законы притяжения Оксана Кулакова",
// tf = 1 0
text => "Законы притяжения Оксана Кулакова] \n\nТема: Сексуальное искусство, Женственность\nТип товара: Запись вебинара (аудио)\nПродолжительность: 1,5 часа\n\nСсылка на вебинар:\n ",
));
index_writer.add_document(doc!(
// tf = 1 0
title => "Любимые русские пироги (Оксана Путан)",
// tf = 2 0
text => "http://i95.fastpic.ru/big/2017/0628/9a/615b9c8504d94a3893d7f496ac53539a.jpg \n\nОт издателя\nОксана Путан профессиональный повар, автор кулинарных книг и известный кулинарный блогер. Ее рецепты отличаются практичностью, доступностью и пользуются огромной популярностью в русскоязычном интернете. Это третья книга автора о самом вкусном и ароматном настоящих русских пирогах и выпечке!\nДаже новички на кухне легко готовят по ее рецептам. Оксана описывает процесс приготовления настолько подробно и понятно, что вам остается только наслаждаться готовкой и не тратить время на лишние усилия. Готовьте легко и просто!\n\nhttps://www.ozon.ru/context/detail/id/139872462/"
));
index_writer.add_document(doc!(
// tf = 1 1
title => "PDF Мастер Класс \"Морячок\" (Оксана Лифенко)",
// tf = 0 0
text => "https://i.ibb.co/pzvHrDN/I3d U T6 Gg TM.jpg\nhttps://i.ibb.co/NFrb6v6/N0ls Z9nwjb U.jpg\nВ описание входит штаны, кофта, берет, матросский воротник. Описание продается в формате PDF, состоит из 12 страниц формата А4 и может быть напечатано на любом принтере.\nОписание предназначено для кукол BJD RealPuki от FairyLand, но может подойти и другим подобным куклам. Также вы можете вязать этот наряд из обычной пряжи, и он подойдет для куколок побольше.\nhttps://vk.com/market 95724412?w=product 95724412_2212"
));
for _ in 0..1_000 {
index_writer.add_document(doc!(
title => "a b d e f g",
text => "maitre corbeau sur un arbre perche tenait dans son bec un fromage Maitre rnard par lodeur alleche lui tint a peu pres ce langage."
));
}
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let query_parser = QueryParser::for_index(&index, vec![title, text]);
let query = query_parser.parse_query("Оксана Лифенко").unwrap();
let weight = query.weight(&searcher, true).unwrap();
let mut scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
scorer.advance();
let explanation = query.explain(&searcher, DocAddress(0u32, 0u32)).unwrap();
assert_eq!(
explanation.to_pretty_json(),
r#"{
"value": 12.997711,
"description": "BooleanClause. Sum of ...",
"details": [
{
"value": 12.997711,
"description": "BooleanClause. Sum of ...",
"details": [
{
"value": 6.551476,
"description": "TermQuery, product of...",
"details": [
{
"value": 2.2,
"description": "(K1+1)"
},
{
"value": 5.658984,
"description": "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5))",
"details": [
{
"value": 3.0,
"description": "n, number of docs containing this term"
},
{
"value": 1003.0,
"description": "N, total number of docs"
}
]
},
{
"value": 0.5262329,
"description": "freq / (freq + k1 * (1 - b + b * dl / avgdl))",
"details": [
{
"value": 1.0,
"description": "freq, occurrences of term within document"
},
{
"value": 1.2,
"description": "k1, term saturation parameter"
},
{
"value": 0.75,
"description": "b, length normalization parameter"
},
{
"value": 4.0,
"description": "dl, length of field"
},
{
"value": 5.997009,
"description": "avgdl, average length of field"
}
]
}
]
},
{
"value": 6.446235,
"description": "TermQuery, product of...",
"details": [
{
"value": 2.2,
"description": "(K1+1)"
},
{
"value": 5.9954567,
"description": "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5))",
"details": [
{
"value": 2.0,
"description": "n, number of docs containing this term"
},
{
"value": 1003.0,
"description": "N, total number of docs"
}
]
},
{
"value": 0.4887212,
"description": "freq / (freq + k1 * (1 - b + b * dl / avgdl))",
"details": [
{
"value": 1.0,
"description": "freq, occurrences of term within document"
},
{
"value": 1.2,
"description": "k1, term saturation parameter"
},
{
"value": 0.75,
"description": "b, length normalization parameter"
},
{
"value": 20.0,
"description": "dl, length of field"
},
{
"value": 24.123629,
"description": "avgdl, average length of field"
}
]
}
]
}
]
}
]
}"#
);
}
} }

View File

@@ -1,159 +0,0 @@
use crate::fastfield::DeleteBitSet;
use crate::query::explanation::does_not_match;
use crate::query::{Explanation, Query, Scorer, Weight};
use crate::{DocId, DocSet, Score, Searcher, SegmentReader, Term};
use std::collections::BTreeSet;
use std::fmt;
/// `BoostQuery` is a wrapper over a query used to boost its score.
///
/// The document set matched by the `BoostQuery` is strictly the same as the underlying query.
/// The score of each document, is the score of the underlying query multiplied by the `boost`
/// factor.
pub struct BoostQuery {
query: Box<dyn Query>,
boost: Score,
}
impl BoostQuery {
/// Builds a boost query.
pub fn new(query: Box<dyn Query>, boost: Score) -> BoostQuery {
BoostQuery { query, boost }
}
}
impl Clone for BoostQuery {
fn clone(&self) -> Self {
BoostQuery {
query: self.query.box_clone(),
boost: self.boost,
}
}
}
impl fmt::Debug for BoostQuery {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "Boost(query={:?}, boost={})", self.query, self.boost)
}
}
impl Query for BoostQuery {
fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> crate::Result<Box<dyn Weight>> {
let weight_without_boost = self.query.weight(searcher, scoring_enabled)?;
let boosted_weight = if scoring_enabled {
Box::new(BoostWeight::new(weight_without_boost, self.boost))
} else {
weight_without_boost
};
Ok(boosted_weight)
}
fn query_terms(&self, term_set: &mut BTreeSet<Term>) {
self.query.query_terms(term_set)
}
}
pub(crate) struct BoostWeight {
weight: Box<dyn Weight>,
boost: Score,
}
impl BoostWeight {
pub fn new(weight: Box<dyn Weight>, boost: Score) -> Self {
BoostWeight { weight, boost }
}
}
impl Weight for BoostWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
self.weight.scorer(reader, boost * self.boost)
}
fn explain(&self, reader: &SegmentReader, doc: u32) -> crate::Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0)?;
if scorer.seek(doc) != doc {
return Err(does_not_match(doc));
}
let mut explanation =
Explanation::new(format!("Boost x{} of ...", self.boost), scorer.score());
let underlying_explanation = self.weight.explain(reader, doc)?;
explanation.add_detail(underlying_explanation);
Ok(explanation)
}
fn count(&self, reader: &SegmentReader) -> crate::Result<u32> {
self.weight.count(reader)
}
}
pub(crate) struct BoostScorer<S: Scorer> {
underlying: S,
boost: Score,
}
impl<S: Scorer> BoostScorer<S> {
pub fn new(underlying: S, boost: Score) -> BoostScorer<S> {
BoostScorer { underlying, boost }
}
}
impl<S: Scorer> DocSet for BoostScorer<S> {
fn advance(&mut self) -> DocId {
self.underlying.advance()
}
fn seek(&mut self, target: DocId) -> DocId {
self.underlying.seek(target)
}
fn fill_buffer(&mut self, buffer: &mut [DocId]) -> usize {
self.underlying.fill_buffer(buffer)
}
fn doc(&self) -> u32 {
self.underlying.doc()
}
fn size_hint(&self) -> u32 {
self.underlying.size_hint()
}
fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
self.underlying.count(delete_bitset)
}
fn count_including_deleted(&mut self) -> u32 {
self.underlying.count_including_deleted()
}
}
impl<S: Scorer> Scorer for BoostScorer<S> {
fn score(&mut self) -> Score {
self.underlying.score() * self.boost
}
}
#[cfg(test)]
mod tests {
use super::BoostQuery;
use crate::query::{AllQuery, Query};
use crate::schema::Schema;
use crate::{DocAddress, Document, Index};
#[test]
fn test_boost_query_explain() {
let schema = Schema::builder().build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(Document::new());
assert!(index_writer.commit().is_ok());
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let query = BoostQuery::new(Box::new(AllQuery), 0.2);
let explanation = query.explain(&searcher, DocAddress(0, 0u32)).unwrap();
assert_eq!(
explanation.to_pretty_json(),
"{\n \"value\": 0.2,\n \"description\": \"Boost x0.2 of ...\",\n \"details\": [\n {\n \"value\": 1.0,\n \"description\": \"AllQuery\"\n }\n ]\n}"
)
}
}

View File

@@ -1,5 +1,4 @@
use super::Scorer; use super::Scorer;
use crate::docset::TERMINATED;
use crate::query::explanation::does_not_match; use crate::query::explanation::does_not_match;
use crate::query::Weight; use crate::query::Weight;
use crate::query::{Explanation, Query}; use crate::query::{Explanation, Query};
@@ -34,7 +33,7 @@ impl Query for EmptyQuery {
/// It is useful for tests and handling edge cases. /// It is useful for tests and handling edge cases.
pub struct EmptyWeight; pub struct EmptyWeight;
impl Weight for EmptyWeight { impl Weight for EmptyWeight {
fn scorer(&self, _reader: &SegmentReader, _boost: Score) -> crate::Result<Box<dyn Scorer>> { fn scorer(&self, _reader: &SegmentReader) -> crate::Result<Box<dyn Scorer>> {
Ok(Box::new(EmptyScorer)) Ok(Box::new(EmptyScorer))
} }
@@ -49,12 +48,15 @@ impl Weight for EmptyWeight {
pub struct EmptyScorer; pub struct EmptyScorer;
impl DocSet for EmptyScorer { impl DocSet for EmptyScorer {
fn advance(&mut self) -> DocId { fn advance(&mut self) -> bool {
TERMINATED false
} }
fn doc(&self) -> DocId { fn doc(&self) -> DocId {
TERMINATED panic!(
"You may not call .doc() on a scorer \
where the last call to advance() did not return true."
);
} }
fn size_hint(&self) -> u32 { fn size_hint(&self) -> u32 {
@@ -64,21 +66,24 @@ impl DocSet for EmptyScorer {
impl Scorer for EmptyScorer { impl Scorer for EmptyScorer {
fn score(&mut self) -> Score { fn score(&mut self) -> Score {
0.0 0f32
} }
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::docset::TERMINATED;
use crate::query::EmptyScorer; use crate::query::EmptyScorer;
use crate::DocSet; use crate::DocSet;
#[test] #[test]
fn test_empty_scorer() { fn test_empty_scorer() {
let mut empty_scorer = EmptyScorer; let mut empty_scorer = EmptyScorer;
assert_eq!(empty_scorer.doc(), TERMINATED); assert!(!empty_scorer.advance());
assert_eq!(empty_scorer.advance(), TERMINATED); }
assert_eq!(empty_scorer.doc(), TERMINATED);
#[test]
#[should_panic]
fn test_empty_scorer_panic_on_doc_call() {
EmptyScorer.doc();
} }
} }

View File

@@ -1,11 +1,12 @@
use crate::docset::{DocSet, TERMINATED}; use crate::docset::{DocSet, SkipResult};
use crate::query::Scorer; use crate::query::Scorer;
use crate::DocId; use crate::DocId;
use crate::Score; use crate::Score;
#[inline(always)] #[derive(Clone, Copy, Debug)]
fn is_within<TDocSetExclude: DocSet>(docset: &mut TDocSetExclude, doc: DocId) -> bool { enum State {
docset.doc() <= doc && docset.seek(doc) == doc ExcludeOne(DocId),
Finished,
} }
/// Filters a given `DocSet` by removing the docs from a given `DocSet`. /// Filters a given `DocSet` by removing the docs from a given `DocSet`.
@@ -14,6 +15,29 @@ fn is_within<TDocSetExclude: DocSet>(docset: &mut TDocSetExclude, doc: DocId) ->
pub struct Exclude<TDocSet, TDocSetExclude> { pub struct Exclude<TDocSet, TDocSetExclude> {
underlying_docset: TDocSet, underlying_docset: TDocSet,
excluding_docset: TDocSetExclude, excluding_docset: TDocSetExclude,
excluding_state: State,
}
impl<TDocSet, TDocSetExclude> Exclude<TDocSet, TDocSetExclude>
where
TDocSetExclude: DocSet,
{
/// Creates a new `ExcludeScorer`
pub fn new(
underlying_docset: TDocSet,
mut excluding_docset: TDocSetExclude,
) -> Exclude<TDocSet, TDocSetExclude> {
let state = if excluding_docset.advance() {
State::ExcludeOne(excluding_docset.doc())
} else {
State::Finished
};
Exclude {
underlying_docset,
excluding_docset,
excluding_state: state,
}
}
} }
impl<TDocSet, TDocSetExclude> Exclude<TDocSet, TDocSetExclude> impl<TDocSet, TDocSetExclude> Exclude<TDocSet, TDocSetExclude>
@@ -21,21 +45,33 @@ where
TDocSet: DocSet, TDocSet: DocSet,
TDocSetExclude: DocSet, TDocSetExclude: DocSet,
{ {
/// Creates a new `ExcludeScorer` /// Returns true iff the doc is not removed.
pub fn new( ///
mut underlying_docset: TDocSet, /// The method has to be called with non strictly
mut excluding_docset: TDocSetExclude, /// increasing `doc`.
) -> Exclude<TDocSet, TDocSetExclude> { fn accept(&mut self) -> bool {
while underlying_docset.doc() != TERMINATED { let doc = self.underlying_docset.doc();
let target = underlying_docset.doc(); match self.excluding_state {
if !is_within(&mut excluding_docset, target) { State::ExcludeOne(excluded_doc) => {
break; if doc == excluded_doc {
return false;
}
if excluded_doc > doc {
return true;
}
match self.excluding_docset.skip_next(doc) {
SkipResult::OverStep => {
self.excluding_state = State::ExcludeOne(self.excluding_docset.doc());
true
}
SkipResult::End => {
self.excluding_state = State::Finished;
true
}
SkipResult::Reached => false,
}
} }
underlying_docset.advance(); State::Finished => true,
}
Exclude {
underlying_docset,
excluding_docset,
} }
} }
} }
@@ -45,27 +81,27 @@ where
TDocSet: DocSet, TDocSet: DocSet,
TDocSetExclude: DocSet, TDocSetExclude: DocSet,
{ {
fn advance(&mut self) -> DocId { fn advance(&mut self) -> bool {
loop { while self.underlying_docset.advance() {
let candidate = self.underlying_docset.advance(); if self.accept() {
if candidate == TERMINATED { return true;
return TERMINATED;
}
if !is_within(&mut self.excluding_docset, candidate) {
return candidate;
} }
} }
false
} }
fn seek(&mut self, target: DocId) -> DocId { fn skip_next(&mut self, target: DocId) -> SkipResult {
let candidate = self.underlying_docset.seek(target); let underlying_skip_result = self.underlying_docset.skip_next(target);
if candidate == TERMINATED { if underlying_skip_result == SkipResult::End {
return TERMINATED; return SkipResult::End;
} }
if !is_within(&mut self.excluding_docset, candidate) { if self.accept() {
return candidate; underlying_skip_result
} else if self.advance() {
SkipResult::OverStep
} else {
SkipResult::End
} }
self.advance()
} }
fn doc(&self) -> DocId { fn doc(&self) -> DocId {
@@ -105,9 +141,8 @@ mod tests {
VecDocSet::from(vec![1, 2, 3, 10, 16, 24]), VecDocSet::from(vec![1, 2, 3, 10, 16, 24]),
); );
let mut els = vec![]; let mut els = vec![];
while exclude_scorer.doc() != TERMINATED { while exclude_scorer.advance() {
els.push(exclude_scorer.doc()); els.push(exclude_scorer.doc());
exclude_scorer.advance();
} }
assert_eq!(els, vec![5, 8, 15]); assert_eq!(els, vec![5, 8, 15]);
} }
@@ -121,7 +156,7 @@ mod tests {
VecDocSet::from(vec![1, 2, 3, 10, 16, 24]), VecDocSet::from(vec![1, 2, 3, 10, 16, 24]),
)) ))
}, },
vec![5, 8, 10, 15, 24], vec![1, 2, 5, 8, 10, 15, 24],
); );
} }

View File

@@ -1,6 +1,4 @@
use crate::{DocId, Score, TantivyError}; use crate::{DocId, TantivyError};
use serde::Serialize;
use std::fmt;
pub(crate) fn does_not_match(doc: DocId) -> TantivyError { pub(crate) fn does_not_match(doc: DocId) -> TantivyError {
TantivyError::InvalidArgument(format!("Document #({}) does not match", doc)) TantivyError::InvalidArgument(format!("Document #({}) does not match", doc))
@@ -13,21 +11,15 @@ pub(crate) fn does_not_match(doc: DocId) -> TantivyError {
/// representation of this tree when debugging a given score. /// representation of this tree when debugging a given score.
#[derive(Clone, Serialize)] #[derive(Clone, Serialize)]
pub struct Explanation { pub struct Explanation {
value: Score, value: f32,
description: String, description: String,
#[serde(skip_serializing_if = "Vec::is_empty")] #[serde(skip_serializing_if = "Vec::is_empty")]
details: Vec<Explanation>, details: Vec<Explanation>,
} }
impl fmt::Debug for Explanation {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "Explanation({})", self.to_pretty_json())
}
}
impl Explanation { impl Explanation {
/// Creates a new explanation object. /// Creates a new explanation object.
pub fn new<T: ToString>(description: T, value: Score) -> Explanation { pub fn new<T: ToString>(description: T, value: f32) -> Explanation {
Explanation { Explanation {
value, value,
description: description.to_string(), description: description.to_string(),
@@ -36,7 +28,7 @@ impl Explanation {
} }
/// Returns the value associated to the current node. /// Returns the value associated to the current node.
pub fn value(&self) -> Score { pub fn value(&self) -> f32 {
self.value self.value
} }
@@ -48,7 +40,7 @@ impl Explanation {
} }
/// Shortcut for `self.details.push(Explanation::new(name, value));` /// Shortcut for `self.details.push(Explanation::new(name, value));`
pub fn add_const<T: ToString>(&mut self, name: T, value: Score) { pub fn add_const<T: ToString>(&mut self, name: T, value: f32) {
self.details.push(Explanation::new(name, value)); self.details.push(Explanation::new(name, value));
} }

View File

@@ -2,40 +2,14 @@ use crate::query::{AutomatonWeight, Query, Weight};
use crate::schema::Term; use crate::schema::Term;
use crate::Searcher; use crate::Searcher;
use crate::TantivyError::InvalidArgument; use crate::TantivyError::InvalidArgument;
use levenshtein_automata::{Distance, LevenshteinAutomatonBuilder, DFA}; use levenshtein_automata::{LevenshteinAutomatonBuilder, DFA};
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use std::collections::HashMap; use std::collections::HashMap;
use std::ops::Range; use std::ops::Range;
use tantivy_fst::Automaton;
pub(crate) struct DFAWrapper(pub DFA);
impl Automaton for DFAWrapper {
type State = u32;
fn start(&self) -> Self::State {
self.0.initial_state()
}
fn is_match(&self, state: &Self::State) -> bool {
match self.0.distance(*state) {
Distance::Exact(_) => true,
Distance::AtLeast(_) => false,
}
}
fn can_match(&self, state: &u32) -> bool {
*state != levenshtein_automata::SINK_STATE
}
fn accept(&self, state: &Self::State, byte: u8) -> Self::State {
self.0.transition(*state, byte)
}
}
/// A range of Levenshtein distances that we will build DFAs for our terms /// A range of Levenshtein distances that we will build DFAs for our terms
/// The computation is exponential, so best keep it to low single digits /// The computation is exponential, so best keep it to low single digits
const VALID_LEVENSHTEIN_DISTANCE_RANGE: Range<u8> = 0..3; const VALID_LEVENSHTEIN_DISTANCE_RANGE: Range<u8> = (0..3);
static LEV_BUILDER: Lazy<HashMap<(u8, bool), LevenshteinAutomatonBuilder>> = Lazy::new(|| { static LEV_BUILDER: Lazy<HashMap<(u8, bool), LevenshteinAutomatonBuilder>> = Lazy::new(|| {
let mut lev_builder_cache = HashMap::new(); let mut lev_builder_cache = HashMap::new();
@@ -117,7 +91,7 @@ impl FuzzyTermQuery {
} }
} }
/// Creates a new Fuzzy Query of the Term prefix /// Creates a new Fuzzy Query that treats transpositions as cost one rather than two
pub fn new_prefix(term: Term, distance: u8, transposition_cost_one: bool) -> FuzzyTermQuery { pub fn new_prefix(term: Term, distance: u8, transposition_cost_one: bool) -> FuzzyTermQuery {
FuzzyTermQuery { FuzzyTermQuery {
term, term,
@@ -127,20 +101,13 @@ impl FuzzyTermQuery {
} }
} }
fn specialized_weight(&self) -> crate::Result<AutomatonWeight<DFAWrapper>> { fn specialized_weight(&self) -> crate::Result<AutomatonWeight<DFA>> {
// LEV_BUILDER is a HashMap, whose `get` method returns an Option // LEV_BUILDER is a HashMap, whose `get` method returns an Option
match LEV_BUILDER.get(&(self.distance, false)) { match LEV_BUILDER.get(&(self.distance, false)) {
// Unwrap the option and build the Ok(AutomatonWeight) // Unwrap the option and build the Ok(AutomatonWeight)
Some(automaton_builder) => { Some(automaton_builder) => {
let automaton = if self.prefix { let automaton = automaton_builder.build_dfa(self.term.text());
automaton_builder.build_prefix_dfa(self.term.text()) Ok(AutomatonWeight::new(self.term.field(), automaton))
} else {
automaton_builder.build_dfa(self.term.text())
};
Ok(AutomatonWeight::new(
self.term.field(),
DFAWrapper(automaton),
))
} }
None => Err(InvalidArgument(format!( None => Err(InvalidArgument(format!(
"Levenshtein distance of {} is not allowed. Choose a value in the {:?} range", "Levenshtein distance of {} is not allowed. Choose a value in the {:?} range",
@@ -163,10 +130,10 @@ impl Query for FuzzyTermQuery {
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use super::FuzzyTermQuery; use super::FuzzyTermQuery;
use crate::assert_nearly_equals;
use crate::collector::TopDocs; use crate::collector::TopDocs;
use crate::schema::Schema; use crate::schema::Schema;
use crate::schema::TEXT; use crate::schema::TEXT;
use crate::tests::assert_nearly_equals;
use crate::Index; use crate::Index;
use crate::Term; use crate::Term;
@@ -177,7 +144,7 @@ mod test {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
index_writer.add_document(doc!( index_writer.add_document(doc!(
country_field => "japan", country_field => "japan",
)); ));
@@ -188,8 +155,6 @@ mod test {
} }
let reader = index.reader().unwrap(); let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
// passes because Levenshtein distance is 1 (substitute 'o' with 'a')
{ {
let term = Term::from_field_text(country_field, "japon"); let term = Term::from_field_text(country_field, "japon");
@@ -199,31 +164,7 @@ mod test {
.unwrap(); .unwrap();
assert_eq!(top_docs.len(), 1, "Expected only 1 document"); assert_eq!(top_docs.len(), 1, "Expected only 1 document");
let (score, _) = top_docs[0]; let (score, _) = top_docs[0];
assert_nearly_equals!(1.0, score); assert_nearly_equals(1f32, score);
}
// fails because non-prefix Levenshtein distance is more than 1 (add 'a' and 'n')
{
let term = Term::from_field_text(country_field, "jap");
let fuzzy_query = FuzzyTermQuery::new(term, 1, true);
let top_docs = searcher
.search(&fuzzy_query, &TopDocs::with_limit(2))
.unwrap();
assert_eq!(top_docs.len(), 0, "Expected no document");
}
// passes because prefix Levenshtein distance is 0
{
let term = Term::from_field_text(country_field, "jap");
let fuzzy_query = FuzzyTermQuery::new_prefix(term, 1, true);
let top_docs = searcher
.search(&fuzzy_query, &TopDocs::with_limit(2))
.unwrap();
assert_eq!(top_docs.len(), 1, "Expected only 1 document");
let (score, _) = top_docs[0];
assert_nearly_equals!(1.0, score);
} }
} }
} }

View File

@@ -1,4 +1,4 @@
use crate::docset::{DocSet, TERMINATED}; use crate::docset::{DocSet, SkipResult};
use crate::query::term_query::TermScorer; use crate::query::term_query::TermScorer;
use crate::query::EmptyScorer; use crate::query::EmptyScorer;
use crate::query::Scorer; use crate::query::Scorer;
@@ -20,14 +20,12 @@ pub fn intersect_scorers(mut scorers: Vec<Box<dyn Scorer>>) -> Box<dyn Scorer> {
if scorers.len() == 1 { if scorers.len() == 1 {
return scorers.pop().unwrap(); return scorers.pop().unwrap();
} }
scorers.sort_by_key(|scorer| scorer.size_hint());
let doc = go_to_first_doc(&mut scorers[..]);
if doc == TERMINATED {
return Box::new(EmptyScorer);
}
// We know that we have at least 2 elements. // We know that we have at least 2 elements.
let left = scorers.remove(0); let num_docsets = scorers.len();
let right = scorers.remove(0); scorers.sort_by(|left, right| right.size_hint().cmp(&left.size_hint()));
let left = scorers.pop().unwrap();
let right = scorers.pop().unwrap();
scorers.reverse();
let all_term_scorers = [&left, &right] let all_term_scorers = [&left, &right]
.iter() .iter()
.all(|&scorer| scorer.is::<TermScorer>()); .all(|&scorer| scorer.is::<TermScorer>());
@@ -36,12 +34,14 @@ pub fn intersect_scorers(mut scorers: Vec<Box<dyn Scorer>>) -> Box<dyn Scorer> {
left: *(left.downcast::<TermScorer>().map_err(|_| ()).unwrap()), left: *(left.downcast::<TermScorer>().map_err(|_| ()).unwrap()),
right: *(right.downcast::<TermScorer>().map_err(|_| ()).unwrap()), right: *(right.downcast::<TermScorer>().map_err(|_| ()).unwrap()),
others: scorers, others: scorers,
num_docsets,
}); });
} }
Box::new(Intersection { Box::new(Intersection {
left, left,
right, right,
others: scorers, others: scorers,
num_docsets,
}) })
} }
@@ -50,35 +50,22 @@ pub struct Intersection<TDocSet: DocSet, TOtherDocSet: DocSet = Box<dyn Scorer>>
left: TDocSet, left: TDocSet,
right: TDocSet, right: TDocSet,
others: Vec<TOtherDocSet>, others: Vec<TOtherDocSet>,
} num_docsets: usize,
fn go_to_first_doc<TDocSet: DocSet>(docsets: &mut [TDocSet]) -> DocId {
assert!(!docsets.is_empty());
let mut candidate = docsets.iter().map(TDocSet::doc).max().unwrap();
'outer: loop {
for docset in docsets.iter_mut() {
let seek_doc = docset.seek(candidate);
if seek_doc > candidate {
candidate = docset.doc();
continue 'outer;
}
}
return candidate;
}
} }
impl<TDocSet: DocSet> Intersection<TDocSet, TDocSet> { impl<TDocSet: DocSet> Intersection<TDocSet, TDocSet> {
pub(crate) fn new(mut docsets: Vec<TDocSet>) -> Intersection<TDocSet, TDocSet> { pub(crate) fn new(mut docsets: Vec<TDocSet>) -> Intersection<TDocSet, TDocSet> {
let num_docsets = docsets.len(); let num_docsets = docsets.len();
assert!(num_docsets >= 2); assert!(num_docsets >= 2);
docsets.sort_by_key(|docset| docset.size_hint()); docsets.sort_by(|left, right| right.size_hint().cmp(&left.size_hint()));
go_to_first_doc(&mut docsets); let left = docsets.pop().unwrap();
let left = docsets.remove(0); let right = docsets.pop().unwrap();
let right = docsets.remove(0); docsets.reverse();
Intersection { Intersection {
left, left,
right, right,
others: docsets, others: docsets,
num_docsets,
} }
} }
} }
@@ -93,49 +80,128 @@ impl<TDocSet: DocSet> Intersection<TDocSet, TDocSet> {
} }
} }
impl<TDocSet: DocSet, TOtherDocSet: DocSet> Intersection<TDocSet, TOtherDocSet> {
pub(crate) fn docset_mut(&mut self, ord: usize) -> &mut dyn DocSet {
match ord {
0 => &mut self.left,
1 => &mut self.right,
n => &mut self.others[n - 2],
}
}
}
impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOtherDocSet> { impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOtherDocSet> {
fn advance(&mut self) -> DocId { fn advance(&mut self) -> bool {
let (left, right) = (&mut self.left, &mut self.right); let (left, right) = (&mut self.left, &mut self.right);
let mut candidate = left.advance();
if !left.advance() {
return false;
}
let mut candidate = left.doc();
let mut other_candidate_ord: usize = usize::max_value();
'outer: loop { 'outer: loop {
// In the first part we look for a document in the intersection // In the first part we look for a document in the intersection
// of the two rarest `DocSet` in the intersection. // of the two rarest `DocSet` in the intersection.
loop { loop {
let right_doc = right.seek(candidate); match right.skip_next(candidate) {
candidate = left.seek(right_doc); SkipResult::Reached => {
if candidate == right_doc { break;
break; }
SkipResult::OverStep => {
candidate = right.doc();
other_candidate_ord = usize::max_value();
}
SkipResult::End => {
return false;
}
}
match left.skip_next(candidate) {
SkipResult::Reached => {
break;
}
SkipResult::OverStep => {
candidate = left.doc();
other_candidate_ord = usize::max_value();
}
SkipResult::End => {
return false;
}
} }
} }
debug_assert_eq!(left.doc(), right.doc());
// test the remaining scorers; // test the remaining scorers;
for docset in self.others.iter_mut() { for (ord, docset) in self.others.iter_mut().enumerate() {
let seek_doc = docset.seek(candidate); if ord == other_candidate_ord {
if seek_doc > candidate { continue;
candidate = left.seek(seek_doc); }
continue 'outer; // `candidate_ord` is already at the
// right position.
//
// Calling `skip_next` would advance this docset
// and miss it.
match docset.skip_next(candidate) {
SkipResult::Reached => {}
SkipResult::OverStep => {
// this is not in the intersection,
// let's update our candidate.
candidate = docset.doc();
match left.skip_next(candidate) {
SkipResult::Reached => {
other_candidate_ord = ord;
}
SkipResult::OverStep => {
candidate = left.doc();
other_candidate_ord = usize::max_value();
}
SkipResult::End => {
return false;
}
}
continue 'outer;
}
SkipResult::End => {
return false;
}
} }
} }
debug_assert_eq!(candidate, self.left.doc()); return true;
debug_assert_eq!(candidate, self.right.doc());
debug_assert!(self.others.iter().all(|docset| docset.doc() == candidate));
return candidate;
} }
} }
fn seek(&mut self, target: DocId) -> DocId { fn skip_next(&mut self, target: DocId) -> SkipResult {
self.left.seek(target); // We optimize skipping by skipping every single member
let mut docsets: Vec<&mut dyn DocSet> = vec![&mut self.left, &mut self.right]; // of the intersection to target.
for docset in &mut self.others { let mut current_target: DocId = target;
docsets.push(docset); let mut current_ord = self.num_docsets;
'outer: loop {
for ord in 0..self.num_docsets {
let docset = self.docset_mut(ord);
if ord == current_ord {
continue;
}
match docset.skip_next(current_target) {
SkipResult::End => {
return SkipResult::End;
}
SkipResult::OverStep => {
// update the target
// for the remaining members of the intersection.
current_target = docset.doc();
current_ord = ord;
continue 'outer;
}
SkipResult::Reached => {}
}
}
if target == current_target {
return SkipResult::Reached;
} else {
assert!(current_target > target);
return SkipResult::OverStep;
}
} }
let doc = go_to_first_doc(&mut docsets[..]);
debug_assert!(docsets.iter().all(|docset| docset.doc() == doc));
debug_assert!(doc >= target);
doc
} }
fn doc(&self) -> DocId { fn doc(&self) -> DocId {
@@ -162,7 +228,7 @@ where
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::Intersection; use super::Intersection;
use crate::docset::{DocSet, TERMINATED}; use crate::docset::{DocSet, SkipResult};
use crate::postings::tests::test_skip_against_unoptimized; use crate::postings::tests::test_skip_against_unoptimized;
use crate::query::VecDocSet; use crate::query::VecDocSet;
@@ -172,18 +238,20 @@ mod tests {
let left = VecDocSet::from(vec![1, 3, 9]); let left = VecDocSet::from(vec![1, 3, 9]);
let right = VecDocSet::from(vec![3, 4, 9, 18]); let right = VecDocSet::from(vec![3, 4, 9, 18]);
let mut intersection = Intersection::new(vec![left, right]); let mut intersection = Intersection::new(vec![left, right]);
assert!(intersection.advance());
assert_eq!(intersection.doc(), 3); assert_eq!(intersection.doc(), 3);
assert_eq!(intersection.advance(), 9); assert!(intersection.advance());
assert_eq!(intersection.doc(), 9); assert_eq!(intersection.doc(), 9);
assert_eq!(intersection.advance(), TERMINATED); assert!(!intersection.advance());
} }
{ {
let a = VecDocSet::from(vec![1, 3, 9]); let a = VecDocSet::from(vec![1, 3, 9]);
let b = VecDocSet::from(vec![3, 4, 9, 18]); let b = VecDocSet::from(vec![3, 4, 9, 18]);
let c = VecDocSet::from(vec![1, 5, 9, 111]); let c = VecDocSet::from(vec![1, 5, 9, 111]);
let mut intersection = Intersection::new(vec![a, b, c]); let mut intersection = Intersection::new(vec![a, b, c]);
assert!(intersection.advance());
assert_eq!(intersection.doc(), 9); assert_eq!(intersection.doc(), 9);
assert_eq!(intersection.advance(), TERMINATED); assert!(!intersection.advance());
} }
} }
@@ -192,8 +260,8 @@ mod tests {
let left = VecDocSet::from(vec![0]); let left = VecDocSet::from(vec![0]);
let right = VecDocSet::from(vec![0]); let right = VecDocSet::from(vec![0]);
let mut intersection = Intersection::new(vec![left, right]); let mut intersection = Intersection::new(vec![left, right]);
assert!(intersection.advance());
assert_eq!(intersection.doc(), 0); assert_eq!(intersection.doc(), 0);
assert_eq!(intersection.advance(), TERMINATED);
} }
#[test] #[test]
@@ -201,7 +269,7 @@ mod tests {
let left = VecDocSet::from(vec![0, 1, 2, 4]); let left = VecDocSet::from(vec![0, 1, 2, 4]);
let right = VecDocSet::from(vec![2, 5]); let right = VecDocSet::from(vec![2, 5]);
let mut intersection = Intersection::new(vec![left, right]); let mut intersection = Intersection::new(vec![left, right]);
assert_eq!(intersection.seek(2), 2); assert_eq!(intersection.skip_next(2), SkipResult::Reached);
assert_eq!(intersection.doc(), 2); assert_eq!(intersection.doc(), 2);
} }
@@ -244,7 +312,7 @@ mod tests {
let a = VecDocSet::from(vec![1, 3]); let a = VecDocSet::from(vec![1, 3]);
let b = VecDocSet::from(vec![1, 4]); let b = VecDocSet::from(vec![1, 4]);
let c = VecDocSet::from(vec![3, 9]); let c = VecDocSet::from(vec![3, 9]);
let intersection = Intersection::new(vec![a, b, c]); let mut intersection = Intersection::new(vec![a, b, c]);
assert_eq!(intersection.doc(), TERMINATED); assert!(!intersection.advance());
} }
} }

View File

@@ -1,11 +1,12 @@
/*! Query Module */ /*!
Query
*/
mod all_query; mod all_query;
mod automaton_weight; mod automaton_weight;
mod bitset; mod bitset;
mod bm25; mod bm25;
mod boolean_query; mod boolean_query;
mod boost_query;
mod empty_query; mod empty_query;
mod exclude; mod exclude;
mod explanation; mod explanation;
@@ -26,7 +27,6 @@ mod weight;
mod vec_docset; mod vec_docset;
pub(crate) mod score_combiner; pub(crate) mod score_combiner;
pub(crate) use self::bm25::BM25Weight;
pub use self::intersection::Intersection; pub use self::intersection::Intersection;
pub use self::union::Union; pub use self::union::Union;
@@ -37,12 +37,9 @@ pub use self::all_query::{AllQuery, AllScorer, AllWeight};
pub use self::automaton_weight::AutomatonWeight; pub use self::automaton_weight::AutomatonWeight;
pub use self::bitset::BitSetDocSet; pub use self::bitset::BitSetDocSet;
pub use self::boolean_query::BooleanQuery; pub use self::boolean_query::BooleanQuery;
pub use self::boost_query::BoostQuery;
pub use self::empty_query::{EmptyQuery, EmptyScorer, EmptyWeight}; pub use self::empty_query::{EmptyQuery, EmptyScorer, EmptyWeight};
pub use self::exclude::Exclude; pub use self::exclude::Exclude;
pub use self::explanation::Explanation; pub use self::explanation::Explanation;
#[cfg(test)]
pub(crate) use self::fuzzy_query::DFAWrapper;
pub use self::fuzzy_query::FuzzyTermQuery; pub use self::fuzzy_query::FuzzyTermQuery;
pub use self::intersection::intersect_scorers; pub use self::intersection::intersect_scorers;
pub use self::phrase_query::PhraseQuery; pub use self::phrase_query::PhraseQuery;

View File

@@ -7,24 +7,24 @@ pub use self::phrase_scorer::PhraseScorer;
pub use self::phrase_weight::PhraseWeight; pub use self::phrase_weight::PhraseWeight;
#[cfg(test)] #[cfg(test)]
pub mod tests { mod tests {
use super::*; use super::*;
use crate::assert_nearly_equals;
use crate::collector::tests::{TEST_COLLECTOR_WITHOUT_SCORE, TEST_COLLECTOR_WITH_SCORE}; use crate::collector::tests::{TEST_COLLECTOR_WITHOUT_SCORE, TEST_COLLECTOR_WITH_SCORE};
use crate::core::Index; use crate::core::Index;
use crate::query::Weight; use crate::error::TantivyError;
use crate::schema::{Schema, Term, TEXT}; use crate::schema::{Schema, Term, TEXT};
use crate::tests::assert_nearly_equals;
use crate::DocId; use crate::DocId;
use crate::{DocAddress, TERMINATED}; use crate::{DocAddress, DocSet};
pub fn create_index(texts: &[&'static str]) -> Index { fn create_index(texts: &[&'static str]) -> Index {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
for &text in texts { for &text in texts {
let doc = doc!(text_field=>text); let doc = doc!(text_field=>text);
index_writer.add_document(doc); index_writer.add_document(doc);
@@ -61,30 +61,13 @@ pub mod tests {
.map(|docaddr| docaddr.1) .map(|docaddr| docaddr.1)
.collect::<Vec<_>>() .collect::<Vec<_>>()
}; };
assert_eq!(test_query(vec!["a", "b"]), vec![1, 2, 3, 4]);
assert_eq!(test_query(vec!["a", "b", "c"]), vec![2, 4]); assert_eq!(test_query(vec!["a", "b", "c"]), vec![2, 4]);
assert_eq!(test_query(vec!["a", "b"]), vec![1, 2, 3, 4]);
assert_eq!(test_query(vec!["b", "b"]), vec![0, 1]); assert_eq!(test_query(vec!["b", "b"]), vec![0, 1]);
assert!(test_query(vec!["g", "ewrwer"]).is_empty()); assert!(test_query(vec!["g", "ewrwer"]).is_empty());
assert!(test_query(vec!["g", "a"]).is_empty()); assert!(test_query(vec!["g", "a"]).is_empty());
} }
#[test]
pub fn test_phrase_query_simple() -> crate::Result<()> {
let index = create_index(&["a b b d c g c", "a b a b c"]);
let text_field = index.schema().get_field("text").unwrap();
let searcher = index.reader()?.searcher();
let terms: Vec<Term> = vec!["a", "b", "c"]
.iter()
.map(|text| Term::from_field_text(text_field, text))
.collect();
let phrase_query = PhraseQuery::new(terms);
let phrase_weight = phrase_query.phrase_weight(&searcher, false)?;
let mut phrase_scorer = phrase_weight.scorer(searcher.segment_reader(0), 1.0)?;
assert_eq!(phrase_scorer.doc(), 1);
assert_eq!(phrase_scorer.advance(), TERMINATED);
Ok(())
}
#[test] #[test]
pub fn test_phrase_query_no_score() { pub fn test_phrase_query_no_score() {
let index = create_index(&[ let index = create_index(&[
@@ -119,6 +102,30 @@ pub mod tests {
assert!(test_query(vec!["g", "a"]).is_empty()); assert!(test_query(vec!["g", "a"]).is_empty());
} }
#[test]
pub fn test_phrase_count() {
let index = create_index(&["a c", "a a b d a b c", " a b"]);
let schema = index.schema();
let text_field = schema.get_field("text").unwrap();
let searcher = index.reader().unwrap().searcher();
let phrase_query = PhraseQuery::new(vec![
Term::from_field_text(text_field, "a"),
Term::from_field_text(text_field, "b"),
]);
let phrase_weight = phrase_query.phrase_weight(&searcher, true).unwrap();
let mut phrase_scorer = phrase_weight
.phrase_scorer(searcher.segment_reader(0u32))
.unwrap()
.unwrap();
assert!(phrase_scorer.advance());
assert_eq!(phrase_scorer.doc(), 1);
assert_eq!(phrase_scorer.phrase_count(), 2);
assert!(phrase_scorer.advance());
assert_eq!(phrase_scorer.doc(), 2);
assert_eq!(phrase_scorer.phrase_count(), 1);
assert!(!phrase_scorer.advance());
}
#[test] #[test]
pub fn test_phrase_query_no_positions() { pub fn test_phrase_query_no_positions() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
@@ -135,7 +142,7 @@ pub mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"a b c")); index_writer.add_document(doc!(text_field=>"a b c"));
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
} }
@@ -144,16 +151,21 @@ pub mod tests {
Term::from_field_text(text_field, "a"), Term::from_field_text(text_field, "a"),
Term::from_field_text(text_field, "b"), Term::from_field_text(text_field, "b"),
]); ]);
match searcher
let search_result = searcher
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE) .search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
.map(|_| ()); .map(|_| ())
assert!(matches!( .unwrap_err()
search_result, {
Err(crate::TantivyError::SchemaError(msg)) TantivyError::SchemaError(ref msg) => {
if msg == "Applied phrase query on field \"text\", which does not have positions \ assert_eq!(
indexed" "Applied phrase query on field \"text\", which does not have positions indexed",
)); msg.as_str()
);
}
_ => {
panic!("Should have returned an error");
}
}
} }
#[test] #[test]
@@ -175,8 +187,8 @@ pub mod tests {
.to_vec() .to_vec()
}; };
let scores = test_query(vec!["a", "b"]); let scores = test_query(vec!["a", "b"]);
assert_nearly_equals!(scores[0], 0.40618482); assert_nearly_equals(scores[0], 0.40618482);
assert_nearly_equals!(scores[1], 0.46844664); assert_nearly_equals(scores[1], 0.46844664);
} }
#[test] // motivated by #234 #[test] // motivated by #234
@@ -186,7 +198,7 @@ pub mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"b")); index_writer.add_document(doc!(text_field=>"b"));
index_writer.add_document(doc!(text_field=>"a b")); index_writer.add_document(doc!(text_field=>"a b"));
index_writer.add_document(doc!(text_field=>"b a")); index_writer.add_document(doc!(text_field=>"b a"));
@@ -217,7 +229,7 @@ pub mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"a b c d e f g h")); index_writer.add_document(doc!(text_field=>"a b c d e f g h"));
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
} }

View File

@@ -1,9 +1,9 @@
use crate::docset::{DocSet, TERMINATED}; use crate::docset::{DocSet, SkipResult};
use crate::fieldnorm::FieldNormReader; use crate::fieldnorm::FieldNormReader;
use crate::postings::Postings; use crate::postings::Postings;
use crate::query::bm25::BM25Weight; use crate::query::bm25::BM25Weight;
use crate::query::{Intersection, Scorer}; use crate::query::{Intersection, Scorer};
use crate::{DocId, Score}; use crate::DocId;
use std::cmp::Ordering; use std::cmp::Ordering;
struct PostingsWithOffset<TPostings> { struct PostingsWithOffset<TPostings> {
@@ -25,12 +25,12 @@ impl<TPostings: Postings> PostingsWithOffset<TPostings> {
} }
impl<TPostings: Postings> DocSet for PostingsWithOffset<TPostings> { impl<TPostings: Postings> DocSet for PostingsWithOffset<TPostings> {
fn advance(&mut self) -> DocId { fn advance(&mut self) -> bool {
self.postings.advance() self.postings.advance()
} }
fn seek(&mut self, target: DocId) -> DocId { fn skip_next(&mut self, target: DocId) -> SkipResult {
self.postings.seek(target) self.postings.skip_next(target)
} }
fn doc(&self) -> DocId { fn doc(&self) -> DocId {
@@ -149,7 +149,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
PostingsWithOffset::new(postings, (max_offset - offset) as u32) PostingsWithOffset::new(postings, (max_offset - offset) as u32)
}) })
.collect::<Vec<_>>(); .collect::<Vec<_>>();
let mut scorer = PhraseScorer { PhraseScorer {
intersection_docset: Intersection::new(postings_with_offsets), intersection_docset: Intersection::new(postings_with_offsets),
num_terms: num_docsets, num_terms: num_docsets,
left: Vec::with_capacity(100), left: Vec::with_capacity(100),
@@ -158,11 +158,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
similarity_weight, similarity_weight,
fieldnorm_reader, fieldnorm_reader,
score_needed, score_needed,
};
if scorer.doc() != TERMINATED && !scorer.phrase_match() {
scorer.advance();
} }
scorer
} }
pub fn phrase_count(&self) -> u32 { pub fn phrase_count(&self) -> u32 {
@@ -229,22 +225,31 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
} }
impl<TPostings: Postings> DocSet for PhraseScorer<TPostings> { impl<TPostings: Postings> DocSet for PhraseScorer<TPostings> {
fn advance(&mut self) -> DocId { fn advance(&mut self) -> bool {
loop { while self.intersection_docset.advance() {
let doc = self.intersection_docset.advance(); if self.phrase_match() {
if doc == TERMINATED || self.phrase_match() { return true;
return doc;
} }
} }
false
} }
fn seek(&mut self, target: DocId) -> DocId { fn skip_next(&mut self, target: DocId) -> SkipResult {
debug_assert!(target >= self.doc()); if self.intersection_docset.skip_next(target) == SkipResult::End {
let doc = self.intersection_docset.seek(target); return SkipResult::End;
if doc == TERMINATED || self.phrase_match() { }
return doc; if self.phrase_match() {
if self.doc() == target {
return SkipResult::Reached;
} else {
return SkipResult::OverStep;
}
}
if self.advance() {
SkipResult::OverStep
} else {
SkipResult::End
} }
self.advance()
} }
fn doc(&self) -> DocId { fn doc(&self) -> DocId {
@@ -257,7 +262,7 @@ impl<TPostings: Postings> DocSet for PhraseScorer<TPostings> {
} }
impl<TPostings: Postings> Scorer for PhraseScorer<TPostings> { impl<TPostings: Postings> Scorer for PhraseScorer<TPostings> {
fn score(&mut self) -> Score { fn score(&mut self) -> f32 {
let doc = self.doc(); let doc = self.doc();
let fieldnorm_id = self.fieldnorm_reader.fieldnorm_id(doc); let fieldnorm_id = self.fieldnorm_reader.fieldnorm_id(doc);
self.similarity_weight self.similarity_weight
@@ -267,6 +272,7 @@ impl<TPostings: Postings> Scorer for PhraseScorer<TPostings> {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::{intersection, intersection_count}; use super::{intersection, intersection_count};
fn test_intersection_sym(left: &[u32], right: &[u32], expected: &[u32]) { fn test_intersection_sym(left: &[u32], right: &[u32], expected: &[u32]) {

View File

@@ -9,8 +9,8 @@ use crate::query::Weight;
use crate::query::{EmptyScorer, Explanation}; use crate::query::{EmptyScorer, Explanation};
use crate::schema::IndexRecordOption; use crate::schema::IndexRecordOption;
use crate::schema::Term; use crate::schema::Term;
use crate::Score;
use crate::{DocId, DocSet}; use crate::{DocId, DocSet};
use crate::{Result, SkipResult};
pub struct PhraseWeight { pub struct PhraseWeight {
phrase_terms: Vec<(usize, Term)>, phrase_terms: Vec<(usize, Term)>,
@@ -32,18 +32,17 @@ impl PhraseWeight {
} }
} }
fn fieldnorm_reader(&self, reader: &SegmentReader) -> crate::Result<FieldNormReader> { fn fieldnorm_reader(&self, reader: &SegmentReader) -> FieldNormReader {
let field = self.phrase_terms[0].1.field(); let field = self.phrase_terms[0].1.field();
reader.get_fieldnorms_reader(field) reader.get_fieldnorms_reader(field)
} }
fn phrase_scorer( pub fn phrase_scorer(
&self, &self,
reader: &SegmentReader, reader: &SegmentReader,
boost: Score, ) -> Result<Option<PhraseScorer<SegmentPostings>>> {
) -> crate::Result<Option<PhraseScorer<SegmentPostings>>> { let similarity_weight = self.similarity_weight.clone();
let similarity_weight = self.similarity_weight.boost_by(boost); let fieldnorm_reader = self.fieldnorm_reader(reader);
let fieldnorm_reader = self.fieldnorm_reader(reader)?;
if reader.has_deletes() { if reader.has_deletes() {
let mut term_postings_list = Vec::new(); let mut term_postings_list = Vec::new();
for &(offset, ref term) in &self.phrase_terms { for &(offset, ref term) in &self.phrase_terms {
@@ -85,24 +84,24 @@ impl PhraseWeight {
} }
impl Weight for PhraseWeight { impl Weight for PhraseWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> { fn scorer(&self, reader: &SegmentReader) -> Result<Box<dyn Scorer>> {
if let Some(scorer) = self.phrase_scorer(reader, boost)? { if let Some(scorer) = self.phrase_scorer(reader)? {
Ok(Box::new(scorer)) Ok(Box::new(scorer))
} else { } else {
Ok(Box::new(EmptyScorer)) Ok(Box::new(EmptyScorer))
} }
} }
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> { fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
let scorer_opt = self.phrase_scorer(reader, 1.0)?; let scorer_opt = self.phrase_scorer(reader)?;
if scorer_opt.is_none() { if scorer_opt.is_none() {
return Err(does_not_match(doc)); return Err(does_not_match(doc));
} }
let mut scorer = scorer_opt.unwrap(); let mut scorer = scorer_opt.unwrap();
if scorer.seek(doc) != doc { if scorer.skip_next(doc) != SkipResult::Reached {
return Err(does_not_match(doc)); return Err(does_not_match(doc));
} }
let fieldnorm_reader = self.fieldnorm_reader(reader)?; let fieldnorm_reader = self.fieldnorm_reader(reader);
let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc); let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc);
let phrase_count = scorer.phrase_count(); let phrase_count = scorer.phrase_count();
let mut explanation = Explanation::new("Phrase Scorer", scorer.score()); let mut explanation = Explanation::new("Phrase Scorer", scorer.score());
@@ -110,34 +109,3 @@ impl Weight for PhraseWeight {
Ok(explanation) Ok(explanation)
} }
} }
#[cfg(test)]
mod tests {
use super::super::tests::create_index;
use crate::docset::TERMINATED;
use crate::query::PhraseQuery;
use crate::{DocSet, Term};
#[test]
pub fn test_phrase_count() {
let index = create_index(&["a c", "a a b d a b c", " a b"]);
let schema = index.schema();
let text_field = schema.get_field("text").unwrap();
let searcher = index.reader().unwrap().searcher();
let phrase_query = PhraseQuery::new(vec![
Term::from_field_text(text_field, "a"),
Term::from_field_text(text_field, "b"),
]);
let phrase_weight = phrase_query.phrase_weight(&searcher, true).unwrap();
let mut phrase_scorer = phrase_weight
.phrase_scorer(searcher.segment_reader(0u32), 1.0)
.unwrap()
.unwrap();
assert_eq!(phrase_scorer.doc(), 1);
assert_eq!(phrase_scorer.phrase_count(), 2);
assert_eq!(phrase_scorer.advance(), 2);
assert_eq!(phrase_scorer.doc(), 2);
assert_eq!(phrase_scorer.phrase_count(), 1);
assert_eq!(phrase_scorer.advance(), TERMINATED);
}
}

View File

@@ -40,7 +40,7 @@ use std::fmt;
/// ///
/// When implementing a new type of `Query`, it is normal to implement a /// When implementing a new type of `Query`, it is normal to implement a
/// dedicated `Query`, `Weight` and `Scorer`. /// dedicated `Query`, `Weight` and `Scorer`.
pub trait Query: QueryClone + Send + Sync + downcast_rs::Downcast + fmt::Debug { pub trait Query: QueryClone + downcast_rs::Downcast + fmt::Debug {
/// Create the weight associated to a query. /// Create the weight associated to a query.
/// ///
/// If scoring is not required, setting `scoring_enabled` to `false` /// If scoring is not required, setting `scoring_enabled` to `false`

View File

@@ -2,7 +2,6 @@ use crate::query::Occur;
use crate::schema::Field; use crate::schema::Field;
use crate::schema::Term; use crate::schema::Term;
use crate::schema::Type; use crate::schema::Type;
use crate::Score;
use std::fmt; use std::fmt;
use std::ops::Bound; use std::ops::Bound;
@@ -22,17 +21,6 @@ pub enum LogicalLiteral {
pub enum LogicalAST { pub enum LogicalAST {
Clause(Vec<(Occur, LogicalAST)>), Clause(Vec<(Occur, LogicalAST)>),
Leaf(Box<LogicalLiteral>), Leaf(Box<LogicalLiteral>),
Boost(Box<LogicalAST>, Score),
}
impl LogicalAST {
pub fn boost(self, boost: Score) -> LogicalAST {
if (boost - 1.0).abs() < Score::EPSILON {
self
} else {
LogicalAST::Boost(Box::new(self), boost)
}
}
} }
fn occur_letter(occur: Occur) -> &'static str { fn occur_letter(occur: Occur) -> &'static str {
@@ -59,7 +47,6 @@ impl fmt::Debug for LogicalAST {
} }
Ok(()) Ok(())
} }
LogicalAST::Boost(ref ast, boost) => write!(formatter, "{:?}^{}", ast, boost),
LogicalAST::Leaf(ref literal) => write!(formatter, "{:?}", literal), LogicalAST::Leaf(ref literal) => write!(formatter, "{:?}", literal),
} }
} }

Some files were not shown because too many files have changed in this diff Show More