Compare commits

..

28 Commits

Author SHA1 Message Date
Paul Masurel
1c81b8171f Switch to slog
Closes #111
2020-09-30 19:55:54 +09:00
Paul Masurel
838c476733 Hirevo move to thiserror (#889)
* Migrated from `failure` to `thiserror`

* Refactoring

Co-authored-by: Nicolas Polomack <nicolas@polomack.eu>
2020-09-30 16:34:10 +09:00
Paul Masurel
5f574348d1 Syntactic change. 2020-09-26 21:33:00 +09:00
Paul Masurel
19a02b2c30 Merge tag '0.13.1'
0.13.1 was published as a hotfix to accomodate tantivy-py.
2020-09-19 21:20:27 +09:00
Paul Masurel
c339b05789 Bumped version and edited changelog 2020-09-19 21:13:19 +09:00
Paul Masurel
2d3c657f9d Added Send Sync to collectors. 2020-09-19 21:04:44 +09:00
Paul Masurel
07f9b828ae Added Send and Sync to the Query trait. 2020-09-19 21:04:29 +09:00
Paul Masurel
70bae7ce4c Removing Term Vec allocation (#881) 2020-09-08 23:11:00 +09:00
Paul Masurel
ac2a7273e6 Re-added comment to Score. 2020-09-08 21:41:34 +09:00
Paul Masurel
4ce9517a82 fix unit test for bench. remove scoref64 feature. fixed test for lz4 feature. 2020-09-08 07:35:00 +09:00
Paul Masurel
73024a8af3 Fixing compilation of bench and doctests. 2020-09-08 07:18:43 +09:00
Paul Masurel
e70e605fc3 fix unit test (at least on linux) 2020-09-07 23:35:04 +09:00
Paul Masurel
439d6956a9 Returning Result in some of the API (#880)
* Returning Result in some of the API

* Introducing `.writer_for_test(..)`
2020-09-07 15:52:34 +09:00
Paul Masurel
6530bf0eae Make field types less strict when populating documents. 2020-09-06 10:24:03 +09:00
Paul Masurel
151498cbe7 Creating the tempfile for atomicwrites in the same directory as the MmapDirectory. (#878) 2020-09-05 23:06:29 +09:00
Paul Masurel
3a72b1cb98 Accept dash within field names. (#874)
Accept dash in field names and enforce field names constraint at the
creation of the schema.

Closes #796
2020-09-01 13:38:52 +09:00
Paul Masurel
2737822620 Fixing unit tests. (#868)
There was a unit test failing when notify was sending more
than one event on atomicwrites.

It was observed on MacOS CI.
2020-08-27 16:43:39 +09:00
b8591340
06c12ae221 Filter meta.json from validate_checksum (#872) 2020-08-27 07:54:37 +09:00
Paul Masurel
4e4400af7f Added cargo timing report to .gitignore 2020-08-23 16:15:28 +09:00
Paul Masurel
3f1ecf53ab Merge branch 'master' of github.com:tantivy-search/tantivy 2020-08-22 21:30:47 +09:00
Paul Masurel
0b583b8130 Plastic changes 2020-08-22 21:29:12 +09:00
Paul Masurel
31d18dca1c Removing dependency to atomicwrites (#866) 2020-08-21 21:37:05 +09:00
stephenlagree
5e06e7de5a Update basic_search.rs (#865)
Remove duplicated document entry.
2020-08-21 11:23:09 +09:00
Paul Masurel
8af53cbd36 Merge branch 'master' of github.com:tantivy-search/tantivy 2020-08-21 08:57:42 +09:00
Paul Masurel
4914076e8f Fixing release build 2020-08-21 08:57:27 +09:00
Paul Masurel
e04f47e922 Using block wand for term queries too. 2020-08-20 15:51:21 +09:00
Paul Masurel
f355695581 Code clean up 2020-08-20 15:42:50 +09:00
Paul Masurel
cbacdf0de8 Edited README. 2020-08-20 14:28:24 +09:00
77 changed files with 1144 additions and 1134 deletions

View File

@@ -1,28 +0,0 @@
name: Tantivy CI
on: [push]
jobs:
test:
name: Test Suite
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
override: true
- uses: actions-rs/cargo@v1
with:
command: test
- uses: actions-rs/cargo@v1
with:
command: fmt
args: --all -- --check
- run: rustup component add clippy
- uses: actions-rs/cargo@v1
with:
command: clippy
args: -- -D warnings

View File

@@ -1,66 +0,0 @@
on: [push]
name: Code coverage with grcov
jobs:
grcov:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os:
- ubuntu-latest
#- macOS-latest
#- windows-latest
steps:
- uses: actions/checkout@v2
- name: Install toolchain
uses: actions-rs/toolchain@v1
with:
toolchain: nightly
override: true
profile: minimal
- name: Execute tests
uses: actions-rs/cargo@v1
with:
command: test
args: --all --lib
env:
CARGO_INCREMENTAL: 0
RUSTFLAGS: "-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort -Zpanic_abort_tests"
# Note that `actions-rs/grcov` Action can install `grcov` too,
# but can't use faster installation methods yet.
# As a temporary experiment `actions-rs/install` Action plugged in here.
# Consider **NOT** to copy that into your workflow,
# but use `actions-rs/grcov` only
- name: Pre-installing grcov
uses: actions-rs/install@v0.1
with:
crate: grcov
use-tool-cache: true
- name: Gather coverage data
id: coverage
uses: actions-rs/grcov@v0.1
with:
coveralls-token: ${{ secrets.COVERALLS_TOKEN }}
- name: Coveralls upload
uses: coverallsapp/github-action@master
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
parallel: true
path-to-lcov: ${{ steps.coverage.outputs.report }}
grcov_finalize:
runs-on: ubuntu-latest
needs: grcov
steps:
- name: Coveralls finalization
uses: coverallsapp/github-action@master
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
parallel-finished: true

1
.gitignore vendored
View File

@@ -12,3 +12,4 @@ cpp/simdcomp/bitpackingbenchmark
*.bk *.bk
.idea .idea
trace.dat trace.dat
cargo-timing*

View File

@@ -1,3 +1,14 @@
Tantivy 0.14.0
=========================
- Remove dependency to atomicwrites #833. Implemented by @pmasurel upon suggestion and research from @asafigan).
- Migrated tantivy error from the now deprecated `failure` crate to `thiserror` #760. (@hirevo)
- Switched to structure logging (via the `slog` crate). (@pmasurel)
Tantivy 0.13.1
===================
Made `Query` and `Collector` `Send + Sync`.
Updated misc dependency versions.
Tantivy 0.13.0 Tantivy 0.13.0
====================== ======================
Tantivy 0.13 introduce a change in the index format that will require Tantivy 0.13 introduce a change in the index format that will require

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "tantivy" name = "tantivy"
version = "0.13.0" version = "0.14.0-dev"
authors = ["Paul Masurel <paul.masurel@gmail.com>"] authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT" license = "MIT"
categories = ["database-implementations", "data-structures"] categories = ["database-implementations", "data-structures"]
@@ -13,21 +13,21 @@ keywords = ["search", "information", "retrieval"]
edition = "2018" edition = "2018"
[dependencies] [dependencies]
base64 = "0.12.0" base64 = "0.12"
byteorder = "1.0" byteorder = "1"
crc32fast = "1.2.0" crc32fast = "1"
once_cell = "1.0" once_cell = "1"
regex ={version = "1.3.0", default-features = false, features = ["std"]} regex ={version = "1", default-features = false, features = ["std"]}
tantivy-fst = "0.3" tantivy-fst = "0.3"
memmap = {version = "0.7", optional=true} memmap = {version = "0.7", optional=true}
lz4 = {version="1.20", optional=true} lz4 = {version="1", optional=true}
snap = "1" snap = "1"
atomicwrites = {version="0.2.2", optional=true} tempfile = {version="3", optional=true}
tempfile = "3.0" slog = "2.5"
log = "0.4" slog-stdlog = "4"
serde = {version="1.0", features=["derive"]} serde = {version="1", features=["derive"]}
serde_json = "1.0" serde_json = "1"
num_cpus = "1.2" num_cpus = "1"
fs2={version="0.4", optional=true} fs2={version="0.4", optional=true}
levenshtein_automata = "0.2" levenshtein_automata = "0.2"
notify = {version="4", optional=true} notify = {version="4", optional=true}
@@ -35,20 +35,20 @@ uuid = { version = "0.8", features = ["v4", "serde"] }
crossbeam = "0.7" crossbeam = "0.7"
futures = {version = "0.3", features=["thread-pool"] } futures = {version = "0.3", features=["thread-pool"] }
owning_ref = "0.4" owning_ref = "0.4"
stable_deref_trait = "1.0.0" tantivy-query-grammar = { version="0.14.0-dev", path="./query-grammar" }
rust-stemmers = "1.2" stable_deref_trait = "1"
downcast-rs = { version="1.0" } rust-stemmers = "1"
tantivy-query-grammar = { version="0.13", path="./query-grammar" } downcast-rs = "1"
bitpacking = {version="0.8", default-features = false, features=["bitpacker4x"]} bitpacking = {version="0.8", default-features = false, features=["bitpacker4x"]}
census = "0.4" census = "0.4"
fnv = "1.0.6" fnv = "1"
owned-read = "0.4" owned-read = "0.4"
failure = "0.1" thiserror = "1.0"
htmlescape = "0.3.1" htmlescape = "0.3"
fail = "0.4" fail = "0.4"
murmurhash32 = "0.2" murmurhash32 = "0.2"
chrono = "0.4" chrono = "0.4"
smallvec = "1.0" smallvec = "1"
rayon = "1" rayon = "1"
[target.'cfg(windows)'.dependencies] [target.'cfg(windows)'.dependencies]
@@ -75,12 +75,11 @@ overflow-checks = true
[features] [features]
default = ["mmap"] default = ["mmap"]
mmap = ["atomicwrites", "fs2", "memmap", "notify"] mmap = ["fs2", "tempfile", "memmap", "notify"]
lz4-compression = ["lz4"] lz4-compression = ["lz4"]
failpoints = ["fail/failpoints"] failpoints = ["fail/failpoints"]
unstable = [] # useful for benches. unstable = [] # useful for benches.
wasm-bindgen = ["uuid/wasm-bindgen"] wasm-bindgen = ["uuid/wasm-bindgen"]
scoref64 = [] # scores are f64 instead of f32. was introduced to debug blockwand.
[workspace] [workspace]
members = ["query-grammar"] members = ["query-grammar"]

View File

@@ -34,11 +34,6 @@ Tantivy is, in fact, strongly inspired by Lucene's design.
The following [benchmark](https://tantivy-search.github.io/bench/) break downs The following [benchmark](https://tantivy-search.github.io/bench/) break downs
performance for different type of queries / collection. performance for different type of queries / collection.
In general, Tantivy tends to be
- slower than Lucene on union with a Top-K due to Block-WAND optimization.
- faster than Lucene on intersection and phrase queries.
Your mileage WILL vary depending on the nature of queries and their load. Your mileage WILL vary depending on the nature of queries and their load.
# Features # Features

View File

@@ -112,18 +112,6 @@ fn main() -> tantivy::Result<()> {
limbs and branches that arch over the pool" limbs and branches that arch over the pool"
)); ));
index_writer.add_document(doc!(
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
// Multivalued field just need to be repeated. // Multivalued field just need to be repeated.
index_writer.add_document(doc!( index_writer.add_document(doc!(
title => "Frankenstein", title => "Frankenstein",

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "tantivy-query-grammar" name = "tantivy-query-grammar"
version = "0.13.0" version = "0.14.0-dev"
authors = ["Paul Masurel <paul.masurel@gmail.com>"] authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT" license = "MIT"
categories = ["database-implementations", "data-structures"] categories = ["database-implementations", "data-structures"]

View File

@@ -52,7 +52,7 @@ mod test {
use crate::Occur; use crate::Occur;
#[test] #[test]
fn test_Occur_compose() { fn test_occur_compose() {
assert_eq!(Occur::compose(Occur::Should, Occur::Should), Occur::Should); assert_eq!(Occur::compose(Occur::Should, Occur::Should), Occur::Should);
assert_eq!(Occur::compose(Occur::Should, Occur::Must), Occur::Must); assert_eq!(Occur::compose(Occur::Should, Occur::Must), Occur::Must);
assert_eq!( assert_eq!(

View File

@@ -9,8 +9,10 @@ use combine::{
fn field<'a>() -> impl Parser<&'a str, Output = String> { fn field<'a>() -> impl Parser<&'a str, Output = String> {
( (
letter(), (letter().or(char('_'))),
many(satisfy(|c: char| c.is_alphanumeric() || c == '_')), many(satisfy(|c: char| {
c.is_alphanumeric() || c == '_' || c == '-'
})),
) )
.skip(char(':')) .skip(char(':'))
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2)) .map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
@@ -279,6 +281,8 @@ pub fn parse_to_ast<'a>() -> impl Parser<&'a str, Output = UserInputAST> {
#[cfg(test)] #[cfg(test)]
mod test { mod test {
type TestParseResult = Result<(), StringStreamError>;
use super::*; use super::*;
use combine::parser::Parser; use combine::parser::Parser;
@@ -296,9 +300,10 @@ mod test {
} }
#[test] #[test]
fn test_occur_symbol() { fn test_occur_symbol() -> TestParseResult {
assert_eq!(super::occur_symbol().parse("-"), Ok((Occur::MustNot, ""))); assert_eq!(super::occur_symbol().parse("-")?, (Occur::MustNot, ""));
assert_eq!(super::occur_symbol().parse("+"), Ok((Occur::Must, ""))); assert_eq!(super::occur_symbol().parse("+")?, (Occur::Must, ""));
Ok(())
} }
#[test] #[test]
@@ -410,6 +415,25 @@ mod test {
assert_eq!(format!("{:?}", ast), "\"abc\""); assert_eq!(format!("{:?}", ast), "\"abc\"");
} }
#[test]
fn test_field_name() -> TestParseResult {
assert_eq!(
super::field().parse("my-field-name:a")?,
("my-field-name".to_string(), "a")
);
assert_eq!(
super::field().parse("my_field_name:a")?,
("my_field_name".to_string(), "a")
);
assert!(super::field().parse(":a").is_err());
assert!(super::field().parse("-my_field:a").is_err());
assert_eq!(
super::field().parse("_my_field:a")?,
("_my_field".to_string(), "a")
);
Ok(())
}
#[test] #[test]
fn test_range_parser() { fn test_range_parser() {
// testing the range() parser separately // testing the range() parser separately

View File

@@ -46,7 +46,7 @@ pub trait CustomScorer<TScore>: Sync {
impl<TCustomScorer, TScore> Collector for CustomScoreTopCollector<TCustomScorer, TScore> impl<TCustomScorer, TScore> Collector for CustomScoreTopCollector<TCustomScorer, TScore>
where where
TCustomScorer: CustomScorer<TScore>, TCustomScorer: CustomScorer<TScore> + Send + Sync,
TScore: 'static + PartialOrd + Clone + Send + Sync, TScore: 'static + PartialOrd + Clone + Send + Sync,
{ {
type Fruit = Vec<(TScore, DocAddress)>; type Fruit = Vec<(TScore, DocAddress)>;

View File

@@ -472,7 +472,7 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
let num_facets: usize = 3 * 4 * 5; let num_facets: usize = 3 * 4 * 5;
let facets: Vec<Facet> = (0..num_facets) let facets: Vec<Facet> = (0..num_facets)
.map(|mut n| { .map(|mut n| {
@@ -531,7 +531,7 @@ mod tests {
let facet_field = schema_builder.add_facet_field("facets"); let facet_field = schema_builder.add_facet_field("facets");
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!( index_writer.add_document(doc!(
facet_field => Facet::from_text(&"/subjects/A/a"), facet_field => Facet::from_text(&"/subjects/A/a"),
facet_field => Facet::from_text(&"/subjects/B/a"), facet_field => Facet::from_text(&"/subjects/B/a"),
@@ -550,12 +550,12 @@ mod tests {
} }
#[test] #[test]
fn test_doc_search_by_facet() { fn test_doc_search_by_facet() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let facet_field = schema_builder.add_facet_field("facet"); let facet_field = schema_builder.add_facet_field("facet");
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!( index_writer.add_document(doc!(
facet_field => Facet::from_text(&"/A/A"), facet_field => Facet::from_text(&"/A/A"),
)); ));
@@ -568,8 +568,8 @@ mod tests {
index_writer.add_document(doc!( index_writer.add_document(doc!(
facet_field => Facet::from_text(&"/D/C/A"), facet_field => Facet::from_text(&"/D/C/A"),
)); ));
index_writer.commit().unwrap(); index_writer.commit()?;
let reader = index.reader().unwrap(); let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 4); assert_eq!(searcher.num_docs(), 4);
@@ -586,17 +586,17 @@ mod tests {
assert_eq!(count_facet("/A/C"), 1); assert_eq!(count_facet("/A/C"), 1);
assert_eq!(count_facet("/A/C/A"), 1); assert_eq!(count_facet("/A/C/A"), 1);
assert_eq!(count_facet("/C/A"), 0); assert_eq!(count_facet("/C/A"), 0);
let query_parser = QueryParser::for_index(&index, vec![]);
{ {
let query_parser = QueryParser::for_index(&index, vec![]); let query = query_parser.parse_query("facet:/A/B")?;
{ assert_eq!(1, searcher.search(&query, &Count).unwrap());
let query = query_parser.parse_query("facet:/A/B").unwrap();
assert_eq!(1, searcher.search(&query, &Count).unwrap());
}
{
let query = query_parser.parse_query("facet:/A").unwrap();
assert_eq!(3, searcher.search(&query, &Count).unwrap());
}
} }
{
let query = query_parser.parse_query("facet:/A")?;
assert_eq!(3, searcher.search(&query, &Count)?);
}
Ok(())
} }
#[test] #[test]
@@ -631,7 +631,7 @@ mod tests {
.collect(); .collect();
docs[..].shuffle(&mut thread_rng()); docs[..].shuffle(&mut thread_rng());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
for doc in docs { for doc in docs {
index_writer.add_document(doc); index_writer.add_document(doc);
} }
@@ -684,7 +684,7 @@ mod bench {
// 40425 docs // 40425 docs
docs[..].shuffle(&mut thread_rng()); docs[..].shuffle(&mut thread_rng());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
for doc in docs { for doc in docs {
index_writer.add_document(doc); index_writer.add_document(doc);
} }

View File

@@ -89,7 +89,7 @@ mod tests {
let index = Index::create_in_ram(schema.clone()); let index = Index::create_in_ram(schema.clone());
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
{ {
for i in 0u64..10u64 { for i in 0u64..10u64 {
index_writer.add_document(doc!( index_writer.add_document(doc!(

View File

@@ -133,7 +133,7 @@ impl<T> Fruit for T where T: Send + downcast_rs::Downcast {}
/// The collection logic itself is in the `SegmentCollector`. /// The collection logic itself is in the `SegmentCollector`.
/// ///
/// Segments are not guaranteed to be visited in any specific order. /// Segments are not guaranteed to be visited in any specific order.
pub trait Collector: Sync { pub trait Collector: Sync + Send {
/// `Fruit` is the type for the result of our collection. /// `Fruit` is the type for the result of our collection.
/// e.g. `usize` for the `Count` collector. /// e.g. `usize` for the `Count` collector.
type Fruit: Fruit; type Fruit: Fruit;

View File

@@ -259,7 +259,7 @@ mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text=>"abc")); index_writer.add_document(doc!(text=>"abc"));
index_writer.add_document(doc!(text=>"abc abc abc")); index_writer.add_document(doc!(text=>"abc abc abc"));
index_writer.add_document(doc!(text=>"abc abc")); index_writer.add_document(doc!(text=>"abc abc"));

View File

@@ -38,7 +38,7 @@ use std::fmt;
/// let schema = schema_builder.build(); /// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema); /// let index = Index::create_in_ram(schema);
/// ///
/// let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); /// let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
/// index_writer.add_document(doc!(title => "The Name of the Wind")); /// index_writer.add_document(doc!(title => "The Name of the Wind"));
/// index_writer.add_document(doc!(title => "The Diary of Muadib")); /// index_writer.add_document(doc!(title => "The Diary of Muadib"));
/// index_writer.add_document(doc!(title => "A Dairy Cow")); /// index_writer.add_document(doc!(title => "A Dairy Cow"));
@@ -123,7 +123,7 @@ impl TopDocs {
/// let schema = schema_builder.build(); /// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema); /// let index = Index::create_in_ram(schema);
/// ///
/// let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); /// let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
/// index_writer.add_document(doc!(title => "The Name of the Wind")); /// index_writer.add_document(doc!(title => "The Name of the Wind"));
/// index_writer.add_document(doc!(title => "The Diary of Muadib")); /// index_writer.add_document(doc!(title => "The Diary of Muadib"));
/// index_writer.add_document(doc!(title => "A Dairy Cow")); /// index_writer.add_document(doc!(title => "A Dairy Cow"));
@@ -163,7 +163,7 @@ impl TopDocs {
/// # let schema = schema_builder.build(); /// # let schema = schema_builder.build();
/// # /// #
/// # let index = Index::create_in_ram(schema); /// # let index = Index::create_in_ram(schema);
/// # let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?; /// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// # index_writer.add_document(doc!(title => "The Name of the Wind", rating => 92u64)); /// # index_writer.add_document(doc!(title => "The Name of the Wind", rating => 92u64));
/// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64)); /// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64));
/// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64)); /// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64));
@@ -264,7 +264,7 @@ impl TopDocs {
/// fn create_index() -> tantivy::Result<Index> { /// fn create_index() -> tantivy::Result<Index> {
/// let schema = create_schema(); /// let schema = create_schema();
/// let index = Index::create_in_ram(schema); /// let index = Index::create_in_ram(schema);
/// let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?; /// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// let product_name = index.schema().get_field("product_name").unwrap(); /// let product_name = index.schema().get_field("product_name").unwrap();
/// let popularity: Field = index.schema().get_field("popularity").unwrap(); /// let popularity: Field = index.schema().get_field("popularity").unwrap();
/// index_writer.add_document(doc!(product_name => "The Diary of Muadib", popularity => 1u64)); /// index_writer.add_document(doc!(product_name => "The Diary of Muadib", popularity => 1u64));
@@ -324,7 +324,7 @@ impl TopDocs {
where where
TScore: 'static + Send + Sync + Clone + PartialOrd, TScore: 'static + Send + Sync + Clone + PartialOrd,
TScoreSegmentTweaker: ScoreSegmentTweaker<TScore> + 'static, TScoreSegmentTweaker: ScoreSegmentTweaker<TScore> + 'static,
TScoreTweaker: ScoreTweaker<TScore, Child = TScoreSegmentTweaker>, TScoreTweaker: ScoreTweaker<TScore, Child = TScoreSegmentTweaker> + Send + Sync,
{ {
TweakedScoreTopCollector::new(score_tweaker, self.0.into_tscore()) TweakedScoreTopCollector::new(score_tweaker, self.0.into_tscore())
} }
@@ -371,7 +371,7 @@ impl TopDocs {
/// # fn main() -> tantivy::Result<()> { /// # fn main() -> tantivy::Result<()> {
/// # let schema = create_schema(); /// # let schema = create_schema();
/// # let index = Index::create_in_ram(schema); /// # let index = Index::create_in_ram(schema);
/// # let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?; /// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// # let product_name = index.schema().get_field("product_name").unwrap(); /// # let product_name = index.schema().get_field("product_name").unwrap();
/// # /// #
/// let popularity: Field = index.schema().get_field("popularity").unwrap(); /// let popularity: Field = index.schema().get_field("popularity").unwrap();
@@ -438,7 +438,7 @@ impl TopDocs {
where where
TScore: 'static + Send + Sync + Clone + PartialOrd, TScore: 'static + Send + Sync + Clone + PartialOrd,
TCustomSegmentScorer: CustomSegmentScorer<TScore> + 'static, TCustomSegmentScorer: CustomSegmentScorer<TScore> + 'static,
TCustomScorer: CustomScorer<TScore, Child = TCustomSegmentScorer>, TCustomScorer: CustomScorer<TScore, Child = TCustomSegmentScorer> + Send + Sync,
{ {
CustomScoreTopCollector::new(custom_score, self.0.into_tscore()) CustomScoreTopCollector::new(custom_score, self.0.into_tscore())
} }
@@ -561,7 +561,7 @@ mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"Hello happy tax payer.")); index_writer.add_document(doc!(text_field=>"Hello happy tax payer."));
index_writer.add_document(doc!(text_field=>"Droopy says hello happy tax payer")); index_writer.add_document(doc!(text_field=>"Droopy says hello happy tax payer"));
index_writer.add_document(doc!(text_field=>"I like Droopy")); index_writer.add_document(doc!(text_field=>"I like Droopy"));
@@ -821,7 +821,7 @@ mod tests {
) -> (Index, Box<dyn Query>) { ) -> (Index, Box<dyn Query>) {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
doc_adder(&mut index_writer); doc_adder(&mut index_writer);
index_writer.commit().unwrap(); index_writer.commit().unwrap();
let query_parser = QueryParser::for_index(&index, vec![query_field]); let query_parser = QueryParser::for_index(&index, vec![query_field]);

View File

@@ -49,7 +49,7 @@ pub trait ScoreTweaker<TScore>: Sync {
impl<TScoreTweaker, TScore> Collector for TweakedScoreTopCollector<TScoreTweaker, TScore> impl<TScoreTweaker, TScore> Collector for TweakedScoreTopCollector<TScoreTweaker, TScore>
where where
TScoreTweaker: ScoreTweaker<TScore>, TScoreTweaker: ScoreTweaker<TScore> + Send + Sync,
TScore: 'static + PartialOrd + Clone + Send + Sync, TScore: 'static + PartialOrd + Clone + Send + Sync,
{ {
type Fruit = Vec<(TScore, DocAddress)>; type Fruit = Vec<(TScore, DocAddress)>;

View File

@@ -1,5 +1,6 @@
use crossbeam::channel; use crossbeam::channel;
use rayon::{ThreadPool, ThreadPoolBuilder}; use rayon::{ThreadPool, ThreadPoolBuilder};
use slog::{error, Logger};
/// Search executor whether search request are single thread or multithread. /// Search executor whether search request are single thread or multithread.
/// ///
@@ -43,6 +44,7 @@ impl Executor {
&self, &self,
f: F, f: F,
args: AIterator, args: AIterator,
logger: Logger,
) -> crate::Result<Vec<R>> { ) -> crate::Result<Vec<R>> {
match self { match self {
Executor::SingleThread => args.map(f).collect::<crate::Result<_>>(), Executor::SingleThread => args.map(f).collect::<crate::Result<_>>(),
@@ -57,7 +59,7 @@ impl Executor {
let (idx, arg) = arg_with_idx; let (idx, arg) = arg_with_idx;
let fruit = f(arg); let fruit = f(arg);
if let Err(err) = fruit_sender.send((idx, fruit)) { if let Err(err) = fruit_sender.send((idx, fruit)) {
error!("Failed to send search task. It probably means all search threads have panicked. {:?}", err); error!(logger, "Failed to send search task. It probably means all search threads have panicked. {:?}", err);
} }
}); });
} }
@@ -87,17 +89,21 @@ impl Executor {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use slog::{o, Discard, Logger};
use super::Executor; use super::Executor;
#[test] #[test]
#[should_panic(expected = "panic should propagate")] #[should_panic(expected = "panic should propagate")]
fn test_panic_propagates_single_thread() { fn test_panic_propagates_single_thread() {
let logger = Logger::root(Discard, o!());
let _result: Vec<usize> = Executor::single_thread() let _result: Vec<usize> = Executor::single_thread()
.map( .map(
|_| { |_| {
panic!("panic should propagate"); panic!("panic should propagate");
}, },
vec![0].into_iter(), vec![0].into_iter(),
logger,
) )
.unwrap(); .unwrap();
} }
@@ -105,6 +111,7 @@ mod tests {
#[test] #[test]
#[should_panic] //< unfortunately the panic message is not propagated #[should_panic] //< unfortunately the panic message is not propagated
fn test_panic_propagates_multi_thread() { fn test_panic_propagates_multi_thread() {
let logger = Logger::root(Discard, o!());
let _result: Vec<usize> = Executor::multi_thread(1, "search-test") let _result: Vec<usize> = Executor::multi_thread(1, "search-test")
.unwrap() .unwrap()
.map( .map(
@@ -112,14 +119,16 @@ mod tests {
panic!("panic should propagate"); panic!("panic should propagate");
}, },
vec![0].into_iter(), vec![0].into_iter(),
logger,
) )
.unwrap(); .unwrap();
} }
#[test] #[test]
fn test_map_singlethread() { fn test_map_singlethread() {
let logger = Logger::root(Discard, o!());
let result: Vec<usize> = Executor::single_thread() let result: Vec<usize> = Executor::single_thread()
.map(|i| Ok(i * 2), 0..1_000) .map(|i| Ok(i * 2), 0..1_000, logger)
.unwrap(); .unwrap();
assert_eq!(result.len(), 1_000); assert_eq!(result.len(), 1_000);
for i in 0..1_000 { for i in 0..1_000 {
@@ -129,9 +138,10 @@ mod tests {
#[test] #[test]
fn test_map_multithread() { fn test_map_multithread() {
let logger = Logger::root(Discard, o!());
let result: Vec<usize> = Executor::multi_thread(3, "search-test") let result: Vec<usize> = Executor::multi_thread(3, "search-test")
.unwrap() .unwrap()
.map(|i| Ok(i * 2), 0..10) .map(|i| Ok(i * 2), 0..10, logger)
.unwrap(); .unwrap();
assert_eq!(result.len(), 10); assert_eq!(result.len(), 10);
for i in 0..10 { for i in 0..10 {

View File

@@ -21,6 +21,7 @@ use crate::schema::FieldType;
use crate::schema::Schema; use crate::schema::Schema;
use crate::tokenizer::{TextAnalyzer, TokenizerManager}; use crate::tokenizer::{TextAnalyzer, TokenizerManager};
use crate::IndexWriter; use crate::IndexWriter;
use slog::Logger;
use std::borrow::BorrowMut; use std::borrow::BorrowMut;
use std::collections::HashSet; use std::collections::HashSet;
use std::fmt; use std::fmt;
@@ -57,7 +58,14 @@ pub struct Index {
} }
impl Index { impl Index {
/// Examines the director to see if it contains an index
pub(crate) fn logger(&self) -> &Logger {
self.directory.logger()
}
/// Examines the directory to see if it contains an index.
///
/// Effectively, it only checks for the presence of the `meta.json` file.
pub fn exists<Dir: Directory>(dir: &Dir) -> bool { pub fn exists<Dir: Directory>(dir: &Dir) -> bool {
dir.exists(&META_FILEPATH) dir.exists(&META_FILEPATH)
} }
@@ -140,16 +148,18 @@ impl Index {
Index::create(mmap_directory, schema) Index::create(mmap_directory, schema)
} }
/// Creates a new index given an implementation of the trait `Directory` /// Creates a new index given an implementation of the trait `Directory`.
///
/// If a directory previously existed, it will be erased.
pub fn create<Dir: Directory>(dir: Dir, schema: Schema) -> crate::Result<Index> { pub fn create<Dir: Directory>(dir: Dir, schema: Schema) -> crate::Result<Index> {
let directory = ManagedDirectory::wrap(dir)?; let directory = ManagedDirectory::wrap(dir)?;
Index::from_directory(directory, schema) Index::new_from_directory(directory, schema)
} }
/// Create a new index from a directory. /// Create a new index from a directory.
/// ///
/// This will overwrite existing meta.json /// This will overwrite existing meta.json
fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> crate::Result<Index> { fn new_from_directory(mut directory: ManagedDirectory, schema: Schema) -> crate::Result<Index> {
save_new_metas(schema.clone(), directory.borrow_mut())?; save_new_metas(schema.clone(), directory.borrow_mut())?;
let metas = IndexMeta::with_schema(schema); let metas = IndexMeta::with_schema(schema);
Index::create_from_metas(directory, &metas, SegmentMetaInventory::default()) Index::create_from_metas(directory, &metas, SegmentMetaInventory::default())
@@ -240,6 +250,8 @@ impl Index {
/// Open the index using the provided directory /// Open the index using the provided directory
pub fn open<D: Directory>(directory: D) -> crate::Result<Index> { pub fn open<D: Directory>(directory: D) -> crate::Result<Index> {
let logger: &Logger = directory.logger();
slog::info!(logger, "index-open"; "directory" => format!("{:?}", directory));
let directory = ManagedDirectory::wrap(directory)?; let directory = ManagedDirectory::wrap(directory)?;
let inventory = SegmentMetaInventory::default(); let inventory = SegmentMetaInventory::default();
let metas = load_metas(&directory, &inventory)?; let metas = load_metas(&directory, &inventory)?;
@@ -300,6 +312,15 @@ impl Index {
) )
} }
/// Helper to create an index writer for tests.
///
/// That index writer only simply has a single thread and a heap of 5 MB.
/// Using a single thread gives us a deterministic allocation of DocId.
#[cfg(test)]
pub fn writer_for_tests(&self) -> crate::Result<IndexWriter> {
self.writer_with_num_threads(1, 10_000_000)
}
/// Creates a multithreaded writer /// Creates a multithreaded writer
/// ///
/// Tantivy will automatically define the number of threads to use. /// Tantivy will automatically define the number of threads to use.
@@ -502,7 +523,7 @@ mod tests {
let schema = throw_away_schema(); let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap(); let field = schema.get_field("num_likes").unwrap();
let mut index = Index::create_from_tempdir(schema).unwrap(); let mut index = Index::create_from_tempdir(schema).unwrap();
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut writer = index.writer_for_tests().unwrap();
writer.commit().unwrap(); writer.commit().unwrap();
let reader = index let reader = index
.reader_builder() .reader_builder()
@@ -539,23 +560,33 @@ mod tests {
test_index_on_commit_reload_policy_aux(field, &write_index, &reader); test_index_on_commit_reload_policy_aux(field, &write_index, &reader);
} }
} }
fn test_index_on_commit_reload_policy_aux(field: Field, index: &Index, reader: &IndexReader) { fn test_index_on_commit_reload_policy_aux(field: Field, index: &Index, reader: &IndexReader) {
let mut reader_index = reader.index(); let mut reader_index = reader.index();
let (sender, receiver) = crossbeam::channel::unbounded(); let (sender, receiver) = crossbeam::channel::unbounded();
let _watch_handle = reader_index.directory_mut().watch(Box::new(move || { let _watch_handle = reader_index.directory_mut().watch(Box::new(move || {
let _ = sender.send(()); let _ = sender.send(());
})); }));
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut writer = index.writer_for_tests().unwrap();
assert_eq!(reader.searcher().num_docs(), 0); assert_eq!(reader.searcher().num_docs(), 0);
writer.add_document(doc!(field=>1u64)); writer.add_document(doc!(field=>1u64));
writer.commit().unwrap(); writer.commit().unwrap();
assert!(receiver.recv().is_ok()); // We need a loop here because it is possible for notify to send more than
assert_eq!(reader.searcher().num_docs(), 1); // one modify event. It was observed on CI on MacOS.
loop {
assert!(receiver.recv().is_ok());
if reader.searcher().num_docs() == 1 {
break;
}
}
writer.add_document(doc!(field=>2u64)); writer.add_document(doc!(field=>2u64));
writer.commit().unwrap(); writer.commit().unwrap();
assert!(receiver.recv().is_ok()); // ... Same as above
assert_eq!(reader.searcher().num_docs(), 2); loop {
assert!(receiver.recv().is_ok());
if reader.searcher().num_docs() == 2 {
break;
}
}
} }
// This test will not pass on windows, because windows // This test will not pass on windows, because windows

View File

@@ -3,7 +3,6 @@ use crate::directory::ReadOnlySource;
use crate::positions::PositionReader; use crate::positions::PositionReader;
use crate::postings::TermInfo; use crate::postings::TermInfo;
use crate::postings::{BlockSegmentPostings, SegmentPostings}; use crate::postings::{BlockSegmentPostings, SegmentPostings};
use crate::schema::FieldType;
use crate::schema::IndexRecordOption; use crate::schema::IndexRecordOption;
use crate::schema::Term; use crate::schema::Term;
use crate::termdict::TermDictionary; use crate::termdict::TermDictionary;
@@ -54,10 +53,7 @@ impl InvertedIndexReader {
/// Creates an empty `InvertedIndexReader` object, which /// Creates an empty `InvertedIndexReader` object, which
/// contains no terms at all. /// contains no terms at all.
pub fn empty(field_type: &FieldType) -> InvertedIndexReader { pub fn empty(record_option: IndexRecordOption) -> InvertedIndexReader {
let record_option = field_type
.get_index_record_option()
.unwrap_or(IndexRecordOption::Basic);
InvertedIndexReader { InvertedIndexReader {
termdict: TermDictionary::empty(), termdict: TermDictionary::empty(),
postings_source: ReadOnlySource::empty(), postings_source: ReadOnlySource::empty(),

View File

@@ -143,6 +143,7 @@ impl Searcher {
collector.collect_segment(weight.as_ref(), segment_ord as u32, segment_reader) collector.collect_segment(weight.as_ref(), segment_ord as u32, segment_reader)
}, },
segment_readers.iter().enumerate(), segment_readers.iter().enumerate(),
self.index.logger().clone(),
)?; )?;
collector.merge_fruits(fruits) collector.merge_fruits(fruits)
} }

View File

@@ -21,6 +21,12 @@ use std::sync::atomic;
#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] #[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct SegmentId(Uuid); pub struct SegmentId(Uuid);
impl ToString for SegmentId {
fn to_string(&self) -> String {
self.short_uuid_string()
}
}
#[cfg(test)] #[cfg(test)]
static AUTO_INC_COUNTER: Lazy<atomic::AtomicUsize> = Lazy::new(|| atomic::AtomicUsize::default()); static AUTO_INC_COUNTER: Lazy<atomic::AtomicUsize> = Lazy::new(|| atomic::AtomicUsize::default());

View File

@@ -9,14 +9,15 @@ use crate::fastfield::DeleteBitSet;
use crate::fastfield::FacetReader; use crate::fastfield::FacetReader;
use crate::fastfield::FastFieldReaders; use crate::fastfield::FastFieldReaders;
use crate::fieldnorm::{FieldNormReader, FieldNormReaders}; use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
use crate::schema::Field;
use crate::schema::FieldType; use crate::schema::FieldType;
use crate::schema::Schema; use crate::schema::Schema;
use crate::schema::{Field, IndexRecordOption};
use crate::space_usage::SegmentSpaceUsage; use crate::space_usage::SegmentSpaceUsage;
use crate::store::StoreReader; use crate::store::StoreReader;
use crate::termdict::TermDictionary; use crate::termdict::TermDictionary;
use crate::DocId; use crate::DocId;
use fail::fail_point; use fail::fail_point;
use slog::{warn, Logger};
use std::collections::HashMap; use std::collections::HashMap;
use std::fmt; use std::fmt;
use std::sync::Arc; use std::sync::Arc;
@@ -53,6 +54,7 @@ pub struct SegmentReader {
store_source: ReadOnlySource, store_source: ReadOnlySource,
delete_bitset_opt: Option<DeleteBitSet>, delete_bitset_opt: Option<DeleteBitSet>,
schema: Schema, schema: Schema,
logger: Logger,
} }
impl SegmentReader { impl SegmentReader {
@@ -125,17 +127,15 @@ impl SegmentReader {
/// ///
/// They are simply stored as a fast field, serialized in /// They are simply stored as a fast field, serialized in
/// the `.fieldnorm` file of the segment. /// the `.fieldnorm` file of the segment.
pub fn get_fieldnorms_reader(&self, field: Field) -> FieldNormReader { pub fn get_fieldnorms_reader(&self, field: Field) -> crate::Result<FieldNormReader> {
if let Some(fieldnorm_reader) = self.fieldnorm_readers.get_field(field) { self.fieldnorm_readers.get_field(field).ok_or_else(|| {
fieldnorm_reader
} else {
let field_name = self.schema.get_field_name(field); let field_name = self.schema.get_field_name(field);
let err_msg = format!( let err_msg = format!(
"Field norm not found for field {:?}. Was it market as indexed during indexing.", "Field norm not found for field {:?}. Was it market as indexed during indexing.",
field_name field_name
); );
panic!(err_msg); crate::TantivyError::SchemaError(err_msg)
} })
} }
/// Accessor to the segment's `StoreReader`. /// Accessor to the segment's `StoreReader`.
@@ -202,6 +202,7 @@ impl SegmentReader {
positions_composite, positions_composite,
positions_idx_composite, positions_idx_composite,
schema, schema,
logger: segment.index().logger().clone(),
}) })
} }
@@ -212,6 +213,11 @@ impl SegmentReader {
/// The field reader is in charge of iterating through the /// The field reader is in charge of iterating through the
/// term dictionary associated to a specific field, /// term dictionary associated to a specific field,
/// and opening the posting list associated to any term. /// and opening the posting list associated to any term.
///
/// If the field is marked as index, a warn is logged and an empty `InvertedIndexReader`
/// is returned.
/// Similarly if the field is marked as indexed but no term has been indexed for the given
/// index. an empty `InvertedIndexReader` is returned (but no warning is logged).
pub fn inverted_index(&self, field: Field) -> Arc<InvertedIndexReader> { pub fn inverted_index(&self, field: Field) -> Arc<InvertedIndexReader> {
if let Some(inv_idx_reader) = self if let Some(inv_idx_reader) = self
.inv_idx_reader_cache .inv_idx_reader_cache
@@ -226,21 +232,25 @@ impl SegmentReader {
let record_option_opt = field_type.get_index_record_option(); let record_option_opt = field_type.get_index_record_option();
if record_option_opt.is_none() { if record_option_opt.is_none() {
panic!("Field {:?} does not seem indexed.", field_entry.name()); warn!(
self.logger,
"Field {:?} does not seem indexed.",
field_entry.name()
);
} }
let record_option = record_option_opt.unwrap();
let postings_source_opt = self.postings_composite.open_read(field); let postings_source_opt = self.postings_composite.open_read(field);
if postings_source_opt.is_none() { if postings_source_opt.is_none() || record_option_opt.is_none() {
// no documents in the segment contained this field. // no documents in the segment contained this field.
// As a result, no data is associated to the inverted index. // As a result, no data is associated to the inverted index.
// //
// Returns an empty inverted index. // Returns an empty inverted index.
return Arc::new(InvertedIndexReader::empty(field_type)); let record_option = record_option_opt.unwrap_or(IndexRecordOption::Basic);
return Arc::new(InvertedIndexReader::empty(record_option));
} }
let record_option = record_option_opt.unwrap();
let postings_source = postings_source_opt.unwrap(); let postings_source = postings_source_opt.unwrap();
let termdict_source = self.termdict_composite.open_read(field).expect( let termdict_source = self.termdict_composite.open_read(field).expect(
@@ -339,7 +349,7 @@ mod test {
let name = schema.get_field("name").unwrap(); let name = schema.get_field("name").unwrap();
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(name => "tantivy")); index_writer.add_document(doc!(name => "tantivy"));
index_writer.add_document(doc!(name => "horse")); index_writer.add_document(doc!(name => "horse"));
index_writer.add_document(doc!(name => "jockey")); index_writer.add_document(doc!(name => "jockey"));

View File

@@ -1,3 +1,5 @@
use slog::{error, Logger};
use crate::directory::directory_lock::Lock; use crate::directory::directory_lock::Lock;
use crate::directory::error::LockError; use crate::directory::error::LockError;
use crate::directory::error::{DeleteError, OpenReadError, OpenWriteError}; use crate::directory::error::{DeleteError, OpenReadError, OpenWriteError};
@@ -64,7 +66,10 @@ impl<T: Send + Sync + 'static> From<Box<T>> for DirectoryLock {
impl Drop for DirectoryLockGuard { impl Drop for DirectoryLockGuard {
fn drop(&mut self) { fn drop(&mut self) {
if let Err(e) = self.directory.delete(&*self.path) { if let Err(e) = self.directory.delete(&*self.path) {
error!("Failed to remove the lock file. {:?}", e); error!(
self.directory.logger(),
"Failed to remove the lock file. {:?}", e
);
} }
} }
} }
@@ -80,7 +85,7 @@ fn try_acquire_lock(
) -> Result<DirectoryLock, TryAcquireLockError> { ) -> Result<DirectoryLock, TryAcquireLockError> {
let mut write = directory.open_write(filepath).map_err(|e| match e { let mut write = directory.open_write(filepath).map_err(|e| match e {
OpenWriteError::FileAlreadyExists(_) => TryAcquireLockError::FileExists, OpenWriteError::FileAlreadyExists(_) => TryAcquireLockError::FileExists,
OpenWriteError::IOError(io_error) => TryAcquireLockError::IOError(io_error.into()), OpenWriteError::IOError { io_error, .. } => TryAcquireLockError::IOError(io_error),
})?; })?;
write.flush().map_err(TryAcquireLockError::IOError)?; write.flush().map_err(TryAcquireLockError::IOError)?;
Ok(DirectoryLock::from(Box::new(DirectoryLockGuard { Ok(DirectoryLock::from(Box::new(DirectoryLockGuard {
@@ -209,6 +214,9 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
/// `OnCommit` `ReloadPolicy`. Not implementing watch in a `Directory` only prevents the /// `OnCommit` `ReloadPolicy`. Not implementing watch in a `Directory` only prevents the
/// `OnCommit` `ReloadPolicy` to work properly. /// `OnCommit` `ReloadPolicy` to work properly.
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle>; fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle>;
/// Returns the `slog::Logger` configured for the `Directory`.
fn logger(&self) -> &Logger;
} }
/// DirectoryClone /// DirectoryClone

View File

@@ -1,160 +1,60 @@
use crate::Version; use crate::Version;
use std::error::Error as StdError;
use std::fmt; use std::fmt;
use std::io; use std::io;
use std::path::PathBuf; use std::path::PathBuf;
/// Error while trying to acquire a directory lock. /// Error while trying to acquire a directory lock.
#[derive(Debug, Fail)] #[derive(Debug, Error)]
pub enum LockError { pub enum LockError {
/// Failed to acquired a lock as it is already held by another /// Failed to acquired a lock as it is already held by another
/// client. /// client.
/// - In the context of a blocking lock, this means the lock was not released within some `timeout` period. /// - In the context of a blocking lock, this means the lock was not released within some `timeout` period.
/// - In the context of a non-blocking lock, this means the lock was busy at the moment of the call. /// - In the context of a non-blocking lock, this means the lock was busy at the moment of the call.
#[fail( #[error("Could not acquire lock as it is already held, possibly by a different process.")]
display = "Could not acquire lock as it is already held, possibly by a different process."
)]
LockBusy, LockBusy,
/// Trying to acquire a lock failed with an `IOError` /// Trying to acquire a lock failed with an `IOError`
#[fail(display = "Failed to acquire the lock due to an io:Error.")] #[error("Failed to acquire the lock due to an io:Error.")]
IOError(io::Error), IOError(io::Error),
} }
/// General IO error with an optional path to the offending file.
#[derive(Debug)]
pub struct IOError {
path: Option<PathBuf>,
err: io::Error,
}
impl Into<io::Error> for IOError {
fn into(self) -> io::Error {
self.err
}
}
impl fmt::Display for IOError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self.path {
Some(ref path) => write!(f, "io error occurred on path '{:?}': '{}'", path, self.err),
None => write!(f, "io error occurred: '{}'", self.err),
}
}
}
impl StdError for IOError {
fn description(&self) -> &str {
"io error occurred"
}
fn cause(&self) -> Option<&dyn StdError> {
Some(&self.err)
}
}
impl IOError {
pub(crate) fn with_path(path: PathBuf, err: io::Error) -> Self {
IOError {
path: Some(path),
err,
}
}
}
impl From<io::Error> for IOError {
fn from(err: io::Error) -> IOError {
IOError { path: None, err }
}
}
/// Error that may occur when opening a directory /// Error that may occur when opening a directory
#[derive(Debug)] #[derive(Debug, Error)]
pub enum OpenDirectoryError { pub enum OpenDirectoryError {
/// The underlying directory does not exists. /// The underlying directory does not exists.
#[error("Directory does not exist: '{0}'.")]
DoesNotExist(PathBuf), DoesNotExist(PathBuf),
/// The path exists but is not a directory. /// The path exists but is not a directory.
#[error("Path exists but is not a directory: '{0}'.")]
NotADirectory(PathBuf), NotADirectory(PathBuf),
/// Failed to create a temp directory.
#[error("Failed to create a temporary directory: '{0}'.")]
FailedToCreateTempDir(io::Error),
/// IoError /// IoError
IoError(io::Error), #[error("IOError '{io_error:?}' while create directory in: '{directory_path:?}'.")]
} IoError {
/// underlying io Error.
impl From<io::Error> for OpenDirectoryError { io_error: io::Error,
fn from(io_err: io::Error) -> Self { /// directory we tried to open.
OpenDirectoryError::IoError(io_err) directory_path: PathBuf,
} },
}
impl fmt::Display for OpenDirectoryError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match *self {
OpenDirectoryError::DoesNotExist(ref path) => {
write!(f, "the underlying directory '{:?}' does not exist", path)
}
OpenDirectoryError::NotADirectory(ref path) => {
write!(f, "the path '{:?}' exists but is not a directory", path)
}
OpenDirectoryError::IoError(ref err) => write!(
f,
"IOError while trying to open/create the directory. {:?}",
err
),
}
}
}
impl StdError for OpenDirectoryError {
fn description(&self) -> &str {
"error occurred while opening a directory"
}
fn cause(&self) -> Option<&dyn StdError> {
None
}
} }
/// Error that may occur when starting to write in a file /// Error that may occur when starting to write in a file
#[derive(Debug)] #[derive(Debug, Error)]
pub enum OpenWriteError { pub enum OpenWriteError {
/// Our directory is WORM, writing an existing file is forbidden. /// Our directory is WORM, writing an existing file is forbidden.
/// Checkout the `Directory` documentation. /// Checkout the `Directory` documentation.
#[error("File already exists: '{0}'")]
FileAlreadyExists(PathBuf), FileAlreadyExists(PathBuf),
/// Any kind of IO error that happens when /// Any kind of IO error that happens when
/// writing in the underlying IO device. /// writing in the underlying IO device.
IOError(IOError), #[error("IOError '{io_error:?}' while opening file for write: '{filepath}'.")]
} IOError {
/// The underlying `io::Error`.
impl From<IOError> for OpenWriteError { io_error: io::Error,
fn from(err: IOError) -> OpenWriteError { /// File path of the file that tantivy failed to open for write.
OpenWriteError::IOError(err) filepath: PathBuf,
} },
}
impl fmt::Display for OpenWriteError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match *self {
OpenWriteError::FileAlreadyExists(ref path) => {
write!(f, "the file '{:?}' already exists", path)
}
OpenWriteError::IOError(ref err) => write!(
f,
"an io error occurred while opening a file for writing: '{}'",
err
),
}
}
}
impl StdError for OpenWriteError {
fn description(&self) -> &str {
"error occurred while opening a file for writing"
}
fn cause(&self) -> Option<&dyn StdError> {
match *self {
OpenWriteError::FileAlreadyExists(_) => None,
OpenWriteError::IOError(ref err) => Some(err),
}
}
} }
/// Type of index incompatibility between the library and the index found on disk /// Type of index incompatibility between the library and the index found on disk
@@ -217,55 +117,41 @@ impl fmt::Debug for Incompatibility {
} }
/// Error that may occur when accessing a file read /// Error that may occur when accessing a file read
#[derive(Debug)] #[derive(Debug, Error)]
pub enum OpenReadError { pub enum OpenReadError {
/// The file does not exists. /// The file does not exists.
#[error("Files does not exists: {0:?}")]
FileDoesNotExist(PathBuf), FileDoesNotExist(PathBuf),
/// Any kind of IO error that happens when /// Any kind of io::Error.
/// interacting with the underlying IO device. #[error(
IOError(IOError), "IOError: '{io_error:?}' happened while opening the following file for Read: {filepath}."
/// This library doesn't support the index version found on disk )]
IOError {
/// The underlying `io::Error`.
io_error: io::Error,
/// File path of the file that tantivy failed to open for read.
filepath: PathBuf,
},
/// This library does not support the index version found in file footer.
#[error("Index version unsupported: {0:?}")]
IncompatibleIndex(Incompatibility), IncompatibleIndex(Incompatibility),
} }
impl From<IOError> for OpenReadError {
fn from(err: IOError) -> OpenReadError {
OpenReadError::IOError(err)
}
}
impl fmt::Display for OpenReadError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match *self {
OpenReadError::FileDoesNotExist(ref path) => {
write!(f, "the file '{:?}' does not exist", path)
}
OpenReadError::IOError(ref err) => write!(
f,
"an io error occurred while opening a file for reading: '{}'",
err
),
OpenReadError::IncompatibleIndex(ref footer) => {
write!(f, "Incompatible index format: {:?}", footer)
}
}
}
}
/// Error that may occur when trying to delete a file /// Error that may occur when trying to delete a file
#[derive(Debug)] #[derive(Debug, Error)]
pub enum DeleteError { pub enum DeleteError {
/// The file does not exists. /// The file does not exists.
#[error("File does not exists: '{0}'.")]
FileDoesNotExist(PathBuf), FileDoesNotExist(PathBuf),
/// Any kind of IO error that happens when /// Any kind of IO error that happens when
/// interacting with the underlying IO device. /// interacting with the underlying IO device.
IOError(IOError), #[error("The following IO error happened while deleting file '{filepath}': '{io_error:?}'.")]
} IOError {
/// The underlying `io::Error`.
impl From<IOError> for DeleteError { io_error: io::Error,
fn from(err: IOError) -> DeleteError { /// File path of the file that tantivy failed to delete.
DeleteError::IOError(err) filepath: PathBuf,
} },
} }
impl From<Incompatibility> for OpenReadError { impl From<Incompatibility> for OpenReadError {
@@ -273,29 +159,3 @@ impl From<Incompatibility> for OpenReadError {
OpenReadError::IncompatibleIndex(incompatibility) OpenReadError::IncompatibleIndex(incompatibility)
} }
} }
impl fmt::Display for DeleteError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match *self {
DeleteError::FileDoesNotExist(ref path) => {
write!(f, "the file '{:?}' does not exist", path)
}
DeleteError::IOError(ref err) => {
write!(f, "an io error occurred while deleting a file: '{}'", err)
}
}
}
}
impl StdError for DeleteError {
fn description(&self) -> &str {
"error occurred while deleting a file"
}
fn cause(&self) -> Option<&dyn StdError> {
match *self {
DeleteError::FileDoesNotExist(_) => None,
DeleteError::IOError(ref err) => Some(err),
}
}
}

View File

@@ -271,7 +271,11 @@ mod tests {
let mut vec = Vec::new(); let mut vec = Vec::new();
let footer_proxy = FooterProxy::new(&mut vec); let footer_proxy = FooterProxy::new(&mut vec);
assert!(footer_proxy.terminate().is_ok()); assert!(footer_proxy.terminate().is_ok());
assert_eq!(vec.len(), 167); if crate::store::COMPRESSION == "lz4" {
assert_eq!(vec.len(), 158);
} else {
assert_eq!(vec.len(), 167);
}
let footer = Footer::deserialize(&mut &vec[..]).unwrap(); let footer = Footer::deserialize(&mut &vec[..]).unwrap();
assert!(matches!( assert!(matches!(
footer.versioned_footer, footer.versioned_footer,

View File

@@ -1,5 +1,5 @@
use crate::core::MANAGED_FILEPATH; use crate::core::{MANAGED_FILEPATH, META_FILEPATH};
use crate::directory::error::{DeleteError, IOError, LockError, OpenReadError, OpenWriteError}; use crate::directory::error::{DeleteError, LockError, OpenReadError, OpenWriteError};
use crate::directory::footer::{Footer, FooterProxy}; use crate::directory::footer::{Footer, FooterProxy};
use crate::directory::DirectoryLock; use crate::directory::DirectoryLock;
use crate::directory::GarbageCollectionResult; use crate::directory::GarbageCollectionResult;
@@ -11,9 +11,9 @@ use crate::error::DataCorruption;
use crate::Directory; use crate::Directory;
use crc32fast::Hasher; use crc32fast::Hasher;
use slog::{debug, error, info};
use std::collections::HashSet; use std::collections::HashSet;
use std::io; use std::io;
use std::io::Write;
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::result; use std::result;
use std::sync::RwLockWriteGuard; use std::sync::RwLockWriteGuard;
@@ -56,9 +56,9 @@ fn save_managed_paths(
directory: &mut dyn Directory, directory: &mut dyn Directory,
wlock: &RwLockWriteGuard<'_, MetaInformation>, wlock: &RwLockWriteGuard<'_, MetaInformation>,
) -> io::Result<()> { ) -> io::Result<()> {
let mut w = serde_json::to_vec(&wlock.managed_paths)?; let mut managed_json = serde_json::to_string_pretty(&wlock.managed_paths)?;
writeln!(&mut w)?; managed_json.push_str("\n");
directory.atomic_write(&MANAGED_FILEPATH, &w[..])?; directory.atomic_write(&MANAGED_FILEPATH, managed_json.as_bytes())?;
Ok(()) Ok(())
} }
@@ -86,7 +86,12 @@ impl ManagedDirectory {
directory: Box::new(directory), directory: Box::new(directory),
meta_informations: Arc::default(), meta_informations: Arc::default(),
}), }),
Err(OpenReadError::IOError(e)) => Err(From::from(e)), Err(OpenReadError::IOError { io_error, filepath }) => {
Err(crate::TantivyError::OpenReadError(OpenReadError::IOError {
io_error,
filepath,
}))
}
Err(OpenReadError::IncompatibleIndex(incompatibility)) => { Err(OpenReadError::IncompatibleIndex(incompatibility)) => {
// For the moment, this should never happen `meta.json` // For the moment, this should never happen `meta.json`
// do not have any footer and cannot detect incompatibility. // do not have any footer and cannot detect incompatibility.
@@ -113,7 +118,7 @@ impl ManagedDirectory {
&mut self, &mut self,
get_living_files: L, get_living_files: L,
) -> crate::Result<GarbageCollectionResult> { ) -> crate::Result<GarbageCollectionResult> {
info!("Garbage collect"); info!(self.directory.logger(), "gc"; "stage"=>"start");
let mut files_to_delete = vec![]; let mut files_to_delete = vec![];
// It is crucial to get the living files after acquiring the // It is crucial to get the living files after acquiring the
@@ -148,7 +153,7 @@ impl ManagedDirectory {
} }
} }
Err(err) => { Err(err) => {
error!("Failed to acquire lock for GC"); error!(self.logger(), "Failed to acquire lock for GC");
return Err(crate::TantivyError::from(err)); return Err(crate::TantivyError::from(err));
} }
} }
@@ -160,7 +165,7 @@ impl ManagedDirectory {
for file_to_delete in files_to_delete { for file_to_delete in files_to_delete {
match self.delete(&file_to_delete) { match self.delete(&file_to_delete) {
Ok(_) => { Ok(_) => {
info!("Deleted {:?}", file_to_delete); debug!(self.logger(), "deleted-success"; "file"=>format!("{:?}", file_to_delete));
deleted_files.push(file_to_delete); deleted_files.push(file_to_delete);
} }
Err(file_error) => { Err(file_error) => {
@@ -168,12 +173,12 @@ impl ManagedDirectory {
DeleteError::FileDoesNotExist(_) => { DeleteError::FileDoesNotExist(_) => {
deleted_files.push(file_to_delete.clone()); deleted_files.push(file_to_delete.clone());
} }
DeleteError::IOError(_) => { DeleteError::IOError { .. } => {
failed_to_delete_files.push(file_to_delete.clone()); failed_to_delete_files.push(file_to_delete.clone());
if !cfg!(target_os = "windows") { if !cfg!(target_os = "windows") {
// On windows, delete is expected to fail if the file // On windows, delete is expected to fail if the file
// is mmapped. // is mmapped.
error!("Failed to delete {:?}", file_to_delete); error!(self.logger(), "delete-file-fail"; "path"=>file_to_delete.to_str().unwrap_or("<invalid-utf8>"));
} }
} }
} }
@@ -195,6 +200,10 @@ impl ManagedDirectory {
save_managed_paths(self.directory.as_mut(), &meta_informations_wlock)?; save_managed_paths(self.directory.as_mut(), &meta_informations_wlock)?;
} }
info!(self.directory.logger(), "gc"; "stage"=>"end",
"num-sucess-file-deletes"=>deleted_files.len(),
"num-failed-file-deletes"=>failed_to_delete_files.len());
Ok(GarbageCollectionResult { Ok(GarbageCollectionResult {
deleted_files, deleted_files,
failed_to_delete_files, failed_to_delete_files,
@@ -231,8 +240,11 @@ impl ManagedDirectory {
/// Verify checksum of a managed file /// Verify checksum of a managed file
pub fn validate_checksum(&self, path: &Path) -> result::Result<bool, OpenReadError> { pub fn validate_checksum(&self, path: &Path) -> result::Result<bool, OpenReadError> {
let reader = self.directory.open_read(path)?; let reader = self.directory.open_read(path)?;
let (footer, data) = Footer::extract_footer(reader) let (footer, data) =
.map_err(|err| IOError::with_path(path.to_path_buf(), err))?; Footer::extract_footer(reader).map_err(|io_error| OpenReadError::IOError {
io_error,
filepath: path.to_path_buf(),
})?;
let mut hasher = Hasher::new(); let mut hasher = Hasher::new();
hasher.update(data.as_slice()); hasher.update(data.as_slice());
let crc = hasher.finalize(); let crc = hasher.finalize();
@@ -245,35 +257,46 @@ impl ManagedDirectory {
/// List files for which checksum does not match content /// List files for which checksum does not match content
pub fn list_damaged(&self) -> result::Result<HashSet<PathBuf>, OpenReadError> { pub fn list_damaged(&self) -> result::Result<HashSet<PathBuf>, OpenReadError> {
let mut hashset = HashSet::new(); let mut managed_paths = self
let managed_paths = self
.meta_informations .meta_informations
.read() .read()
.expect("Managed directory rlock poisoned in list damaged.") .expect("Managed directory rlock poisoned in list damaged.")
.managed_paths .managed_paths
.clone(); .clone();
for path in managed_paths.into_iter() { managed_paths.remove(*META_FILEPATH);
let mut damaged_files = HashSet::new();
for path in managed_paths {
if !self.validate_checksum(&path)? { if !self.validate_checksum(&path)? {
hashset.insert(path); damaged_files.insert(path);
} }
} }
Ok(hashset) Ok(damaged_files)
} }
} }
impl Directory for ManagedDirectory { impl Directory for ManagedDirectory {
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> { fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> {
slog::debug!(self.logger(), "open-read"; "path" => path.to_str().unwrap_or("<invalid-utf8>"));
let read_only_source = self.directory.open_read(path)?; let read_only_source = self.directory.open_read(path)?;
let (footer, reader) = Footer::extract_footer(read_only_source) let (footer, reader) = Footer::extract_footer(read_only_source).map_err(|io_error| {
.map_err(|err| IOError::with_path(path.to_path_buf(), err))?; OpenReadError::IOError {
io_error,
filepath: path.to_path_buf(),
}
})?;
footer.is_compatible()?; footer.is_compatible()?;
Ok(reader) Ok(reader)
} }
fn open_write(&mut self, path: &Path) -> result::Result<WritePtr, OpenWriteError> { fn open_write(&mut self, path: &Path) -> result::Result<WritePtr, OpenWriteError> {
slog::debug!(self.logger(), "open-write"; "path" => path.to_str().unwrap_or("<invalid-utf8>"));
self.register_file_as_managed(path) self.register_file_as_managed(path)
.map_err(|e| IOError::with_path(path.to_owned(), e))?; .map_err(|io_error| OpenWriteError::IOError {
io_error,
filepath: path.to_path_buf(),
})?;
Ok(io::BufWriter::new(Box::new(FooterProxy::new( Ok(io::BufWriter::new(Box::new(FooterProxy::new(
self.directory self.directory
.open_write(path)? .open_write(path)?
@@ -283,9 +306,11 @@ impl Directory for ManagedDirectory {
)))) ))))
} }
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> { fn atomic_write(&mut self, path: &Path, content: &[u8]) -> io::Result<()> {
let content_str = std::str::from_utf8(content).unwrap_or("<content-not-utf-8>");
slog::debug!(self.logger(), "Atomic write"; "path" => format!("{:?}", path), "content_length"=>content_str);
self.register_file_as_managed(path)?; self.register_file_as_managed(path)?;
self.directory.atomic_write(path, data) self.directory.atomic_write(path, content)
} }
fn atomic_read(&self, path: &Path) -> result::Result<Vec<u8>, OpenReadError> { fn atomic_read(&self, path: &Path) -> result::Result<Vec<u8>, OpenReadError> {
@@ -307,6 +332,10 @@ impl Directory for ManagedDirectory {
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> { fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
self.directory.watch(watch_callback) self.directory.watch(watch_callback)
} }
fn logger(&self) -> &slog::Logger {
self.directory.logger()
}
} }
impl Clone for ManagedDirectory { impl Clone for ManagedDirectory {

View File

@@ -1,8 +1,6 @@
use crate::core::META_FILEPATH; use crate::core::META_FILEPATH;
use crate::directory::error::LockError; use crate::directory::error::LockError;
use crate::directory::error::{ use crate::directory::error::{DeleteError, OpenDirectoryError, OpenReadError, OpenWriteError};
DeleteError, IOError, OpenDirectoryError, OpenReadError, OpenWriteError,
};
use crate::directory::read_only_source::BoxedData; use crate::directory::read_only_source::BoxedData;
use crate::directory::AntiCallToken; use crate::directory::AntiCallToken;
use crate::directory::Directory; use crate::directory::Directory;
@@ -19,6 +17,8 @@ use notify::RawEvent;
use notify::RecursiveMode; use notify::RecursiveMode;
use notify::Watcher; use notify::Watcher;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use slog::{debug, o, Drain, Logger};
use slog_stdlog::StdLog;
use std::collections::HashMap; use std::collections::HashMap;
use std::convert::From; use std::convert::From;
use std::fmt; use std::fmt;
@@ -36,11 +36,6 @@ use std::sync::Weak;
use std::thread; use std::thread;
use tempfile::TempDir; use tempfile::TempDir;
/// Create a default io error given a string.
pub(crate) fn make_io_err(msg: String) -> io::Error {
io::Error::new(io::ErrorKind::Other, msg)
}
/// Returns None iff the file exists, can be read, but is empty (and hence /// Returns None iff the file exists, can be read, but is empty (and hence
/// cannot be mmapped) /// cannot be mmapped)
fn open_mmap(full_path: &Path) -> result::Result<Option<Mmap>, OpenReadError> { fn open_mmap(full_path: &Path) -> result::Result<Option<Mmap>, OpenReadError> {
@@ -48,13 +43,17 @@ fn open_mmap(full_path: &Path) -> result::Result<Option<Mmap>, OpenReadError> {
if e.kind() == io::ErrorKind::NotFound { if e.kind() == io::ErrorKind::NotFound {
OpenReadError::FileDoesNotExist(full_path.to_owned()) OpenReadError::FileDoesNotExist(full_path.to_owned())
} else { } else {
OpenReadError::IOError(IOError::with_path(full_path.to_owned(), e)) OpenReadError::IOError {
io_error: e,
filepath: full_path.to_owned(),
}
} }
})?; })?;
let meta_data = file let meta_data = file.metadata().map_err(|e| OpenReadError::IOError {
.metadata() io_error: e,
.map_err(|e| IOError::with_path(full_path.to_owned(), e))?; filepath: full_path.to_owned(),
})?;
if meta_data.len() == 0 { if meta_data.len() == 0 {
// if the file size is 0, it will not be possible // if the file size is 0, it will not be possible
// to mmap the file, so we return None // to mmap the file, so we return None
@@ -64,7 +63,10 @@ fn open_mmap(full_path: &Path) -> result::Result<Option<Mmap>, OpenReadError> {
unsafe { unsafe {
memmap::Mmap::map(&file) memmap::Mmap::map(&file)
.map(Some) .map(Some)
.map_err(|e| From::from(IOError::with_path(full_path.to_owned(), e))) .map_err(|e| OpenReadError::IOError {
io_error: e,
filepath: full_path.to_owned(),
})
} }
} }
@@ -144,7 +146,7 @@ struct WatcherWrapper {
} }
impl WatcherWrapper { impl WatcherWrapper {
pub fn new(path: &Path) -> Result<Self, OpenDirectoryError> { pub(crate) fn new(path: &Path, logger: Logger) -> Result<Self, OpenDirectoryError> {
let (tx, watcher_recv): (Sender<RawEvent>, Receiver<RawEvent>) = channel(); let (tx, watcher_recv): (Sender<RawEvent>, Receiver<RawEvent>) = channel();
// We need to initialize the // We need to initialize the
let watcher = notify::raw_watcher(tx) let watcher = notify::raw_watcher(tx)
@@ -158,7 +160,8 @@ impl WatcherWrapper {
panic!("Unknown error while starting watching directory {:?}", path); panic!("Unknown error while starting watching directory {:?}", path);
} }
})?; })?;
let watcher_router: Arc<WatchCallbackList> = Default::default(); let watcher_router: Arc<WatchCallbackList> =
Arc::new(WatchCallbackList::with_logger(logger));
let watcher_router_clone = watcher_router.clone(); let watcher_router_clone = watcher_router.clone();
thread::Builder::new() thread::Builder::new()
.name("meta-file-watch-thread".to_string()) .name("meta-file-watch-thread".to_string())
@@ -183,6 +186,10 @@ impl WatcherWrapper {
} }
} }
} }
})
.map_err(|io_error| OpenDirectoryError::IoError {
io_error,
directory_path: path.to_path_buf(),
})?; })?;
Ok(WatcherWrapper { Ok(WatcherWrapper {
_watcher: Mutex::new(watcher), _watcher: Mutex::new(watcher),
@@ -217,15 +224,21 @@ struct MmapDirectoryInner {
mmap_cache: RwLock<MmapCache>, mmap_cache: RwLock<MmapCache>,
_temp_directory: Option<TempDir>, _temp_directory: Option<TempDir>,
watcher: RwLock<Option<WatcherWrapper>>, watcher: RwLock<Option<WatcherWrapper>>,
logger: Logger,
} }
impl MmapDirectoryInner { impl MmapDirectoryInner {
fn new(root_path: PathBuf, temp_directory: Option<TempDir>) -> MmapDirectoryInner { fn new(
root_path: PathBuf,
temp_directory: Option<TempDir>,
logger: Logger,
) -> MmapDirectoryInner {
MmapDirectoryInner { MmapDirectoryInner {
root_path, root_path,
mmap_cache: Default::default(), mmap_cache: Default::default(),
_temp_directory: temp_directory, _temp_directory: temp_directory,
watcher: RwLock::new(None), watcher: RwLock::new(None),
logger,
} }
} }
@@ -237,7 +250,7 @@ impl MmapDirectoryInner {
// The downside is that we might create a watch wrapper that is not useful. // The downside is that we might create a watch wrapper that is not useful.
let need_initialization = self.watcher.read().unwrap().is_none(); let need_initialization = self.watcher.read().unwrap().is_none();
if need_initialization { if need_initialization {
let watch_wrapper = WatcherWrapper::new(&self.root_path)?; let watch_wrapper = WatcherWrapper::new(&self.root_path, self.logger.clone())?;
let mut watch_wlock = self.watcher.write().unwrap(); let mut watch_wlock = self.watcher.write().unwrap();
// the watcher could have been initialized when we released the lock, and // the watcher could have been initialized when we released the lock, and
// we do not want to lose the watched files that were set. // we do not want to lose the watched files that were set.
@@ -260,8 +273,8 @@ impl fmt::Debug for MmapDirectory {
} }
impl MmapDirectory { impl MmapDirectory {
fn new(root_path: PathBuf, temp_directory: Option<TempDir>) -> MmapDirectory { fn new(root_path: PathBuf, temp_directory: Option<TempDir>, logger: Logger) -> MmapDirectory {
let inner = MmapDirectoryInner::new(root_path, temp_directory); let inner = MmapDirectoryInner::new(root_path, temp_directory, logger);
MmapDirectory { MmapDirectory {
inner: Arc::new(inner), inner: Arc::new(inner),
} }
@@ -272,16 +285,19 @@ impl MmapDirectory {
/// This is mostly useful to test the MmapDirectory itself. /// This is mostly useful to test the MmapDirectory itself.
/// For your unit tests, prefer the RAMDirectory. /// For your unit tests, prefer the RAMDirectory.
pub fn create_from_tempdir() -> Result<MmapDirectory, OpenDirectoryError> { pub fn create_from_tempdir() -> Result<MmapDirectory, OpenDirectoryError> {
let tempdir = TempDir::new().map_err(OpenDirectoryError::IoError)?; let tempdir = TempDir::new().map_err(OpenDirectoryError::FailedToCreateTempDir)?;
let tempdir_path = PathBuf::from(tempdir.path()); let logger = Logger::root(StdLog.fuse(), o!());
Ok(MmapDirectory::new(tempdir_path, Some(tempdir))) Ok(MmapDirectory::new(tempdir.path().to_owned(), Some(tempdir), logger))
} }
/// Opens a MmapDirectory in a directory. /// Opens a MmapDirectory in a directory.
/// ///
/// Returns an error if the `directory_path` does not /// Returns an error if the `directory_path` does not
/// exist or if it is not a directory. /// exist or if it is not a directory.
pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<MmapDirectory, OpenDirectoryError> { pub fn open_with_logger<P: AsRef<Path>>(
directory_path: P,
logger: Logger,
) -> Result<MmapDirectory, OpenDirectoryError> {
let directory_path: &Path = directory_path.as_ref(); let directory_path: &Path = directory_path.as_ref();
if !directory_path.exists() { if !directory_path.exists() {
Err(OpenDirectoryError::DoesNotExist(PathBuf::from( Err(OpenDirectoryError::DoesNotExist(PathBuf::from(
@@ -292,10 +308,20 @@ impl MmapDirectory {
directory_path, directory_path,
))) )))
} else { } else {
Ok(MmapDirectory::new(PathBuf::from(directory_path), None)) Ok(MmapDirectory::new(
PathBuf::from(directory_path),
None,
logger,
))
} }
} }
/// Creates an `MmapDirectory` at the given path.
pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<MmapDirectory, OpenDirectoryError> {
let logger = Logger::root(StdLog.fuse(), o!());
Self::open_with_logger(directory_path, logger)
}
/// Joins a relative_path to the directory `root_path` /// Joins a relative_path to the directory `root_path`
/// to create a proper complete `filepath`. /// to create a proper complete `filepath`.
fn resolve_path(&self, relative_path: &Path) -> PathBuf { fn resolve_path(&self, relative_path: &Path) -> PathBuf {
@@ -355,11 +381,12 @@ impl MmapDirectory {
struct ReleaseLockFile { struct ReleaseLockFile {
_file: File, _file: File,
path: PathBuf, path: PathBuf,
logger: Logger,
} }
impl Drop for ReleaseLockFile { impl Drop for ReleaseLockFile {
fn drop(&mut self) { fn drop(&mut self) {
debug!("Releasing lock {:?}", self.path); debug!(self.logger, "Releasing lock {:?}", self.path);
} }
} }
@@ -398,16 +425,18 @@ impl TerminatingWrite for SafeFileWriter {
impl Directory for MmapDirectory { impl Directory for MmapDirectory {
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> { fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> {
debug!("Open Read {:?}", path);
let full_path = self.resolve_path(path); let full_path = self.resolve_path(path);
let mut mmap_cache = self.inner.mmap_cache.write().map_err(|_| { let mut mmap_cache = self.inner.mmap_cache.write().map_err(|_| {
let msg = format!( let msg = format!(
"Failed to acquired write lock \ "Failed to acquired write lock \
on mmap cache while reading {:?}", on mmap cache while reading {:?}",
path path
); );
IOError::with_path(path.to_owned(), make_io_err(msg)) let io_error = io::Error::new(io::ErrorKind::Other, msg);
OpenReadError::IOError {
io_error,
filepath: path.to_owned(),
}
})?; })?;
Ok(mmap_cache Ok(mmap_cache
.get_mmap(&full_path)? .get_mmap(&full_path)?
@@ -420,14 +449,18 @@ impl Directory for MmapDirectory {
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> { fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
let full_path = self.resolve_path(path); let full_path = self.resolve_path(path);
match fs::remove_file(&full_path) { match fs::remove_file(&full_path) {
Ok(_) => self Ok(_) => self.sync_directory().map_err(|e| DeleteError::IOError {
.sync_directory() io_error: e,
.map_err(|e| IOError::with_path(path.to_owned(), e).into()), filepath: path.to_path_buf(),
}),
Err(e) => { Err(e) => {
if e.kind() == io::ErrorKind::NotFound { if e.kind() == io::ErrorKind::NotFound {
Err(DeleteError::FileDoesNotExist(path.to_owned())) Err(DeleteError::FileDoesNotExist(path.to_owned()))
} else { } else {
Err(IOError::with_path(path.to_owned(), e).into()) Err(DeleteError::IOError {
io_error: e,
filepath: path.to_path_buf(),
})
} }
} }
} }
@@ -439,9 +472,7 @@ impl Directory for MmapDirectory {
} }
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> { fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> {
debug!("Open Write {:?}", path);
let full_path = self.resolve_path(path); let full_path = self.resolve_path(path);
let open_res = OpenOptions::new() let open_res = OpenOptions::new()
.write(true) .write(true)
.create_new(true) .create_new(true)
@@ -451,18 +482,25 @@ impl Directory for MmapDirectory {
if err.kind() == io::ErrorKind::AlreadyExists { if err.kind() == io::ErrorKind::AlreadyExists {
OpenWriteError::FileAlreadyExists(path.to_owned()) OpenWriteError::FileAlreadyExists(path.to_owned())
} else { } else {
IOError::with_path(path.to_owned(), err).into() OpenWriteError::IOError {
io_error: err,
filepath: path.to_owned(),
}
} }
})?; })?;
// making sure the file is created. // making sure the file is created.
file.flush() file.flush().map_err(|io_error| OpenWriteError::IOError {
.map_err(|e| IOError::with_path(path.to_owned(), e))?; io_error,
filepath: path.to_owned(),
})?;
// Apparetntly, on some filesystem syncing the parent // Apparetntly, on some filesystem syncing the parent
// directory is required. // directory is required.
self.sync_directory() self.sync_directory().map_err(|e| OpenWriteError::IOError {
.map_err(|e| IOError::with_path(path.to_owned(), e))?; io_error: e,
filepath: path.to_owned(),
})?;
let writer = SafeFileWriter::new(file); let writer = SafeFileWriter::new(file);
Ok(BufWriter::new(Box::new(writer))) Ok(BufWriter::new(Box::new(writer)))
@@ -474,24 +512,31 @@ impl Directory for MmapDirectory {
match File::open(&full_path) { match File::open(&full_path) {
Ok(mut file) => { Ok(mut file) => {
file.read_to_end(&mut buffer) file.read_to_end(&mut buffer)
.map_err(|e| IOError::with_path(path.to_owned(), e))?; .map_err(|io_error| OpenReadError::IOError {
io_error,
filepath: path.to_owned(),
})?;
Ok(buffer) Ok(buffer)
} }
Err(e) => { Err(io_error) => {
if e.kind() == io::ErrorKind::NotFound { if io_error.kind() == io::ErrorKind::NotFound {
Err(OpenReadError::FileDoesNotExist(path.to_owned())) Err(OpenReadError::FileDoesNotExist(path.to_owned()))
} else { } else {
Err(IOError::with_path(path.to_owned(), e).into()) Err(OpenReadError::IOError {
io_error,
filepath: path.to_owned(),
})
} }
} }
} }
} }
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> { fn atomic_write(&mut self, path: &Path, content: &[u8]) -> io::Result<()> {
debug!("Atomic Write {:?}", path); let mut tempfile = tempfile::Builder::new().tempfile_in(&self.inner.root_path)?;
tempfile.write_all(content)?;
tempfile.flush()?;
let full_path = self.resolve_path(path); let full_path = self.resolve_path(path);
let meta_file = atomicwrites::AtomicFile::new(full_path, atomicwrites::AllowOverwrite); tempfile.into_temp_path().persist(full_path)?;
meta_file.write(|f| f.write_all(data))?;
Ok(()) Ok(())
} }
@@ -508,16 +553,22 @@ impl Directory for MmapDirectory {
} else { } else {
file.try_lock_exclusive().map_err(|_| LockError::LockBusy)? file.try_lock_exclusive().map_err(|_| LockError::LockBusy)?
} }
let logger = self.inner.logger.clone();
// dropping the file handle will release the lock. // dropping the file handle will release the lock.
Ok(DirectoryLock::from(Box::new(ReleaseLockFile { Ok(DirectoryLock::from(Box::new(ReleaseLockFile {
path: lock.filepath.clone(), path: lock.filepath.clone(),
_file: file, _file: file,
logger,
}))) })))
} }
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> { fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
self.inner.watch(watch_callback) self.inner.watch(watch_callback)
} }
fn logger(&self) -> &Logger {
&self.inner.logger
}
} }
#[cfg(test)] #[cfg(test)]
@@ -627,7 +678,8 @@ mod tests {
let counter_clone = counter.clone(); let counter_clone = counter.clone();
let tmp_dir = tempfile::TempDir::new().unwrap(); let tmp_dir = tempfile::TempDir::new().unwrap();
let tmp_dirpath = tmp_dir.path().to_owned(); let tmp_dirpath = tmp_dir.path().to_owned();
let mut watch_wrapper = WatcherWrapper::new(&tmp_dirpath).unwrap(); let logger = Logger::root(slog::Discard, o!());
let mut watch_wrapper = WatcherWrapper::new(&tmp_dirpath, logger).unwrap();
let tmp_file = tmp_dirpath.join(*META_FILEPATH); let tmp_file = tmp_dirpath.join(*META_FILEPATH);
let _handle = watch_wrapper.watch(Box::new(move || { let _handle = watch_wrapper.watch(Box::new(move || {
counter_clone.fetch_add(1, Ordering::SeqCst); counter_clone.fetch_add(1, Ordering::SeqCst);
@@ -652,7 +704,7 @@ mod tests {
{ {
let index = Index::create(mmap_directory.clone(), schema).unwrap(); let index = Index::create(mmap_directory.clone(), schema).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
let mut log_merge_policy = LogMergePolicy::default(); let mut log_merge_policy = LogMergePolicy::default();
log_merge_policy.set_min_merge_size(3); log_merge_policy.set_min_merge_size(3);
index_writer.set_merge_policy(Box::new(log_merge_policy)); index_writer.set_merge_policy(Box::new(log_merge_policy));

View File

@@ -23,7 +23,8 @@ pub use self::directory::{Directory, DirectoryClone};
pub use self::directory_lock::{Lock, INDEX_WRITER_LOCK, META_LOCK}; pub use self::directory_lock::{Lock, INDEX_WRITER_LOCK, META_LOCK};
pub use self::ram_directory::RAMDirectory; pub use self::ram_directory::RAMDirectory;
pub use self::read_only_source::ReadOnlySource; pub use self::read_only_source::ReadOnlySource;
pub use self::watch_event_router::{WatchCallback, WatchCallbackList, WatchHandle}; pub(crate) use self::watch_event_router::WatchCallbackList;
pub use self::watch_event_router::{WatchCallback, WatchHandle};
use std::io::{self, BufWriter, Write}; use std::io::{self, BufWriter, Write};
use std::path::PathBuf; use std::path::PathBuf;
/// Outcome of the Garbage collection /// Outcome of the Garbage collection

View File

@@ -5,6 +5,8 @@ use crate::directory::WatchCallbackList;
use crate::directory::{Directory, ReadOnlySource, WatchCallback, WatchHandle}; use crate::directory::{Directory, ReadOnlySource, WatchCallback, WatchHandle};
use crate::directory::{TerminatingWrite, WritePtr}; use crate::directory::{TerminatingWrite, WritePtr};
use fail::fail_point; use fail::fail_point;
use slog::{o, Drain, Logger};
use slog_stdlog::StdLog;
use std::collections::HashMap; use std::collections::HashMap;
use std::fmt; use std::fmt;
use std::io::{self, BufWriter, Cursor, Seek, SeekFrom, Write}; use std::io::{self, BufWriter, Cursor, Seek, SeekFrom, Write};
@@ -66,7 +68,7 @@ impl Write for VecWriter {
fn flush(&mut self) -> io::Result<()> { fn flush(&mut self) -> io::Result<()> {
self.is_flushed = true; self.is_flushed = true;
let mut fs = self.shared_directory.fs.write().unwrap(); let mut fs = self.shared_directory.fs.inner_directory.write().unwrap();
fs.write(self.path.clone(), self.data.get_ref()); fs.write(self.path.clone(), self.data.get_ref());
Ok(()) Ok(())
} }
@@ -78,13 +80,19 @@ impl TerminatingWrite for VecWriter {
} }
} }
#[derive(Default)]
struct InnerDirectory { struct InnerDirectory {
fs: HashMap<PathBuf, ReadOnlySource>, fs: HashMap<PathBuf, ReadOnlySource>,
watch_router: WatchCallbackList, watch_router: WatchCallbackList,
} }
impl InnerDirectory { impl InnerDirectory {
fn with_logger(logger: Logger) -> Self {
InnerDirectory {
fs: Default::default(),
watch_router: WatchCallbackList::with_logger(logger.clone()),
}
}
fn write(&mut self, path: PathBuf, data: &[u8]) -> bool { fn write(&mut self, path: PathBuf, data: &[u8]) -> bool {
let data = ReadOnlySource::new(Vec::from(data)); let data = ReadOnlySource::new(Vec::from(data));
self.fs.insert(path, data).is_some() self.fs.insert(path, data).is_some()
@@ -117,20 +125,32 @@ impl InnerDirectory {
} }
} }
impl Default for RAMDirectory {
fn default() -> RAMDirectory {
let logger = Logger::root(StdLog.fuse(), o!());
Self::with_logger(logger)
}
}
impl fmt::Debug for RAMDirectory { impl fmt::Debug for RAMDirectory {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "RAMDirectory") write!(f, "RAMDirectory")
} }
} }
struct Inner {
inner_directory: RwLock<InnerDirectory>,
logger: Logger,
}
/// A Directory storing everything in anonymous memory. /// A Directory storing everything in anonymous memory.
/// ///
/// It is mainly meant for unit testing. /// It is mainly meant for unit testing.
/// Writes are only made visible upon flushing. /// Writes are only made visible upon flushing.
/// ///
#[derive(Clone, Default)] #[derive(Clone)]
pub struct RAMDirectory { pub struct RAMDirectory {
fs: Arc<RwLock<InnerDirectory>>, fs: Arc<Inner>,
} }
impl RAMDirectory { impl RAMDirectory {
@@ -139,10 +159,21 @@ impl RAMDirectory {
Self::default() Self::default()
} }
/// Create a `RAMDirectory` with a custom logger.
pub fn with_logger(logger: Logger) -> RAMDirectory {
let inner_directory = InnerDirectory::with_logger(logger.clone()).into();
RAMDirectory {
fs: Arc::new(Inner {
inner_directory,
logger,
}),
}
}
/// Returns the sum of the size of the different files /// Returns the sum of the size of the different files
/// in the RAMDirectory. /// in the RAMDirectory.
pub fn total_mem_usage(&self) -> usize { pub fn total_mem_usage(&self) -> usize {
self.fs.read().unwrap().total_mem_usage() self.fs.inner_directory.read().unwrap().total_mem_usage()
} }
/// Write a copy of all of the files saved in the RAMDirectory in the target `Directory`. /// Write a copy of all of the files saved in the RAMDirectory in the target `Directory`.
@@ -152,7 +183,7 @@ impl RAMDirectory {
/// ///
/// If an error is encounterred, files may be persisted partially. /// If an error is encounterred, files may be persisted partially.
pub fn persist(&self, dest: &mut dyn Directory) -> crate::Result<()> { pub fn persist(&self, dest: &mut dyn Directory) -> crate::Result<()> {
let wlock = self.fs.write().unwrap(); let wlock = self.fs.inner_directory.write().unwrap();
for (path, source) in wlock.fs.iter() { for (path, source) in wlock.fs.iter() {
let mut dest_wrt = dest.open_write(path)?; let mut dest_wrt = dest.open_write(path)?;
dest_wrt.write_all(source.as_slice())?; dest_wrt.write_all(source.as_slice())?;
@@ -164,24 +195,25 @@ impl RAMDirectory {
impl Directory for RAMDirectory { impl Directory for RAMDirectory {
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> { fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> {
self.fs.read().unwrap().open_read(path) self.fs.inner_directory.read().unwrap().open_read(path)
} }
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> { fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
fail_point!("RAMDirectory::delete", |_| { fail_point!("RAMDirectory::delete", |_| {
use crate::directory::error::IOError; Err(DeleteError::IOError {
let io_error = IOError::from(io::Error::from(io::ErrorKind::Other)); io_error: io::Error::from(io::ErrorKind::Other),
Err(DeleteError::from(io_error)) filepath: path.to_path_buf(),
})
}); });
self.fs.write().unwrap().delete(path) self.fs.inner_directory.write().unwrap().delete(path)
} }
fn exists(&self, path: &Path) -> bool { fn exists(&self, path: &Path) -> bool {
self.fs.read().unwrap().exists(path) self.fs.inner_directory.read().unwrap().exists(path)
} }
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> { fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> {
let mut fs = self.fs.write().unwrap(); let mut fs = self.fs.inner_directory.write().unwrap();
let path_buf = PathBuf::from(path); let path_buf = PathBuf::from(path);
let vec_writer = VecWriter::new(path_buf.clone(), self.clone()); let vec_writer = VecWriter::new(path_buf.clone(), self.clone());
let exists = fs.write(path_buf.clone(), &[]); let exists = fs.write(path_buf.clone(), &[]);
@@ -205,19 +237,38 @@ impl Directory for RAMDirectory {
let path_buf = PathBuf::from(path); let path_buf = PathBuf::from(path);
// Reserve the path to prevent calls to .write() to succeed. // Reserve the path to prevent calls to .write() to succeed.
self.fs.write().unwrap().write(path_buf.clone(), &[]); self.fs
.inner_directory
.write()
.unwrap()
.write(path_buf.clone(), &[]);
let mut vec_writer = VecWriter::new(path_buf, self.clone()); let mut vec_writer = VecWriter::new(path_buf, self.clone());
vec_writer.write_all(data)?; vec_writer.write_all(data)?;
vec_writer.flush()?; vec_writer.flush()?;
if path == Path::new(&*META_FILEPATH) { if path == Path::new(&*META_FILEPATH) {
let _ = self.fs.write().unwrap().watch_router.broadcast(); let _ = self
.fs
.inner_directory
.write()
.unwrap()
.watch_router
.broadcast();
} }
Ok(()) Ok(())
} }
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> { fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
Ok(self.fs.write().unwrap().watch(watch_callback)) Ok(self
.fs
.inner_directory
.write()
.unwrap()
.watch(watch_callback))
}
fn logger(&self) -> &Logger {
&self.fs.logger
} }
} }

View File

@@ -211,19 +211,19 @@ fn test_watch(directory: &mut dyn Directory) {
.unwrap(); .unwrap();
for i in 0..10 { for i in 0..10 {
assert_eq!(i, counter.load(SeqCst)); assert!(i <= counter.load(SeqCst));
assert!(directory assert!(directory
.atomic_write(Path::new("meta.json"), b"random_test_data_2") .atomic_write(Path::new("meta.json"), b"random_test_data_2")
.is_ok()); .is_ok());
assert_eq!(receiver.recv_timeout(Duration::from_millis(500)), Ok(i)); assert_eq!(receiver.recv_timeout(Duration::from_millis(500)), Ok(i));
assert_eq!(i + 1, counter.load(SeqCst)); assert!(i + 1 <= counter.load(SeqCst)); // notify can trigger more than once.
} }
mem::drop(watch_handle); mem::drop(watch_handle);
assert!(directory assert!(directory
.atomic_write(Path::new("meta.json"), b"random_test_data") .atomic_write(Path::new("meta.json"), b"random_test_data")
.is_ok()); .is_ok());
assert!(receiver.recv_timeout(Duration::from_millis(500)).is_ok()); assert!(receiver.recv_timeout(Duration::from_millis(500)).is_ok());
assert_eq!(10, counter.load(SeqCst)); assert!(10 <= counter.load(SeqCst));
} }
fn test_lock_non_blocking(directory: &mut dyn Directory) { fn test_lock_non_blocking(directory: &mut dyn Directory) {

View File

@@ -1,5 +1,6 @@
use futures::channel::oneshot; use futures::channel::oneshot;
use futures::{Future, TryFutureExt}; use futures::{Future, TryFutureExt};
use slog::{error, Logger};
use std::sync::Arc; use std::sync::Arc;
use std::sync::RwLock; use std::sync::RwLock;
use std::sync::Weak; use std::sync::Weak;
@@ -11,9 +12,9 @@ pub type WatchCallback = Box<dyn Fn() + Sync + Send>;
/// ///
/// It registers callbacks (See `.subscribe(...)`) and /// It registers callbacks (See `.subscribe(...)`) and
/// calls them upon calls to `.broadcast(...)`. /// calls them upon calls to `.broadcast(...)`.
#[derive(Default)] pub(crate) struct WatchCallbackList {
pub struct WatchCallbackList {
router: RwLock<Vec<Weak<WatchCallback>>>, router: RwLock<Vec<Weak<WatchCallback>>>,
logger: Logger,
} }
/// Controls how long a directory should watch for a file change. /// Controls how long a directory should watch for a file change.
@@ -32,6 +33,13 @@ impl WatchHandle {
} }
impl WatchCallbackList { impl WatchCallbackList {
pub fn with_logger(logger: Logger) -> Self {
WatchCallbackList {
logger,
router: Default::default(),
}
}
/// Subscribes a new callback and returns a handle that controls the lifetime of the callback. /// Subscribes a new callback and returns a handle that controls the lifetime of the callback.
pub fn subscribe(&self, watch_callback: WatchCallback) -> WatchHandle { pub fn subscribe(&self, watch_callback: WatchCallback) -> WatchHandle {
let watch_callback_arc = Arc::new(watch_callback); let watch_callback_arc = Arc::new(watch_callback);
@@ -74,8 +82,8 @@ impl WatchCallbackList {
}); });
if let Err(err) = spawn_res { if let Err(err) = spawn_res {
error!( error!(
"Failed to spawn thread to call watch callbacks. Cause: {:?}", self.logger,
err "Failed to spawn thread to call watch callbacks. Cause: {:?}", err
); );
} }
result result
@@ -86,13 +94,18 @@ impl WatchCallbackList {
mod tests { mod tests {
use crate::directory::WatchCallbackList; use crate::directory::WatchCallbackList;
use futures::executor::block_on; use futures::executor::block_on;
use slog::{o, Discard, Logger};
use std::mem; use std::mem;
use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc; use std::sync::Arc;
fn default_watch_callback_list() -> WatchCallbackList {
WatchCallbackList::with_logger(Logger::root(Discard, o!()))
}
#[test] #[test]
fn test_watch_event_router_simple() { fn test_watch_event_router_simple() {
let watch_event_router = WatchCallbackList::default(); let watch_event_router = default_watch_callback_list();
let counter: Arc<AtomicUsize> = Default::default(); let counter: Arc<AtomicUsize> = Default::default();
let counter_clone = counter.clone(); let counter_clone = counter.clone();
let inc_callback = Box::new(move || { let inc_callback = Box::new(move || {
@@ -119,7 +132,7 @@ mod tests {
#[test] #[test]
fn test_watch_event_router_multiple_callback_same_key() { fn test_watch_event_router_multiple_callback_same_key() {
let watch_event_router = WatchCallbackList::default(); let watch_event_router = default_watch_callback_list();
let counter: Arc<AtomicUsize> = Default::default(); let counter: Arc<AtomicUsize> = Default::default();
let inc_callback = |inc: usize| { let inc_callback = |inc: usize| {
let counter_clone = counter.clone(); let counter_clone = counter.clone();
@@ -148,7 +161,7 @@ mod tests {
#[test] #[test]
fn test_watch_event_router_multiple_callback_different_key() { fn test_watch_event_router_multiple_callback_different_key() {
let watch_event_router = WatchCallbackList::default(); let watch_event_router = default_watch_callback_list();
let counter: Arc<AtomicUsize> = Default::default(); let counter: Arc<AtomicUsize> = Default::default();
let counter_clone = counter.clone(); let counter_clone = counter.clone();
let inc_callback = Box::new(move || { let inc_callback = Box::new(move || {

View File

@@ -2,11 +2,13 @@
use std::io; use std::io;
use crate::directory::error::{IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
use crate::directory::error::{Incompatibility, LockError}; use crate::directory::error::{Incompatibility, LockError};
use crate::fastfield::FastFieldNotAvailableError; use crate::fastfield::FastFieldNotAvailableError;
use crate::query; use crate::query;
use crate::schema; use crate::{
directory::error::{OpenDirectoryError, OpenReadError, OpenWriteError},
schema,
};
use std::fmt; use std::fmt;
use std::path::PathBuf; use std::path::PathBuf;
use std::sync::PoisonError; use std::sync::PoisonError;
@@ -43,44 +45,47 @@ impl fmt::Debug for DataCorruption {
} }
} }
/// The library's failure based error enum /// The library's error enum
#[derive(Debug, Fail)] #[derive(Debug, Error)]
pub enum TantivyError { pub enum TantivyError {
/// Path does not exist. /// Failed to open the directory.
#[fail(display = "Path does not exist: '{:?}'", _0)] #[error("Failed to open the directory: '{0:?}'")]
PathDoesNotExist(PathBuf), OpenDirectoryError(#[from] OpenDirectoryError),
/// File already exists, this is a problem when we try to write into a new file. /// Failed to open a file for read.
#[fail(display = "File already exists: '{:?}'", _0)] #[error("Failed to open file for read: '{0:?}'")]
FileAlreadyExists(PathBuf), OpenReadError(#[from] OpenReadError),
/// Failed to open a file for write.
#[error("Failed to open file for write: '{0:?}'")]
OpenWriteError(#[from] OpenWriteError),
/// Index already exists in this directory /// Index already exists in this directory
#[fail(display = "Index already exists")] #[error("Index already exists")]
IndexAlreadyExists, IndexAlreadyExists,
/// Failed to acquire file lock /// Failed to acquire file lock
#[fail(display = "Failed to acquire Lockfile: {:?}. {:?}", _0, _1)] #[error("Failed to acquire Lockfile: {0:?}. {1:?}")]
LockFailure(LockError, Option<String>), LockFailure(LockError, Option<String>),
/// IO Error. /// IO Error.
#[fail(display = "An IO error occurred: '{}'", _0)] #[error("An IO error occurred: '{0}'")]
IOError(#[cause] IOError), IOError(#[from] io::Error),
/// Data corruption. /// Data corruption.
#[fail(display = "{:?}", _0)] #[error("Data corrupted: '{0:?}'")]
DataCorruption(DataCorruption), DataCorruption(DataCorruption),
/// A thread holding the locked panicked and poisoned the lock. /// A thread holding the locked panicked and poisoned the lock.
#[fail(display = "A thread holding the locked panicked and poisoned the lock")] #[error("A thread holding the locked panicked and poisoned the lock")]
Poisoned, Poisoned,
/// Invalid argument was passed by the user. /// Invalid argument was passed by the user.
#[fail(display = "An invalid argument was passed: '{}'", _0)] #[error("An invalid argument was passed: '{0}'")]
InvalidArgument(String), InvalidArgument(String),
/// An Error happened in one of the thread. /// An Error happened in one of the thread.
#[fail(display = "An error occurred in a thread: '{}'", _0)] #[error("An error occurred in a thread: '{0}'")]
ErrorInThread(String), ErrorInThread(String),
/// An Error appeared related to the schema. /// An Error appeared related to the schema.
#[fail(display = "Schema error: '{}'", _0)] #[error("Schema error: '{0}'")]
SchemaError(String), SchemaError(String),
/// System error. (e.g.: We failed spawning a new thread) /// System error. (e.g.: We failed spawning a new thread)
#[fail(display = "System error.'{}'", _0)] #[error("System error.'{0}'")]
SystemError(String), SystemError(String),
/// Index incompatible with current version of tantivy /// Index incompatible with current version of tantivy
#[fail(display = "{:?}", _0)] #[error("{0:?}")]
IncompatibleIndex(Incompatibility), IncompatibleIndex(Incompatibility),
} }
@@ -89,31 +94,17 @@ impl From<DataCorruption> for TantivyError {
TantivyError::DataCorruption(data_corruption) TantivyError::DataCorruption(data_corruption)
} }
} }
impl From<FastFieldNotAvailableError> for TantivyError { impl From<FastFieldNotAvailableError> for TantivyError {
fn from(fastfield_error: FastFieldNotAvailableError) -> TantivyError { fn from(fastfield_error: FastFieldNotAvailableError) -> TantivyError {
TantivyError::SchemaError(format!("{}", fastfield_error)) TantivyError::SchemaError(format!("{}", fastfield_error))
} }
} }
impl From<LockError> for TantivyError { impl From<LockError> for TantivyError {
fn from(lock_error: LockError) -> TantivyError { fn from(lock_error: LockError) -> TantivyError {
TantivyError::LockFailure(lock_error, None) TantivyError::LockFailure(lock_error, None)
} }
} }
impl From<IOError> for TantivyError {
fn from(io_error: IOError) -> TantivyError {
TantivyError::IOError(io_error)
}
}
impl From<io::Error> for TantivyError {
fn from(io_error: io::Error) -> TantivyError {
TantivyError::IOError(io_error.into())
}
}
impl From<query::QueryParserError> for TantivyError { impl From<query::QueryParserError> for TantivyError {
fn from(parsing_error: query::QueryParserError) -> TantivyError { fn from(parsing_error: query::QueryParserError) -> TantivyError {
TantivyError::InvalidArgument(format!("Query is invalid. {:?}", parsing_error)) TantivyError::InvalidArgument(format!("Query is invalid. {:?}", parsing_error))
@@ -126,49 +117,12 @@ impl<Guard> From<PoisonError<Guard>> for TantivyError {
} }
} }
impl From<OpenReadError> for TantivyError {
fn from(error: OpenReadError) -> TantivyError {
match error {
OpenReadError::FileDoesNotExist(filepath) => TantivyError::PathDoesNotExist(filepath),
OpenReadError::IOError(io_error) => TantivyError::IOError(io_error),
OpenReadError::IncompatibleIndex(incompatibility) => {
TantivyError::IncompatibleIndex(incompatibility)
}
}
}
}
impl From<schema::DocParsingError> for TantivyError { impl From<schema::DocParsingError> for TantivyError {
fn from(error: schema::DocParsingError) -> TantivyError { fn from(error: schema::DocParsingError) -> TantivyError {
TantivyError::InvalidArgument(format!("Failed to parse document {:?}", error)) TantivyError::InvalidArgument(format!("Failed to parse document {:?}", error))
} }
} }
impl From<OpenWriteError> for TantivyError {
fn from(error: OpenWriteError) -> TantivyError {
match error {
OpenWriteError::FileAlreadyExists(filepath) => {
TantivyError::FileAlreadyExists(filepath)
}
OpenWriteError::IOError(io_error) => TantivyError::IOError(io_error),
}
}
}
impl From<OpenDirectoryError> for TantivyError {
fn from(error: OpenDirectoryError) -> TantivyError {
match error {
OpenDirectoryError::DoesNotExist(directory_path) => {
TantivyError::PathDoesNotExist(directory_path)
}
OpenDirectoryError::NotADirectory(directory_path) => {
TantivyError::InvalidArgument(format!("{:?} is not a directory", directory_path))
}
OpenDirectoryError::IoError(err) => TantivyError::IOError(IOError::from(err)),
}
}
}
impl From<serde_json::Error> for TantivyError { impl From<serde_json::Error> for TantivyError {
fn from(error: serde_json::Error) -> TantivyError { fn from(error: serde_json::Error) -> TantivyError {
let io_err = io::Error::from(error); let io_err = io::Error::from(error);

View File

@@ -15,7 +15,7 @@ mod tests {
let field = schema_builder.add_bytes_field("bytesfield"); let field = schema_builder.add_bytes_field("bytesfield");
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(field=>vec![0u8, 1, 2, 3])); index_writer.add_document(doc!(field=>vec![0u8, 1, 2, 3]));
index_writer.add_document(doc!(field=>vec![])); index_writer.add_document(doc!(field=>vec![]));
index_writer.add_document(doc!(field=>vec![255u8])); index_writer.add_document(doc!(field=>vec![255u8]));

View File

@@ -4,8 +4,8 @@ use std::result;
/// `FastFieldNotAvailableError` is returned when the /// `FastFieldNotAvailableError` is returned when the
/// user requested for a fast field reader, and the field was not /// user requested for a fast field reader, and the field was not
/// defined in the schema as a fast field. /// defined in the schema as a fast field.
#[derive(Debug, Fail)] #[derive(Debug, Error)]
#[fail(display = "Fast field not available: '{:?}'", field_name)] #[error("Fast field not available: '{field_name:?}'")]
pub struct FastFieldNotAvailableError { pub struct FastFieldNotAvailableError {
field_name: String, field_name: String,
} }

View File

@@ -474,7 +474,7 @@ mod tests {
let date_field = schema_builder.add_date_field("date", FAST); let date_field = schema_builder.add_date_field("date", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy)); index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer.add_document(doc!(date_field =>crate::chrono::prelude::Utc::now())); index_writer.add_document(doc!(date_field =>crate::chrono::prelude::Utc::now()));
index_writer.commit().unwrap(); index_writer.commit().unwrap();
@@ -511,7 +511,7 @@ mod tests {
); );
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy)); index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer.add_document(doc!( index_writer.add_document(doc!(
date_field => crate::DateTime::from_u64(1i64.to_u64()), date_field => crate::DateTime::from_u64(1i64.to_u64()),

View File

@@ -25,7 +25,7 @@ mod tests {
); );
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(field=>1u64, field=>3u64)); index_writer.add_document(doc!(field=>1u64, field=>3u64));
index_writer.add_document(doc!()); index_writer.add_document(doc!());
index_writer.add_document(doc!(field=>4u64)); index_writer.add_document(doc!(field=>4u64));
@@ -64,7 +64,7 @@ mod tests {
schema_builder.add_i64_field("time_stamp_i", IntOptions::default().set_stored()); schema_builder.add_i64_field("time_stamp_i", IntOptions::default().set_stored());
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
let first_time_stamp = chrono::Utc::now(); let first_time_stamp = chrono::Utc::now();
index_writer.add_document( index_writer.add_document(
doc!(date_field=>first_time_stamp, date_field=>first_time_stamp, time_i=>1i64), doc!(date_field=>first_time_stamp, date_field=>first_time_stamp, time_i=>1i64),
@@ -186,7 +186,7 @@ mod tests {
); );
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(field=> 1i64, field => 3i64)); index_writer.add_document(doc!(field=> 1i64, field => 3i64));
index_writer.add_document(doc!()); index_writer.add_document(doc!());
index_writer.add_document(doc!(field=> -4i64)); index_writer.add_document(doc!(field=> -4i64));
@@ -221,7 +221,7 @@ mod tests {
let field = schema_builder.add_facet_field("facetfield"); let field = schema_builder.add_facet_field("facetfield");
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
for i in 0..100_000 { for i in 0..100_000 {
index_writer.add_document(doc!(field=> Facet::from(format!("/lang/{}", i).as_str()))); index_writer.add_document(doc!(field=> Facet::from(format!("/lang/{}", i).as_str())));
} }

View File

@@ -74,7 +74,7 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index let mut index_writer = index
.writer_with_num_threads(1, 30_000_000) .writer_for_tests()
.expect("Failed to create index writer."); .expect("Failed to create index writer.");
index_writer.add_document(doc!( index_writer.add_document(doc!(
facet_field => Facet::from("/category/cat2"), facet_field => Facet::from("/category/cat2"),

View File

@@ -27,6 +27,7 @@ use crate::Opstamp;
use crossbeam::channel; use crossbeam::channel;
use futures::executor::block_on; use futures::executor::block_on;
use futures::future::Future; use futures::future::Future;
use slog::{error, info, Logger};
use smallvec::smallvec; use smallvec::smallvec;
use smallvec::SmallVec; use smallvec::SmallVec;
use std::mem; use std::mem;
@@ -195,20 +196,21 @@ fn index_documents(
grouped_document_iterator: &mut dyn Iterator<Item = OperationGroup>, grouped_document_iterator: &mut dyn Iterator<Item = OperationGroup>,
segment_updater: &mut SegmentUpdater, segment_updater: &mut SegmentUpdater,
mut delete_cursor: DeleteCursor, mut delete_cursor: DeleteCursor,
logger: &Logger,
) -> crate::Result<bool> { ) -> crate::Result<bool> {
let schema = segment.schema(); let schema = segment.schema();
info!(logger, "segment-index"; "stage"=>"start");
let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), &schema)?; let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), &schema)?;
let mut buffer_limit_reached = false;
for document_group in grouped_document_iterator { for document_group in grouped_document_iterator {
for doc in document_group { for doc in document_group {
segment_writer.add_document(doc, &schema)?; segment_writer.add_document(doc, &schema)?;
} }
let mem_usage = segment_writer.mem_usage(); let mem_usage = segment_writer.mem_usage();
if mem_usage >= memory_budget - MARGIN_IN_BYTES { if mem_usage >= memory_budget - MARGIN_IN_BYTES {
info!( buffer_limit_reached = true;
"Buffer limit reached, flushing segment with maxdoc={}.",
segment_writer.max_doc()
);
break; break;
} }
} }
@@ -228,6 +230,14 @@ fn index_documents(
let segment_with_max_doc = segment.with_max_doc(max_doc); let segment_with_max_doc = segment.with_max_doc(max_doc);
let last_docstamp: Opstamp = *(doc_opstamps.last().unwrap()); let last_docstamp: Opstamp = *(doc_opstamps.last().unwrap());
info!(
logger,
"segment-index";
"stage" => "serialize",
"cause" => if buffer_limit_reached { "buffer-limit" } else { "commit" },
"maxdoc" => max_doc,
"last_docstamp" => last_docstamp
);
let delete_bitset_opt = apply_deletes( let delete_bitset_opt = apply_deletes(
&segment_with_max_doc, &segment_with_max_doc,
@@ -241,7 +251,18 @@ fn index_documents(
delete_cursor, delete_cursor,
delete_bitset_opt, delete_bitset_opt,
); );
info!(
logger,
"segment-index";
"stage" => "publish",
);
block_on(segment_updater.schedule_add_segment(segment_entry))?; block_on(segment_updater.schedule_add_segment(segment_entry))?;
info!(
logger,
"segment-index";
"stage" => "end",
);
Ok(true) Ok(true)
} }
@@ -344,6 +365,10 @@ impl IndexWriter {
Ok(index_writer) Ok(index_writer)
} }
pub(crate) fn logger(&self) -> &Logger {
self.index.logger()
}
fn drop_sender(&mut self) { fn drop_sender(&mut self) {
let (sender, _receiver) = channel::bounded(1); let (sender, _receiver) = channel::bounded(1);
self.operation_sender = sender; self.operation_sender = sender;
@@ -352,6 +377,8 @@ impl IndexWriter {
/// If there are some merging threads, blocks until they all finish their work and /// If there are some merging threads, blocks until they all finish their work and
/// then drop the `IndexWriter`. /// then drop the `IndexWriter`.
pub fn wait_merging_threads(mut self) -> crate::Result<()> { pub fn wait_merging_threads(mut self) -> crate::Result<()> {
info!(self.logger(), "wait-merge-threads"; "stage"=>"start");
// this will stop the indexing thread, // this will stop the indexing thread,
// dropping the last reference to the segment_updater. // dropping the last reference to the segment_updater.
self.drop_sender(); self.drop_sender();
@@ -372,9 +399,9 @@ impl IndexWriter {
.map_err(|_| TantivyError::ErrorInThread("Failed to join merging thread.".into())); .map_err(|_| TantivyError::ErrorInThread("Failed to join merging thread.".into()));
if let Err(ref e) = result { if let Err(ref e) = result {
error!("Some merging thread failed {:?}", e); error!(self.logger(), "some merge thread failed"; "cause"=>e.to_string());
} }
info!(self.logger(), "wait-merge-threads"; "stage"=>"stop");
result result
} }
@@ -434,12 +461,16 @@ impl IndexWriter {
return Ok(()); return Ok(());
} }
let segment = index.new_segment(); let segment = index.new_segment();
let segment_id = segment.id();
index_documents( index_documents(
mem_budget, mem_budget,
segment, segment,
&mut document_iterator, &mut document_iterator,
&mut segment_updater, &mut segment_updater,
delete_cursor.clone(), delete_cursor.clone(),
&index
.logger()
.new(slog::o!("segment"=>segment_id.to_string())),
)?; )?;
} }
})?; })?;
@@ -553,7 +584,10 @@ impl IndexWriter {
/// ///
/// The opstamp at the last commit is returned. /// The opstamp at the last commit is returned.
pub fn rollback(&mut self) -> crate::Result<Opstamp> { pub fn rollback(&mut self) -> crate::Result<Opstamp> {
info!("Rolling back to opstamp {}", self.committed_opstamp); info!(
self.logger(),
"Rolling back to opstamp {}", self.committed_opstamp
);
// marks the segment updater as killed. From now on, all // marks the segment updater as killed. From now on, all
// segment updates will be ignored. // segment updates will be ignored.
self.segment_updater.kill(); self.segment_updater.kill();
@@ -610,6 +644,8 @@ impl IndexWriter {
/// using this API. /// using this API.
/// See [`PreparedCommit::set_payload()`](PreparedCommit.html) /// See [`PreparedCommit::set_payload()`](PreparedCommit.html)
pub fn prepare_commit(&mut self) -> crate::Result<PreparedCommit> { pub fn prepare_commit(&mut self) -> crate::Result<PreparedCommit> {
let logger = self.logger().clone();
// Here, because we join all of the worker threads, // Here, because we join all of the worker threads,
// all of the segment update for this commit have been // all of the segment update for this commit have been
// sent. // sent.
@@ -620,7 +656,10 @@ impl IndexWriter {
// //
// This will move uncommitted segments to the state of // This will move uncommitted segments to the state of
// committed segments. // committed segments.
info!("Preparing commit");
let commit_opstamp = self.stamper.stamp();
info!(logger, "prepare-commit"; "opstamp" => commit_opstamp);
// this will drop the current document channel // this will drop the current document channel
// and recreate a new one. // and recreate a new one.
@@ -636,9 +675,8 @@ impl IndexWriter {
self.add_indexing_worker()?; self.add_indexing_worker()?;
} }
let commit_opstamp = self.stamper.stamp();
let prepared_commit = PreparedCommit::new(self, commit_opstamp); let prepared_commit = PreparedCommit::new(self, commit_opstamp);
info!("Prepared commit {}", commit_opstamp); info!(logger, "Prepared commit {}", commit_opstamp);
Ok(prepared_commit) Ok(prepared_commit)
} }
@@ -800,7 +838,7 @@ mod tests {
let mut schema_builder = schema::Schema::builder(); let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT); let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let index_writer = index.writer_for_tests().unwrap();
let operations = vec![ let operations = vec![
UserOperation::Add(doc!(text_field=>"a")), UserOperation::Add(doc!(text_field=>"a")),
UserOperation::Add(doc!(text_field=>"b")), UserOperation::Add(doc!(text_field=>"b")),
@@ -815,7 +853,7 @@ mod tests {
let text_field = schema_builder.add_text_field("text", schema::TEXT); let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field => "hello1")); index_writer.add_document(doc!(text_field => "hello1"));
index_writer.add_document(doc!(text_field => "hello2")); index_writer.add_document(doc!(text_field => "hello2"));
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
@@ -864,7 +902,7 @@ mod tests {
.reload_policy(ReloadPolicy::Manual) .reload_policy(ReloadPolicy::Manual)
.try_into() .try_into()
.unwrap(); .unwrap();
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
let a_term = Term::from_field_text(text_field, "a"); let a_term = Term::from_field_text(text_field, "a");
let b_term = Term::from_field_text(text_field, "b"); let b_term = Term::from_field_text(text_field, "b");
let operations = vec![ let operations = vec![
@@ -926,8 +964,8 @@ mod tests {
fn test_lockfile_already_exists_error_msg() { fn test_lockfile_already_exists_error_msg() {
let schema_builder = schema::Schema::builder(); let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let _index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let _index_writer = index.writer_for_tests().unwrap();
match index.writer_with_num_threads(1, 3_000_000) { match index.writer_for_tests() {
Err(err) => { Err(err) => {
let err_msg = err.to_string(); let err_msg = err.to_string();
assert!(err_msg.contains("already an `IndexWriter`")); assert!(err_msg.contains("already an `IndexWriter`"));
@@ -1261,7 +1299,7 @@ mod tests {
let idfield = schema_builder.add_text_field("id", STRING); let idfield = schema_builder.add_text_field("id", STRING);
schema_builder.add_text_field("optfield", STRING); schema_builder.add_text_field("optfield", STRING);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(idfield=>"myid")); index_writer.add_document(doc!(idfield=>"myid"));
let commit = index_writer.commit(); let commit = index_writer.commit();
assert!(commit.is_ok()); assert!(commit.is_ok());

View File

@@ -25,14 +25,14 @@ use std::cmp;
use std::collections::HashMap; use std::collections::HashMap;
use std::sync::Arc; use std::sync::Arc;
fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 { fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::Result<u64> {
let mut total_tokens = 0u64; let mut total_tokens = 0u64;
let mut count: [usize; 256] = [0; 256]; let mut count: [usize; 256] = [0; 256];
for reader in readers { for reader in readers {
if reader.has_deletes() { if reader.has_deletes() {
// if there are deletes, then we use an approximation // if there are deletes, then we use an approximation
// using the fieldnorm // using the fieldnorm
let fieldnorms_reader = reader.get_fieldnorms_reader(field); let fieldnorms_reader = reader.get_fieldnorms_reader(field)?;
for doc in reader.doc_ids_alive() { for doc in reader.doc_ids_alive() {
let fieldnorm_id = fieldnorms_reader.fieldnorm_id(doc); let fieldnorm_id = fieldnorms_reader.fieldnorm_id(doc);
count[fieldnorm_id as usize] += 1; count[fieldnorm_id as usize] += 1;
@@ -41,7 +41,7 @@ fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 {
total_tokens += reader.inverted_index(field).total_num_tokens(); total_tokens += reader.inverted_index(field).total_num_tokens();
} }
} }
total_tokens Ok(total_tokens
+ count + count
.iter() .iter()
.cloned() .cloned()
@@ -49,7 +49,7 @@ fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 {
.map(|(fieldnorm_ord, count)| { .map(|(fieldnorm_ord, count)| {
count as u64 * u64::from(FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8)) count as u64 * u64::from(FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8))
}) })
.sum::<u64>() .sum::<u64>())
} }
pub struct IndexMerger { pub struct IndexMerger {
@@ -175,7 +175,7 @@ impl IndexMerger {
for field in fields { for field in fields {
fieldnorms_data.clear(); fieldnorms_data.clear();
for reader in &self.readers { for reader in &self.readers {
let fieldnorms_reader = reader.get_fieldnorms_reader(field); let fieldnorms_reader = reader.get_fieldnorms_reader(field)?;
for doc_id in reader.doc_ids_alive() { for doc_id in reader.doc_ids_alive() {
let fieldnorm_id = fieldnorms_reader.fieldnorm_id(doc_id); let fieldnorm_id = fieldnorms_reader.fieldnorm_id(doc_id);
fieldnorms_data.push(fieldnorm_id); fieldnorms_data.push(fieldnorm_id);
@@ -541,7 +541,7 @@ impl IndexMerger {
// The total number of tokens will only be exact when there has been no deletes. // The total number of tokens will only be exact when there has been no deletes.
// //
// Otherwise, we approximate by removing deleted documents proportionally. // Otherwise, we approximate by removing deleted documents proportionally.
let total_num_tokens: u64 = compute_total_num_tokens(&self.readers, indexed_field); let total_num_tokens: u64 = compute_total_num_tokens(&self.readers, indexed_field)?;
// Create the total list of doc ids // Create the total list of doc ids
// by stacking the doc ids from the different segment. // by stacking the doc ids from the different segment.
@@ -751,7 +751,7 @@ mod tests {
}; };
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
{ {
// writing the segment // writing the segment
{ {
@@ -803,7 +803,7 @@ mod tests {
let segment_ids = index let segment_ids = index
.searchable_segment_ids() .searchable_segment_ids()
.expect("Searchable segments failed."); .expect("Searchable segments failed.");
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
block_on(index_writer.merge(&segment_ids)).expect("Merging failed"); block_on(index_writer.merge(&segment_ids)).expect("Merging failed");
index_writer.wait_merging_threads().unwrap(); index_writer.wait_merging_threads().unwrap();
} }
@@ -904,7 +904,7 @@ mod tests {
let score_field = schema_builder.add_u64_field("score", score_fieldtype); let score_field = schema_builder.add_u64_field("score", score_fieldtype);
let bytes_score_field = schema_builder.add_bytes_field("score_bytes"); let bytes_score_field = schema_builder.add_bytes_field("score_bytes");
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
let reader = index.reader().unwrap(); let reader = index.reader().unwrap();
let search_term = |searcher: &Searcher, term: Term| { let search_term = |searcher: &Searcher, term: Term| {
let collector = FastFieldTestCollector::for_field(score_field); let collector = FastFieldTestCollector::for_field(score_field);
@@ -1211,7 +1211,7 @@ mod tests {
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let reader = index.reader().unwrap(); let reader = index.reader().unwrap();
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
let index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| { let index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| {
let mut doc = Document::default(); let mut doc = Document::default();
for facet in doc_facets { for facet in doc_facets {
@@ -1276,7 +1276,7 @@ mod tests {
let segment_ids = index let segment_ids = index
.searchable_segment_ids() .searchable_segment_ids()
.expect("Searchable segments failed."); .expect("Searchable segments failed.");
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
block_on(index_writer.merge(&segment_ids)).expect("Merging failed"); block_on(index_writer.merge(&segment_ids)).expect("Merging failed");
index_writer.wait_merging_threads().unwrap(); index_writer.wait_merging_threads().unwrap();
reader.reload().unwrap(); reader.reload().unwrap();
@@ -1295,7 +1295,7 @@ mod tests {
// Deleting one term // Deleting one term
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
let facet = Facet::from_path(vec!["top", "a", "firstdoc"]); let facet = Facet::from_path(vec!["top", "a", "firstdoc"]);
let facet_term = Term::from_facet(facet_field, &facet); let facet_term = Term::from_facet(facet_field, &facet);
index_writer.delete_term(facet_term); index_writer.delete_term(facet_term);
@@ -1320,7 +1320,7 @@ mod tests {
let mut schema_builder = schema::Schema::builder(); let mut schema_builder = schema::Schema::builder();
let int_field = schema_builder.add_u64_field("intvals", INDEXED); let int_field = schema_builder.add_u64_field("intvals", INDEXED);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(int_field => 1u64)); index_writer.add_document(doc!(int_field => 1u64));
index_writer.commit().expect("commit failed"); index_writer.commit().expect("commit failed");
index_writer.add_document(doc!(int_field => 1u64)); index_writer.add_document(doc!(int_field => 1u64));
@@ -1349,7 +1349,7 @@ mod tests {
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let reader = index.reader().unwrap(); let reader = index.reader().unwrap();
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
let mut doc = Document::default(); let mut doc = Document::default();
doc.add_u64(int_field, 1); doc.add_u64(int_field, 1);
index_writer.add_document(doc.clone()); index_writer.add_document(doc.clone());
@@ -1388,7 +1388,7 @@ mod tests {
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| { let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| {
let mut doc = Document::default(); let mut doc = Document::default();
for &val in int_vals { for &val in int_vals {
@@ -1462,7 +1462,7 @@ mod tests {
let segment_ids = index let segment_ids = index
.searchable_segment_ids() .searchable_segment_ids()
.expect("Searchable segments failed."); .expect("Searchable segments failed.");
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
assert!(block_on(index_writer.merge(&segment_ids)).is_ok()); assert!(block_on(index_writer.merge(&segment_ids)).is_ok());
assert!(index_writer.wait_merging_threads().is_ok()); assert!(index_writer.wait_merging_threads().is_ok());
} }
@@ -1516,7 +1516,7 @@ mod tests {
let index = Index::create_in_ram(builder.build()); let index = Index::create_in_ram(builder.build());
let mut writer = index.writer_with_num_threads(1, 3_000_000)?; let mut writer = index.writer_for_tests()?;
// Make sure we'll attempt to merge every created segment // Make sure we'll attempt to merge every created segment
let mut policy = crate::indexer::LogMergePolicy::default(); let mut policy = crate::indexer::LogMergePolicy::default();
@@ -1548,7 +1548,7 @@ mod tests {
let mut builder = schema::SchemaBuilder::new(); let mut builder = schema::SchemaBuilder::new();
let text = builder.add_text_field("text", TEXT); let text = builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(builder.build()); let index = Index::create_in_ram(builder.build());
let mut writer = index.writer_with_num_threads(1, 3_000_000)?; let mut writer = index.writer_for_tests()?;
let happy_term = Term::from_field_text(text, "happy"); let happy_term = Term::from_field_text(text, "happy");
let term_query = TermQuery::new(happy_term, IndexRecordOption::WithFreqs); let term_query = TermQuery::new(happy_term, IndexRecordOption::WithFreqs);
for _ in 0..62 { for _ in 0..62 {

View File

@@ -29,8 +29,9 @@ pub use self::segment_writer::SegmentWriter;
/// Alias for the default merge policy, which is the `LogMergePolicy`. /// Alias for the default merge policy, which is the `LogMergePolicy`.
pub type DefaultMergePolicy = LogMergePolicy; pub type DefaultMergePolicy = LogMergePolicy;
#[cfg(feature = "mmap")]
#[cfg(test)] #[cfg(test)]
mod tests { mod tests_mmap {
use crate::schema::{self, Schema}; use crate::schema::{self, Schema};
use crate::{Index, Term}; use crate::{Index, Term};
@@ -39,7 +40,7 @@ mod tests {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT); let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_from_tempdir(schema_builder.build()).unwrap(); let index = Index::create_from_tempdir(schema_builder.build()).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
// there must be one deleted document in the segment // there must be one deleted document in the segment
index_writer.add_document(doc!(text_field=>"b")); index_writer.add_document(doc!(text_field=>"b"));
index_writer.delete_term(Term::from_field_text(text_field, "b")); index_writer.delete_term(Term::from_field_text(text_field, "b"));

View File

@@ -1,6 +1,7 @@
use super::IndexWriter; use super::IndexWriter;
use crate::Opstamp; use crate::Opstamp;
use futures::executor::block_on; use futures::executor::block_on;
use slog::info;
/// A prepared commit /// A prepared commit
pub struct PreparedCommit<'a> { pub struct PreparedCommit<'a> {
@@ -31,7 +32,7 @@ impl<'a> PreparedCommit<'a> {
} }
pub fn commit(self) -> crate::Result<Opstamp> { pub fn commit(self) -> crate::Result<Opstamp> {
info!("committing {}", self.opstamp); info!(self.index_writer.logger(), "committing {}", self.opstamp);
let _ = block_on( let _ = block_on(
self.index_writer self.index_writer
.segment_updater() .segment_updater()

View File

@@ -1,3 +1,5 @@
use slog::{warn, Logger};
use super::segment_register::SegmentRegister; use super::segment_register::SegmentRegister;
use crate::core::SegmentId; use crate::core::SegmentId;
use crate::core::SegmentMeta; use crate::core::SegmentMeta;
@@ -42,9 +44,9 @@ impl SegmentRegisters {
/// ///
/// It guarantees the atomicity of the /// It guarantees the atomicity of the
/// changes (merges especially) /// changes (merges especially)
#[derive(Default)]
pub struct SegmentManager { pub struct SegmentManager {
registers: RwLock<SegmentRegisters>, registers: RwLock<SegmentRegisters>,
logger: Logger,
} }
impl Debug for SegmentManager { impl Debug for SegmentManager {
@@ -77,12 +79,14 @@ impl SegmentManager {
pub fn from_segments( pub fn from_segments(
segment_metas: Vec<SegmentMeta>, segment_metas: Vec<SegmentMeta>,
delete_cursor: &DeleteCursor, delete_cursor: &DeleteCursor,
logger: Logger,
) -> SegmentManager { ) -> SegmentManager {
SegmentManager { SegmentManager {
registers: RwLock::new(SegmentRegisters { registers: RwLock::new(SegmentRegisters {
uncommitted: SegmentRegister::default(), uncommitted: SegmentRegister::default(),
committed: SegmentRegister::new(segment_metas, delete_cursor), committed: SegmentRegister::new(segment_metas, delete_cursor),
}), }),
logger,
} }
} }
@@ -186,7 +190,7 @@ impl SegmentManager {
let segments_status = registers_lock let segments_status = registers_lock
.segments_status(before_merge_segment_ids) .segments_status(before_merge_segment_ids)
.ok_or_else(|| { .ok_or_else(|| {
warn!("couldn't find segment in SegmentManager"); warn!(self.logger, "couldn't find segment in SegmentManager");
crate::TantivyError::InvalidArgument( crate::TantivyError::InvalidArgument(
"The segments that were merged could not be found in the SegmentManager. \ "The segments that were merged could not be found in the SegmentManager. \
This is not necessarily a bug, and can happen after a rollback for instance." This is not necessarily a bug, and can happen after a rollback for instance."

View File

@@ -23,9 +23,9 @@ use futures::channel::oneshot;
use futures::executor::{ThreadPool, ThreadPoolBuilder}; use futures::executor::{ThreadPool, ThreadPoolBuilder};
use futures::future::Future; use futures::future::Future;
use futures::future::TryFutureExt; use futures::future::TryFutureExt;
use slog::{debug, error, info, warn};
use std::borrow::BorrowMut; use std::borrow::BorrowMut;
use std::collections::HashSet; use std::collections::HashSet;
use std::io::Write;
use std::ops::Deref; use std::ops::Deref;
use std::path::PathBuf; use std::path::PathBuf;
use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::atomic::{AtomicBool, Ordering};
@@ -65,12 +65,11 @@ pub fn save_new_metas(schema: Schema, directory: &mut dyn Directory) -> crate::R
/// ///
/// This method is not part of tantivy's public API /// This method is not part of tantivy's public API
fn save_metas(metas: &IndexMeta, directory: &mut dyn Directory) -> crate::Result<()> { fn save_metas(metas: &IndexMeta, directory: &mut dyn Directory) -> crate::Result<()> {
info!("save metas"); let mut meta_json = serde_json::to_string_pretty(metas)?;
let mut buffer = serde_json::to_vec_pretty(metas)?;
// Just adding a new line at the end of the buffer. // Just adding a new line at the end of the buffer.
writeln!(&mut buffer)?; meta_json.push_str("\n");
directory.atomic_write(&META_FILEPATH, &buffer[..])?; debug!(directory.logger(), "save meta"; "content"=>&meta_json);
debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas)); directory.atomic_write(&META_FILEPATH, meta_json.as_bytes())?;
Ok(()) Ok(())
} }
@@ -97,7 +96,6 @@ impl Deref for SegmentUpdater {
async fn garbage_collect_files( async fn garbage_collect_files(
segment_updater: SegmentUpdater, segment_updater: SegmentUpdater,
) -> crate::Result<GarbageCollectionResult> { ) -> crate::Result<GarbageCollectionResult> {
info!("Running garbage collection");
let mut index = segment_updater.index.clone(); let mut index = segment_updater.index.clone();
index index
.directory_mut() .directory_mut()
@@ -107,14 +105,12 @@ async fn garbage_collect_files(
/// Merges a list of segments the list of segment givens in the `segment_entries`. /// Merges a list of segments the list of segment givens in the `segment_entries`.
/// This function happens in the calling thread and is computationally expensive. /// This function happens in the calling thread and is computationally expensive.
fn merge( fn merge(
merged_segment: Segment,
index: &Index, index: &Index,
mut segment_entries: Vec<SegmentEntry>, mut segment_entries: Vec<SegmentEntry>,
target_opstamp: Opstamp, target_opstamp: Opstamp,
) -> crate::Result<SegmentEntry> { ) -> crate::Result<SegmentEntry> {
// first we need to apply deletes to our segment. // First we apply all of the delete to the merged segment, up to the target opstamp.
let merged_segment = index.new_segment();
// First we apply all of the delet to the merged segment, up to the target opstamp.
for segment_entry in &mut segment_entries { for segment_entry in &mut segment_entries {
let segment = index.segment(segment_entry.meta().clone()); let segment = index.segment(segment_entry.meta().clone());
advance_deletes(segment, segment_entry, target_opstamp)?; advance_deletes(segment, segment_entry, target_opstamp)?;
@@ -167,7 +163,8 @@ impl SegmentUpdater {
delete_cursor: &DeleteCursor, delete_cursor: &DeleteCursor,
) -> crate::Result<SegmentUpdater> { ) -> crate::Result<SegmentUpdater> {
let segments = index.searchable_segment_metas()?; let segments = index.searchable_segment_metas()?;
let segment_manager = SegmentManager::from_segments(segments, delete_cursor); let segment_manager =
SegmentManager::from_segments(segments, delete_cursor, index.logger().clone());
let pool = ThreadPoolBuilder::new() let pool = ThreadPoolBuilder::new()
.name_prefix("segment_updater") .name_prefix("segment_updater")
.pool_size(1) .pool_size(1)
@@ -387,7 +384,18 @@ impl SegmentUpdater {
.segment_manager .segment_manager
.start_merge(merge_operation.segment_ids())?; .start_merge(merge_operation.segment_ids())?;
info!("Starting merge - {:?}", merge_operation.segment_ids()); let segment_ids_str: String = merge_operation
.segment_ids()
.iter()
.map(|segment_id| segment_id.to_string())
.collect::<Vec<String>>()
.join(",");
let merged_segment = self.index.new_segment();
let logger = self.index.logger().new(slog::o!("segments"=>segment_ids_str, "merged-segment"=>merged_segment.id().to_string()));
let num_merges: usize = self.merge_operations.list().len();
slog::info!(&logger, "merge"; "stage"=>"start", "num-merges" => num_merges);
let (merging_future_send, merging_future_recv) = let (merging_future_send, merging_future_recv) =
oneshot::channel::<crate::Result<SegmentMeta>>(); oneshot::channel::<crate::Result<SegmentMeta>>();
@@ -398,22 +406,20 @@ impl SegmentUpdater {
// as well as which segment is currently in merge and therefore should not be // as well as which segment is currently in merge and therefore should not be
// candidate for another merge. // candidate for another merge.
match merge( match merge(
merged_segment,
&segment_updater.index, &segment_updater.index,
segment_entries, segment_entries,
merge_operation.target_opstamp(), merge_operation.target_opstamp(),
) { ) {
Ok(after_merge_segment_entry) => { Ok(after_merge_segment_entry) => {
info!(&logger, "merge"; "stage" => "end");
let segment_meta = segment_updater let segment_meta = segment_updater
.end_merge(merge_operation, after_merge_segment_entry) .end_merge(merge_operation, after_merge_segment_entry)
.await; .await;
let _send_result = merging_future_send.send(segment_meta); let _send_result = merging_future_send.send(segment_meta);
} }
Err(e) => { Err(e) => {
warn!( error!(&logger, "merge"; "stage" => "fail", "cause"=>e.to_string());
"Merge of {:?} was cancelled: {:?}",
merge_operation.segment_ids().to_vec(),
e
);
// ... cancel merge // ... cancel merge
if cfg!(test) { if cfg!(test) {
panic!("Merge failed."); panic!("Merge failed.");
@@ -454,11 +460,12 @@ impl SegmentUpdater {
.collect::<Vec<_>>(); .collect::<Vec<_>>();
merge_candidates.extend(committed_merge_candidates.into_iter()); merge_candidates.extend(committed_merge_candidates.into_iter());
let logger = self.index.logger();
for merge_operation in merge_candidates { for merge_operation in merge_candidates {
if let Err(err) = self.start_merge(merge_operation) { if let Err(err) = self.start_merge(merge_operation) {
warn!( warn!(
"Starting the merge failed for the following reason. This is not fatal. {}", logger,
err "merge-start-fail (not fatal, not necessarily a problem)"; "reason" => format!("{}", err),
); );
} }
} }
@@ -471,8 +478,11 @@ impl SegmentUpdater {
) -> impl Future<Output = crate::Result<SegmentMeta>> { ) -> impl Future<Output = crate::Result<SegmentMeta>> {
let segment_updater = self.clone(); let segment_updater = self.clone();
let after_merge_segment_meta = after_merge_segment_entry.meta().clone(); let after_merge_segment_meta = after_merge_segment_entry.meta().clone();
let logger = self.index.logger().new(
slog::o!("segment"=>after_merge_segment_meta.id().to_string(),
"delete-opstamp"=>after_merge_segment_meta.delete_opstamp()),
);
let end_merge_future = self.schedule_future(async move { let end_merge_future = self.schedule_future(async move {
info!("End merge {:?}", after_merge_segment_entry.meta());
{ {
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone(); let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
if let Some(delete_operation) = delete_cursor.get() { if let Some(delete_operation) = delete_cursor.get() {
@@ -486,6 +496,7 @@ impl SegmentUpdater {
committed_opstamp, committed_opstamp,
) { ) {
error!( error!(
logger,
"Merge of {:?} was cancelled (advancing deletes failed): {:?}", "Merge of {:?} was cancelled (advancing deletes failed): {:?}",
merge_operation.segment_ids(), merge_operation.segment_ids(),
e e
@@ -555,7 +566,7 @@ mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(MergeWheneverPossible)); index_writer.set_merge_policy(Box::new(MergeWheneverPossible));
{ {
@@ -608,7 +619,7 @@ mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
{ {
for _ in 0..100 { for _ in 0..100 {
@@ -679,7 +690,7 @@ mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
{ {
for _ in 0..100 { for _ in 0..100 {

View File

@@ -1,5 +1,4 @@
use super::operation::AddOperation; use super::operation::AddOperation;
use crate::core::Segment;
use crate::core::SerializableSegment; use crate::core::SerializableSegment;
use crate::fastfield::FastFieldsWriter; use crate::fastfield::FastFieldsWriter;
use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter}; use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter};
@@ -15,9 +14,9 @@ use crate::tokenizer::{BoxTokenStream, PreTokenizedStream};
use crate::tokenizer::{FacetTokenizer, TextAnalyzer}; use crate::tokenizer::{FacetTokenizer, TextAnalyzer};
use crate::tokenizer::{TokenStreamChain, Tokenizer}; use crate::tokenizer::{TokenStreamChain, Tokenizer};
use crate::Opstamp; use crate::Opstamp;
use crate::{core::Segment, tokenizer::MAX_TOKEN_LEN};
use crate::{DocId, SegmentComponent}; use crate::{DocId, SegmentComponent};
use std::io; use std::io;
use std::str;
/// Computes the initial size of the hash table. /// Computes the initial size of the hash table.
/// ///
@@ -48,6 +47,7 @@ pub struct SegmentWriter {
fieldnorms_writer: FieldNormsWriter, fieldnorms_writer: FieldNormsWriter,
doc_opstamps: Vec<Opstamp>, doc_opstamps: Vec<Opstamp>,
tokenizers: Vec<Option<TextAnalyzer>>, tokenizers: Vec<Option<TextAnalyzer>>,
term_buffer: Term,
} }
impl SegmentWriter { impl SegmentWriter {
@@ -91,6 +91,7 @@ impl SegmentWriter {
fast_field_writers: FastFieldsWriter::from_schema(schema), fast_field_writers: FastFieldsWriter::from_schema(schema),
doc_opstamps: Vec::with_capacity(1_000), doc_opstamps: Vec::with_capacity(1_000),
tokenizers, tokenizers,
term_buffer: Term::new(),
}) })
} }
@@ -128,24 +129,29 @@ impl SegmentWriter {
if !field_options.is_indexed() { if !field_options.is_indexed() {
continue; continue;
} }
let (term_buffer, multifield_postings) =
(&mut self.term_buffer, &mut self.multifield_postings);
match *field_options.field_type() { match *field_options.field_type() {
FieldType::HierarchicalFacet => { FieldType::HierarchicalFacet => {
let facets: Vec<&str> = field_values term_buffer.set_field(field);
.iter() let facets =
.flat_map(|field_value| match *field_value.value() { field_values
Value::Facet(ref facet) => Some(facet.encoded_str()), .iter()
_ => { .flat_map(|field_value| match *field_value.value() {
panic!("Expected hierarchical facet"); Value::Facet(ref facet) => Some(facet.encoded_str()),
} _ => {
}) panic!("Expected hierarchical facet");
.collect(); }
let mut term = Term::for_field(field); // we set the Term });
for fake_str in facets { for fake_str in facets {
let mut unordered_term_id_opt = None; let mut unordered_term_id_opt = None;
FacetTokenizer.token_stream(fake_str).process(&mut |token| { FacetTokenizer.token_stream(fake_str).process(&mut |token| {
term.set_text(&token.text); if token.text.len() > MAX_TOKEN_LEN {
return;
}
term_buffer.set_text(&token.text);
let unordered_term_id = let unordered_term_id =
self.multifield_postings.subscribe(doc_id, &term); multifield_postings.subscribe(doc_id, &term_buffer);
unordered_term_id_opt = Some(unordered_term_id); unordered_term_id_opt = Some(unordered_term_id);
}); });
if let Some(unordered_term_id) = unordered_term_id_opt { if let Some(unordered_term_id) = unordered_term_id_opt {
@@ -168,7 +174,6 @@ impl SegmentWriter {
if let Some(last_token) = tok_str.tokens.last() { if let Some(last_token) = tok_str.tokens.last() {
total_offset += last_token.offset_to; total_offset += last_token.offset_to;
} }
token_streams token_streams
.push(PreTokenizedStream::from(tok_str.clone()).into()); .push(PreTokenizedStream::from(tok_str.clone()).into());
} }
@@ -178,7 +183,6 @@ impl SegmentWriter {
{ {
offsets.push(total_offset); offsets.push(total_offset);
total_offset += text.len(); total_offset += text.len();
token_streams.push(tokenizer.token_stream(text)); token_streams.push(tokenizer.token_stream(text));
} }
} }
@@ -190,8 +194,12 @@ impl SegmentWriter {
0 0
} else { } else {
let mut token_stream = TokenStreamChain::new(offsets, token_streams); let mut token_stream = TokenStreamChain::new(offsets, token_streams);
self.multifield_postings multifield_postings.index_text(
.index_text(doc_id, field, &mut token_stream) doc_id,
field,
&mut token_stream,
term_buffer,
)
}; };
self.fieldnorms_writer.record(doc_id, field, num_tokens); self.fieldnorms_writer.record(doc_id, field, num_tokens);
@@ -199,44 +207,36 @@ impl SegmentWriter {
FieldType::U64(ref int_option) => { FieldType::U64(ref int_option) => {
if int_option.is_indexed() { if int_option.is_indexed() {
for field_value in field_values { for field_value in field_values {
let term = Term::from_field_u64( term_buffer.set_field(field_value.field());
field_value.field(), term_buffer.set_u64(field_value.value().u64_value());
field_value.value().u64_value(), multifield_postings.subscribe(doc_id, &term_buffer);
);
self.multifield_postings.subscribe(doc_id, &term);
} }
} }
} }
FieldType::Date(ref int_option) => { FieldType::Date(ref int_option) => {
if int_option.is_indexed() { if int_option.is_indexed() {
for field_value in field_values { for field_value in field_values {
let term = Term::from_field_i64( term_buffer.set_field(field_value.field());
field_value.field(), term_buffer.set_i64(field_value.value().date_value().timestamp());
field_value.value().date_value().timestamp(), multifield_postings.subscribe(doc_id, &term_buffer);
);
self.multifield_postings.subscribe(doc_id, &term);
} }
} }
} }
FieldType::I64(ref int_option) => { FieldType::I64(ref int_option) => {
if int_option.is_indexed() { if int_option.is_indexed() {
for field_value in field_values { for field_value in field_values {
let term = Term::from_field_i64( term_buffer.set_field(field_value.field());
field_value.field(), term_buffer.set_i64(field_value.value().i64_value());
field_value.value().i64_value(), multifield_postings.subscribe(doc_id, &term_buffer);
);
self.multifield_postings.subscribe(doc_id, &term);
} }
} }
} }
FieldType::F64(ref int_option) => { FieldType::F64(ref int_option) => {
if int_option.is_indexed() { if int_option.is_indexed() {
for field_value in field_values { for field_value in field_values {
let term = Term::from_field_f64( term_buffer.set_field(field_value.field());
field_value.field(), term_buffer.set_f64(field_value.value().f64_value());
field_value.value().f64_value(), multifield_postings.subscribe(doc_id, &term_buffer);
);
self.multifield_postings.subscribe(doc_id, &term);
} }
} }
} }

View File

@@ -102,10 +102,7 @@
extern crate serde_json; extern crate serde_json;
#[macro_use] #[macro_use]
extern crate log; extern crate thiserror;
#[macro_use]
extern crate failure;
#[cfg(all(test, feature = "unstable"))] #[cfg(all(test, feature = "unstable"))]
extern crate test; extern crate test;
@@ -148,6 +145,7 @@ pub mod schema;
pub mod space_usage; pub mod space_usage;
pub mod store; pub mod store;
pub mod termdict; pub mod termdict;
pub use slog;
mod reader; mod reader;
@@ -245,18 +243,10 @@ pub type DocId = u32;
/// with opstamp `n+1`. /// with opstamp `n+1`.
pub type Opstamp = u64; pub type Opstamp = u64;
/// A Score that represents the relevance of the document to the query
///
/// This is modelled internally as a `f64`, because tantivy was compiled with the `scoref64`
/// feature. The larger the number, the more relevant the document is to the search query.
#[cfg(feature = "scoref64")]
pub type Score = f64;
/// A Score that represents the relevance of the document to the query /// A Score that represents the relevance of the document to the query
/// ///
/// This is modelled internally as a `f32`. The larger the number, the more relevant /// This is modelled internally as a `f32`. The larger the number, the more relevant
/// the document to the search query. /// the document to the search query.
#[cfg(not(feature = "scoref64"))]
pub type Score = f32; pub type Score = f32;
/// A `SegmentLocalId` identifies a segment. /// A `SegmentLocalId` identifies a segment.
@@ -296,7 +286,6 @@ mod tests {
use crate::schema::*; use crate::schema::*;
use crate::DocAddress; use crate::DocAddress;
use crate::Index; use crate::Index;
use crate::IndexWriter;
use crate::Postings; use crate::Postings;
use crate::ReloadPolicy; use crate::ReloadPolicy;
use rand::distributions::Bernoulli; use rand::distributions::Bernoulli;
@@ -361,14 +350,14 @@ mod tests {
#[test] #[test]
#[cfg(feature = "mmap")] #[cfg(feature = "mmap")]
fn test_indexing() { fn test_indexing() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_from_tempdir(schema).unwrap(); let index = Index::create_from_tempdir(schema).unwrap();
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests()?;
{ {
let doc = doc!(text_field=>"af b"); let doc = doc!(text_field=>"af b");
index_writer.add_document(doc); index_writer.add_document(doc);
@@ -383,29 +372,30 @@ mod tests {
} }
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
} }
Ok(())
} }
#[test] #[test]
fn test_docfreq1() { fn test_docfreq1() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests()?;
{ {
index_writer.add_document(doc!(text_field=>"a b c")); index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.commit().unwrap(); index_writer.commit()?;
} }
{ {
index_writer.add_document(doc!(text_field=>"a")); index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"a a")); index_writer.add_document(doc!(text_field=>"a a"));
index_writer.commit().unwrap(); index_writer.commit()?;
} }
{ {
index_writer.add_document(doc!(text_field=>"c")); index_writer.add_document(doc!(text_field=>"c"));
index_writer.commit().unwrap(); index_writer.commit()?;
} }
{ {
let reader = index.reader().unwrap(); let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
let term_a = Term::from_field_text(text_field, "a"); let term_a = Term::from_field_text(text_field, "a");
assert_eq!(searcher.doc_freq(&term_a), 3); assert_eq!(searcher.doc_freq(&term_a), 3);
@@ -416,67 +406,50 @@ mod tests {
let term_d = Term::from_field_text(text_field, "d"); let term_d = Term::from_field_text(text_field, "d");
assert_eq!(searcher.doc_freq(&term_d), 0); assert_eq!(searcher.doc_freq(&term_d), 0);
} }
Ok(())
} }
#[test] #[test]
fn test_fieldnorm_no_docs_with_field() { fn test_fieldnorm_no_docs_with_field() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let title_field = schema_builder.add_text_field("title", TEXT); let title_field = schema_builder.add_text_field("title", TEXT);
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.commit()?;
let index_reader = index.reader()?;
let searcher = index_reader.searcher();
let reader = searcher.segment_reader(0);
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let fieldnorm_reader = reader.get_fieldnorms_reader(text_field)?;
{ assert_eq!(fieldnorm_reader.fieldnorm(0), 3);
let doc = doc!(text_field=>"a b c");
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
} }
{ {
let index_reader = index.reader().unwrap(); let fieldnorm_reader = reader.get_fieldnorms_reader(title_field)?;
let searcher = index_reader.searcher(); assert_eq!(fieldnorm_reader.fieldnorm_id(0), 0);
let reader = searcher.segment_reader(0);
{
let fieldnorm_reader = reader.get_fieldnorms_reader(text_field);
assert_eq!(fieldnorm_reader.fieldnorm(0), 3);
}
{
let fieldnorm_reader = reader.get_fieldnorms_reader(title_field);
assert_eq!(fieldnorm_reader.fieldnorm_id(0), 0);
}
} }
Ok(())
} }
#[test] #[test]
fn test_fieldnorm() { fn test_fieldnorm() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
{ let mut index_writer = index.writer_for_tests()?;
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); index_writer.add_document(doc!(text_field=>"a b c"));
{ index_writer.add_document(doc!());
let doc = doc!(text_field=>"a b c"); index_writer.add_document(doc!(text_field=>"a b"));
index_writer.add_document(doc); index_writer.commit()?;
} let reader = index.reader()?;
{ let searcher = reader.searcher();
let doc = doc!(); let segment_reader: &SegmentReader = searcher.segment_reader(0);
index_writer.add_document(doc); let fieldnorms_reader = segment_reader.get_fieldnorms_reader(text_field)?;
} assert_eq!(fieldnorms_reader.fieldnorm(0), 3);
{ assert_eq!(fieldnorms_reader.fieldnorm(1), 0);
let doc = doc!(text_field=>"a b"); assert_eq!(fieldnorms_reader.fieldnorm(2), 2);
index_writer.add_document(doc); Ok(())
}
index_writer.commit().unwrap();
}
{
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let segment_reader: &SegmentReader = searcher.segment_reader(0);
let fieldnorms_reader = segment_reader.get_fieldnorms_reader(text_field);
assert_eq!(fieldnorms_reader.fieldnorm(0), 3);
assert_eq!(fieldnorms_reader.fieldnorm(1), 0);
assert_eq!(fieldnorms_reader.fieldnorm(2), 2);
}
} }
fn advance_undeleted(docset: &mut dyn DocSet, reader: &SegmentReader) -> bool { fn advance_undeleted(docset: &mut dyn DocSet, reader: &SegmentReader) -> bool {
@@ -491,7 +464,7 @@ mod tests {
} }
#[test] #[test]
fn test_delete_postings1() { fn test_delete_postings1() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let term_abcd = Term::from_field_text(text_field, "abcd"); let term_abcd = Term::from_field_text(text_field, "abcd");
@@ -507,7 +480,7 @@ mod tests {
.unwrap(); .unwrap();
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests()?;
// 0 // 0
index_writer.add_document(doc!(text_field=>"a b")); index_writer.add_document(doc!(text_field=>"a b"));
// 1 // 1
@@ -523,10 +496,10 @@ mod tests {
index_writer.add_document(doc!(text_field=>" b c")); index_writer.add_document(doc!(text_field=>" b c"));
// 5 // 5
index_writer.add_document(doc!(text_field=>" a")); index_writer.add_document(doc!(text_field=>" a"));
index_writer.commit().unwrap(); index_writer.commit()?;
} }
{ {
reader.reload().unwrap(); reader.reload()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
let inverted_index = segment_reader.inverted_index(text_field); let inverted_index = segment_reader.inverted_index(text_field);
@@ -554,15 +527,15 @@ mod tests {
} }
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests()?;
// 0 // 0
index_writer.add_document(doc!(text_field=>"a b")); index_writer.add_document(doc!(text_field=>"a b"));
// 1 // 1
index_writer.delete_term(Term::from_field_text(text_field, "c")); index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.rollback().unwrap(); index_writer.rollback()?;
} }
{ {
reader.reload().unwrap(); reader.reload()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
let seg_reader = searcher.segment_reader(0); let seg_reader = searcher.segment_reader(0);
let inverted_index = seg_reader.inverted_index(term_abcd.field()); let inverted_index = seg_reader.inverted_index(term_abcd.field());
@@ -591,15 +564,15 @@ mod tests {
} }
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b")); index_writer.add_document(doc!(text_field=>"a b"));
index_writer.delete_term(Term::from_field_text(text_field, "c")); index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.rollback().unwrap(); index_writer.rollback()?;
index_writer.delete_term(Term::from_field_text(text_field, "a")); index_writer.delete_term(Term::from_field_text(text_field, "a"));
index_writer.commit().unwrap(); index_writer.commit()?;
} }
{ {
reader.reload().unwrap(); reader.reload()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
let inverted_index = segment_reader.inverted_index(term_abcd.field()); let inverted_index = segment_reader.inverted_index(term_abcd.field());
@@ -631,19 +604,20 @@ mod tests {
assert!(!advance_undeleted(&mut postings, segment_reader)); assert!(!advance_undeleted(&mut postings, segment_reader));
} }
} }
Ok(())
} }
#[test] #[test]
fn test_indexed_u64() { fn test_indexed_u64() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field("value", INDEXED); let field = schema_builder.add_u64_field("value", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(field=>1u64)); index_writer.add_document(doc!(field=>1u64));
index_writer.commit().unwrap(); index_writer.commit()?;
let reader = index.reader().unwrap(); let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
let term = Term::from_field_u64(field, 1u64); let term = Term::from_field_u64(field, 1u64);
let mut postings = searcher let mut postings = searcher
@@ -653,20 +627,21 @@ mod tests {
.unwrap(); .unwrap();
assert_eq!(postings.doc(), 0); assert_eq!(postings.doc(), 0);
assert_eq!(postings.advance(), TERMINATED); assert_eq!(postings.advance(), TERMINATED);
Ok(())
} }
#[test] #[test]
fn test_indexed_i64() { fn test_indexed_i64() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let value_field = schema_builder.add_i64_field("value", INDEXED); let value_field = schema_builder.add_i64_field("value", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests()?;
let negative_val = -1i64; let negative_val = -1i64;
index_writer.add_document(doc!(value_field => negative_val)); index_writer.add_document(doc!(value_field => negative_val));
index_writer.commit().unwrap(); index_writer.commit()?;
let reader = index.reader().unwrap(); let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
let term = Term::from_field_i64(value_field, negative_val); let term = Term::from_field_i64(value_field, negative_val);
let mut postings = searcher let mut postings = searcher
@@ -676,20 +651,21 @@ mod tests {
.unwrap(); .unwrap();
assert_eq!(postings.doc(), 0); assert_eq!(postings.doc(), 0);
assert_eq!(postings.advance(), TERMINATED); assert_eq!(postings.advance(), TERMINATED);
Ok(())
} }
#[test] #[test]
fn test_indexed_f64() { fn test_indexed_f64() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let value_field = schema_builder.add_f64_field("value", INDEXED); let value_field = schema_builder.add_f64_field("value", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests()?;
let val = std::f64::consts::PI; let val = std::f64::consts::PI;
index_writer.add_document(doc!(value_field => val)); index_writer.add_document(doc!(value_field => val));
index_writer.commit().unwrap(); index_writer.commit()?;
let reader = index.reader().unwrap(); let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
let term = Term::from_field_f64(value_field, val); let term = Term::from_field_f64(value_field, val);
let mut postings = searcher let mut postings = searcher
@@ -699,26 +675,29 @@ mod tests {
.unwrap(); .unwrap();
assert_eq!(postings.doc(), 0); assert_eq!(postings.doc(), 0);
assert_eq!(postings.advance(), TERMINATED); assert_eq!(postings.advance(), TERMINATED);
Ok(())
} }
#[test] #[test]
fn test_indexedfield_not_in_documents() { fn test_indexedfield_not_in_documents() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let absent_field = schema_builder.add_text_field("text", TEXT); let absent_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(2, 6_000_000).unwrap(); let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a")); index_writer.add_document(doc!(text_field=>"a"));
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
let reader = index.reader().unwrap(); let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
segment_reader.inverted_index(absent_field); //< should not panic let inverted_index = segment_reader.inverted_index(absent_field); //< should not panic
assert_eq!(inverted_index.terms().num_terms(), 0);
Ok(())
} }
#[test] #[test]
fn test_delete_postings2() { fn test_delete_postings2() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
@@ -726,53 +705,40 @@ mod tests {
let reader = index let reader = index
.reader_builder() .reader_builder()
.reload_policy(ReloadPolicy::Manual) .reload_policy(ReloadPolicy::Manual)
.try_into() .try_into()?;
.unwrap();
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(2, 6_000_000).unwrap(); let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"63"));
let add_document = |index_writer: &mut IndexWriter, val: &'static str| { index_writer.add_document(doc!(text_field=>"70"));
let doc = doc!(text_field=>val); index_writer.add_document(doc!(text_field=>"34"));
index_writer.add_document(doc); index_writer.add_document(doc!(text_field=>"1"));
}; index_writer.add_document(doc!(text_field=>"38"));
index_writer.add_document(doc!(text_field=>"33"));
let remove_document = |index_writer: &mut IndexWriter, val: &'static str| { index_writer.add_document(doc!(text_field=>"40"));
let delterm = Term::from_field_text(text_field, val); index_writer.add_document(doc!(text_field=>"17"));
index_writer.delete_term(delterm); index_writer.delete_term(Term::from_field_text(text_field, "38"));
}; index_writer.delete_term(Term::from_field_text(text_field, "34"));
index_writer.commit()?;
add_document(&mut index_writer, "63"); reader.reload()?;
add_document(&mut index_writer, "70"); assert_eq!(reader.searcher().num_docs(), 6);
add_document(&mut index_writer, "34"); Ok(())
add_document(&mut index_writer, "1");
add_document(&mut index_writer, "38");
add_document(&mut index_writer, "33");
add_document(&mut index_writer, "40");
add_document(&mut index_writer, "17");
remove_document(&mut index_writer, "38");
remove_document(&mut index_writer, "34");
index_writer.commit().unwrap();
reader.reload().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 6);
} }
#[test] #[test]
fn test_termfreq() { fn test_termfreq() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests()?;
let doc = doc!(text_field=>"af af af bc bc"); index_writer.add_document(doc!(text_field=>"af af af bc bc"));
index_writer.add_document(doc); index_writer.commit()?;
index_writer.commit().unwrap();
} }
{ {
let index_reader = index.reader().unwrap(); let index_reader = index.reader()?;
let searcher = index_reader.searcher(); let searcher = index_reader.searcher();
let reader = searcher.segment_reader(0); let reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(text_field); let inverted_index = reader.inverted_index(text_field);
@@ -788,63 +754,63 @@ mod tests {
assert_eq!(postings.term_freq(), 3); assert_eq!(postings.term_freq(), 3);
assert_eq!(postings.advance(), TERMINATED); assert_eq!(postings.advance(), TERMINATED);
} }
Ok(())
} }
#[test] #[test]
fn test_searcher_1() { fn test_searcher_1() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let reader = index.reader().unwrap(); let reader = index.reader()?;
{ // writing the segment
// writing the segment let mut index_writer = index.writer_for_tests()?;
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); index_writer.add_document(doc!(text_field=>"af af af b"));
index_writer.add_document(doc!(text_field=>"af af af b")); index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.add_document(doc!(text_field=>"a b c")); index_writer.add_document(doc!(text_field=>"a b c d"));
index_writer.add_document(doc!(text_field=>"a b c d")); index_writer.commit()?;
index_writer.commit().unwrap();
} reader.reload()?;
{ let searcher = reader.searcher();
reader.reload().unwrap(); let get_doc_ids = |terms: Vec<Term>| {
let searcher = reader.searcher(); let query = BooleanQuery::new_multiterms_query(terms);
let get_doc_ids = |terms: Vec<Term>| { searcher
let query = BooleanQuery::new_multiterms_query(terms); .search(&query, &TEST_COLLECTOR_WITH_SCORE)
let topdocs = searcher.search(&query, &TEST_COLLECTOR_WITH_SCORE).unwrap(); .map(|topdocs| topdocs.docs().to_vec())
topdocs.docs().to_vec() };
}; assert_eq!(
assert_eq!( get_doc_ids(vec![Term::from_field_text(text_field, "a")])?,
get_doc_ids(vec![Term::from_field_text(text_field, "a")]), vec![DocAddress(0, 1), DocAddress(0, 2)]
vec![DocAddress(0, 1), DocAddress(0, 2)] );
); assert_eq!(
assert_eq!( get_doc_ids(vec![Term::from_field_text(text_field, "af")])?,
get_doc_ids(vec![Term::from_field_text(text_field, "af")]), vec![DocAddress(0, 0)]
vec![DocAddress(0, 0)] );
); assert_eq!(
assert_eq!( get_doc_ids(vec![Term::from_field_text(text_field, "b")])?,
get_doc_ids(vec![Term::from_field_text(text_field, "b")]), vec![DocAddress(0, 0), DocAddress(0, 1), DocAddress(0, 2)]
vec![DocAddress(0, 0), DocAddress(0, 1), DocAddress(0, 2)] );
); assert_eq!(
assert_eq!( get_doc_ids(vec![Term::from_field_text(text_field, "c")])?,
get_doc_ids(vec![Term::from_field_text(text_field, "c")]), vec![DocAddress(0, 1), DocAddress(0, 2)]
vec![DocAddress(0, 1), DocAddress(0, 2)] );
); assert_eq!(
assert_eq!( get_doc_ids(vec![Term::from_field_text(text_field, "d")])?,
get_doc_ids(vec![Term::from_field_text(text_field, "d")]), vec![DocAddress(0, 2)]
vec![DocAddress(0, 2)] );
); assert_eq!(
assert_eq!( get_doc_ids(vec![
get_doc_ids(vec![ Term::from_field_text(text_field, "b"),
Term::from_field_text(text_field, "b"), Term::from_field_text(text_field, "a"),
Term::from_field_text(text_field, "a"), ])?,
]), vec![DocAddress(0, 0), DocAddress(0, 1), DocAddress(0, 2)]
vec![DocAddress(0, 0), DocAddress(0, 1), DocAddress(0, 2)] );
); Ok(())
}
} }
#[test] #[test]
fn test_searcher_2() { fn test_searcher_2() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
@@ -852,19 +818,17 @@ mod tests {
let reader = index let reader = index
.reader_builder() .reader_builder()
.reload_policy(ReloadPolicy::Manual) .reload_policy(ReloadPolicy::Manual)
.try_into() .try_into()?;
.unwrap();
assert_eq!(reader.searcher().num_docs(), 0u64); assert_eq!(reader.searcher().num_docs(), 0u64);
{ // writing the segment
// writing the segment let mut index_writer = index.writer_for_tests()?;
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); index_writer.add_document(doc!(text_field=>"af b"));
index_writer.add_document(doc!(text_field=>"af b")); index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.add_document(doc!(text_field=>"a b c")); index_writer.add_document(doc!(text_field=>"a b c d"));
index_writer.add_document(doc!(text_field=>"a b c d")); index_writer.commit()?;
index_writer.commit().unwrap(); reader.reload()?;
}
reader.reload().unwrap();
assert_eq!(reader.searcher().num_docs(), 3u64); assert_eq!(reader.searcher().num_docs(), 3u64);
Ok(())
} }
#[test] #[test]
@@ -886,7 +850,7 @@ mod tests {
} }
#[test] #[test]
fn test_wrong_fast_field_type() { fn test_wrong_fast_field_type() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let fast_field_unsigned = schema_builder.add_u64_field("unsigned", FAST); let fast_field_unsigned = schema_builder.add_u64_field("unsigned", FAST);
let fast_field_signed = schema_builder.add_i64_field("signed", FAST); let fast_field_signed = schema_builder.add_i64_field("signed", FAST);
@@ -896,14 +860,14 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 50_000_000).unwrap(); let mut index_writer = index.writer_for_tests()?;
{ {
let document = let document =
doc!(fast_field_unsigned => 4u64, fast_field_signed=>4i64, fast_field_float=>4f64); doc!(fast_field_unsigned => 4u64, fast_field_signed=>4i64, fast_field_float=>4f64);
index_writer.add_document(document); index_writer.add_document(document);
index_writer.commit().unwrap(); index_writer.commit()?;
} }
let reader = index.reader().unwrap(); let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
let segment_reader: &SegmentReader = searcher.segment_reader(0); let segment_reader: &SegmentReader = searcher.segment_reader(0);
{ {
@@ -942,11 +906,12 @@ mod tests {
let fast_field_reader = fast_field_reader_opt.unwrap(); let fast_field_reader = fast_field_reader_opt.unwrap();
assert_eq!(fast_field_reader.get(0), 4f64) assert_eq!(fast_field_reader.get(0), 4f64)
} }
Ok(())
} }
// motivated by #729 // motivated by #729
#[test] #[test]
fn test_update_via_delete_insert() { fn test_update_via_delete_insert() -> crate::Result<()> {
use crate::collector::Count; use crate::collector::Count;
use crate::indexer::NoMergePolicy; use crate::indexer::NoMergePolicy;
use crate::query::AllQuery; use crate::query::AllQuery;
@@ -960,17 +925,17 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone()); let index = Index::create_in_ram(schema.clone());
let index_reader = index.reader().unwrap(); let index_reader = index.reader()?;
let mut index_writer = index.writer(3_000_000).unwrap(); let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy)); index_writer.set_merge_policy(Box::new(NoMergePolicy));
for doc_id in 0u64..DOC_COUNT { for doc_id in 0u64..DOC_COUNT {
index_writer.add_document(doc!(id => doc_id)); index_writer.add_document(doc!(id => doc_id));
} }
index_writer.commit().unwrap(); index_writer.commit()?;
index_reader.reload().unwrap(); index_reader.reload()?;
let searcher = index_reader.searcher(); let searcher = index_reader.searcher();
assert_eq!( assert_eq!(
@@ -981,12 +946,11 @@ mod tests {
// update the 10 elements by deleting and re-adding // update the 10 elements by deleting and re-adding
for doc_id in 0u64..DOC_COUNT { for doc_id in 0u64..DOC_COUNT {
index_writer.delete_term(Term::from_field_u64(id, doc_id)); index_writer.delete_term(Term::from_field_u64(id, doc_id));
index_writer.commit().unwrap(); index_writer.commit()?;
index_reader.reload().unwrap(); index_reader.reload()?;
let doc = doc!(id => doc_id); index_writer.add_document(doc!(id => doc_id));
index_writer.add_document(doc); index_writer.commit()?;
index_writer.commit().unwrap(); index_reader.reload()?;
index_reader.reload().unwrap();
let searcher = index_reader.searcher(); let searcher = index_reader.searcher();
// The number of document should be stable. // The number of document should be stable.
assert_eq!( assert_eq!(
@@ -995,7 +959,7 @@ mod tests {
); );
} }
index_reader.reload().unwrap(); index_reader.reload()?;
let searcher = index_reader.searcher(); let searcher = index_reader.searcher();
let segment_ids: Vec<SegmentId> = searcher let segment_ids: Vec<SegmentId> = searcher
.segment_readers() .segment_readers()
@@ -1004,12 +968,18 @@ mod tests {
.collect(); .collect();
block_on(index_writer.merge(&segment_ids)).unwrap(); block_on(index_writer.merge(&segment_ids)).unwrap();
index_reader.reload().unwrap(); index_reader.reload()?;
let searcher = index_reader.searcher(); let searcher = index_reader.searcher();
assert_eq!(searcher.search(&AllQuery, &Count)?, DOC_COUNT as usize);
Ok(())
}
assert_eq!( #[test]
searcher.search(&AllQuery, &Count).unwrap(), fn test_validate_checksum() -> crate::Result<()> {
DOC_COUNT as usize let index_path = tempfile::tempdir().expect("dir");
); let schema = Schema::builder().build();
let index = Index::create_in_dir(&index_path, schema)?;
assert!(index.validate_checksum()?.is_empty());
Ok(())
} }
} }

View File

@@ -455,7 +455,7 @@ mod tests {
let int_field = schema_builder.add_u64_field("id", INDEXED); let int_field = schema_builder.add_u64_field("id", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
let mut last_doc = 0u32; let mut last_doc = 0u32;
for &doc in docs { for &doc in docs {
for _ in last_doc..doc { for _ in last_doc..doc {
@@ -496,7 +496,7 @@ mod tests {
let int_field = schema_builder.add_u64_field("id", INDEXED); let int_field = schema_builder.add_u64_field("id", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
// create two postings list, one containg even number, // create two postings list, one containg even number,
// the other containing odd numbers. // the other containing odd numbers.
for i in 0..6 { for i in 0..6 {

View File

@@ -310,6 +310,7 @@ pub mod tests {
mod bench { mod bench {
use super::*; use super::*;
use crate::TERMINATED;
use rand::rngs::StdRng; use rand::rngs::StdRng;
use rand::Rng; use rand::Rng;
use rand::SeedableRng; use rand::SeedableRng;
@@ -340,7 +341,7 @@ mod bench {
let mut encoder = BlockEncoder::new(); let mut encoder = BlockEncoder::new();
let data = generate_array(COMPRESSION_BLOCK_SIZE, 0.1); let data = generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
let (num_bits, compressed) = encoder.compress_block_sorted(&data, 0u32); let (num_bits, compressed) = encoder.compress_block_sorted(&data, 0u32);
let mut decoder = BlockDecoder::new(); let mut decoder = BlockDecoder::default();
b.iter(|| { b.iter(|| {
decoder.uncompress_block_sorted(compressed, 0u32, num_bits); decoder.uncompress_block_sorted(compressed, 0u32, num_bits);
}); });
@@ -375,9 +376,9 @@ mod bench {
let mut encoder = BlockEncoder::new(); let mut encoder = BlockEncoder::new();
let data = generate_array(NUM_INTS_BENCH_VINT, 0.001); let data = generate_array(NUM_INTS_BENCH_VINT, 0.001);
let compressed = encoder.compress_vint_sorted(&data, 0u32); let compressed = encoder.compress_vint_sorted(&data, 0u32);
let mut decoder = BlockDecoder::new(); let mut decoder = BlockDecoder::default();
b.iter(|| { b.iter(|| {
decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT); decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT, TERMINATED);
}); });
} }
} }

View File

@@ -91,7 +91,7 @@ pub mod tests {
let title = schema_builder.add_text_field("title", TEXT); let title = schema_builder.add_text_field("title", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 30_000_000)?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(title => r#"abc abc abc"#)); index_writer.add_document(doc!(title => r#"abc abc abc"#));
index_writer.add_document(doc!(title => r#"abc be be be be abc"#)); index_writer.add_document(doc!(title => r#"abc be be be be abc"#));
for _ in 0..1_000 { for _ in 0..1_000 {
@@ -176,7 +176,7 @@ pub mod tests {
.tokenizers() .tokenizers()
.register("simple_no_truncation", SimpleTokenizer); .register("simple_no_truncation", SimpleTokenizer);
let reader = index.reader().unwrap(); let reader = index.reader().unwrap();
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy)); index_writer.set_merge_policy(Box::new(NoMergePolicy));
{ {
index_writer.add_document(doc!(text_field=>exceeding_token_text)); index_writer.add_document(doc!(text_field=>exceeding_token_text));
@@ -205,7 +205,7 @@ pub mod tests {
} }
#[test] #[test]
pub fn test_position_and_fieldnorm1() { pub fn test_position_and_fieldnorm1() -> crate::Result<()> {
let mut positions = Vec::new(); let mut positions = Vec::new();
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
@@ -217,42 +217,38 @@ pub mod tests {
let mut segment_writer = let mut segment_writer =
SegmentWriter::for_segment(3_000_000, segment.clone(), &schema).unwrap(); SegmentWriter::for_segment(3_000_000, segment.clone(), &schema).unwrap();
{ {
let mut doc = Document::default();
// checking that position works if the field has two values // checking that position works if the field has two values
doc.add_text(text_field, "a b a c a d a a.");
doc.add_text(text_field, "d d d d a");
let op = AddOperation { let op = AddOperation {
opstamp: 0u64, opstamp: 0u64,
document: doc, document: doc!(
text_field => "a b a c a d a a.",
text_field => "d d d d a"
),
}; };
segment_writer.add_document(op, &schema).unwrap(); segment_writer.add_document(op, &schema)?;
} }
{ {
let mut doc = Document::default();
doc.add_text(text_field, "b a");
let op = AddOperation { let op = AddOperation {
opstamp: 1u64, opstamp: 1u64,
document: doc, document: doc!(text_field => "b a"),
}; };
segment_writer.add_document(op, &schema).unwrap(); segment_writer.add_document(op, &schema).unwrap();
} }
for i in 2..1000 { for i in 2..1000 {
let mut doc = Document::default(); let mut text: String = iter::repeat("e ").take(i).collect();
let mut text = iter::repeat("e ").take(i).collect::<String>();
text.push_str(" a"); text.push_str(" a");
doc.add_text(text_field, &text);
let op = AddOperation { let op = AddOperation {
opstamp: 2u64, opstamp: 2u64,
document: doc, document: doc!(text_field => text),
}; };
segment_writer.add_document(op, &schema).unwrap(); segment_writer.add_document(op, &schema).unwrap();
} }
segment_writer.finalize().unwrap(); segment_writer.finalize()?;
} }
{ {
let segment_reader = SegmentReader::open(&segment).unwrap(); let segment_reader = SegmentReader::open(&segment)?;
{ {
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text_field); let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text_field)?;
assert_eq!(fieldnorm_reader.fieldnorm(0), 8 + 5); assert_eq!(fieldnorm_reader.fieldnorm(0), 8 + 5);
assert_eq!(fieldnorm_reader.fieldnorm(1), 2); assert_eq!(fieldnorm_reader.fieldnorm(1), 2);
for i in 2..1000 { for i in 2..1000 {
@@ -312,6 +308,7 @@ pub mod tests {
assert_eq!(postings_e.doc(), TERMINATED); assert_eq!(postings_e.doc(), TERMINATED);
} }
} }
Ok(())
} }
#[test] #[test]
@@ -322,7 +319,7 @@ pub mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field => "g b b d c g c")); index_writer.add_document(doc!(text_field => "g b b d c g c"));
index_writer.add_document(doc!(text_field => "g a b b a d c g c")); index_writer.add_document(doc!(text_field => "g a b b a d c g c"));
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
@@ -354,7 +351,7 @@ pub mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
for i in 0u64..num_docs as u64 { for i in 0u64..num_docs as u64 {
let doc = doc!(value_field => 2u64, value_field => i % 2u64); let doc = doc!(value_field => 2u64, value_field => i % 2u64);
index_writer.add_document(doc); index_writer.add_document(doc);
@@ -425,7 +422,7 @@ pub mod tests {
// delete some of the documents // delete some of the documents
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.delete_term(term_0); index_writer.delete_term(term_0);
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
} }
@@ -479,7 +476,7 @@ pub mod tests {
// delete everything else // delete everything else
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.delete_term(term_1); index_writer.delete_term(term_1);
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
} }
@@ -522,7 +519,7 @@ pub mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let posting_list_size = 1_000_000; let posting_list_size = 1_000_000;
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
for _ in 0..posting_list_size { for _ in 0..posting_list_size {
let mut doc = Document::default(); let mut doc = Document::default();
if rng.gen_bool(1f64 / 15f64) { if rng.gen_bool(1f64 / 15f64) {
@@ -730,7 +727,7 @@ mod bench {
let mut s = 0u32; let mut s = 0u32;
while segment_postings.doc() != TERMINATED { while segment_postings.doc() != TERMINATED {
s += (segment_postings.doc() & n) % 1024; s += (segment_postings.doc() & n) % 1024;
segment_postings.advance() segment_postings.advance();
} }
s s
}); });

View File

@@ -105,6 +105,7 @@ impl MultiFieldPostingsWriter {
doc: DocId, doc: DocId,
field: Field, field: Field,
token_stream: &mut dyn TokenStream, token_stream: &mut dyn TokenStream,
term_buffer: &mut Term,
) -> u32 { ) -> u32 {
let postings_writer = let postings_writer =
self.per_field_postings_writers[field.field_id() as usize].deref_mut(); self.per_field_postings_writers[field.field_id() as usize].deref_mut();
@@ -114,6 +115,7 @@ impl MultiFieldPostingsWriter {
field, field,
token_stream, token_stream,
&mut self.heap, &mut self.heap,
term_buffer,
) )
} }
@@ -220,21 +222,22 @@ pub trait PostingsWriter {
field: Field, field: Field,
token_stream: &mut dyn TokenStream, token_stream: &mut dyn TokenStream,
heap: &mut MemoryArena, heap: &mut MemoryArena,
term_buffer: &mut Term,
) -> u32 { ) -> u32 {
let mut term = Term::for_field(field); term_buffer.set_field(field);
let mut sink = |token: &Token| { let mut sink = |token: &Token| {
// We skip all tokens with a len greater than u16. // We skip all tokens with a len greater than u16.
if token.text.len() <= MAX_TOKEN_LEN { if token.text.len() > MAX_TOKEN_LEN {
term.set_text(token.text.as_str()); return;
self.subscribe(term_index, doc_id, token.position as u32, &term, heap);
} else {
info!(
"A token exceeding MAX_TOKEN_LEN ({}>{}) was dropped. Search for \
MAX_TOKEN_LEN in the documentation for more information.",
token.text.len(),
MAX_TOKEN_LEN
);
} }
term_buffer.set_text(token.text.as_str());
self.subscribe(
term_index,
doc_id,
token.position as u32,
&term_buffer,
heap,
);
}; };
token_stream.process(&mut sink) token_stream.process(&mut sink)
} }

View File

@@ -114,7 +114,7 @@ impl SegmentPostings {
.iter() .iter()
.map(|&fieldnorm| fieldnorm as u64) .map(|&fieldnorm| fieldnorm as u64)
.sum::<u64>(); .sum::<u64>();
total_num_tokens as Score / fieldnorms.len() as f32 total_num_tokens as Score / fieldnorms.len() as Score
}) })
.unwrap_or(0.0); .unwrap_or(0.0);
let mut postings_serializer = PostingsSerializer::new( let mut postings_serializer = PostingsSerializer::new(

View File

@@ -83,7 +83,7 @@ mod tests {
let field = schema_builder.add_text_field("text", TEXT); let field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(field=>"aaa")); index_writer.add_document(doc!(field=>"aaa"));
index_writer.add_document(doc!(field=>"bbb")); index_writer.add_document(doc!(field=>"bbb"));
index_writer.commit().unwrap(); index_writer.commit().unwrap();

View File

@@ -5,7 +5,6 @@ use crate::query::{BitSetDocSet, Explanation};
use crate::query::{Scorer, Weight}; use crate::query::{Scorer, Weight};
use crate::schema::{Field, IndexRecordOption}; use crate::schema::{Field, IndexRecordOption};
use crate::termdict::{TermDictionary, TermStreamer}; use crate::termdict::{TermDictionary, TermStreamer};
use crate::Result;
use crate::TantivyError; use crate::TantivyError;
use crate::{DocId, Score}; use crate::{DocId, Score};
use std::sync::Arc; use std::sync::Arc;
@@ -40,7 +39,7 @@ impl<A> Weight for AutomatonWeight<A>
where where
A: Automaton + Send + Sync + 'static, A: Automaton + Send + Sync + 'static,
{ {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> Result<Box<dyn Scorer>> { fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let max_doc = reader.max_doc(); let max_doc = reader.max_doc();
let mut doc_bitset = BitSet::with_max_value(max_doc); let mut doc_bitset = BitSet::with_max_value(max_doc);
let inverted_index = reader.inverted_index(self.field); let inverted_index = reader.inverted_index(self.field);
@@ -66,7 +65,7 @@ where
Ok(Box::new(const_scorer)) Ok(Box::new(const_scorer))
} }
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> { fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0)?; let mut scorer = self.scorer(reader, 1.0)?;
if scorer.seek(doc) == doc { if scorer.seek(doc) == doc {
Ok(Explanation::new("AutomatonScorer", 1.0)) Ok(Explanation::new("AutomatonScorer", 1.0))
@@ -91,7 +90,7 @@ mod tests {
let mut schema = Schema::builder(); let mut schema = Schema::builder();
let title = schema.add_text_field("title", STRING); let title = schema.add_text_field("title", STRING);
let index = Index::create_in_ram(schema.build()); let index = Index::create_in_ram(schema.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(title=>"abc")); index_writer.add_document(doc!(title=>"abc"));
index_writer.add_document(doc!(title=>"bcd")); index_writer.add_document(doc!(title=>"bcd"));
index_writer.add_document(doc!(title=>"abcd")); index_writer.add_document(doc!(title=>"abcd"));

View File

@@ -4,19 +4,6 @@ use crate::{DocId, DocSet, Score, TERMINATED};
use std::ops::Deref; use std::ops::Deref;
use std::ops::DerefMut; use std::ops::DerefMut;
fn is_sorted<I: Iterator<Item = DocId>>(mut it: I) -> bool {
if let Some(first) = it.next() {
let mut prev = first;
for doc in it {
if doc < prev {
return false;
}
prev = doc;
}
}
true
}
/// Takes a term_scorers sorted by their current doc() and a threshold and returns /// Takes a term_scorers sorted by their current doc() and a threshold and returns
/// Returns (pivot_len, pivot_ord) defined as follows: /// Returns (pivot_len, pivot_ord) defined as follows:
/// - `pivot_doc` lowest document that has a chance of exceeding (>) the threshold score. /// - `pivot_doc` lowest document that has a chance of exceeding (>) the threshold score.
@@ -55,37 +42,12 @@ fn find_pivot_doc(
Some((before_pivot_len, pivot_len, pivot_doc)) Some((before_pivot_len, pivot_len, pivot_doc))
} }
struct TermScorerWithMaxScore<'a> {
scorer: &'a mut TermScorer,
max_score: Score,
}
impl<'a> From<&'a mut TermScorer> for TermScorerWithMaxScore<'a> {
fn from(scorer: &'a mut TermScorer) -> Self {
let max_score = scorer.max_score();
TermScorerWithMaxScore { scorer, max_score }
}
}
impl<'a> Deref for TermScorerWithMaxScore<'a> {
type Target = TermScorer;
fn deref(&self) -> &Self::Target {
self.scorer
}
}
impl<'a> DerefMut for TermScorerWithMaxScore<'a> {
fn deref_mut(&mut self) -> &mut Self::Target {
self.scorer
}
}
// Before and after calling this method, scorers need to be sorted by their `.doc()`. // Before and after calling this method, scorers need to be sorted by their `.doc()`.
fn block_max_was_too_low_advance_one_scorer( fn block_max_was_too_low_advance_one_scorer(
scorers: &mut Vec<TermScorerWithMaxScore>, scorers: &mut Vec<TermScorerWithMaxScore>,
pivot_len: usize, pivot_len: usize,
) { ) {
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
let mut scorer_to_seek = pivot_len - 1; let mut scorer_to_seek = pivot_len - 1;
let mut doc_to_seek_after = scorers[scorer_to_seek].doc(); let mut doc_to_seek_after = scorers[scorer_to_seek].doc();
for scorer_ord in (0..pivot_len - 1).rev() { for scorer_ord in (0..pivot_len - 1).rev() {
@@ -102,6 +64,7 @@ fn block_max_was_too_low_advance_one_scorer(
} }
scorers[scorer_to_seek].seek(doc_to_seek_after + 1); scorers[scorer_to_seek].seek(doc_to_seek_after + 1);
restore_ordering(scorers, scorer_to_seek); restore_ordering(scorers, scorer_to_seek);
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
} }
// Given a list of term_scorers and a `ord` and assuming that `term_scorers[ord]` is sorted // Given a list of term_scorers and a `ord` and assuming that `term_scorers[ord]` is sorted
@@ -177,64 +140,99 @@ pub fn block_wand(
.map(TermScorerWithMaxScore::from) .map(TermScorerWithMaxScore::from)
.collect(); .collect();
scorers.sort_by_key(|scorer| scorer.doc()); scorers.sort_by_key(|scorer| scorer.doc());
loop { // At this point we need to ensure that the scorers are sorted!
// At this point we need to ensure that the scorers are sorted! debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
while let Some((before_pivot_len, pivot_len, pivot_doc)) =
find_pivot_doc(&scorers[..], threshold)
{
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc()))); debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
if let Some((before_pivot_len, pivot_len, pivot_doc)) = debug_assert_ne!(pivot_doc, TERMINATED);
find_pivot_doc(&scorers[..], threshold) debug_assert!(before_pivot_len < pivot_len);
{
debug_assert_ne!(pivot_doc, TERMINATED);
debug_assert!(before_pivot_len < pivot_len);
let block_max_score_upperbound: Score = scorers[..pivot_len] let block_max_score_upperbound: Score = scorers[..pivot_len]
.iter_mut() .iter_mut()
.map(|scorer| { .map(|scorer| {
scorer.shallow_seek(pivot_doc); scorer.shallow_seek(pivot_doc);
scorer.block_max_score() scorer.block_max_score()
}) })
.sum(); .sum();
// Beware after shallow advance, skip readers can be in advance compared to // Beware after shallow advance, skip readers can be in advance compared to
// the segment posting lists. // the segment posting lists.
// //
// `block_segment_postings.load_block()` need to be called separately. // `block_segment_postings.load_block()` need to be called separately.
if block_max_score_upperbound <= threshold { if block_max_score_upperbound <= threshold {
// Block max condition was not reached // Block max condition was not reached
// We could get away by simply advancing the scorers to DocId + 1 but it would // We could get away by simply advancing the scorers to DocId + 1 but it would
// be inefficient. The optimization requires proper explanation and was // be inefficient. The optimization requires proper explanation and was
// isolated in a different function. // isolated in a different function.
block_max_was_too_low_advance_one_scorer(&mut scorers, pivot_len); block_max_was_too_low_advance_one_scorer(&mut scorers, pivot_len);
continue; continue;
}
// Block max condition is observed.
//
// Let's try and advance all scorers before the pivot to the pivot.
if !align_scorers(&mut scorers, pivot_doc, before_pivot_len) {
// At least of the scorer does not contain the pivot.
//
// Let's stop scoring this pivot and go through the pivot selection again.
// Note that the current pivot is not necessarily a bad candidate and it
// may be picked again.
continue;
}
// At this point, all scorers are positioned on the doc.
let score = scorers[..pivot_len]
.iter_mut()
.map(|scorer| scorer.score())
.sum();
if score > threshold {
threshold = callback(pivot_doc, score);
}
// let's advance all of the scorers that are currently positioned on the pivot.
advance_all_scorers_on_pivot(&mut scorers, pivot_len);
} else {
return;
} }
// Block max condition is observed.
//
// Let's try and advance all scorers before the pivot to the pivot.
if !align_scorers(&mut scorers, pivot_doc, before_pivot_len) {
// At least of the scorer does not contain the pivot.
//
// Let's stop scoring this pivot and go through the pivot selection again.
// Note that the current pivot is not necessarily a bad candidate and it
// may be picked again.
continue;
}
// At this point, all scorers are positioned on the doc.
let score = scorers[..pivot_len]
.iter_mut()
.map(|scorer| scorer.score())
.sum();
if score > threshold {
threshold = callback(pivot_doc, score);
}
// let's advance all of the scorers that are currently positioned on the pivot.
advance_all_scorers_on_pivot(&mut scorers, pivot_len);
} }
} }
struct TermScorerWithMaxScore<'a> {
scorer: &'a mut TermScorer,
max_score: Score,
}
impl<'a> From<&'a mut TermScorer> for TermScorerWithMaxScore<'a> {
fn from(scorer: &'a mut TermScorer) -> Self {
let max_score = scorer.max_score();
TermScorerWithMaxScore { scorer, max_score }
}
}
impl<'a> Deref for TermScorerWithMaxScore<'a> {
type Target = TermScorer;
fn deref(&self) -> &Self::Target {
self.scorer
}
}
impl<'a> DerefMut for TermScorerWithMaxScore<'a> {
fn deref_mut(&mut self) -> &mut Self::Target {
self.scorer
}
}
fn is_sorted<I: Iterator<Item = DocId>>(mut it: I) -> bool {
if let Some(first) = it.next() {
let mut prev = first;
for doc in it {
if doc < prev {
return false;
}
prev = doc;
}
}
true
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::query::score_combiner::SumCombiner; use crate::query::score_combiner::SumCombiner;
@@ -248,17 +246,21 @@ mod tests {
use std::iter; use std::iter;
struct Float(Score); struct Float(Score);
impl Eq for Float {} impl Eq for Float {}
impl PartialEq for Float { impl PartialEq for Float {
fn eq(&self, other: &Self) -> bool { fn eq(&self, other: &Self) -> bool {
self.cmp(&other) == Ordering::Equal self.cmp(&other) == Ordering::Equal
} }
} }
impl PartialOrd for Float { impl PartialOrd for Float {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> { fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other)) Some(self.cmp(other))
} }
} }
impl Ord for Float { impl Ord for Float {
fn cmp(&self, other: &Self) -> Ordering { fn cmp(&self, other: &Self) -> Ordering {
other.0.partial_cmp(&self.0).unwrap_or(Ordering::Equal) other.0.partial_cmp(&self.0).unwrap_or(Ordering::Equal)

View File

@@ -32,7 +32,7 @@ mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
{ {
index_writer.add_document(doc!(text_field => "a b c")); index_writer.add_document(doc!(text_field => "a b c"));
index_writer.add_document(doc!(text_field => "a c")); index_writer.add_document(doc!(text_field => "a c"));
@@ -224,7 +224,7 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field => "a b c")); index_writer.add_document(doc!(text_field => "a b c"));
index_writer.add_document(doc!(text_field => "a c")); index_writer.add_document(doc!(text_field => "a c"));
index_writer.add_document(doc!(text_field => "b c")); index_writer.add_document(doc!(text_field => "b c"));

View File

@@ -144,7 +144,7 @@ mod tests {
fn test_boost_query_explain() { fn test_boost_query_explain() {
let schema = Schema::builder().build(); let schema = Schema::builder().build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(Document::new()); index_writer.add_document(Document::new());
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
let reader = index.reader().unwrap(); let reader = index.reader().unwrap();

View File

@@ -177,7 +177,7 @@ mod test {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!( index_writer.add_document(doc!(
country_field => "japan", country_field => "japan",
)); ));

View File

@@ -24,7 +24,7 @@ pub mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
for &text in texts { for &text in texts {
let doc = doc!(text_field=>text); let doc = doc!(text_field=>text);
index_writer.add_document(doc); index_writer.add_document(doc);
@@ -135,7 +135,7 @@ pub mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field=>"a b c")); index_writer.add_document(doc!(text_field=>"a b c"));
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
} }
@@ -186,7 +186,7 @@ pub mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field=>"b")); index_writer.add_document(doc!(text_field=>"b"));
index_writer.add_document(doc!(text_field=>"a b")); index_writer.add_document(doc!(text_field=>"a b"));
index_writer.add_document(doc!(text_field=>"b a")); index_writer.add_document(doc!(text_field=>"b a"));
@@ -217,7 +217,7 @@ pub mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field=>"a b c d e f g h")); index_writer.add_document(doc!(text_field=>"a b c d e f g h"));
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
} }

View File

@@ -9,8 +9,8 @@ use crate::query::Weight;
use crate::query::{EmptyScorer, Explanation}; use crate::query::{EmptyScorer, Explanation};
use crate::schema::IndexRecordOption; use crate::schema::IndexRecordOption;
use crate::schema::Term; use crate::schema::Term;
use crate::Score;
use crate::{DocId, DocSet}; use crate::{DocId, DocSet};
use crate::{Result, Score};
pub struct PhraseWeight { pub struct PhraseWeight {
phrase_terms: Vec<(usize, Term)>, phrase_terms: Vec<(usize, Term)>,
@@ -32,7 +32,7 @@ impl PhraseWeight {
} }
} }
fn fieldnorm_reader(&self, reader: &SegmentReader) -> FieldNormReader { fn fieldnorm_reader(&self, reader: &SegmentReader) -> crate::Result<FieldNormReader> {
let field = self.phrase_terms[0].1.field(); let field = self.phrase_terms[0].1.field();
reader.get_fieldnorms_reader(field) reader.get_fieldnorms_reader(field)
} }
@@ -41,9 +41,9 @@ impl PhraseWeight {
&self, &self,
reader: &SegmentReader, reader: &SegmentReader,
boost: Score, boost: Score,
) -> Result<Option<PhraseScorer<SegmentPostings>>> { ) -> crate::Result<Option<PhraseScorer<SegmentPostings>>> {
let similarity_weight = self.similarity_weight.boost_by(boost); let similarity_weight = self.similarity_weight.boost_by(boost);
let fieldnorm_reader = self.fieldnorm_reader(reader); let fieldnorm_reader = self.fieldnorm_reader(reader)?;
if reader.has_deletes() { if reader.has_deletes() {
let mut term_postings_list = Vec::new(); let mut term_postings_list = Vec::new();
for &(offset, ref term) in &self.phrase_terms { for &(offset, ref term) in &self.phrase_terms {
@@ -85,7 +85,7 @@ impl PhraseWeight {
} }
impl Weight for PhraseWeight { impl Weight for PhraseWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> Result<Box<dyn Scorer>> { fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
if let Some(scorer) = self.phrase_scorer(reader, boost)? { if let Some(scorer) = self.phrase_scorer(reader, boost)? {
Ok(Box::new(scorer)) Ok(Box::new(scorer))
} else { } else {
@@ -93,7 +93,7 @@ impl Weight for PhraseWeight {
} }
} }
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> { fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let scorer_opt = self.phrase_scorer(reader, 1.0)?; let scorer_opt = self.phrase_scorer(reader, 1.0)?;
if scorer_opt.is_none() { if scorer_opt.is_none() {
return Err(does_not_match(doc)); return Err(does_not_match(doc));
@@ -102,7 +102,7 @@ impl Weight for PhraseWeight {
if scorer.seek(doc) != doc { if scorer.seek(doc) != doc {
return Err(does_not_match(doc)); return Err(does_not_match(doc));
} }
let fieldnorm_reader = self.fieldnorm_reader(reader); let fieldnorm_reader = self.fieldnorm_reader(reader)?;
let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc); let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc);
let phrase_count = scorer.phrase_count(); let phrase_count = scorer.phrase_count();
let mut explanation = Explanation::new("Phrase Scorer", scorer.score()); let mut explanation = Explanation::new("Phrase Scorer", scorer.score());

View File

@@ -40,7 +40,7 @@ use std::fmt;
/// ///
/// When implementing a new type of `Query`, it is normal to implement a /// When implementing a new type of `Query`, it is normal to implement a
/// dedicated `Query`, `Weight` and `Scorer`. /// dedicated `Query`, `Weight` and `Scorer`.
pub trait Query: QueryClone + downcast_rs::Downcast + fmt::Debug { pub trait Query: QueryClone + Send + Sync + downcast_rs::Downcast + fmt::Debug {
/// Create the weight associated to a query. /// Create the weight associated to a query.
/// ///
/// If scoring is not required, setting `scoring_enabled` to `false` /// If scoring is not required, setting `scoring_enabled` to `false`

View File

@@ -21,51 +21,48 @@ use std::str::FromStr;
use tantivy_query_grammar::{UserInputAST, UserInputBound, UserInputLeaf}; use tantivy_query_grammar::{UserInputAST, UserInputBound, UserInputLeaf};
/// Possible error that may happen when parsing a query. /// Possible error that may happen when parsing a query.
#[derive(Debug, PartialEq, Eq, Fail)] #[derive(Debug, PartialEq, Eq, Error)]
pub enum QueryParserError { pub enum QueryParserError {
/// Error in the query syntax /// Error in the query syntax
#[fail(display = "Syntax Error")] #[error("Syntax Error")]
SyntaxError, SyntaxError,
/// `FieldDoesNotExist(field_name: String)` /// `FieldDoesNotExist(field_name: String)`
/// The query references a field that is not in the schema /// The query references a field that is not in the schema
#[fail(display = "File does not exists: '{:?}'", _0)] #[error("File does not exists: '{0:?}'")]
FieldDoesNotExist(String), FieldDoesNotExist(String),
/// The query contains a term for a `u64` or `i64`-field, but the value /// The query contains a term for a `u64` or `i64`-field, but the value
/// is neither. /// is neither.
#[fail(display = "Expected a valid integer: '{:?}'", _0)] #[error("Expected a valid integer: '{0:?}'")]
ExpectedInt(ParseIntError), ExpectedInt(ParseIntError),
/// The query contains a term for a `f64`-field, but the value /// The query contains a term for a `f64`-field, but the value
/// is not a f64. /// is not a f64.
#[fail(display = "Invalid query: Only excluding terms given")] #[error("Invalid query: Only excluding terms given")]
ExpectedFloat(ParseFloatError), ExpectedFloat(ParseFloatError),
/// It is forbidden queries that are only "excluding". (e.g. -title:pop) /// It is forbidden queries that are only "excluding". (e.g. -title:pop)
#[fail(display = "Invalid query: Only excluding terms given")] #[error("Invalid query: Only excluding terms given")]
AllButQueryForbidden, AllButQueryForbidden,
/// If no default field is declared, running a query without any /// If no default field is declared, running a query without any
/// field specified is forbbidden. /// field specified is forbbidden.
#[fail(display = "No default field declared and no field specified in query")] #[error("No default field declared and no field specified in query")]
NoDefaultFieldDeclared, NoDefaultFieldDeclared,
/// The field searched for is not declared /// The field searched for is not declared
/// as indexed in the schema. /// as indexed in the schema.
#[fail(display = "The field '{:?}' is not declared as indexed", _0)] #[error("The field '{0:?}' is not declared as indexed")]
FieldNotIndexed(String), FieldNotIndexed(String),
/// A phrase query was requested for a field that does not /// A phrase query was requested for a field that does not
/// have any positions indexed. /// have any positions indexed.
#[fail(display = "The field '{:?}' does not have positions indexed", _0)] #[error("The field '{0:?}' does not have positions indexed")]
FieldDoesNotHavePositionsIndexed(String), FieldDoesNotHavePositionsIndexed(String),
/// The tokenizer for the given field is unknown /// The tokenizer for the given field is unknown
/// The two argument strings are the name of the field, the name of the tokenizer /// The two argument strings are the name of the field, the name of the tokenizer
#[fail( #[error("The tokenizer '{0:?}' for the field '{1:?}' is unknown")]
display = "The tokenizer '{:?}' for the field '{:?}' is unknown",
_0, _1
)]
UnknownTokenizer(String, String), UnknownTokenizer(String, String),
/// The query contains a range query with a phrase as one of the bounds. /// The query contains a range query with a phrase as one of the bounds.
/// Only terms can be used as bounds. /// Only terms can be used as bounds.
#[fail(display = "A range query cannot have a phrase as one of the bounds")] #[error("A range query cannot have a phrase as one of the bounds")]
RangeMustNotHavePhrase, RangeMustNotHavePhrase,
/// The format for the date field is not RFC 3339 compliant. /// The format for the date field is not RFC 3339 compliant.
#[fail(display = "The date field has an invalid format")] #[error("The date field has an invalid format")]
DateFormatError(chrono::ParseError), DateFormatError(chrono::ParseError),
} }

View File

@@ -9,7 +9,6 @@ use crate::query::{Query, Scorer, Weight};
use crate::schema::Type; use crate::schema::Type;
use crate::schema::{Field, IndexRecordOption, Term}; use crate::schema::{Field, IndexRecordOption, Term};
use crate::termdict::{TermDictionary, TermStreamer}; use crate::termdict::{TermDictionary, TermStreamer};
use crate::Result;
use crate::{DocId, Score}; use crate::{DocId, Score};
use std::collections::Bound; use std::collections::Bound;
use std::ops::Range; use std::ops::Range;
@@ -48,7 +47,7 @@ fn map_bound<TFrom, TTo, Transform: Fn(&TFrom) -> TTo>(
/// let schema = schema_builder.build(); /// let schema = schema_builder.build();
/// ///
/// let index = Index::create_in_ram(schema); /// let index = Index::create_in_ram(schema);
/// let mut index_writer = index.writer_with_num_threads(1, 6_000_000)?; /// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// for year in 1950u64..2017u64 { /// for year in 1950u64..2017u64 {
/// let num_docs_within_year = 10 + (year - 1950) * (year - 1950); /// let num_docs_within_year = 10 + (year - 1950) * (year - 1950);
/// for _ in 0..num_docs_within_year { /// for _ in 0..num_docs_within_year {
@@ -246,7 +245,11 @@ impl RangeQuery {
} }
impl Query for RangeQuery { impl Query for RangeQuery {
fn weight(&self, searcher: &Searcher, _scoring_enabled: bool) -> Result<Box<dyn Weight>> { fn weight(
&self,
searcher: &Searcher,
_scoring_enabled: bool,
) -> crate::Result<Box<dyn Weight>> {
let schema = searcher.schema(); let schema = searcher.schema();
let value_type = schema.get_field_entry(self.field).field_type().value_type(); let value_type = schema.get_field_entry(self.field).field_type().value_type();
if value_type != self.value_type { if value_type != self.value_type {
@@ -289,7 +292,7 @@ impl RangeWeight {
} }
impl Weight for RangeWeight { impl Weight for RangeWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> Result<Box<dyn Scorer>> { fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let max_doc = reader.max_doc(); let max_doc = reader.max_doc();
let mut doc_bitset = BitSet::with_max_value(max_doc); let mut doc_bitset = BitSet::with_max_value(max_doc);
@@ -315,7 +318,7 @@ impl Weight for RangeWeight {
Ok(Box::new(ConstScorer::new(doc_bitset, boost))) Ok(Box::new(ConstScorer::new(doc_bitset, boost)))
} }
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> { fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0)?; let mut scorer = self.scorer(reader, 1.0)?;
if scorer.seek(doc) != doc { if scorer.seek(doc) != doc {
return Err(does_not_match(doc)); return Err(does_not_match(doc));
@@ -342,7 +345,7 @@ mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_with_num_threads(1, 6_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
for year in 1950u64..2017u64 { for year in 1950u64..2017u64 {
let num_docs_within_year = 10 + (year - 1950) * (year - 1950); let num_docs_within_year = 10 + (year - 1950) * (year - 1950);
for _ in 0..num_docs_within_year { for _ in 0..num_docs_within_year {
@@ -485,7 +488,7 @@ mod tests {
schema_builder.add_i64_field("year", INDEXED); schema_builder.add_i64_field("year", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone()); let index = Index::create_in_ram(schema.clone());
let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?; let mut index_writer = index.writer_for_tests()?;
let title = schema.get_field("title").unwrap(); let title = schema.get_field("title").unwrap();
let year = schema.get_field("year").unwrap(); let year = schema.get_field("year").unwrap();
index_writer.add_document(doc!( index_writer.add_document(doc!(

View File

@@ -103,7 +103,7 @@ mod test {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!( index_writer.add_document(doc!(
country_field => "japan", country_field => "japan",
)); ));

View File

@@ -25,7 +25,7 @@ mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
let doc = doc!(text_field => "a"); let doc = doc!(text_field => "a");
index_writer.add_document(doc); index_writer.add_document(doc);
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
@@ -50,7 +50,7 @@ mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?; let mut index_writer = index.writer_for_tests()?;
for _ in 0..COMPRESSION_BLOCK_SIZE { for _ in 0..COMPRESSION_BLOCK_SIZE {
let doc = doc!(text_field => "a"); let doc = doc!(text_field => "a");
index_writer.add_document(doc); index_writer.add_document(doc);
@@ -86,7 +86,7 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!( index_writer.add_document(doc!(
left_field => "left1 left2 left2 left2f2 left2f2 left3 abcde abcde abcde abcde abcde abcde abcde abcde abcde abcewde abcde abcde", left_field => "left1 left2 left2 left2f2 left2f2 left3 abcde abcde abcde abcde abcde abcde abcde abcde abcde abcewde abcde abcde",
right_field => "right1 right2", right_field => "right1 right2",
@@ -136,7 +136,7 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 5_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field=>"a b")); index_writer.add_document(doc!(text_field=>"a b"));
index_writer.add_document(doc!(text_field=>"a c")); index_writer.add_document(doc!(text_field=>"a c"));
index_writer.delete_term(Term::from_field_text(text_field, "b")); index_writer.delete_term(Term::from_field_text(text_field, "b"));
@@ -153,7 +153,7 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field=>"a")); index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"a")); index_writer.add_document(doc!(text_field=>"a"));
index_writer.commit()?; index_writer.commit()?;

View File

@@ -4,11 +4,10 @@ use crate::docset::DocSet;
use crate::postings::SegmentPostings; use crate::postings::SegmentPostings;
use crate::query::bm25::BM25Weight; use crate::query::bm25::BM25Weight;
use crate::query::explanation::does_not_match; use crate::query::explanation::does_not_match;
use crate::query::weight::{for_each_pruning_scorer, for_each_scorer}; use crate::query::weight::for_each_scorer;
use crate::query::Weight; use crate::query::Weight;
use crate::query::{Explanation, Scorer}; use crate::query::{Explanation, Scorer};
use crate::schema::IndexRecordOption; use crate::schema::IndexRecordOption;
use crate::Result;
use crate::Term; use crate::Term;
use crate::{DocId, Score}; use crate::{DocId, Score};
@@ -19,12 +18,12 @@ pub struct TermWeight {
} }
impl Weight for TermWeight { impl Weight for TermWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> Result<Box<dyn Scorer>> { fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let term_scorer = self.specialized_scorer(reader, boost)?; let term_scorer = self.specialized_scorer(reader, boost)?;
Ok(Box::new(term_scorer)) Ok(Box::new(term_scorer))
} }
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> { fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let mut scorer = self.specialized_scorer(reader, 1.0)?; let mut scorer = self.specialized_scorer(reader, 1.0)?;
if scorer.seek(doc) != doc { if scorer.seek(doc) != doc {
return Err(does_not_match(doc)); return Err(does_not_match(doc));
@@ -32,7 +31,7 @@ impl Weight for TermWeight {
Ok(scorer.explain()) Ok(scorer.explain())
} }
fn count(&self, reader: &SegmentReader) -> Result<u32> { fn count(&self, reader: &SegmentReader) -> crate::Result<u32> {
if let Some(delete_bitset) = reader.delete_bitset() { if let Some(delete_bitset) = reader.delete_bitset() {
Ok(self.scorer(reader, 1.0)?.count(delete_bitset)) Ok(self.scorer(reader, 1.0)?.count(delete_bitset))
} else { } else {
@@ -73,8 +72,8 @@ impl Weight for TermWeight {
reader: &SegmentReader, reader: &SegmentReader,
callback: &mut dyn FnMut(DocId, Score) -> Score, callback: &mut dyn FnMut(DocId, Score) -> Score,
) -> crate::Result<()> { ) -> crate::Result<()> {
let mut scorer = self.scorer(reader, 1.0)?; let scorer = self.specialized_scorer(reader, 1.0)?;
for_each_pruning_scorer(&mut scorer, threshold, callback); crate::query::boolean_query::block_wand(vec![scorer], threshold, callback);
Ok(()) Ok(())
} }
} }
@@ -96,10 +95,10 @@ impl TermWeight {
&self, &self,
reader: &SegmentReader, reader: &SegmentReader,
boost: Score, boost: Score,
) -> Result<TermScorer> { ) -> crate::Result<TermScorer> {
let field = self.term.field(); let field = self.term.field();
let inverted_index = reader.inverted_index(field); let inverted_index = reader.inverted_index(field);
let fieldnorm_reader = reader.get_fieldnorms_reader(field); let fieldnorm_reader = reader.get_fieldnorms_reader(field)?;
let similarity_weight = self.similarity_weight.boost_by(boost); let similarity_weight = self.similarity_weight.boost_by(boost);
let postings_opt: Option<SegmentPostings> = let postings_opt: Option<SegmentPostings> =
inverted_index.read_postings(&self.term, self.index_record_option); inverted_index.read_postings(&self.term, self.index_record_option);

View File

@@ -398,9 +398,9 @@ mod bench {
use crate::query::score_combiner::DoNothingCombiner; use crate::query::score_combiner::DoNothingCombiner;
use crate::query::{ConstScorer, Union, VecDocSet}; use crate::query::{ConstScorer, Union, VecDocSet};
use crate::tests;
use crate::DocId; use crate::DocId;
use crate::DocSet; use crate::DocSet;
use crate::{tests, TERMINATED};
use test::Bencher; use test::Bencher;
#[bench] #[bench]
@@ -414,10 +414,12 @@ mod bench {
union_docset union_docset
.iter() .iter()
.map(|doc_ids| VecDocSet::from(doc_ids.clone())) .map(|doc_ids| VecDocSet::from(doc_ids.clone()))
.map(ConstScorer::new) .map(|docset| ConstScorer::new(docset, 1.0))
.collect::<Vec<_>>(), .collect::<Vec<_>>(),
); );
while v.advance() {} while v.doc() != TERMINATED {
v.advance();
}
}); });
} }
#[bench] #[bench]
@@ -432,10 +434,12 @@ mod bench {
union_docset union_docset
.iter() .iter()
.map(|doc_ids| VecDocSet::from(doc_ids.clone())) .map(|doc_ids| VecDocSet::from(doc_ids.clone()))
.map(ConstScorer::new) .map(|docset| ConstScorer::new(docset, 1.0))
.collect::<Vec<_>>(), .collect::<Vec<_>>(),
); );
while v.advance() {} while v.doc() != TERMINATED {
v.advance();
}
}); });
} }
} }

View File

@@ -1,5 +1,7 @@
mod pool; mod pool;
use slog::error;
pub use self::pool::LeasedItem; pub use self::pool::LeasedItem;
use self::pool::Pool; use self::pool::Pool;
use crate::core::Segment; use crate::core::Segment;
@@ -62,6 +64,7 @@ impl IndexReaderBuilder {
/// to open different segment readers. It may take hundreds of milliseconds /// to open different segment readers. It may take hundreds of milliseconds
/// of time and it may return an error. /// of time and it may return an error.
pub fn try_into(self) -> crate::Result<IndexReader> { pub fn try_into(self) -> crate::Result<IndexReader> {
let logger = self.index.logger().clone();
let inner_reader = InnerIndexReader { let inner_reader = InnerIndexReader {
index: self.index, index: self.index,
num_searchers: self.num_searchers, num_searchers: self.num_searchers,
@@ -80,8 +83,8 @@ impl IndexReaderBuilder {
let callback = move || { let callback = move || {
if let Err(err) = inner_reader_arc_clone.reload() { if let Err(err) = inner_reader_arc_clone.reload() {
error!( error!(
"Error while loading searcher after commit was detected. {:?}", logger,
err "Error while loading searcher after commit was detected. {:?}", err
); );
} }
}; };
@@ -138,9 +141,11 @@ impl InnerIndexReader {
.collect::<crate::Result<_>>()? .collect::<crate::Result<_>>()?
}; };
let schema = self.index.schema(); let schema = self.index.schema();
let searchers = (0..self.num_searchers) let searchers = std::iter::repeat_with(|| {
.map(|_| Searcher::new(schema.clone(), self.index.clone(), segment_readers.clone())) Searcher::new(schema.clone(), self.index.clone(), segment_readers.clone())
.collect(); })
.take(self.num_searchers)
.collect();
self.searcher_pool.publish_new_generation(searchers); self.searcher_pool.publish_new_generation(searchers);
Ok(()) Ok(())
} }

View File

@@ -74,9 +74,8 @@ impl Document {
} }
/// Add a text field. /// Add a text field.
pub fn add_text(&mut self, field: Field, text: &str) { pub fn add_text<S: ToString>(&mut self, field: Field, text: S) {
let value = Value::Str(String::from(text)); self.add(FieldValue::new(field, Value::Str(text.to_string())));
self.add(FieldValue::new(field, value));
} }
/// Add a pre-tokenized text field. /// Add a pre-tokenized text field.
@@ -110,8 +109,8 @@ impl Document {
} }
/// Add a bytes field /// Add a bytes field
pub fn add_bytes(&mut self, field: Field, value: Vec<u8>) { pub fn add_bytes<T: Into<Vec<u8>>>(&mut self, field: Field, value: T) {
self.add(FieldValue::new(field, Value::Bytes(value))) self.add(FieldValue::new(field, Value::Bytes(value.into())))
} }
/// Add a field value /// Add a field value

View File

@@ -1,5 +1,5 @@
use crate::schema::IntOptions;
use crate::schema::TextOptions; use crate::schema::TextOptions;
use crate::schema::{is_valid_field_name, IntOptions};
use crate::schema::FieldType; use crate::schema::FieldType;
use serde::de::{self, MapAccess, Visitor}; use serde::de::{self, MapAccess, Visitor};
@@ -24,6 +24,7 @@ impl FieldEntry {
/// Creates a new u64 field entry in the schema, given /// Creates a new u64 field entry in the schema, given
/// a name, and some options. /// a name, and some options.
pub fn new_text(field_name: String, text_options: TextOptions) -> FieldEntry { pub fn new_text(field_name: String, text_options: TextOptions) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry { FieldEntry {
name: field_name, name: field_name,
field_type: FieldType::Str(text_options), field_type: FieldType::Str(text_options),
@@ -33,6 +34,7 @@ impl FieldEntry {
/// Creates a new u64 field entry in the schema, given /// Creates a new u64 field entry in the schema, given
/// a name, and some options. /// a name, and some options.
pub fn new_u64(field_name: String, field_type: IntOptions) -> FieldEntry { pub fn new_u64(field_name: String, field_type: IntOptions) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry { FieldEntry {
name: field_name, name: field_name,
field_type: FieldType::U64(field_type), field_type: FieldType::U64(field_type),
@@ -42,6 +44,7 @@ impl FieldEntry {
/// Creates a new i64 field entry in the schema, given /// Creates a new i64 field entry in the schema, given
/// a name, and some options. /// a name, and some options.
pub fn new_i64(field_name: String, field_type: IntOptions) -> FieldEntry { pub fn new_i64(field_name: String, field_type: IntOptions) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry { FieldEntry {
name: field_name, name: field_name,
field_type: FieldType::I64(field_type), field_type: FieldType::I64(field_type),
@@ -51,6 +54,7 @@ impl FieldEntry {
/// Creates a new f64 field entry in the schema, given /// Creates a new f64 field entry in the schema, given
/// a name, and some options. /// a name, and some options.
pub fn new_f64(field_name: String, field_type: IntOptions) -> FieldEntry { pub fn new_f64(field_name: String, field_type: IntOptions) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry { FieldEntry {
name: field_name, name: field_name,
field_type: FieldType::F64(field_type), field_type: FieldType::F64(field_type),
@@ -60,6 +64,7 @@ impl FieldEntry {
/// Creates a new date field entry in the schema, given /// Creates a new date field entry in the schema, given
/// a name, and some options. /// a name, and some options.
pub fn new_date(field_name: String, field_type: IntOptions) -> FieldEntry { pub fn new_date(field_name: String, field_type: IntOptions) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry { FieldEntry {
name: field_name, name: field_name,
field_type: FieldType::Date(field_type), field_type: FieldType::Date(field_type),
@@ -68,6 +73,7 @@ impl FieldEntry {
/// Creates a field entry for a facet. /// Creates a field entry for a facet.
pub fn new_facet(field_name: String) -> FieldEntry { pub fn new_facet(field_name: String) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry { FieldEntry {
name: field_name, name: field_name,
field_type: FieldType::HierarchicalFacet, field_type: FieldType::HierarchicalFacet,
@@ -76,6 +82,7 @@ impl FieldEntry {
/// Creates a field entry for a bytes field /// Creates a field entry for a bytes field
pub fn new_bytes(field_name: String) -> FieldEntry { pub fn new_bytes(field_name: String) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry { FieldEntry {
name: field_name, name: field_name,
field_type: FieldType::Bytes, field_type: FieldType::Bytes,
@@ -268,6 +275,12 @@ mod tests {
use crate::schema::TEXT; use crate::schema::TEXT;
use serde_json; use serde_json;
#[test]
#[should_panic]
fn test_invalid_field_name_should_panic() {
FieldEntry::new_text("-hello".to_string(), TEXT);
}
#[test] #[test]
fn test_json_serialization() { fn test_json_serialization() {
let field_value = FieldEntry::new_text(String::from("title"), TEXT); let field_value = FieldEntry::new_text(String::from("title"), TEXT);

View File

@@ -149,14 +149,16 @@ pub use self::int_options::IntOptions;
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use regex::Regex; use regex::Regex;
/// Regular expression representing the restriction on a valid field names.
pub const FIELD_NAME_PATTERN: &str = r#"^[_a-zA-Z][_\-a-zA-Z0-9]*$"#;
/// Validator for a potential `field_name`. /// Validator for a potential `field_name`.
/// Returns true iff the name can be use for a field name. /// Returns true iff the name can be use for a field name.
/// ///
/// A field name must start by a letter `[a-zA-Z]`. /// A field name must start by a letter `[a-zA-Z]`.
/// The other characters can be any alphanumic character `[a-ZA-Z0-9]` or `_`. /// The other characters can be any alphanumic character `[a-ZA-Z0-9]` or `_`.
pub fn is_valid_field_name(field_name: &str) -> bool { pub fn is_valid_field_name(field_name: &str) -> bool {
static FIELD_NAME_PTN: Lazy<Regex> = static FIELD_NAME_PTN: Lazy<Regex> = Lazy::new(|| Regex::new(FIELD_NAME_PATTERN).unwrap());
Lazy::new(|| Regex::new("^[a-zA-Z][_a-zA-Z0-9]*$").unwrap());
FIELD_NAME_PTN.is_match(field_name) FIELD_NAME_PTN.is_match(field_name)
} }
@@ -170,6 +172,11 @@ mod tests {
assert!(is_valid_field_name("text")); assert!(is_valid_field_name("text"));
assert!(is_valid_field_name("text0")); assert!(is_valid_field_name("text0"));
assert!(!is_valid_field_name("0text")); assert!(!is_valid_field_name("0text"));
assert!(is_valid_field_name("field-name"));
assert!(is_valid_field_name("field_name"));
assert!(!is_valid_field_name("field!name"));
assert!(!is_valid_field_name("-fieldname"));
assert!(is_valid_field_name("_fieldname"));
assert!(!is_valid_field_name("")); assert!(!is_valid_field_name(""));
assert!(!is_valid_field_name("シャボン玉")); assert!(!is_valid_field_name("シャボン玉"));
assert!(is_valid_field_name("my_text_field")); assert!(is_valid_field_name("my_text_field"));

View File

@@ -381,19 +381,16 @@ impl<'de> Deserialize<'de> for Schema {
/// Error that may happen when deserializing /// Error that may happen when deserializing
/// a document from JSON. /// a document from JSON.
#[derive(Debug, Fail, PartialEq)] #[derive(Debug, Error, PartialEq)]
pub enum DocParsingError { pub enum DocParsingError {
/// The payload given is not valid JSON. /// The payload given is not valid JSON.
#[fail(display = "The provided string is not valid JSON")] #[error("The provided string is not valid JSON")]
NotJSON(String), NotJSON(String),
/// One of the value node could not be parsed. /// One of the value node could not be parsed.
#[fail(display = "The field '{:?}' could not be parsed: {:?}", _0, _1)] #[error("The field '{0:?}' could not be parsed: {1:?}")]
ValueError(String, ValueParsingError), ValueError(String, ValueParsingError),
/// The json-document contains a field that is not declared in the schema. /// The json-document contains a field that is not declared in the schema.
#[fail( #[error("The document contains a field that is not declared in the schema: {0:?}")]
display = "The document contains a field that is not declared in the schema: {:?}",
_0
)]
NoSuchFieldInSchema(String), NoSuchFieldInSchema(String),
} }

View File

@@ -4,7 +4,6 @@ use super::Field;
use crate::common; use crate::common;
use crate::schema::Facet; use crate::schema::Facet;
use crate::DateTime; use crate::DateTime;
use byteorder::{BigEndian, ByteOrder};
use std::str; use std::str;
/// Size (in bytes) of the buffer of a int field. /// Size (in bytes) of the buffer of a int field.
@@ -19,6 +18,10 @@ where
B: AsRef<[u8]>; B: AsRef<[u8]>;
impl Term { impl Term {
pub(crate) fn new() -> Term {
Term(Vec::with_capacity(100))
}
/// Builds a term given a field, and a i64-value /// Builds a term given a field, and a i64-value
/// ///
/// Assuming the term has a field id of 1, and a i64 value of 3234, /// Assuming the term has a field id of 1, and a i64 value of 3234,
@@ -93,6 +96,12 @@ impl Term {
term term
} }
pub(crate) fn from_field_bytes(field: Field, bytes: &[u8]) -> Term {
let mut term = Term::for_field(field);
term.set_bytes(bytes);
term
}
/// Creates a new Term for a given field. /// Creates a new Term for a given field.
pub(crate) fn for_field(field: Field) -> Term { pub(crate) fn for_field(field: Field) -> Term {
let mut term = Term(Vec::with_capacity(100)); let mut term = Term(Vec::with_capacity(100));
@@ -100,12 +109,10 @@ impl Term {
term term
} }
/// Returns the field. pub(crate) fn set_field(&mut self, field: Field) {
pub fn set_field(&mut self, field: Field) { self.0.clear();
if self.0.len() < 4 { self.0
self.0.resize(4, 0u8); .extend_from_slice(&field.field_id().to_be_bytes()[..]);
}
BigEndian::write_u32(&mut self.0[0..4], field.field_id());
} }
/// Sets a u64 value in the term. /// Sets a u64 value in the term.
@@ -116,7 +123,7 @@ impl Term {
/// the natural order of the values. /// the natural order of the values.
pub fn set_u64(&mut self, val: u64) { pub fn set_u64(&mut self, val: u64) {
self.0.resize(INT_TERM_LEN, 0u8); self.0.resize(INT_TERM_LEN, 0u8);
BigEndian::write_u64(&mut self.0[4..], val); self.0[4..12].copy_from_slice(val.to_be_bytes().as_ref());
} }
/// Sets a `i64` value in the term. /// Sets a `i64` value in the term.
@@ -134,12 +141,6 @@ impl Term {
self.0.extend(bytes); self.0.extend(bytes);
} }
pub(crate) fn from_field_bytes(field: Field, bytes: &[u8]) -> Term {
let mut term = Term::for_field(field);
term.set_bytes(bytes);
term
}
/// Set the texts only, keeping the field untouched. /// Set the texts only, keeping the field untouched.
pub fn set_text(&mut self, text: &str) { pub fn set_text(&mut self, text: &str) {
self.set_bytes(text.as_bytes()); self.set_bytes(text.as_bytes());
@@ -157,7 +158,9 @@ where
/// Returns the field. /// Returns the field.
pub fn field(&self) -> Field { pub fn field(&self) -> Field {
Field::from_field_id(BigEndian::read_u32(&self.0.as_ref()[..4])) let mut field_id_bytes = [0u8; 4];
field_id_bytes.copy_from_slice(&self.0.as_ref()[..4]);
Field::from_field_id(u32::from_be_bytes(field_id_bytes))
} }
/// Returns the `u64` value stored in a term. /// Returns the `u64` value stored in a term.
@@ -166,7 +169,9 @@ where
/// ... or returns an invalid value /// ... or returns an invalid value
/// if the term is not a `u64` field. /// if the term is not a `u64` field.
pub fn get_u64(&self) -> u64 { pub fn get_u64(&self) -> u64 {
BigEndian::read_u64(&self.0.as_ref()[4..]) let mut field_id_bytes = [0u8; 8];
field_id_bytes.copy_from_slice(self.value_bytes());
u64::from_be_bytes(field_id_bytes)
} }
/// Returns the `i64` value stored in a term. /// Returns the `i64` value stored in a term.
@@ -175,7 +180,7 @@ where
/// ... or returns an invalid value /// ... or returns an invalid value
/// if the term is not a `i64` field. /// if the term is not a `i64` field.
pub fn get_i64(&self) -> i64 { pub fn get_i64(&self) -> i64 {
common::u64_to_i64(BigEndian::read_u64(&self.0.as_ref()[4..])) common::u64_to_i64(self.get_u64())
} }
/// Returns the `f64` value stored in a term. /// Returns the `f64` value stored in a term.
@@ -184,7 +189,7 @@ where
/// ... or returns an invalid value /// ... or returns an invalid value
/// if the term is not a `f64` field. /// if the term is not a `f64` field.
pub fn get_f64(&self) -> f64 { pub fn get_f64(&self) -> f64 {
common::u64_to_f64(BigEndian::read_u64(&self.0.as_ref()[4..])) common::u64_to_f64(self.get_u64())
} }
/// Returns the text associated with the term. /// Returns the text associated with the term.

View File

@@ -221,6 +221,12 @@ impl<'a> From<&'a str> for Value {
} }
} }
impl<'a> From<&'a [u8]> for Value {
fn from(bytes: &'a [u8]) -> Value {
Value::Bytes(bytes.to_vec())
}
}
impl<'a> From<Facet> for Value { impl<'a> From<Facet> for Value {
fn from(facet: Facet) -> Value { fn from(facet: Facet) -> Value {
Value::Facet(facet) Value::Facet(facet)

View File

@@ -221,7 +221,7 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str)
/// # let text_field = schema_builder.add_text_field("text", TEXT); /// # let text_field = schema_builder.add_text_field("text", TEXT);
/// # let schema = schema_builder.build(); /// # let schema = schema_builder.build();
/// # let index = Index::create_in_ram(schema); /// # let index = Index::create_in_ram(schema);
/// # let mut index_writer = index.writer_with_num_threads(1, 30_000_000)?; /// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// # let doc = doc!(text_field => r#"Comme je descendais des Fleuves impassibles, /// # let doc = doc!(text_field => r#"Comme je descendais des Fleuves impassibles,
/// # Je ne me sentis plus guidé par les haleurs : /// # Je ne me sentis plus guidé par les haleurs :
/// # Des Peaux-Rouges criards les avaient pris pour cibles, /// # Des Peaux-Rouges criards les avaient pris pour cibles,
@@ -506,7 +506,7 @@ Survey in 2016, 2017, and 2018."#;
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field => "a")); index_writer.add_document(doc!(text_field => "a"));
index_writer.add_document(doc!(text_field => "a")); index_writer.add_document(doc!(text_field => "a"));
index_writer.add_document(doc!(text_field => "a b")); index_writer.add_document(doc!(text_field => "a b"));
@@ -562,7 +562,7 @@ Survey in 2016, 2017, and 2018."#;
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
{ {
let doc = doc ! (text_field => TEST_TEXT); let doc = doc ! (text_field => TEST_TEXT);
index_writer.add_document(doc); index_writer.add_document(doc);

View File

@@ -336,7 +336,7 @@ mod test {
let index = Index::create_in_ram(schema.clone()); let index = Index::create_in_ram(schema.clone());
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(name => 1u64)); index_writer.add_document(doc!(name => 1u64));
index_writer.add_document(doc!(name => 2u64)); index_writer.add_document(doc!(name => 2u64));
index_writer.add_document(doc!(name => 10u64)); index_writer.add_document(doc!(name => 10u64));
@@ -374,7 +374,7 @@ mod test {
let index = Index::create_in_ram(schema.clone()); let index = Index::create_in_ram(schema.clone());
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(name => "hi")); index_writer.add_document(doc!(name => "hi"));
index_writer.add_document(doc!(name => "this is a test")); index_writer.add_document(doc!(name => "this is a test"));
index_writer.add_document( index_writer.add_document(
@@ -414,7 +414,7 @@ mod test {
let index = Index::create_in_ram(schema.clone()); let index = Index::create_in_ram(schema.clone());
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(name => "hi")); index_writer.add_document(doc!(name => "hi"));
index_writer.add_document(doc!(name => "this is a test")); index_writer.add_document(doc!(name => "this is a test"));
index_writer.add_document( index_writer.add_document(
@@ -453,7 +453,7 @@ mod test {
let index = Index::create_in_ram(schema.clone()); let index = Index::create_in_ram(schema.clone());
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(name => 1u64)); index_writer.add_document(doc!(name => 1u64));
index_writer.add_document(doc!(name => 2u64)); index_writer.add_document(doc!(name => 2u64));
index_writer.add_document(doc!(name => 3u64)); index_writer.add_document(doc!(name => 3u64));

View File

@@ -68,19 +68,17 @@ impl<T: BinarySerializable> SkipListBuilder<T> {
} }
pub fn insert(&mut self, key: u64, dest: &T) -> io::Result<()> { pub fn insert(&mut self, key: u64, dest: &T) -> io::Result<()> {
let mut layer_id = 0;
let mut skip_pointer = self.data_layer.insert(key, dest)?; let mut skip_pointer = self.data_layer.insert(key, dest)?;
loop { for layer_id in 0.. {
skip_pointer = match skip_pointer { if let Some((skip_doc_id, skip_offset)) = skip_pointer {
Some((skip_doc_id, skip_offset)) => self skip_pointer = self
.get_skip_layer(layer_id) .get_skip_layer(layer_id)
.insert(skip_doc_id, &skip_offset)?, .insert(skip_doc_id, &skip_offset)?;
None => { } else {
return Ok(()); break;
} }
};
layer_id += 1;
} }
Ok(())
} }
pub fn write<W: Write>(self, output: &mut W) -> io::Result<()> { pub fn write<W: Write>(self, output: &mut W) -> io::Result<()> {

View File

@@ -138,7 +138,7 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
{ {
{ {
let mut doc = Document::default(); let mut doc = Document::default();