Compare commits

...

25 Commits
0.13.1 ... slog

Author SHA1 Message Date
Paul Masurel
1c81b8171f Switch to slog
Closes #111
2020-09-30 19:55:54 +09:00
Paul Masurel
838c476733 Hirevo move to thiserror (#889)
* Migrated from `failure` to `thiserror`

* Refactoring

Co-authored-by: Nicolas Polomack <nicolas@polomack.eu>
2020-09-30 16:34:10 +09:00
Paul Masurel
5f574348d1 Syntactic change. 2020-09-26 21:33:00 +09:00
Paul Masurel
19a02b2c30 Merge tag '0.13.1'
0.13.1 was published as a hotfix to accomodate tantivy-py.
2020-09-19 21:20:27 +09:00
Paul Masurel
70bae7ce4c Removing Term Vec allocation (#881) 2020-09-08 23:11:00 +09:00
Paul Masurel
ac2a7273e6 Re-added comment to Score. 2020-09-08 21:41:34 +09:00
Paul Masurel
4ce9517a82 fix unit test for bench. remove scoref64 feature. fixed test for lz4 feature. 2020-09-08 07:35:00 +09:00
Paul Masurel
73024a8af3 Fixing compilation of bench and doctests. 2020-09-08 07:18:43 +09:00
Paul Masurel
e70e605fc3 fix unit test (at least on linux) 2020-09-07 23:35:04 +09:00
Paul Masurel
439d6956a9 Returning Result in some of the API (#880)
* Returning Result in some of the API

* Introducing `.writer_for_test(..)`
2020-09-07 15:52:34 +09:00
Paul Masurel
6530bf0eae Make field types less strict when populating documents. 2020-09-06 10:24:03 +09:00
Paul Masurel
151498cbe7 Creating the tempfile for atomicwrites in the same directory as the MmapDirectory. (#878) 2020-09-05 23:06:29 +09:00
Paul Masurel
3a72b1cb98 Accept dash within field names. (#874)
Accept dash in field names and enforce field names constraint at the
creation of the schema.

Closes #796
2020-09-01 13:38:52 +09:00
Paul Masurel
2737822620 Fixing unit tests. (#868)
There was a unit test failing when notify was sending more
than one event on atomicwrites.

It was observed on MacOS CI.
2020-08-27 16:43:39 +09:00
b8591340
06c12ae221 Filter meta.json from validate_checksum (#872) 2020-08-27 07:54:37 +09:00
Paul Masurel
4e4400af7f Added cargo timing report to .gitignore 2020-08-23 16:15:28 +09:00
Paul Masurel
3f1ecf53ab Merge branch 'master' of github.com:tantivy-search/tantivy 2020-08-22 21:30:47 +09:00
Paul Masurel
0b583b8130 Plastic changes 2020-08-22 21:29:12 +09:00
Paul Masurel
31d18dca1c Removing dependency to atomicwrites (#866) 2020-08-21 21:37:05 +09:00
stephenlagree
5e06e7de5a Update basic_search.rs (#865)
Remove duplicated document entry.
2020-08-21 11:23:09 +09:00
Paul Masurel
8af53cbd36 Merge branch 'master' of github.com:tantivy-search/tantivy 2020-08-21 08:57:42 +09:00
Paul Masurel
4914076e8f Fixing release build 2020-08-21 08:57:27 +09:00
Paul Masurel
e04f47e922 Using block wand for term queries too. 2020-08-20 15:51:21 +09:00
Paul Masurel
f355695581 Code clean up 2020-08-20 15:42:50 +09:00
Paul Masurel
cbacdf0de8 Edited README. 2020-08-20 14:28:24 +09:00
71 changed files with 1119 additions and 1021 deletions

1
.gitignore vendored
View File

@@ -12,3 +12,4 @@ cpp/simdcomp/bitpackingbenchmark
*.bk
.idea
trace.dat
cargo-timing*

View File

@@ -1,9 +1,14 @@
Tantivy 0.14.0
=========================
- Remove dependency to atomicwrites #833. Implemented by @pmasurel upon suggestion and research from @asafigan).
- Migrated tantivy error from the now deprecated `failure` crate to `thiserror` #760. (@hirevo)
- Switched to structure logging (via the `slog` crate). (@pmasurel)
Tantivy 0.13.1
======================
===================
Made `Query` and `Collector` `Send + Sync`.
Updated misc dependency versions.
Tantivy 0.13.0
======================
Tantivy 0.13 introduce a change in the index format that will require

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.13.1"
version = "0.14.0-dev"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
@@ -22,9 +22,9 @@ tantivy-fst = "0.3"
memmap = {version = "0.7", optional=true}
lz4 = {version="1", optional=true}
snap = "1"
atomicwrites = {version="0.2", optional=true}
tempfile = "3"
log = "0.4"
tempfile = {version="3", optional=true}
slog = "2.5"
slog-stdlog = "4"
serde = {version="1", features=["derive"]}
serde_json = "1"
num_cpus = "1"
@@ -35,15 +35,15 @@ uuid = { version = "0.8", features = ["v4", "serde"] }
crossbeam = "0.7"
futures = {version = "0.3", features=["thread-pool"] }
owning_ref = "0.4"
tantivy-query-grammar = { version="0.14.0-dev", path="./query-grammar" }
stable_deref_trait = "1"
rust-stemmers = "1"
downcast-rs = "1"
tantivy-query-grammar = { version="0.13", path="./query-grammar" }
bitpacking = {version="0.8", default-features = false, features=["bitpacker4x"]}
census = "0.4"
fnv = "1"
owned-read = "0.4"
failure = "0.1"
thiserror = "1.0"
htmlescape = "0.3"
fail = "0.4"
murmurhash32 = "0.2"
@@ -75,12 +75,11 @@ overflow-checks = true
[features]
default = ["mmap"]
mmap = ["atomicwrites", "fs2", "memmap", "notify"]
mmap = ["fs2", "tempfile", "memmap", "notify"]
lz4-compression = ["lz4"]
failpoints = ["fail/failpoints"]
unstable = [] # useful for benches.
wasm-bindgen = ["uuid/wasm-bindgen"]
scoref64 = [] # scores are f64 instead of f32. was introduced to debug blockwand.
[workspace]
members = ["query-grammar"]

View File

@@ -34,11 +34,6 @@ Tantivy is, in fact, strongly inspired by Lucene's design.
The following [benchmark](https://tantivy-search.github.io/bench/) break downs
performance for different type of queries / collection.
In general, Tantivy tends to be
- slower than Lucene on union with a Top-K due to Block-WAND optimization.
- faster than Lucene on intersection and phrase queries.
Your mileage WILL vary depending on the nature of queries and their load.
# Features

View File

@@ -112,18 +112,6 @@ fn main() -> tantivy::Result<()> {
limbs and branches that arch over the pool"
));
index_writer.add_document(doc!(
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
// Multivalued field just need to be repeated.
index_writer.add_document(doc!(
title => "Frankenstein",

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy-query-grammar"
version = "0.13.0"
version = "0.14.0-dev"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]

View File

@@ -52,7 +52,7 @@ mod test {
use crate::Occur;
#[test]
fn test_Occur_compose() {
fn test_occur_compose() {
assert_eq!(Occur::compose(Occur::Should, Occur::Should), Occur::Should);
assert_eq!(Occur::compose(Occur::Should, Occur::Must), Occur::Must);
assert_eq!(

View File

@@ -9,8 +9,10 @@ use combine::{
fn field<'a>() -> impl Parser<&'a str, Output = String> {
(
letter(),
many(satisfy(|c: char| c.is_alphanumeric() || c == '_')),
(letter().or(char('_'))),
many(satisfy(|c: char| {
c.is_alphanumeric() || c == '_' || c == '-'
})),
)
.skip(char(':'))
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
@@ -279,6 +281,8 @@ pub fn parse_to_ast<'a>() -> impl Parser<&'a str, Output = UserInputAST> {
#[cfg(test)]
mod test {
type TestParseResult = Result<(), StringStreamError>;
use super::*;
use combine::parser::Parser;
@@ -296,9 +300,10 @@ mod test {
}
#[test]
fn test_occur_symbol() {
assert_eq!(super::occur_symbol().parse("-"), Ok((Occur::MustNot, "")));
assert_eq!(super::occur_symbol().parse("+"), Ok((Occur::Must, "")));
fn test_occur_symbol() -> TestParseResult {
assert_eq!(super::occur_symbol().parse("-")?, (Occur::MustNot, ""));
assert_eq!(super::occur_symbol().parse("+")?, (Occur::Must, ""));
Ok(())
}
#[test]
@@ -410,6 +415,25 @@ mod test {
assert_eq!(format!("{:?}", ast), "\"abc\"");
}
#[test]
fn test_field_name() -> TestParseResult {
assert_eq!(
super::field().parse("my-field-name:a")?,
("my-field-name".to_string(), "a")
);
assert_eq!(
super::field().parse("my_field_name:a")?,
("my_field_name".to_string(), "a")
);
assert!(super::field().parse(":a").is_err());
assert!(super::field().parse("-my_field:a").is_err());
assert_eq!(
super::field().parse("_my_field:a")?,
("_my_field".to_string(), "a")
);
Ok(())
}
#[test]
fn test_range_parser() {
// testing the range() parser separately

View File

@@ -472,7 +472,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let num_facets: usize = 3 * 4 * 5;
let facets: Vec<Facet> = (0..num_facets)
.map(|mut n| {
@@ -531,7 +531,7 @@ mod tests {
let facet_field = schema_builder.add_facet_field("facets");
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(
facet_field => Facet::from_text(&"/subjects/A/a"),
facet_field => Facet::from_text(&"/subjects/B/a"),
@@ -550,12 +550,12 @@ mod tests {
}
#[test]
fn test_doc_search_by_facet() {
fn test_doc_search_by_facet() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let facet_field = schema_builder.add_facet_field("facet");
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(
facet_field => Facet::from_text(&"/A/A"),
));
@@ -568,8 +568,8 @@ mod tests {
index_writer.add_document(doc!(
facet_field => Facet::from_text(&"/D/C/A"),
));
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 4);
@@ -586,17 +586,17 @@ mod tests {
assert_eq!(count_facet("/A/C"), 1);
assert_eq!(count_facet("/A/C/A"), 1);
assert_eq!(count_facet("/C/A"), 0);
let query_parser = QueryParser::for_index(&index, vec![]);
{
let query_parser = QueryParser::for_index(&index, vec![]);
{
let query = query_parser.parse_query("facet:/A/B").unwrap();
assert_eq!(1, searcher.search(&query, &Count).unwrap());
}
{
let query = query_parser.parse_query("facet:/A").unwrap();
assert_eq!(3, searcher.search(&query, &Count).unwrap());
}
let query = query_parser.parse_query("facet:/A/B")?;
assert_eq!(1, searcher.search(&query, &Count).unwrap());
}
{
let query = query_parser.parse_query("facet:/A")?;
assert_eq!(3, searcher.search(&query, &Count)?);
}
Ok(())
}
#[test]
@@ -631,7 +631,7 @@ mod tests {
.collect();
docs[..].shuffle(&mut thread_rng());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
for doc in docs {
index_writer.add_document(doc);
}
@@ -684,7 +684,7 @@ mod bench {
// 40425 docs
docs[..].shuffle(&mut thread_rng());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
for doc in docs {
index_writer.add_document(doc);
}

View File

@@ -89,7 +89,7 @@ mod tests {
let index = Index::create_in_ram(schema.clone());
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
{
for i in 0u64..10u64 {
index_writer.add_document(doc!(

View File

@@ -259,7 +259,7 @@ mod tests {
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text=>"abc"));
index_writer.add_document(doc!(text=>"abc abc abc"));
index_writer.add_document(doc!(text=>"abc abc"));

View File

@@ -38,7 +38,7 @@ use std::fmt;
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
///
/// let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
/// index_writer.add_document(doc!(title => "The Name of the Wind"));
/// index_writer.add_document(doc!(title => "The Diary of Muadib"));
/// index_writer.add_document(doc!(title => "A Dairy Cow"));
@@ -123,7 +123,7 @@ impl TopDocs {
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
///
/// let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
/// index_writer.add_document(doc!(title => "The Name of the Wind"));
/// index_writer.add_document(doc!(title => "The Diary of Muadib"));
/// index_writer.add_document(doc!(title => "A Dairy Cow"));
@@ -163,7 +163,7 @@ impl TopDocs {
/// # let schema = schema_builder.build();
/// #
/// # let index = Index::create_in_ram(schema);
/// # let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
/// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// # index_writer.add_document(doc!(title => "The Name of the Wind", rating => 92u64));
/// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64));
/// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64));
@@ -264,7 +264,7 @@ impl TopDocs {
/// fn create_index() -> tantivy::Result<Index> {
/// let schema = create_schema();
/// let index = Index::create_in_ram(schema);
/// let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// let product_name = index.schema().get_field("product_name").unwrap();
/// let popularity: Field = index.schema().get_field("popularity").unwrap();
/// index_writer.add_document(doc!(product_name => "The Diary of Muadib", popularity => 1u64));
@@ -371,7 +371,7 @@ impl TopDocs {
/// # fn main() -> tantivy::Result<()> {
/// # let schema = create_schema();
/// # let index = Index::create_in_ram(schema);
/// # let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
/// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// # let product_name = index.schema().get_field("product_name").unwrap();
/// #
/// let popularity: Field = index.schema().get_field("popularity").unwrap();
@@ -561,7 +561,7 @@ mod tests {
let index = Index::create_in_ram(schema);
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"Hello happy tax payer."));
index_writer.add_document(doc!(text_field=>"Droopy says hello happy tax payer"));
index_writer.add_document(doc!(text_field=>"I like Droopy"));
@@ -821,7 +821,7 @@ mod tests {
) -> (Index, Box<dyn Query>) {
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
doc_adder(&mut index_writer);
index_writer.commit().unwrap();
let query_parser = QueryParser::for_index(&index, vec![query_field]);

View File

@@ -1,5 +1,6 @@
use crossbeam::channel;
use rayon::{ThreadPool, ThreadPoolBuilder};
use slog::{error, Logger};
/// Search executor whether search request are single thread or multithread.
///
@@ -43,6 +44,7 @@ impl Executor {
&self,
f: F,
args: AIterator,
logger: Logger,
) -> crate::Result<Vec<R>> {
match self {
Executor::SingleThread => args.map(f).collect::<crate::Result<_>>(),
@@ -57,7 +59,7 @@ impl Executor {
let (idx, arg) = arg_with_idx;
let fruit = f(arg);
if let Err(err) = fruit_sender.send((idx, fruit)) {
error!("Failed to send search task. It probably means all search threads have panicked. {:?}", err);
error!(logger, "Failed to send search task. It probably means all search threads have panicked. {:?}", err);
}
});
}
@@ -87,17 +89,21 @@ impl Executor {
#[cfg(test)]
mod tests {
use slog::{o, Discard, Logger};
use super::Executor;
#[test]
#[should_panic(expected = "panic should propagate")]
fn test_panic_propagates_single_thread() {
let logger = Logger::root(Discard, o!());
let _result: Vec<usize> = Executor::single_thread()
.map(
|_| {
panic!("panic should propagate");
},
vec![0].into_iter(),
logger,
)
.unwrap();
}
@@ -105,6 +111,7 @@ mod tests {
#[test]
#[should_panic] //< unfortunately the panic message is not propagated
fn test_panic_propagates_multi_thread() {
let logger = Logger::root(Discard, o!());
let _result: Vec<usize> = Executor::multi_thread(1, "search-test")
.unwrap()
.map(
@@ -112,14 +119,16 @@ mod tests {
panic!("panic should propagate");
},
vec![0].into_iter(),
logger,
)
.unwrap();
}
#[test]
fn test_map_singlethread() {
let logger = Logger::root(Discard, o!());
let result: Vec<usize> = Executor::single_thread()
.map(|i| Ok(i * 2), 0..1_000)
.map(|i| Ok(i * 2), 0..1_000, logger)
.unwrap();
assert_eq!(result.len(), 1_000);
for i in 0..1_000 {
@@ -129,9 +138,10 @@ mod tests {
#[test]
fn test_map_multithread() {
let logger = Logger::root(Discard, o!());
let result: Vec<usize> = Executor::multi_thread(3, "search-test")
.unwrap()
.map(|i| Ok(i * 2), 0..10)
.map(|i| Ok(i * 2), 0..10, logger)
.unwrap();
assert_eq!(result.len(), 10);
for i in 0..10 {

View File

@@ -21,6 +21,7 @@ use crate::schema::FieldType;
use crate::schema::Schema;
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
use crate::IndexWriter;
use slog::Logger;
use std::borrow::BorrowMut;
use std::collections::HashSet;
use std::fmt;
@@ -57,7 +58,14 @@ pub struct Index {
}
impl Index {
/// Examines the director to see if it contains an index
pub(crate) fn logger(&self) -> &Logger {
self.directory.logger()
}
/// Examines the directory to see if it contains an index.
///
/// Effectively, it only checks for the presence of the `meta.json` file.
pub fn exists<Dir: Directory>(dir: &Dir) -> bool {
dir.exists(&META_FILEPATH)
}
@@ -140,16 +148,18 @@ impl Index {
Index::create(mmap_directory, schema)
}
/// Creates a new index given an implementation of the trait `Directory`
/// Creates a new index given an implementation of the trait `Directory`.
///
/// If a directory previously existed, it will be erased.
pub fn create<Dir: Directory>(dir: Dir, schema: Schema) -> crate::Result<Index> {
let directory = ManagedDirectory::wrap(dir)?;
Index::from_directory(directory, schema)
Index::new_from_directory(directory, schema)
}
/// Create a new index from a directory.
///
/// This will overwrite existing meta.json
fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> crate::Result<Index> {
fn new_from_directory(mut directory: ManagedDirectory, schema: Schema) -> crate::Result<Index> {
save_new_metas(schema.clone(), directory.borrow_mut())?;
let metas = IndexMeta::with_schema(schema);
Index::create_from_metas(directory, &metas, SegmentMetaInventory::default())
@@ -240,6 +250,8 @@ impl Index {
/// Open the index using the provided directory
pub fn open<D: Directory>(directory: D) -> crate::Result<Index> {
let logger: &Logger = directory.logger();
slog::info!(logger, "index-open"; "directory" => format!("{:?}", directory));
let directory = ManagedDirectory::wrap(directory)?;
let inventory = SegmentMetaInventory::default();
let metas = load_metas(&directory, &inventory)?;
@@ -300,6 +312,15 @@ impl Index {
)
}
/// Helper to create an index writer for tests.
///
/// That index writer only simply has a single thread and a heap of 5 MB.
/// Using a single thread gives us a deterministic allocation of DocId.
#[cfg(test)]
pub fn writer_for_tests(&self) -> crate::Result<IndexWriter> {
self.writer_with_num_threads(1, 10_000_000)
}
/// Creates a multithreaded writer
///
/// Tantivy will automatically define the number of threads to use.
@@ -502,7 +523,7 @@ mod tests {
let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap();
let mut index = Index::create_from_tempdir(schema).unwrap();
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut writer = index.writer_for_tests().unwrap();
writer.commit().unwrap();
let reader = index
.reader_builder()
@@ -539,23 +560,33 @@ mod tests {
test_index_on_commit_reload_policy_aux(field, &write_index, &reader);
}
}
fn test_index_on_commit_reload_policy_aux(field: Field, index: &Index, reader: &IndexReader) {
let mut reader_index = reader.index();
let (sender, receiver) = crossbeam::channel::unbounded();
let _watch_handle = reader_index.directory_mut().watch(Box::new(move || {
let _ = sender.send(());
}));
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut writer = index.writer_for_tests().unwrap();
assert_eq!(reader.searcher().num_docs(), 0);
writer.add_document(doc!(field=>1u64));
writer.commit().unwrap();
assert!(receiver.recv().is_ok());
assert_eq!(reader.searcher().num_docs(), 1);
// We need a loop here because it is possible for notify to send more than
// one modify event. It was observed on CI on MacOS.
loop {
assert!(receiver.recv().is_ok());
if reader.searcher().num_docs() == 1 {
break;
}
}
writer.add_document(doc!(field=>2u64));
writer.commit().unwrap();
assert!(receiver.recv().is_ok());
assert_eq!(reader.searcher().num_docs(), 2);
// ... Same as above
loop {
assert!(receiver.recv().is_ok());
if reader.searcher().num_docs() == 2 {
break;
}
}
}
// This test will not pass on windows, because windows

View File

@@ -3,7 +3,6 @@ use crate::directory::ReadOnlySource;
use crate::positions::PositionReader;
use crate::postings::TermInfo;
use crate::postings::{BlockSegmentPostings, SegmentPostings};
use crate::schema::FieldType;
use crate::schema::IndexRecordOption;
use crate::schema::Term;
use crate::termdict::TermDictionary;
@@ -54,10 +53,7 @@ impl InvertedIndexReader {
/// Creates an empty `InvertedIndexReader` object, which
/// contains no terms at all.
pub fn empty(field_type: &FieldType) -> InvertedIndexReader {
let record_option = field_type
.get_index_record_option()
.unwrap_or(IndexRecordOption::Basic);
pub fn empty(record_option: IndexRecordOption) -> InvertedIndexReader {
InvertedIndexReader {
termdict: TermDictionary::empty(),
postings_source: ReadOnlySource::empty(),

View File

@@ -143,6 +143,7 @@ impl Searcher {
collector.collect_segment(weight.as_ref(), segment_ord as u32, segment_reader)
},
segment_readers.iter().enumerate(),
self.index.logger().clone(),
)?;
collector.merge_fruits(fruits)
}

View File

@@ -21,6 +21,12 @@ use std::sync::atomic;
#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct SegmentId(Uuid);
impl ToString for SegmentId {
fn to_string(&self) -> String {
self.short_uuid_string()
}
}
#[cfg(test)]
static AUTO_INC_COUNTER: Lazy<atomic::AtomicUsize> = Lazy::new(|| atomic::AtomicUsize::default());

View File

@@ -9,14 +9,15 @@ use crate::fastfield::DeleteBitSet;
use crate::fastfield::FacetReader;
use crate::fastfield::FastFieldReaders;
use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
use crate::schema::Field;
use crate::schema::FieldType;
use crate::schema::Schema;
use crate::schema::{Field, IndexRecordOption};
use crate::space_usage::SegmentSpaceUsage;
use crate::store::StoreReader;
use crate::termdict::TermDictionary;
use crate::DocId;
use fail::fail_point;
use slog::{warn, Logger};
use std::collections::HashMap;
use std::fmt;
use std::sync::Arc;
@@ -53,6 +54,7 @@ pub struct SegmentReader {
store_source: ReadOnlySource,
delete_bitset_opt: Option<DeleteBitSet>,
schema: Schema,
logger: Logger,
}
impl SegmentReader {
@@ -125,17 +127,15 @@ impl SegmentReader {
///
/// They are simply stored as a fast field, serialized in
/// the `.fieldnorm` file of the segment.
pub fn get_fieldnorms_reader(&self, field: Field) -> FieldNormReader {
if let Some(fieldnorm_reader) = self.fieldnorm_readers.get_field(field) {
fieldnorm_reader
} else {
pub fn get_fieldnorms_reader(&self, field: Field) -> crate::Result<FieldNormReader> {
self.fieldnorm_readers.get_field(field).ok_or_else(|| {
let field_name = self.schema.get_field_name(field);
let err_msg = format!(
"Field norm not found for field {:?}. Was it market as indexed during indexing.",
field_name
);
panic!(err_msg);
}
crate::TantivyError::SchemaError(err_msg)
})
}
/// Accessor to the segment's `StoreReader`.
@@ -202,6 +202,7 @@ impl SegmentReader {
positions_composite,
positions_idx_composite,
schema,
logger: segment.index().logger().clone(),
})
}
@@ -212,6 +213,11 @@ impl SegmentReader {
/// The field reader is in charge of iterating through the
/// term dictionary associated to a specific field,
/// and opening the posting list associated to any term.
///
/// If the field is marked as index, a warn is logged and an empty `InvertedIndexReader`
/// is returned.
/// Similarly if the field is marked as indexed but no term has been indexed for the given
/// index. an empty `InvertedIndexReader` is returned (but no warning is logged).
pub fn inverted_index(&self, field: Field) -> Arc<InvertedIndexReader> {
if let Some(inv_idx_reader) = self
.inv_idx_reader_cache
@@ -226,21 +232,25 @@ impl SegmentReader {
let record_option_opt = field_type.get_index_record_option();
if record_option_opt.is_none() {
panic!("Field {:?} does not seem indexed.", field_entry.name());
warn!(
self.logger,
"Field {:?} does not seem indexed.",
field_entry.name()
);
}
let record_option = record_option_opt.unwrap();
let postings_source_opt = self.postings_composite.open_read(field);
if postings_source_opt.is_none() {
if postings_source_opt.is_none() || record_option_opt.is_none() {
// no documents in the segment contained this field.
// As a result, no data is associated to the inverted index.
//
// Returns an empty inverted index.
return Arc::new(InvertedIndexReader::empty(field_type));
let record_option = record_option_opt.unwrap_or(IndexRecordOption::Basic);
return Arc::new(InvertedIndexReader::empty(record_option));
}
let record_option = record_option_opt.unwrap();
let postings_source = postings_source_opt.unwrap();
let termdict_source = self.termdict_composite.open_read(field).expect(
@@ -339,7 +349,7 @@ mod test {
let name = schema.get_field("name").unwrap();
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(name => "tantivy"));
index_writer.add_document(doc!(name => "horse"));
index_writer.add_document(doc!(name => "jockey"));

View File

@@ -1,3 +1,5 @@
use slog::{error, Logger};
use crate::directory::directory_lock::Lock;
use crate::directory::error::LockError;
use crate::directory::error::{DeleteError, OpenReadError, OpenWriteError};
@@ -64,7 +66,10 @@ impl<T: Send + Sync + 'static> From<Box<T>> for DirectoryLock {
impl Drop for DirectoryLockGuard {
fn drop(&mut self) {
if let Err(e) = self.directory.delete(&*self.path) {
error!("Failed to remove the lock file. {:?}", e);
error!(
self.directory.logger(),
"Failed to remove the lock file. {:?}", e
);
}
}
}
@@ -80,7 +85,7 @@ fn try_acquire_lock(
) -> Result<DirectoryLock, TryAcquireLockError> {
let mut write = directory.open_write(filepath).map_err(|e| match e {
OpenWriteError::FileAlreadyExists(_) => TryAcquireLockError::FileExists,
OpenWriteError::IOError(io_error) => TryAcquireLockError::IOError(io_error.into()),
OpenWriteError::IOError { io_error, .. } => TryAcquireLockError::IOError(io_error),
})?;
write.flush().map_err(TryAcquireLockError::IOError)?;
Ok(DirectoryLock::from(Box::new(DirectoryLockGuard {
@@ -209,6 +214,9 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
/// `OnCommit` `ReloadPolicy`. Not implementing watch in a `Directory` only prevents the
/// `OnCommit` `ReloadPolicy` to work properly.
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle>;
/// Returns the `slog::Logger` configured for the `Directory`.
fn logger(&self) -> &Logger;
}
/// DirectoryClone

View File

@@ -1,160 +1,60 @@
use crate::Version;
use std::error::Error as StdError;
use std::fmt;
use std::io;
use std::path::PathBuf;
/// Error while trying to acquire a directory lock.
#[derive(Debug, Fail)]
#[derive(Debug, Error)]
pub enum LockError {
/// Failed to acquired a lock as it is already held by another
/// client.
/// - In the context of a blocking lock, this means the lock was not released within some `timeout` period.
/// - In the context of a non-blocking lock, this means the lock was busy at the moment of the call.
#[fail(
display = "Could not acquire lock as it is already held, possibly by a different process."
)]
#[error("Could not acquire lock as it is already held, possibly by a different process.")]
LockBusy,
/// Trying to acquire a lock failed with an `IOError`
#[fail(display = "Failed to acquire the lock due to an io:Error.")]
#[error("Failed to acquire the lock due to an io:Error.")]
IOError(io::Error),
}
/// General IO error with an optional path to the offending file.
#[derive(Debug)]
pub struct IOError {
path: Option<PathBuf>,
err: io::Error,
}
impl Into<io::Error> for IOError {
fn into(self) -> io::Error {
self.err
}
}
impl fmt::Display for IOError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self.path {
Some(ref path) => write!(f, "io error occurred on path '{:?}': '{}'", path, self.err),
None => write!(f, "io error occurred: '{}'", self.err),
}
}
}
impl StdError for IOError {
fn description(&self) -> &str {
"io error occurred"
}
fn cause(&self) -> Option<&dyn StdError> {
Some(&self.err)
}
}
impl IOError {
pub(crate) fn with_path(path: PathBuf, err: io::Error) -> Self {
IOError {
path: Some(path),
err,
}
}
}
impl From<io::Error> for IOError {
fn from(err: io::Error) -> IOError {
IOError { path: None, err }
}
}
/// Error that may occur when opening a directory
#[derive(Debug)]
#[derive(Debug, Error)]
pub enum OpenDirectoryError {
/// The underlying directory does not exists.
#[error("Directory does not exist: '{0}'.")]
DoesNotExist(PathBuf),
/// The path exists but is not a directory.
#[error("Path exists but is not a directory: '{0}'.")]
NotADirectory(PathBuf),
/// Failed to create a temp directory.
#[error("Failed to create a temporary directory: '{0}'.")]
FailedToCreateTempDir(io::Error),
/// IoError
IoError(io::Error),
}
impl From<io::Error> for OpenDirectoryError {
fn from(io_err: io::Error) -> Self {
OpenDirectoryError::IoError(io_err)
}
}
impl fmt::Display for OpenDirectoryError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match *self {
OpenDirectoryError::DoesNotExist(ref path) => {
write!(f, "the underlying directory '{:?}' does not exist", path)
}
OpenDirectoryError::NotADirectory(ref path) => {
write!(f, "the path '{:?}' exists but is not a directory", path)
}
OpenDirectoryError::IoError(ref err) => write!(
f,
"IOError while trying to open/create the directory. {:?}",
err
),
}
}
}
impl StdError for OpenDirectoryError {
fn description(&self) -> &str {
"error occurred while opening a directory"
}
fn cause(&self) -> Option<&dyn StdError> {
None
}
#[error("IOError '{io_error:?}' while create directory in: '{directory_path:?}'.")]
IoError {
/// underlying io Error.
io_error: io::Error,
/// directory we tried to open.
directory_path: PathBuf,
},
}
/// Error that may occur when starting to write in a file
#[derive(Debug)]
#[derive(Debug, Error)]
pub enum OpenWriteError {
/// Our directory is WORM, writing an existing file is forbidden.
/// Checkout the `Directory` documentation.
#[error("File already exists: '{0}'")]
FileAlreadyExists(PathBuf),
/// Any kind of IO error that happens when
/// writing in the underlying IO device.
IOError(IOError),
}
impl From<IOError> for OpenWriteError {
fn from(err: IOError) -> OpenWriteError {
OpenWriteError::IOError(err)
}
}
impl fmt::Display for OpenWriteError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match *self {
OpenWriteError::FileAlreadyExists(ref path) => {
write!(f, "the file '{:?}' already exists", path)
}
OpenWriteError::IOError(ref err) => write!(
f,
"an io error occurred while opening a file for writing: '{}'",
err
),
}
}
}
impl StdError for OpenWriteError {
fn description(&self) -> &str {
"error occurred while opening a file for writing"
}
fn cause(&self) -> Option<&dyn StdError> {
match *self {
OpenWriteError::FileAlreadyExists(_) => None,
OpenWriteError::IOError(ref err) => Some(err),
}
}
#[error("IOError '{io_error:?}' while opening file for write: '{filepath}'.")]
IOError {
/// The underlying `io::Error`.
io_error: io::Error,
/// File path of the file that tantivy failed to open for write.
filepath: PathBuf,
},
}
/// Type of index incompatibility between the library and the index found on disk
@@ -217,55 +117,41 @@ impl fmt::Debug for Incompatibility {
}
/// Error that may occur when accessing a file read
#[derive(Debug)]
#[derive(Debug, Error)]
pub enum OpenReadError {
/// The file does not exists.
#[error("Files does not exists: {0:?}")]
FileDoesNotExist(PathBuf),
/// Any kind of IO error that happens when
/// interacting with the underlying IO device.
IOError(IOError),
/// This library doesn't support the index version found on disk
/// Any kind of io::Error.
#[error(
"IOError: '{io_error:?}' happened while opening the following file for Read: {filepath}."
)]
IOError {
/// The underlying `io::Error`.
io_error: io::Error,
/// File path of the file that tantivy failed to open for read.
filepath: PathBuf,
},
/// This library does not support the index version found in file footer.
#[error("Index version unsupported: {0:?}")]
IncompatibleIndex(Incompatibility),
}
impl From<IOError> for OpenReadError {
fn from(err: IOError) -> OpenReadError {
OpenReadError::IOError(err)
}
}
impl fmt::Display for OpenReadError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match *self {
OpenReadError::FileDoesNotExist(ref path) => {
write!(f, "the file '{:?}' does not exist", path)
}
OpenReadError::IOError(ref err) => write!(
f,
"an io error occurred while opening a file for reading: '{}'",
err
),
OpenReadError::IncompatibleIndex(ref footer) => {
write!(f, "Incompatible index format: {:?}", footer)
}
}
}
}
/// Error that may occur when trying to delete a file
#[derive(Debug)]
#[derive(Debug, Error)]
pub enum DeleteError {
/// The file does not exists.
#[error("File does not exists: '{0}'.")]
FileDoesNotExist(PathBuf),
/// Any kind of IO error that happens when
/// interacting with the underlying IO device.
IOError(IOError),
}
impl From<IOError> for DeleteError {
fn from(err: IOError) -> DeleteError {
DeleteError::IOError(err)
}
#[error("The following IO error happened while deleting file '{filepath}': '{io_error:?}'.")]
IOError {
/// The underlying `io::Error`.
io_error: io::Error,
/// File path of the file that tantivy failed to delete.
filepath: PathBuf,
},
}
impl From<Incompatibility> for OpenReadError {
@@ -273,29 +159,3 @@ impl From<Incompatibility> for OpenReadError {
OpenReadError::IncompatibleIndex(incompatibility)
}
}
impl fmt::Display for DeleteError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match *self {
DeleteError::FileDoesNotExist(ref path) => {
write!(f, "the file '{:?}' does not exist", path)
}
DeleteError::IOError(ref err) => {
write!(f, "an io error occurred while deleting a file: '{}'", err)
}
}
}
}
impl StdError for DeleteError {
fn description(&self) -> &str {
"error occurred while deleting a file"
}
fn cause(&self) -> Option<&dyn StdError> {
match *self {
DeleteError::FileDoesNotExist(_) => None,
DeleteError::IOError(ref err) => Some(err),
}
}
}

View File

@@ -271,7 +271,11 @@ mod tests {
let mut vec = Vec::new();
let footer_proxy = FooterProxy::new(&mut vec);
assert!(footer_proxy.terminate().is_ok());
assert_eq!(vec.len(), 167);
if crate::store::COMPRESSION == "lz4" {
assert_eq!(vec.len(), 158);
} else {
assert_eq!(vec.len(), 167);
}
let footer = Footer::deserialize(&mut &vec[..]).unwrap();
assert!(matches!(
footer.versioned_footer,

View File

@@ -1,5 +1,5 @@
use crate::core::MANAGED_FILEPATH;
use crate::directory::error::{DeleteError, IOError, LockError, OpenReadError, OpenWriteError};
use crate::core::{MANAGED_FILEPATH, META_FILEPATH};
use crate::directory::error::{DeleteError, LockError, OpenReadError, OpenWriteError};
use crate::directory::footer::{Footer, FooterProxy};
use crate::directory::DirectoryLock;
use crate::directory::GarbageCollectionResult;
@@ -11,9 +11,9 @@ use crate::error::DataCorruption;
use crate::Directory;
use crc32fast::Hasher;
use slog::{debug, error, info};
use std::collections::HashSet;
use std::io;
use std::io::Write;
use std::path::{Path, PathBuf};
use std::result;
use std::sync::RwLockWriteGuard;
@@ -56,9 +56,9 @@ fn save_managed_paths(
directory: &mut dyn Directory,
wlock: &RwLockWriteGuard<'_, MetaInformation>,
) -> io::Result<()> {
let mut w = serde_json::to_vec(&wlock.managed_paths)?;
writeln!(&mut w)?;
directory.atomic_write(&MANAGED_FILEPATH, &w[..])?;
let mut managed_json = serde_json::to_string_pretty(&wlock.managed_paths)?;
managed_json.push_str("\n");
directory.atomic_write(&MANAGED_FILEPATH, managed_json.as_bytes())?;
Ok(())
}
@@ -86,7 +86,12 @@ impl ManagedDirectory {
directory: Box::new(directory),
meta_informations: Arc::default(),
}),
Err(OpenReadError::IOError(e)) => Err(From::from(e)),
Err(OpenReadError::IOError { io_error, filepath }) => {
Err(crate::TantivyError::OpenReadError(OpenReadError::IOError {
io_error,
filepath,
}))
}
Err(OpenReadError::IncompatibleIndex(incompatibility)) => {
// For the moment, this should never happen `meta.json`
// do not have any footer and cannot detect incompatibility.
@@ -113,7 +118,7 @@ impl ManagedDirectory {
&mut self,
get_living_files: L,
) -> crate::Result<GarbageCollectionResult> {
info!("Garbage collect");
info!(self.directory.logger(), "gc"; "stage"=>"start");
let mut files_to_delete = vec![];
// It is crucial to get the living files after acquiring the
@@ -148,7 +153,7 @@ impl ManagedDirectory {
}
}
Err(err) => {
error!("Failed to acquire lock for GC");
error!(self.logger(), "Failed to acquire lock for GC");
return Err(crate::TantivyError::from(err));
}
}
@@ -160,7 +165,7 @@ impl ManagedDirectory {
for file_to_delete in files_to_delete {
match self.delete(&file_to_delete) {
Ok(_) => {
info!("Deleted {:?}", file_to_delete);
debug!(self.logger(), "deleted-success"; "file"=>format!("{:?}", file_to_delete));
deleted_files.push(file_to_delete);
}
Err(file_error) => {
@@ -168,12 +173,12 @@ impl ManagedDirectory {
DeleteError::FileDoesNotExist(_) => {
deleted_files.push(file_to_delete.clone());
}
DeleteError::IOError(_) => {
DeleteError::IOError { .. } => {
failed_to_delete_files.push(file_to_delete.clone());
if !cfg!(target_os = "windows") {
// On windows, delete is expected to fail if the file
// is mmapped.
error!("Failed to delete {:?}", file_to_delete);
error!(self.logger(), "delete-file-fail"; "path"=>file_to_delete.to_str().unwrap_or("<invalid-utf8>"));
}
}
}
@@ -195,6 +200,10 @@ impl ManagedDirectory {
save_managed_paths(self.directory.as_mut(), &meta_informations_wlock)?;
}
info!(self.directory.logger(), "gc"; "stage"=>"end",
"num-sucess-file-deletes"=>deleted_files.len(),
"num-failed-file-deletes"=>failed_to_delete_files.len());
Ok(GarbageCollectionResult {
deleted_files,
failed_to_delete_files,
@@ -231,8 +240,11 @@ impl ManagedDirectory {
/// Verify checksum of a managed file
pub fn validate_checksum(&self, path: &Path) -> result::Result<bool, OpenReadError> {
let reader = self.directory.open_read(path)?;
let (footer, data) = Footer::extract_footer(reader)
.map_err(|err| IOError::with_path(path.to_path_buf(), err))?;
let (footer, data) =
Footer::extract_footer(reader).map_err(|io_error| OpenReadError::IOError {
io_error,
filepath: path.to_path_buf(),
})?;
let mut hasher = Hasher::new();
hasher.update(data.as_slice());
let crc = hasher.finalize();
@@ -245,35 +257,46 @@ impl ManagedDirectory {
/// List files for which checksum does not match content
pub fn list_damaged(&self) -> result::Result<HashSet<PathBuf>, OpenReadError> {
let mut hashset = HashSet::new();
let managed_paths = self
let mut managed_paths = self
.meta_informations
.read()
.expect("Managed directory rlock poisoned in list damaged.")
.managed_paths
.clone();
for path in managed_paths.into_iter() {
managed_paths.remove(*META_FILEPATH);
let mut damaged_files = HashSet::new();
for path in managed_paths {
if !self.validate_checksum(&path)? {
hashset.insert(path);
damaged_files.insert(path);
}
}
Ok(hashset)
Ok(damaged_files)
}
}
impl Directory for ManagedDirectory {
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> {
slog::debug!(self.logger(), "open-read"; "path" => path.to_str().unwrap_or("<invalid-utf8>"));
let read_only_source = self.directory.open_read(path)?;
let (footer, reader) = Footer::extract_footer(read_only_source)
.map_err(|err| IOError::with_path(path.to_path_buf(), err))?;
let (footer, reader) = Footer::extract_footer(read_only_source).map_err(|io_error| {
OpenReadError::IOError {
io_error,
filepath: path.to_path_buf(),
}
})?;
footer.is_compatible()?;
Ok(reader)
}
fn open_write(&mut self, path: &Path) -> result::Result<WritePtr, OpenWriteError> {
slog::debug!(self.logger(), "open-write"; "path" => path.to_str().unwrap_or("<invalid-utf8>"));
self.register_file_as_managed(path)
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
.map_err(|io_error| OpenWriteError::IOError {
io_error,
filepath: path.to_path_buf(),
})?;
Ok(io::BufWriter::new(Box::new(FooterProxy::new(
self.directory
.open_write(path)?
@@ -283,9 +306,11 @@ impl Directory for ManagedDirectory {
))))
}
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
fn atomic_write(&mut self, path: &Path, content: &[u8]) -> io::Result<()> {
let content_str = std::str::from_utf8(content).unwrap_or("<content-not-utf-8>");
slog::debug!(self.logger(), "Atomic write"; "path" => format!("{:?}", path), "content_length"=>content_str);
self.register_file_as_managed(path)?;
self.directory.atomic_write(path, data)
self.directory.atomic_write(path, content)
}
fn atomic_read(&self, path: &Path) -> result::Result<Vec<u8>, OpenReadError> {
@@ -307,6 +332,10 @@ impl Directory for ManagedDirectory {
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
self.directory.watch(watch_callback)
}
fn logger(&self) -> &slog::Logger {
self.directory.logger()
}
}
impl Clone for ManagedDirectory {

View File

@@ -1,8 +1,6 @@
use crate::core::META_FILEPATH;
use crate::directory::error::LockError;
use crate::directory::error::{
DeleteError, IOError, OpenDirectoryError, OpenReadError, OpenWriteError,
};
use crate::directory::error::{DeleteError, OpenDirectoryError, OpenReadError, OpenWriteError};
use crate::directory::read_only_source::BoxedData;
use crate::directory::AntiCallToken;
use crate::directory::Directory;
@@ -19,6 +17,8 @@ use notify::RawEvent;
use notify::RecursiveMode;
use notify::Watcher;
use serde::{Deserialize, Serialize};
use slog::{debug, o, Drain, Logger};
use slog_stdlog::StdLog;
use std::collections::HashMap;
use std::convert::From;
use std::fmt;
@@ -36,11 +36,6 @@ use std::sync::Weak;
use std::thread;
use tempfile::TempDir;
/// Create a default io error given a string.
pub(crate) fn make_io_err(msg: String) -> io::Error {
io::Error::new(io::ErrorKind::Other, msg)
}
/// Returns None iff the file exists, can be read, but is empty (and hence
/// cannot be mmapped)
fn open_mmap(full_path: &Path) -> result::Result<Option<Mmap>, OpenReadError> {
@@ -48,13 +43,17 @@ fn open_mmap(full_path: &Path) -> result::Result<Option<Mmap>, OpenReadError> {
if e.kind() == io::ErrorKind::NotFound {
OpenReadError::FileDoesNotExist(full_path.to_owned())
} else {
OpenReadError::IOError(IOError::with_path(full_path.to_owned(), e))
OpenReadError::IOError {
io_error: e,
filepath: full_path.to_owned(),
}
}
})?;
let meta_data = file
.metadata()
.map_err(|e| IOError::with_path(full_path.to_owned(), e))?;
let meta_data = file.metadata().map_err(|e| OpenReadError::IOError {
io_error: e,
filepath: full_path.to_owned(),
})?;
if meta_data.len() == 0 {
// if the file size is 0, it will not be possible
// to mmap the file, so we return None
@@ -64,7 +63,10 @@ fn open_mmap(full_path: &Path) -> result::Result<Option<Mmap>, OpenReadError> {
unsafe {
memmap::Mmap::map(&file)
.map(Some)
.map_err(|e| From::from(IOError::with_path(full_path.to_owned(), e)))
.map_err(|e| OpenReadError::IOError {
io_error: e,
filepath: full_path.to_owned(),
})
}
}
@@ -144,7 +146,7 @@ struct WatcherWrapper {
}
impl WatcherWrapper {
pub fn new(path: &Path) -> Result<Self, OpenDirectoryError> {
pub(crate) fn new(path: &Path, logger: Logger) -> Result<Self, OpenDirectoryError> {
let (tx, watcher_recv): (Sender<RawEvent>, Receiver<RawEvent>) = channel();
// We need to initialize the
let watcher = notify::raw_watcher(tx)
@@ -158,7 +160,8 @@ impl WatcherWrapper {
panic!("Unknown error while starting watching directory {:?}", path);
}
})?;
let watcher_router: Arc<WatchCallbackList> = Default::default();
let watcher_router: Arc<WatchCallbackList> =
Arc::new(WatchCallbackList::with_logger(logger));
let watcher_router_clone = watcher_router.clone();
thread::Builder::new()
.name("meta-file-watch-thread".to_string())
@@ -183,6 +186,10 @@ impl WatcherWrapper {
}
}
}
})
.map_err(|io_error| OpenDirectoryError::IoError {
io_error,
directory_path: path.to_path_buf(),
})?;
Ok(WatcherWrapper {
_watcher: Mutex::new(watcher),
@@ -217,15 +224,21 @@ struct MmapDirectoryInner {
mmap_cache: RwLock<MmapCache>,
_temp_directory: Option<TempDir>,
watcher: RwLock<Option<WatcherWrapper>>,
logger: Logger,
}
impl MmapDirectoryInner {
fn new(root_path: PathBuf, temp_directory: Option<TempDir>) -> MmapDirectoryInner {
fn new(
root_path: PathBuf,
temp_directory: Option<TempDir>,
logger: Logger,
) -> MmapDirectoryInner {
MmapDirectoryInner {
root_path,
mmap_cache: Default::default(),
_temp_directory: temp_directory,
watcher: RwLock::new(None),
logger,
}
}
@@ -237,7 +250,7 @@ impl MmapDirectoryInner {
// The downside is that we might create a watch wrapper that is not useful.
let need_initialization = self.watcher.read().unwrap().is_none();
if need_initialization {
let watch_wrapper = WatcherWrapper::new(&self.root_path)?;
let watch_wrapper = WatcherWrapper::new(&self.root_path, self.logger.clone())?;
let mut watch_wlock = self.watcher.write().unwrap();
// the watcher could have been initialized when we released the lock, and
// we do not want to lose the watched files that were set.
@@ -260,8 +273,8 @@ impl fmt::Debug for MmapDirectory {
}
impl MmapDirectory {
fn new(root_path: PathBuf, temp_directory: Option<TempDir>) -> MmapDirectory {
let inner = MmapDirectoryInner::new(root_path, temp_directory);
fn new(root_path: PathBuf, temp_directory: Option<TempDir>, logger: Logger) -> MmapDirectory {
let inner = MmapDirectoryInner::new(root_path, temp_directory, logger);
MmapDirectory {
inner: Arc::new(inner),
}
@@ -272,16 +285,19 @@ impl MmapDirectory {
/// This is mostly useful to test the MmapDirectory itself.
/// For your unit tests, prefer the RAMDirectory.
pub fn create_from_tempdir() -> Result<MmapDirectory, OpenDirectoryError> {
let tempdir = TempDir::new().map_err(OpenDirectoryError::IoError)?;
let tempdir_path = PathBuf::from(tempdir.path());
Ok(MmapDirectory::new(tempdir_path, Some(tempdir)))
let tempdir = TempDir::new().map_err(OpenDirectoryError::FailedToCreateTempDir)?;
let logger = Logger::root(StdLog.fuse(), o!());
Ok(MmapDirectory::new(tempdir.path().to_owned(), Some(tempdir), logger))
}
/// Opens a MmapDirectory in a directory.
///
/// Returns an error if the `directory_path` does not
/// exist or if it is not a directory.
pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<MmapDirectory, OpenDirectoryError> {
pub fn open_with_logger<P: AsRef<Path>>(
directory_path: P,
logger: Logger,
) -> Result<MmapDirectory, OpenDirectoryError> {
let directory_path: &Path = directory_path.as_ref();
if !directory_path.exists() {
Err(OpenDirectoryError::DoesNotExist(PathBuf::from(
@@ -292,10 +308,20 @@ impl MmapDirectory {
directory_path,
)))
} else {
Ok(MmapDirectory::new(PathBuf::from(directory_path), None))
Ok(MmapDirectory::new(
PathBuf::from(directory_path),
None,
logger,
))
}
}
/// Creates an `MmapDirectory` at the given path.
pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<MmapDirectory, OpenDirectoryError> {
let logger = Logger::root(StdLog.fuse(), o!());
Self::open_with_logger(directory_path, logger)
}
/// Joins a relative_path to the directory `root_path`
/// to create a proper complete `filepath`.
fn resolve_path(&self, relative_path: &Path) -> PathBuf {
@@ -355,11 +381,12 @@ impl MmapDirectory {
struct ReleaseLockFile {
_file: File,
path: PathBuf,
logger: Logger,
}
impl Drop for ReleaseLockFile {
fn drop(&mut self) {
debug!("Releasing lock {:?}", self.path);
debug!(self.logger, "Releasing lock {:?}", self.path);
}
}
@@ -398,16 +425,18 @@ impl TerminatingWrite for SafeFileWriter {
impl Directory for MmapDirectory {
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> {
debug!("Open Read {:?}", path);
let full_path = self.resolve_path(path);
let mut mmap_cache = self.inner.mmap_cache.write().map_err(|_| {
let msg = format!(
"Failed to acquired write lock \
on mmap cache while reading {:?}",
path
);
IOError::with_path(path.to_owned(), make_io_err(msg))
let io_error = io::Error::new(io::ErrorKind::Other, msg);
OpenReadError::IOError {
io_error,
filepath: path.to_owned(),
}
})?;
Ok(mmap_cache
.get_mmap(&full_path)?
@@ -420,14 +449,18 @@ impl Directory for MmapDirectory {
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
let full_path = self.resolve_path(path);
match fs::remove_file(&full_path) {
Ok(_) => self
.sync_directory()
.map_err(|e| IOError::with_path(path.to_owned(), e).into()),
Ok(_) => self.sync_directory().map_err(|e| DeleteError::IOError {
io_error: e,
filepath: path.to_path_buf(),
}),
Err(e) => {
if e.kind() == io::ErrorKind::NotFound {
Err(DeleteError::FileDoesNotExist(path.to_owned()))
} else {
Err(IOError::with_path(path.to_owned(), e).into())
Err(DeleteError::IOError {
io_error: e,
filepath: path.to_path_buf(),
})
}
}
}
@@ -439,9 +472,7 @@ impl Directory for MmapDirectory {
}
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> {
debug!("Open Write {:?}", path);
let full_path = self.resolve_path(path);
let open_res = OpenOptions::new()
.write(true)
.create_new(true)
@@ -451,18 +482,25 @@ impl Directory for MmapDirectory {
if err.kind() == io::ErrorKind::AlreadyExists {
OpenWriteError::FileAlreadyExists(path.to_owned())
} else {
IOError::with_path(path.to_owned(), err).into()
OpenWriteError::IOError {
io_error: err,
filepath: path.to_owned(),
}
}
})?;
// making sure the file is created.
file.flush()
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
file.flush().map_err(|io_error| OpenWriteError::IOError {
io_error,
filepath: path.to_owned(),
})?;
// Apparetntly, on some filesystem syncing the parent
// directory is required.
self.sync_directory()
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
self.sync_directory().map_err(|e| OpenWriteError::IOError {
io_error: e,
filepath: path.to_owned(),
})?;
let writer = SafeFileWriter::new(file);
Ok(BufWriter::new(Box::new(writer)))
@@ -474,24 +512,31 @@ impl Directory for MmapDirectory {
match File::open(&full_path) {
Ok(mut file) => {
file.read_to_end(&mut buffer)
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
.map_err(|io_error| OpenReadError::IOError {
io_error,
filepath: path.to_owned(),
})?;
Ok(buffer)
}
Err(e) => {
if e.kind() == io::ErrorKind::NotFound {
Err(io_error) => {
if io_error.kind() == io::ErrorKind::NotFound {
Err(OpenReadError::FileDoesNotExist(path.to_owned()))
} else {
Err(IOError::with_path(path.to_owned(), e).into())
Err(OpenReadError::IOError {
io_error,
filepath: path.to_owned(),
})
}
}
}
}
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
debug!("Atomic Write {:?}", path);
fn atomic_write(&mut self, path: &Path, content: &[u8]) -> io::Result<()> {
let mut tempfile = tempfile::Builder::new().tempfile_in(&self.inner.root_path)?;
tempfile.write_all(content)?;
tempfile.flush()?;
let full_path = self.resolve_path(path);
let meta_file = atomicwrites::AtomicFile::new(full_path, atomicwrites::AllowOverwrite);
meta_file.write(|f| f.write_all(data))?;
tempfile.into_temp_path().persist(full_path)?;
Ok(())
}
@@ -508,16 +553,22 @@ impl Directory for MmapDirectory {
} else {
file.try_lock_exclusive().map_err(|_| LockError::LockBusy)?
}
let logger = self.inner.logger.clone();
// dropping the file handle will release the lock.
Ok(DirectoryLock::from(Box::new(ReleaseLockFile {
path: lock.filepath.clone(),
_file: file,
logger,
})))
}
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
self.inner.watch(watch_callback)
}
fn logger(&self) -> &Logger {
&self.inner.logger
}
}
#[cfg(test)]
@@ -627,7 +678,8 @@ mod tests {
let counter_clone = counter.clone();
let tmp_dir = tempfile::TempDir::new().unwrap();
let tmp_dirpath = tmp_dir.path().to_owned();
let mut watch_wrapper = WatcherWrapper::new(&tmp_dirpath).unwrap();
let logger = Logger::root(slog::Discard, o!());
let mut watch_wrapper = WatcherWrapper::new(&tmp_dirpath, logger).unwrap();
let tmp_file = tmp_dirpath.join(*META_FILEPATH);
let _handle = watch_wrapper.watch(Box::new(move || {
counter_clone.fetch_add(1, Ordering::SeqCst);
@@ -652,7 +704,7 @@ mod tests {
{
let index = Index::create(mmap_directory.clone(), schema).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let mut log_merge_policy = LogMergePolicy::default();
log_merge_policy.set_min_merge_size(3);
index_writer.set_merge_policy(Box::new(log_merge_policy));

View File

@@ -23,7 +23,8 @@ pub use self::directory::{Directory, DirectoryClone};
pub use self::directory_lock::{Lock, INDEX_WRITER_LOCK, META_LOCK};
pub use self::ram_directory::RAMDirectory;
pub use self::read_only_source::ReadOnlySource;
pub use self::watch_event_router::{WatchCallback, WatchCallbackList, WatchHandle};
pub(crate) use self::watch_event_router::WatchCallbackList;
pub use self::watch_event_router::{WatchCallback, WatchHandle};
use std::io::{self, BufWriter, Write};
use std::path::PathBuf;
/// Outcome of the Garbage collection

View File

@@ -5,6 +5,8 @@ use crate::directory::WatchCallbackList;
use crate::directory::{Directory, ReadOnlySource, WatchCallback, WatchHandle};
use crate::directory::{TerminatingWrite, WritePtr};
use fail::fail_point;
use slog::{o, Drain, Logger};
use slog_stdlog::StdLog;
use std::collections::HashMap;
use std::fmt;
use std::io::{self, BufWriter, Cursor, Seek, SeekFrom, Write};
@@ -66,7 +68,7 @@ impl Write for VecWriter {
fn flush(&mut self) -> io::Result<()> {
self.is_flushed = true;
let mut fs = self.shared_directory.fs.write().unwrap();
let mut fs = self.shared_directory.fs.inner_directory.write().unwrap();
fs.write(self.path.clone(), self.data.get_ref());
Ok(())
}
@@ -78,13 +80,19 @@ impl TerminatingWrite for VecWriter {
}
}
#[derive(Default)]
struct InnerDirectory {
fs: HashMap<PathBuf, ReadOnlySource>,
watch_router: WatchCallbackList,
}
impl InnerDirectory {
fn with_logger(logger: Logger) -> Self {
InnerDirectory {
fs: Default::default(),
watch_router: WatchCallbackList::with_logger(logger.clone()),
}
}
fn write(&mut self, path: PathBuf, data: &[u8]) -> bool {
let data = ReadOnlySource::new(Vec::from(data));
self.fs.insert(path, data).is_some()
@@ -117,20 +125,32 @@ impl InnerDirectory {
}
}
impl Default for RAMDirectory {
fn default() -> RAMDirectory {
let logger = Logger::root(StdLog.fuse(), o!());
Self::with_logger(logger)
}
}
impl fmt::Debug for RAMDirectory {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "RAMDirectory")
}
}
struct Inner {
inner_directory: RwLock<InnerDirectory>,
logger: Logger,
}
/// A Directory storing everything in anonymous memory.
///
/// It is mainly meant for unit testing.
/// Writes are only made visible upon flushing.
///
#[derive(Clone, Default)]
#[derive(Clone)]
pub struct RAMDirectory {
fs: Arc<RwLock<InnerDirectory>>,
fs: Arc<Inner>,
}
impl RAMDirectory {
@@ -139,10 +159,21 @@ impl RAMDirectory {
Self::default()
}
/// Create a `RAMDirectory` with a custom logger.
pub fn with_logger(logger: Logger) -> RAMDirectory {
let inner_directory = InnerDirectory::with_logger(logger.clone()).into();
RAMDirectory {
fs: Arc::new(Inner {
inner_directory,
logger,
}),
}
}
/// Returns the sum of the size of the different files
/// in the RAMDirectory.
pub fn total_mem_usage(&self) -> usize {
self.fs.read().unwrap().total_mem_usage()
self.fs.inner_directory.read().unwrap().total_mem_usage()
}
/// Write a copy of all of the files saved in the RAMDirectory in the target `Directory`.
@@ -152,7 +183,7 @@ impl RAMDirectory {
///
/// If an error is encounterred, files may be persisted partially.
pub fn persist(&self, dest: &mut dyn Directory) -> crate::Result<()> {
let wlock = self.fs.write().unwrap();
let wlock = self.fs.inner_directory.write().unwrap();
for (path, source) in wlock.fs.iter() {
let mut dest_wrt = dest.open_write(path)?;
dest_wrt.write_all(source.as_slice())?;
@@ -164,24 +195,25 @@ impl RAMDirectory {
impl Directory for RAMDirectory {
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> {
self.fs.read().unwrap().open_read(path)
self.fs.inner_directory.read().unwrap().open_read(path)
}
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
fail_point!("RAMDirectory::delete", |_| {
use crate::directory::error::IOError;
let io_error = IOError::from(io::Error::from(io::ErrorKind::Other));
Err(DeleteError::from(io_error))
Err(DeleteError::IOError {
io_error: io::Error::from(io::ErrorKind::Other),
filepath: path.to_path_buf(),
})
});
self.fs.write().unwrap().delete(path)
self.fs.inner_directory.write().unwrap().delete(path)
}
fn exists(&self, path: &Path) -> bool {
self.fs.read().unwrap().exists(path)
self.fs.inner_directory.read().unwrap().exists(path)
}
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> {
let mut fs = self.fs.write().unwrap();
let mut fs = self.fs.inner_directory.write().unwrap();
let path_buf = PathBuf::from(path);
let vec_writer = VecWriter::new(path_buf.clone(), self.clone());
let exists = fs.write(path_buf.clone(), &[]);
@@ -205,19 +237,38 @@ impl Directory for RAMDirectory {
let path_buf = PathBuf::from(path);
// Reserve the path to prevent calls to .write() to succeed.
self.fs.write().unwrap().write(path_buf.clone(), &[]);
self.fs
.inner_directory
.write()
.unwrap()
.write(path_buf.clone(), &[]);
let mut vec_writer = VecWriter::new(path_buf, self.clone());
vec_writer.write_all(data)?;
vec_writer.flush()?;
if path == Path::new(&*META_FILEPATH) {
let _ = self.fs.write().unwrap().watch_router.broadcast();
let _ = self
.fs
.inner_directory
.write()
.unwrap()
.watch_router
.broadcast();
}
Ok(())
}
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
Ok(self.fs.write().unwrap().watch(watch_callback))
Ok(self
.fs
.inner_directory
.write()
.unwrap()
.watch(watch_callback))
}
fn logger(&self) -> &Logger {
&self.fs.logger
}
}

View File

@@ -211,19 +211,19 @@ fn test_watch(directory: &mut dyn Directory) {
.unwrap();
for i in 0..10 {
assert_eq!(i, counter.load(SeqCst));
assert!(i <= counter.load(SeqCst));
assert!(directory
.atomic_write(Path::new("meta.json"), b"random_test_data_2")
.is_ok());
assert_eq!(receiver.recv_timeout(Duration::from_millis(500)), Ok(i));
assert_eq!(i + 1, counter.load(SeqCst));
assert!(i + 1 <= counter.load(SeqCst)); // notify can trigger more than once.
}
mem::drop(watch_handle);
assert!(directory
.atomic_write(Path::new("meta.json"), b"random_test_data")
.is_ok());
assert!(receiver.recv_timeout(Duration::from_millis(500)).is_ok());
assert_eq!(10, counter.load(SeqCst));
assert!(10 <= counter.load(SeqCst));
}
fn test_lock_non_blocking(directory: &mut dyn Directory) {

View File

@@ -1,5 +1,6 @@
use futures::channel::oneshot;
use futures::{Future, TryFutureExt};
use slog::{error, Logger};
use std::sync::Arc;
use std::sync::RwLock;
use std::sync::Weak;
@@ -11,9 +12,9 @@ pub type WatchCallback = Box<dyn Fn() + Sync + Send>;
///
/// It registers callbacks (See `.subscribe(...)`) and
/// calls them upon calls to `.broadcast(...)`.
#[derive(Default)]
pub struct WatchCallbackList {
pub(crate) struct WatchCallbackList {
router: RwLock<Vec<Weak<WatchCallback>>>,
logger: Logger,
}
/// Controls how long a directory should watch for a file change.
@@ -32,6 +33,13 @@ impl WatchHandle {
}
impl WatchCallbackList {
pub fn with_logger(logger: Logger) -> Self {
WatchCallbackList {
logger,
router: Default::default(),
}
}
/// Subscribes a new callback and returns a handle that controls the lifetime of the callback.
pub fn subscribe(&self, watch_callback: WatchCallback) -> WatchHandle {
let watch_callback_arc = Arc::new(watch_callback);
@@ -74,8 +82,8 @@ impl WatchCallbackList {
});
if let Err(err) = spawn_res {
error!(
"Failed to spawn thread to call watch callbacks. Cause: {:?}",
err
self.logger,
"Failed to spawn thread to call watch callbacks. Cause: {:?}", err
);
}
result
@@ -86,13 +94,18 @@ impl WatchCallbackList {
mod tests {
use crate::directory::WatchCallbackList;
use futures::executor::block_on;
use slog::{o, Discard, Logger};
use std::mem;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
fn default_watch_callback_list() -> WatchCallbackList {
WatchCallbackList::with_logger(Logger::root(Discard, o!()))
}
#[test]
fn test_watch_event_router_simple() {
let watch_event_router = WatchCallbackList::default();
let watch_event_router = default_watch_callback_list();
let counter: Arc<AtomicUsize> = Default::default();
let counter_clone = counter.clone();
let inc_callback = Box::new(move || {
@@ -119,7 +132,7 @@ mod tests {
#[test]
fn test_watch_event_router_multiple_callback_same_key() {
let watch_event_router = WatchCallbackList::default();
let watch_event_router = default_watch_callback_list();
let counter: Arc<AtomicUsize> = Default::default();
let inc_callback = |inc: usize| {
let counter_clone = counter.clone();
@@ -148,7 +161,7 @@ mod tests {
#[test]
fn test_watch_event_router_multiple_callback_different_key() {
let watch_event_router = WatchCallbackList::default();
let watch_event_router = default_watch_callback_list();
let counter: Arc<AtomicUsize> = Default::default();
let counter_clone = counter.clone();
let inc_callback = Box::new(move || {

View File

@@ -2,11 +2,13 @@
use std::io;
use crate::directory::error::{IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
use crate::directory::error::{Incompatibility, LockError};
use crate::fastfield::FastFieldNotAvailableError;
use crate::query;
use crate::schema;
use crate::{
directory::error::{OpenDirectoryError, OpenReadError, OpenWriteError},
schema,
};
use std::fmt;
use std::path::PathBuf;
use std::sync::PoisonError;
@@ -43,44 +45,47 @@ impl fmt::Debug for DataCorruption {
}
}
/// The library's failure based error enum
#[derive(Debug, Fail)]
/// The library's error enum
#[derive(Debug, Error)]
pub enum TantivyError {
/// Path does not exist.
#[fail(display = "Path does not exist: '{:?}'", _0)]
PathDoesNotExist(PathBuf),
/// File already exists, this is a problem when we try to write into a new file.
#[fail(display = "File already exists: '{:?}'", _0)]
FileAlreadyExists(PathBuf),
/// Failed to open the directory.
#[error("Failed to open the directory: '{0:?}'")]
OpenDirectoryError(#[from] OpenDirectoryError),
/// Failed to open a file for read.
#[error("Failed to open file for read: '{0:?}'")]
OpenReadError(#[from] OpenReadError),
/// Failed to open a file for write.
#[error("Failed to open file for write: '{0:?}'")]
OpenWriteError(#[from] OpenWriteError),
/// Index already exists in this directory
#[fail(display = "Index already exists")]
#[error("Index already exists")]
IndexAlreadyExists,
/// Failed to acquire file lock
#[fail(display = "Failed to acquire Lockfile: {:?}. {:?}", _0, _1)]
#[error("Failed to acquire Lockfile: {0:?}. {1:?}")]
LockFailure(LockError, Option<String>),
/// IO Error.
#[fail(display = "An IO error occurred: '{}'", _0)]
IOError(#[cause] IOError),
#[error("An IO error occurred: '{0}'")]
IOError(#[from] io::Error),
/// Data corruption.
#[fail(display = "{:?}", _0)]
#[error("Data corrupted: '{0:?}'")]
DataCorruption(DataCorruption),
/// A thread holding the locked panicked and poisoned the lock.
#[fail(display = "A thread holding the locked panicked and poisoned the lock")]
#[error("A thread holding the locked panicked and poisoned the lock")]
Poisoned,
/// Invalid argument was passed by the user.
#[fail(display = "An invalid argument was passed: '{}'", _0)]
#[error("An invalid argument was passed: '{0}'")]
InvalidArgument(String),
/// An Error happened in one of the thread.
#[fail(display = "An error occurred in a thread: '{}'", _0)]
#[error("An error occurred in a thread: '{0}'")]
ErrorInThread(String),
/// An Error appeared related to the schema.
#[fail(display = "Schema error: '{}'", _0)]
#[error("Schema error: '{0}'")]
SchemaError(String),
/// System error. (e.g.: We failed spawning a new thread)
#[fail(display = "System error.'{}'", _0)]
#[error("System error.'{0}'")]
SystemError(String),
/// Index incompatible with current version of tantivy
#[fail(display = "{:?}", _0)]
#[error("{0:?}")]
IncompatibleIndex(Incompatibility),
}
@@ -89,31 +94,17 @@ impl From<DataCorruption> for TantivyError {
TantivyError::DataCorruption(data_corruption)
}
}
impl From<FastFieldNotAvailableError> for TantivyError {
fn from(fastfield_error: FastFieldNotAvailableError) -> TantivyError {
TantivyError::SchemaError(format!("{}", fastfield_error))
}
}
impl From<LockError> for TantivyError {
fn from(lock_error: LockError) -> TantivyError {
TantivyError::LockFailure(lock_error, None)
}
}
impl From<IOError> for TantivyError {
fn from(io_error: IOError) -> TantivyError {
TantivyError::IOError(io_error)
}
}
impl From<io::Error> for TantivyError {
fn from(io_error: io::Error) -> TantivyError {
TantivyError::IOError(io_error.into())
}
}
impl From<query::QueryParserError> for TantivyError {
fn from(parsing_error: query::QueryParserError) -> TantivyError {
TantivyError::InvalidArgument(format!("Query is invalid. {:?}", parsing_error))
@@ -126,49 +117,12 @@ impl<Guard> From<PoisonError<Guard>> for TantivyError {
}
}
impl From<OpenReadError> for TantivyError {
fn from(error: OpenReadError) -> TantivyError {
match error {
OpenReadError::FileDoesNotExist(filepath) => TantivyError::PathDoesNotExist(filepath),
OpenReadError::IOError(io_error) => TantivyError::IOError(io_error),
OpenReadError::IncompatibleIndex(incompatibility) => {
TantivyError::IncompatibleIndex(incompatibility)
}
}
}
}
impl From<schema::DocParsingError> for TantivyError {
fn from(error: schema::DocParsingError) -> TantivyError {
TantivyError::InvalidArgument(format!("Failed to parse document {:?}", error))
}
}
impl From<OpenWriteError> for TantivyError {
fn from(error: OpenWriteError) -> TantivyError {
match error {
OpenWriteError::FileAlreadyExists(filepath) => {
TantivyError::FileAlreadyExists(filepath)
}
OpenWriteError::IOError(io_error) => TantivyError::IOError(io_error),
}
}
}
impl From<OpenDirectoryError> for TantivyError {
fn from(error: OpenDirectoryError) -> TantivyError {
match error {
OpenDirectoryError::DoesNotExist(directory_path) => {
TantivyError::PathDoesNotExist(directory_path)
}
OpenDirectoryError::NotADirectory(directory_path) => {
TantivyError::InvalidArgument(format!("{:?} is not a directory", directory_path))
}
OpenDirectoryError::IoError(err) => TantivyError::IOError(IOError::from(err)),
}
}
}
impl From<serde_json::Error> for TantivyError {
fn from(error: serde_json::Error) -> TantivyError {
let io_err = io::Error::from(error);

View File

@@ -15,7 +15,7 @@ mod tests {
let field = schema_builder.add_bytes_field("bytesfield");
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(field=>vec![0u8, 1, 2, 3]));
index_writer.add_document(doc!(field=>vec![]));
index_writer.add_document(doc!(field=>vec![255u8]));

View File

@@ -4,8 +4,8 @@ use std::result;
/// `FastFieldNotAvailableError` is returned when the
/// user requested for a fast field reader, and the field was not
/// defined in the schema as a fast field.
#[derive(Debug, Fail)]
#[fail(display = "Fast field not available: '{:?}'", field_name)]
#[derive(Debug, Error)]
#[error("Fast field not available: '{field_name:?}'")]
pub struct FastFieldNotAvailableError {
field_name: String,
}

View File

@@ -474,7 +474,7 @@ mod tests {
let date_field = schema_builder.add_date_field("date", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer.add_document(doc!(date_field =>crate::chrono::prelude::Utc::now()));
index_writer.commit().unwrap();
@@ -511,7 +511,7 @@ mod tests {
);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer.add_document(doc!(
date_field => crate::DateTime::from_u64(1i64.to_u64()),

View File

@@ -25,7 +25,7 @@ mod tests {
);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(field=>1u64, field=>3u64));
index_writer.add_document(doc!());
index_writer.add_document(doc!(field=>4u64));
@@ -64,7 +64,7 @@ mod tests {
schema_builder.add_i64_field("time_stamp_i", IntOptions::default().set_stored());
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let first_time_stamp = chrono::Utc::now();
index_writer.add_document(
doc!(date_field=>first_time_stamp, date_field=>first_time_stamp, time_i=>1i64),
@@ -186,7 +186,7 @@ mod tests {
);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(field=> 1i64, field => 3i64));
index_writer.add_document(doc!());
index_writer.add_document(doc!(field=> -4i64));
@@ -221,7 +221,7 @@ mod tests {
let field = schema_builder.add_facet_field("facetfield");
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
for i in 0..100_000 {
index_writer.add_document(doc!(field=> Facet::from(format!("/lang/{}", i).as_str())));
}

View File

@@ -74,7 +74,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index
.writer_with_num_threads(1, 30_000_000)
.writer_for_tests()
.expect("Failed to create index writer.");
index_writer.add_document(doc!(
facet_field => Facet::from("/category/cat2"),

View File

@@ -27,6 +27,7 @@ use crate::Opstamp;
use crossbeam::channel;
use futures::executor::block_on;
use futures::future::Future;
use slog::{error, info, Logger};
use smallvec::smallvec;
use smallvec::SmallVec;
use std::mem;
@@ -195,20 +196,21 @@ fn index_documents(
grouped_document_iterator: &mut dyn Iterator<Item = OperationGroup>,
segment_updater: &mut SegmentUpdater,
mut delete_cursor: DeleteCursor,
logger: &Logger,
) -> crate::Result<bool> {
let schema = segment.schema();
info!(logger, "segment-index"; "stage"=>"start");
let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), &schema)?;
let mut buffer_limit_reached = false;
for document_group in grouped_document_iterator {
for doc in document_group {
segment_writer.add_document(doc, &schema)?;
}
let mem_usage = segment_writer.mem_usage();
if mem_usage >= memory_budget - MARGIN_IN_BYTES {
info!(
"Buffer limit reached, flushing segment with maxdoc={}.",
segment_writer.max_doc()
);
buffer_limit_reached = true;
break;
}
}
@@ -228,6 +230,14 @@ fn index_documents(
let segment_with_max_doc = segment.with_max_doc(max_doc);
let last_docstamp: Opstamp = *(doc_opstamps.last().unwrap());
info!(
logger,
"segment-index";
"stage" => "serialize",
"cause" => if buffer_limit_reached { "buffer-limit" } else { "commit" },
"maxdoc" => max_doc,
"last_docstamp" => last_docstamp
);
let delete_bitset_opt = apply_deletes(
&segment_with_max_doc,
@@ -241,7 +251,18 @@ fn index_documents(
delete_cursor,
delete_bitset_opt,
);
info!(
logger,
"segment-index";
"stage" => "publish",
);
block_on(segment_updater.schedule_add_segment(segment_entry))?;
info!(
logger,
"segment-index";
"stage" => "end",
);
Ok(true)
}
@@ -344,6 +365,10 @@ impl IndexWriter {
Ok(index_writer)
}
pub(crate) fn logger(&self) -> &Logger {
self.index.logger()
}
fn drop_sender(&mut self) {
let (sender, _receiver) = channel::bounded(1);
self.operation_sender = sender;
@@ -352,6 +377,8 @@ impl IndexWriter {
/// If there are some merging threads, blocks until they all finish their work and
/// then drop the `IndexWriter`.
pub fn wait_merging_threads(mut self) -> crate::Result<()> {
info!(self.logger(), "wait-merge-threads"; "stage"=>"start");
// this will stop the indexing thread,
// dropping the last reference to the segment_updater.
self.drop_sender();
@@ -372,9 +399,9 @@ impl IndexWriter {
.map_err(|_| TantivyError::ErrorInThread("Failed to join merging thread.".into()));
if let Err(ref e) = result {
error!("Some merging thread failed {:?}", e);
error!(self.logger(), "some merge thread failed"; "cause"=>e.to_string());
}
info!(self.logger(), "wait-merge-threads"; "stage"=>"stop");
result
}
@@ -434,12 +461,16 @@ impl IndexWriter {
return Ok(());
}
let segment = index.new_segment();
let segment_id = segment.id();
index_documents(
mem_budget,
segment,
&mut document_iterator,
&mut segment_updater,
delete_cursor.clone(),
&index
.logger()
.new(slog::o!("segment"=>segment_id.to_string())),
)?;
}
})?;
@@ -553,7 +584,10 @@ impl IndexWriter {
///
/// The opstamp at the last commit is returned.
pub fn rollback(&mut self) -> crate::Result<Opstamp> {
info!("Rolling back to opstamp {}", self.committed_opstamp);
info!(
self.logger(),
"Rolling back to opstamp {}", self.committed_opstamp
);
// marks the segment updater as killed. From now on, all
// segment updates will be ignored.
self.segment_updater.kill();
@@ -610,6 +644,8 @@ impl IndexWriter {
/// using this API.
/// See [`PreparedCommit::set_payload()`](PreparedCommit.html)
pub fn prepare_commit(&mut self) -> crate::Result<PreparedCommit> {
let logger = self.logger().clone();
// Here, because we join all of the worker threads,
// all of the segment update for this commit have been
// sent.
@@ -620,7 +656,10 @@ impl IndexWriter {
//
// This will move uncommitted segments to the state of
// committed segments.
info!("Preparing commit");
let commit_opstamp = self.stamper.stamp();
info!(logger, "prepare-commit"; "opstamp" => commit_opstamp);
// this will drop the current document channel
// and recreate a new one.
@@ -636,9 +675,8 @@ impl IndexWriter {
self.add_indexing_worker()?;
}
let commit_opstamp = self.stamper.stamp();
let prepared_commit = PreparedCommit::new(self, commit_opstamp);
info!("Prepared commit {}", commit_opstamp);
info!(logger, "Prepared commit {}", commit_opstamp);
Ok(prepared_commit)
}
@@ -800,7 +838,7 @@ mod tests {
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
let index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let index_writer = index.writer_for_tests().unwrap();
let operations = vec![
UserOperation::Add(doc!(text_field=>"a")),
UserOperation::Add(doc!(text_field=>"b")),
@@ -815,7 +853,7 @@ mod tests {
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field => "hello1"));
index_writer.add_document(doc!(text_field => "hello2"));
assert!(index_writer.commit().is_ok());
@@ -864,7 +902,7 @@ mod tests {
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let a_term = Term::from_field_text(text_field, "a");
let b_term = Term::from_field_text(text_field, "b");
let operations = vec![
@@ -926,8 +964,8 @@ mod tests {
fn test_lockfile_already_exists_error_msg() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let _index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
match index.writer_with_num_threads(1, 3_000_000) {
let _index_writer = index.writer_for_tests().unwrap();
match index.writer_for_tests() {
Err(err) => {
let err_msg = err.to_string();
assert!(err_msg.contains("already an `IndexWriter`"));
@@ -1261,7 +1299,7 @@ mod tests {
let idfield = schema_builder.add_text_field("id", STRING);
schema_builder.add_text_field("optfield", STRING);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(idfield=>"myid"));
let commit = index_writer.commit();
assert!(commit.is_ok());

View File

@@ -25,14 +25,14 @@ use std::cmp;
use std::collections::HashMap;
use std::sync::Arc;
fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 {
fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::Result<u64> {
let mut total_tokens = 0u64;
let mut count: [usize; 256] = [0; 256];
for reader in readers {
if reader.has_deletes() {
// if there are deletes, then we use an approximation
// using the fieldnorm
let fieldnorms_reader = reader.get_fieldnorms_reader(field);
let fieldnorms_reader = reader.get_fieldnorms_reader(field)?;
for doc in reader.doc_ids_alive() {
let fieldnorm_id = fieldnorms_reader.fieldnorm_id(doc);
count[fieldnorm_id as usize] += 1;
@@ -41,7 +41,7 @@ fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 {
total_tokens += reader.inverted_index(field).total_num_tokens();
}
}
total_tokens
Ok(total_tokens
+ count
.iter()
.cloned()
@@ -49,7 +49,7 @@ fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 {
.map(|(fieldnorm_ord, count)| {
count as u64 * u64::from(FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8))
})
.sum::<u64>()
.sum::<u64>())
}
pub struct IndexMerger {
@@ -175,7 +175,7 @@ impl IndexMerger {
for field in fields {
fieldnorms_data.clear();
for reader in &self.readers {
let fieldnorms_reader = reader.get_fieldnorms_reader(field);
let fieldnorms_reader = reader.get_fieldnorms_reader(field)?;
for doc_id in reader.doc_ids_alive() {
let fieldnorm_id = fieldnorms_reader.fieldnorm_id(doc_id);
fieldnorms_data.push(fieldnorm_id);
@@ -541,7 +541,7 @@ impl IndexMerger {
// The total number of tokens will only be exact when there has been no deletes.
//
// Otherwise, we approximate by removing deleted documents proportionally.
let total_num_tokens: u64 = compute_total_num_tokens(&self.readers, indexed_field);
let total_num_tokens: u64 = compute_total_num_tokens(&self.readers, indexed_field)?;
// Create the total list of doc ids
// by stacking the doc ids from the different segment.
@@ -751,7 +751,7 @@ mod tests {
};
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
{
// writing the segment
{
@@ -803,7 +803,7 @@ mod tests {
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
block_on(index_writer.merge(&segment_ids)).expect("Merging failed");
index_writer.wait_merging_threads().unwrap();
}
@@ -904,7 +904,7 @@ mod tests {
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
let bytes_score_field = schema_builder.add_bytes_field("score_bytes");
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let reader = index.reader().unwrap();
let search_term = |searcher: &Searcher, term: Term| {
let collector = FastFieldTestCollector::for_field(score_field);
@@ -1211,7 +1211,7 @@ mod tests {
let index = Index::create_in_ram(schema_builder.build());
let reader = index.reader().unwrap();
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| {
let mut doc = Document::default();
for facet in doc_facets {
@@ -1276,7 +1276,7 @@ mod tests {
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
block_on(index_writer.merge(&segment_ids)).expect("Merging failed");
index_writer.wait_merging_threads().unwrap();
reader.reload().unwrap();
@@ -1295,7 +1295,7 @@ mod tests {
// Deleting one term
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let facet = Facet::from_path(vec!["top", "a", "firstdoc"]);
let facet_term = Term::from_facet(facet_field, &facet);
index_writer.delete_term(facet_term);
@@ -1320,7 +1320,7 @@ mod tests {
let mut schema_builder = schema::Schema::builder();
let int_field = schema_builder.add_u64_field("intvals", INDEXED);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(int_field => 1u64));
index_writer.commit().expect("commit failed");
index_writer.add_document(doc!(int_field => 1u64));
@@ -1349,7 +1349,7 @@ mod tests {
let index = Index::create_in_ram(schema_builder.build());
let reader = index.reader().unwrap();
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let mut doc = Document::default();
doc.add_u64(int_field, 1);
index_writer.add_document(doc.clone());
@@ -1388,7 +1388,7 @@ mod tests {
let index = Index::create_in_ram(schema_builder.build());
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| {
let mut doc = Document::default();
for &val in int_vals {
@@ -1462,7 +1462,7 @@ mod tests {
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
assert!(block_on(index_writer.merge(&segment_ids)).is_ok());
assert!(index_writer.wait_merging_threads().is_ok());
}
@@ -1516,7 +1516,7 @@ mod tests {
let index = Index::create_in_ram(builder.build());
let mut writer = index.writer_with_num_threads(1, 3_000_000)?;
let mut writer = index.writer_for_tests()?;
// Make sure we'll attempt to merge every created segment
let mut policy = crate::indexer::LogMergePolicy::default();
@@ -1548,7 +1548,7 @@ mod tests {
let mut builder = schema::SchemaBuilder::new();
let text = builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(builder.build());
let mut writer = index.writer_with_num_threads(1, 3_000_000)?;
let mut writer = index.writer_for_tests()?;
let happy_term = Term::from_field_text(text, "happy");
let term_query = TermQuery::new(happy_term, IndexRecordOption::WithFreqs);
for _ in 0..62 {

View File

@@ -29,8 +29,9 @@ pub use self::segment_writer::SegmentWriter;
/// Alias for the default merge policy, which is the `LogMergePolicy`.
pub type DefaultMergePolicy = LogMergePolicy;
#[cfg(feature = "mmap")]
#[cfg(test)]
mod tests {
mod tests_mmap {
use crate::schema::{self, Schema};
use crate::{Index, Term};
@@ -39,7 +40,7 @@ mod tests {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_from_tempdir(schema_builder.build()).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
// there must be one deleted document in the segment
index_writer.add_document(doc!(text_field=>"b"));
index_writer.delete_term(Term::from_field_text(text_field, "b"));

View File

@@ -1,6 +1,7 @@
use super::IndexWriter;
use crate::Opstamp;
use futures::executor::block_on;
use slog::info;
/// A prepared commit
pub struct PreparedCommit<'a> {
@@ -31,7 +32,7 @@ impl<'a> PreparedCommit<'a> {
}
pub fn commit(self) -> crate::Result<Opstamp> {
info!("committing {}", self.opstamp);
info!(self.index_writer.logger(), "committing {}", self.opstamp);
let _ = block_on(
self.index_writer
.segment_updater()

View File

@@ -1,3 +1,5 @@
use slog::{warn, Logger};
use super::segment_register::SegmentRegister;
use crate::core::SegmentId;
use crate::core::SegmentMeta;
@@ -42,9 +44,9 @@ impl SegmentRegisters {
///
/// It guarantees the atomicity of the
/// changes (merges especially)
#[derive(Default)]
pub struct SegmentManager {
registers: RwLock<SegmentRegisters>,
logger: Logger,
}
impl Debug for SegmentManager {
@@ -77,12 +79,14 @@ impl SegmentManager {
pub fn from_segments(
segment_metas: Vec<SegmentMeta>,
delete_cursor: &DeleteCursor,
logger: Logger,
) -> SegmentManager {
SegmentManager {
registers: RwLock::new(SegmentRegisters {
uncommitted: SegmentRegister::default(),
committed: SegmentRegister::new(segment_metas, delete_cursor),
}),
logger,
}
}
@@ -186,7 +190,7 @@ impl SegmentManager {
let segments_status = registers_lock
.segments_status(before_merge_segment_ids)
.ok_or_else(|| {
warn!("couldn't find segment in SegmentManager");
warn!(self.logger, "couldn't find segment in SegmentManager");
crate::TantivyError::InvalidArgument(
"The segments that were merged could not be found in the SegmentManager. \
This is not necessarily a bug, and can happen after a rollback for instance."

View File

@@ -23,9 +23,9 @@ use futures::channel::oneshot;
use futures::executor::{ThreadPool, ThreadPoolBuilder};
use futures::future::Future;
use futures::future::TryFutureExt;
use slog::{debug, error, info, warn};
use std::borrow::BorrowMut;
use std::collections::HashSet;
use std::io::Write;
use std::ops::Deref;
use std::path::PathBuf;
use std::sync::atomic::{AtomicBool, Ordering};
@@ -65,12 +65,11 @@ pub fn save_new_metas(schema: Schema, directory: &mut dyn Directory) -> crate::R
///
/// This method is not part of tantivy's public API
fn save_metas(metas: &IndexMeta, directory: &mut dyn Directory) -> crate::Result<()> {
info!("save metas");
let mut buffer = serde_json::to_vec_pretty(metas)?;
let mut meta_json = serde_json::to_string_pretty(metas)?;
// Just adding a new line at the end of the buffer.
writeln!(&mut buffer)?;
directory.atomic_write(&META_FILEPATH, &buffer[..])?;
debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas));
meta_json.push_str("\n");
debug!(directory.logger(), "save meta"; "content"=>&meta_json);
directory.atomic_write(&META_FILEPATH, meta_json.as_bytes())?;
Ok(())
}
@@ -97,7 +96,6 @@ impl Deref for SegmentUpdater {
async fn garbage_collect_files(
segment_updater: SegmentUpdater,
) -> crate::Result<GarbageCollectionResult> {
info!("Running garbage collection");
let mut index = segment_updater.index.clone();
index
.directory_mut()
@@ -107,14 +105,12 @@ async fn garbage_collect_files(
/// Merges a list of segments the list of segment givens in the `segment_entries`.
/// This function happens in the calling thread and is computationally expensive.
fn merge(
merged_segment: Segment,
index: &Index,
mut segment_entries: Vec<SegmentEntry>,
target_opstamp: Opstamp,
) -> crate::Result<SegmentEntry> {
// first we need to apply deletes to our segment.
let merged_segment = index.new_segment();
// First we apply all of the delet to the merged segment, up to the target opstamp.
// First we apply all of the delete to the merged segment, up to the target opstamp.
for segment_entry in &mut segment_entries {
let segment = index.segment(segment_entry.meta().clone());
advance_deletes(segment, segment_entry, target_opstamp)?;
@@ -167,7 +163,8 @@ impl SegmentUpdater {
delete_cursor: &DeleteCursor,
) -> crate::Result<SegmentUpdater> {
let segments = index.searchable_segment_metas()?;
let segment_manager = SegmentManager::from_segments(segments, delete_cursor);
let segment_manager =
SegmentManager::from_segments(segments, delete_cursor, index.logger().clone());
let pool = ThreadPoolBuilder::new()
.name_prefix("segment_updater")
.pool_size(1)
@@ -387,7 +384,18 @@ impl SegmentUpdater {
.segment_manager
.start_merge(merge_operation.segment_ids())?;
info!("Starting merge - {:?}", merge_operation.segment_ids());
let segment_ids_str: String = merge_operation
.segment_ids()
.iter()
.map(|segment_id| segment_id.to_string())
.collect::<Vec<String>>()
.join(",");
let merged_segment = self.index.new_segment();
let logger = self.index.logger().new(slog::o!("segments"=>segment_ids_str, "merged-segment"=>merged_segment.id().to_string()));
let num_merges: usize = self.merge_operations.list().len();
slog::info!(&logger, "merge"; "stage"=>"start", "num-merges" => num_merges);
let (merging_future_send, merging_future_recv) =
oneshot::channel::<crate::Result<SegmentMeta>>();
@@ -398,22 +406,20 @@ impl SegmentUpdater {
// as well as which segment is currently in merge and therefore should not be
// candidate for another merge.
match merge(
merged_segment,
&segment_updater.index,
segment_entries,
merge_operation.target_opstamp(),
) {
Ok(after_merge_segment_entry) => {
info!(&logger, "merge"; "stage" => "end");
let segment_meta = segment_updater
.end_merge(merge_operation, after_merge_segment_entry)
.await;
let _send_result = merging_future_send.send(segment_meta);
}
Err(e) => {
warn!(
"Merge of {:?} was cancelled: {:?}",
merge_operation.segment_ids().to_vec(),
e
);
error!(&logger, "merge"; "stage" => "fail", "cause"=>e.to_string());
// ... cancel merge
if cfg!(test) {
panic!("Merge failed.");
@@ -454,11 +460,12 @@ impl SegmentUpdater {
.collect::<Vec<_>>();
merge_candidates.extend(committed_merge_candidates.into_iter());
let logger = self.index.logger();
for merge_operation in merge_candidates {
if let Err(err) = self.start_merge(merge_operation) {
warn!(
"Starting the merge failed for the following reason. This is not fatal. {}",
err
logger,
"merge-start-fail (not fatal, not necessarily a problem)"; "reason" => format!("{}", err),
);
}
}
@@ -471,8 +478,11 @@ impl SegmentUpdater {
) -> impl Future<Output = crate::Result<SegmentMeta>> {
let segment_updater = self.clone();
let after_merge_segment_meta = after_merge_segment_entry.meta().clone();
let logger = self.index.logger().new(
slog::o!("segment"=>after_merge_segment_meta.id().to_string(),
"delete-opstamp"=>after_merge_segment_meta.delete_opstamp()),
);
let end_merge_future = self.schedule_future(async move {
info!("End merge {:?}", after_merge_segment_entry.meta());
{
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
if let Some(delete_operation) = delete_cursor.get() {
@@ -486,6 +496,7 @@ impl SegmentUpdater {
committed_opstamp,
) {
error!(
logger,
"Merge of {:?} was cancelled (advancing deletes failed): {:?}",
merge_operation.segment_ids(),
e
@@ -555,7 +566,7 @@ mod tests {
let index = Index::create_in_ram(schema);
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(MergeWheneverPossible));
{
@@ -608,7 +619,7 @@ mod tests {
let index = Index::create_in_ram(schema);
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
{
for _ in 0..100 {
@@ -679,7 +690,7 @@ mod tests {
let index = Index::create_in_ram(schema);
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
{
for _ in 0..100 {

View File

@@ -1,5 +1,4 @@
use super::operation::AddOperation;
use crate::core::Segment;
use crate::core::SerializableSegment;
use crate::fastfield::FastFieldsWriter;
use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter};
@@ -15,9 +14,9 @@ use crate::tokenizer::{BoxTokenStream, PreTokenizedStream};
use crate::tokenizer::{FacetTokenizer, TextAnalyzer};
use crate::tokenizer::{TokenStreamChain, Tokenizer};
use crate::Opstamp;
use crate::{core::Segment, tokenizer::MAX_TOKEN_LEN};
use crate::{DocId, SegmentComponent};
use std::io;
use std::str;
/// Computes the initial size of the hash table.
///
@@ -48,6 +47,7 @@ pub struct SegmentWriter {
fieldnorms_writer: FieldNormsWriter,
doc_opstamps: Vec<Opstamp>,
tokenizers: Vec<Option<TextAnalyzer>>,
term_buffer: Term,
}
impl SegmentWriter {
@@ -91,6 +91,7 @@ impl SegmentWriter {
fast_field_writers: FastFieldsWriter::from_schema(schema),
doc_opstamps: Vec::with_capacity(1_000),
tokenizers,
term_buffer: Term::new(),
})
}
@@ -128,24 +129,29 @@ impl SegmentWriter {
if !field_options.is_indexed() {
continue;
}
let (term_buffer, multifield_postings) =
(&mut self.term_buffer, &mut self.multifield_postings);
match *field_options.field_type() {
FieldType::HierarchicalFacet => {
let facets: Vec<&str> = field_values
.iter()
.flat_map(|field_value| match *field_value.value() {
Value::Facet(ref facet) => Some(facet.encoded_str()),
_ => {
panic!("Expected hierarchical facet");
}
})
.collect();
let mut term = Term::for_field(field); // we set the Term
term_buffer.set_field(field);
let facets =
field_values
.iter()
.flat_map(|field_value| match *field_value.value() {
Value::Facet(ref facet) => Some(facet.encoded_str()),
_ => {
panic!("Expected hierarchical facet");
}
});
for fake_str in facets {
let mut unordered_term_id_opt = None;
FacetTokenizer.token_stream(fake_str).process(&mut |token| {
term.set_text(&token.text);
if token.text.len() > MAX_TOKEN_LEN {
return;
}
term_buffer.set_text(&token.text);
let unordered_term_id =
self.multifield_postings.subscribe(doc_id, &term);
multifield_postings.subscribe(doc_id, &term_buffer);
unordered_term_id_opt = Some(unordered_term_id);
});
if let Some(unordered_term_id) = unordered_term_id_opt {
@@ -168,7 +174,6 @@ impl SegmentWriter {
if let Some(last_token) = tok_str.tokens.last() {
total_offset += last_token.offset_to;
}
token_streams
.push(PreTokenizedStream::from(tok_str.clone()).into());
}
@@ -178,7 +183,6 @@ impl SegmentWriter {
{
offsets.push(total_offset);
total_offset += text.len();
token_streams.push(tokenizer.token_stream(text));
}
}
@@ -190,8 +194,12 @@ impl SegmentWriter {
0
} else {
let mut token_stream = TokenStreamChain::new(offsets, token_streams);
self.multifield_postings
.index_text(doc_id, field, &mut token_stream)
multifield_postings.index_text(
doc_id,
field,
&mut token_stream,
term_buffer,
)
};
self.fieldnorms_writer.record(doc_id, field, num_tokens);
@@ -199,44 +207,36 @@ impl SegmentWriter {
FieldType::U64(ref int_option) => {
if int_option.is_indexed() {
for field_value in field_values {
let term = Term::from_field_u64(
field_value.field(),
field_value.value().u64_value(),
);
self.multifield_postings.subscribe(doc_id, &term);
term_buffer.set_field(field_value.field());
term_buffer.set_u64(field_value.value().u64_value());
multifield_postings.subscribe(doc_id, &term_buffer);
}
}
}
FieldType::Date(ref int_option) => {
if int_option.is_indexed() {
for field_value in field_values {
let term = Term::from_field_i64(
field_value.field(),
field_value.value().date_value().timestamp(),
);
self.multifield_postings.subscribe(doc_id, &term);
term_buffer.set_field(field_value.field());
term_buffer.set_i64(field_value.value().date_value().timestamp());
multifield_postings.subscribe(doc_id, &term_buffer);
}
}
}
FieldType::I64(ref int_option) => {
if int_option.is_indexed() {
for field_value in field_values {
let term = Term::from_field_i64(
field_value.field(),
field_value.value().i64_value(),
);
self.multifield_postings.subscribe(doc_id, &term);
term_buffer.set_field(field_value.field());
term_buffer.set_i64(field_value.value().i64_value());
multifield_postings.subscribe(doc_id, &term_buffer);
}
}
}
FieldType::F64(ref int_option) => {
if int_option.is_indexed() {
for field_value in field_values {
let term = Term::from_field_f64(
field_value.field(),
field_value.value().f64_value(),
);
self.multifield_postings.subscribe(doc_id, &term);
term_buffer.set_field(field_value.field());
term_buffer.set_f64(field_value.value().f64_value());
multifield_postings.subscribe(doc_id, &term_buffer);
}
}
}

View File

@@ -102,10 +102,7 @@
extern crate serde_json;
#[macro_use]
extern crate log;
#[macro_use]
extern crate failure;
extern crate thiserror;
#[cfg(all(test, feature = "unstable"))]
extern crate test;
@@ -148,6 +145,7 @@ pub mod schema;
pub mod space_usage;
pub mod store;
pub mod termdict;
pub use slog;
mod reader;
@@ -245,18 +243,10 @@ pub type DocId = u32;
/// with opstamp `n+1`.
pub type Opstamp = u64;
/// A Score that represents the relevance of the document to the query
///
/// This is modelled internally as a `f64`, because tantivy was compiled with the `scoref64`
/// feature. The larger the number, the more relevant the document is to the search query.
#[cfg(feature = "scoref64")]
pub type Score = f64;
/// A Score that represents the relevance of the document to the query
///
/// This is modelled internally as a `f32`. The larger the number, the more relevant
/// the document to the search query.
#[cfg(not(feature = "scoref64"))]
pub type Score = f32;
/// A `SegmentLocalId` identifies a segment.
@@ -296,7 +286,6 @@ mod tests {
use crate::schema::*;
use crate::DocAddress;
use crate::Index;
use crate::IndexWriter;
use crate::Postings;
use crate::ReloadPolicy;
use rand::distributions::Bernoulli;
@@ -361,14 +350,14 @@ mod tests {
#[test]
#[cfg(feature = "mmap")]
fn test_indexing() {
fn test_indexing() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_from_tempdir(schema).unwrap();
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests()?;
{
let doc = doc!(text_field=>"af b");
index_writer.add_document(doc);
@@ -383,29 +372,30 @@ mod tests {
}
assert!(index_writer.commit().is_ok());
}
Ok(())
}
#[test]
fn test_docfreq1() {
fn test_docfreq1() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests()?;
{
index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.commit().unwrap();
index_writer.commit()?;
}
{
index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"a a"));
index_writer.commit().unwrap();
index_writer.commit()?;
}
{
index_writer.add_document(doc!(text_field=>"c"));
index_writer.commit().unwrap();
index_writer.commit()?;
}
{
let reader = index.reader().unwrap();
let reader = index.reader()?;
let searcher = reader.searcher();
let term_a = Term::from_field_text(text_field, "a");
assert_eq!(searcher.doc_freq(&term_a), 3);
@@ -416,67 +406,50 @@ mod tests {
let term_d = Term::from_field_text(text_field, "d");
assert_eq!(searcher.doc_freq(&term_d), 0);
}
Ok(())
}
#[test]
fn test_fieldnorm_no_docs_with_field() {
fn test_fieldnorm_no_docs_with_field() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let title_field = schema_builder.add_text_field("title", TEXT);
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.commit()?;
let index_reader = index.reader()?;
let searcher = index_reader.searcher();
let reader = searcher.segment_reader(0);
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
{
let doc = doc!(text_field=>"a b c");
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
let fieldnorm_reader = reader.get_fieldnorms_reader(text_field)?;
assert_eq!(fieldnorm_reader.fieldnorm(0), 3);
}
{
let index_reader = index.reader().unwrap();
let searcher = index_reader.searcher();
let reader = searcher.segment_reader(0);
{
let fieldnorm_reader = reader.get_fieldnorms_reader(text_field);
assert_eq!(fieldnorm_reader.fieldnorm(0), 3);
}
{
let fieldnorm_reader = reader.get_fieldnorms_reader(title_field);
assert_eq!(fieldnorm_reader.fieldnorm_id(0), 0);
}
let fieldnorm_reader = reader.get_fieldnorms_reader(title_field)?;
assert_eq!(fieldnorm_reader.fieldnorm_id(0), 0);
}
Ok(())
}
#[test]
fn test_fieldnorm() {
fn test_fieldnorm() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
{
let doc = doc!(text_field=>"a b c");
index_writer.add_document(doc);
}
{
let doc = doc!();
index_writer.add_document(doc);
}
{
let doc = doc!(text_field=>"a b");
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
}
{
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let segment_reader: &SegmentReader = searcher.segment_reader(0);
let fieldnorms_reader = segment_reader.get_fieldnorms_reader(text_field);
assert_eq!(fieldnorms_reader.fieldnorm(0), 3);
assert_eq!(fieldnorms_reader.fieldnorm(1), 0);
assert_eq!(fieldnorms_reader.fieldnorm(2), 2);
}
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.add_document(doc!());
index_writer.add_document(doc!(text_field=>"a b"));
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
let segment_reader: &SegmentReader = searcher.segment_reader(0);
let fieldnorms_reader = segment_reader.get_fieldnorms_reader(text_field)?;
assert_eq!(fieldnorms_reader.fieldnorm(0), 3);
assert_eq!(fieldnorms_reader.fieldnorm(1), 0);
assert_eq!(fieldnorms_reader.fieldnorm(2), 2);
Ok(())
}
fn advance_undeleted(docset: &mut dyn DocSet, reader: &SegmentReader) -> bool {
@@ -491,7 +464,7 @@ mod tests {
}
#[test]
fn test_delete_postings1() {
fn test_delete_postings1() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let term_abcd = Term::from_field_text(text_field, "abcd");
@@ -507,7 +480,7 @@ mod tests {
.unwrap();
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests()?;
// 0
index_writer.add_document(doc!(text_field=>"a b"));
// 1
@@ -523,10 +496,10 @@ mod tests {
index_writer.add_document(doc!(text_field=>" b c"));
// 5
index_writer.add_document(doc!(text_field=>" a"));
index_writer.commit().unwrap();
index_writer.commit()?;
}
{
reader.reload().unwrap();
reader.reload()?;
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0);
let inverted_index = segment_reader.inverted_index(text_field);
@@ -554,15 +527,15 @@ mod tests {
}
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests()?;
// 0
index_writer.add_document(doc!(text_field=>"a b"));
// 1
index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.rollback().unwrap();
index_writer.rollback()?;
}
{
reader.reload().unwrap();
reader.reload()?;
let searcher = reader.searcher();
let seg_reader = searcher.segment_reader(0);
let inverted_index = seg_reader.inverted_index(term_abcd.field());
@@ -591,15 +564,15 @@ mod tests {
}
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b"));
index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.rollback().unwrap();
index_writer.rollback()?;
index_writer.delete_term(Term::from_field_text(text_field, "a"));
index_writer.commit().unwrap();
index_writer.commit()?;
}
{
reader.reload().unwrap();
reader.reload()?;
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0);
let inverted_index = segment_reader.inverted_index(term_abcd.field());
@@ -631,19 +604,20 @@ mod tests {
assert!(!advance_undeleted(&mut postings, segment_reader));
}
}
Ok(())
}
#[test]
fn test_indexed_u64() {
fn test_indexed_u64() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field("value", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(field=>1u64));
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
let term = Term::from_field_u64(field, 1u64);
let mut postings = searcher
@@ -653,20 +627,21 @@ mod tests {
.unwrap();
assert_eq!(postings.doc(), 0);
assert_eq!(postings.advance(), TERMINATED);
Ok(())
}
#[test]
fn test_indexed_i64() {
fn test_indexed_i64() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let value_field = schema_builder.add_i64_field("value", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests()?;
let negative_val = -1i64;
index_writer.add_document(doc!(value_field => negative_val));
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
let term = Term::from_field_i64(value_field, negative_val);
let mut postings = searcher
@@ -676,20 +651,21 @@ mod tests {
.unwrap();
assert_eq!(postings.doc(), 0);
assert_eq!(postings.advance(), TERMINATED);
Ok(())
}
#[test]
fn test_indexed_f64() {
fn test_indexed_f64() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let value_field = schema_builder.add_f64_field("value", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests()?;
let val = std::f64::consts::PI;
index_writer.add_document(doc!(value_field => val));
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
let term = Term::from_field_f64(value_field, val);
let mut postings = searcher
@@ -699,26 +675,29 @@ mod tests {
.unwrap();
assert_eq!(postings.doc(), 0);
assert_eq!(postings.advance(), TERMINATED);
Ok(())
}
#[test]
fn test_indexedfield_not_in_documents() {
fn test_indexedfield_not_in_documents() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let absent_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(2, 6_000_000).unwrap();
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a"));
assert!(index_writer.commit().is_ok());
let reader = index.reader().unwrap();
let reader = index.reader()?;
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0);
segment_reader.inverted_index(absent_field); //< should not panic
let inverted_index = segment_reader.inverted_index(absent_field); //< should not panic
assert_eq!(inverted_index.terms().num_terms(), 0);
Ok(())
}
#[test]
fn test_delete_postings2() {
fn test_delete_postings2() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
@@ -726,53 +705,40 @@ mod tests {
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
.try_into()?;
// writing the segment
let mut index_writer = index.writer_with_num_threads(2, 6_000_000).unwrap();
let add_document = |index_writer: &mut IndexWriter, val: &'static str| {
let doc = doc!(text_field=>val);
index_writer.add_document(doc);
};
let remove_document = |index_writer: &mut IndexWriter, val: &'static str| {
let delterm = Term::from_field_text(text_field, val);
index_writer.delete_term(delterm);
};
add_document(&mut index_writer, "63");
add_document(&mut index_writer, "70");
add_document(&mut index_writer, "34");
add_document(&mut index_writer, "1");
add_document(&mut index_writer, "38");
add_document(&mut index_writer, "33");
add_document(&mut index_writer, "40");
add_document(&mut index_writer, "17");
remove_document(&mut index_writer, "38");
remove_document(&mut index_writer, "34");
index_writer.commit().unwrap();
reader.reload().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 6);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"63"));
index_writer.add_document(doc!(text_field=>"70"));
index_writer.add_document(doc!(text_field=>"34"));
index_writer.add_document(doc!(text_field=>"1"));
index_writer.add_document(doc!(text_field=>"38"));
index_writer.add_document(doc!(text_field=>"33"));
index_writer.add_document(doc!(text_field=>"40"));
index_writer.add_document(doc!(text_field=>"17"));
index_writer.delete_term(Term::from_field_text(text_field, "38"));
index_writer.delete_term(Term::from_field_text(text_field, "34"));
index_writer.commit()?;
reader.reload()?;
assert_eq!(reader.searcher().num_docs(), 6);
Ok(())
}
#[test]
fn test_termfreq() {
fn test_termfreq() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let doc = doc!(text_field=>"af af af bc bc");
index_writer.add_document(doc);
index_writer.commit().unwrap();
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"af af af bc bc"));
index_writer.commit()?;
}
{
let index_reader = index.reader().unwrap();
let index_reader = index.reader()?;
let searcher = index_reader.searcher();
let reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(text_field);
@@ -788,63 +754,63 @@ mod tests {
assert_eq!(postings.term_freq(), 3);
assert_eq!(postings.advance(), TERMINATED);
}
Ok(())
}
#[test]
fn test_searcher_1() {
fn test_searcher_1() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let reader = index.reader().unwrap();
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"af af af b"));
index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.add_document(doc!(text_field=>"a b c d"));
index_writer.commit().unwrap();
}
{
reader.reload().unwrap();
let searcher = reader.searcher();
let get_doc_ids = |terms: Vec<Term>| {
let query = BooleanQuery::new_multiterms_query(terms);
let topdocs = searcher.search(&query, &TEST_COLLECTOR_WITH_SCORE).unwrap();
topdocs.docs().to_vec()
};
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
vec![DocAddress(0, 1), DocAddress(0, 2)]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
vec![DocAddress(0, 0)]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
vec![DocAddress(0, 0), DocAddress(0, 1), DocAddress(0, 2)]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "c")]),
vec![DocAddress(0, 1), DocAddress(0, 2)]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "d")]),
vec![DocAddress(0, 2)]
);
assert_eq!(
get_doc_ids(vec![
Term::from_field_text(text_field, "b"),
Term::from_field_text(text_field, "a"),
]),
vec![DocAddress(0, 0), DocAddress(0, 1), DocAddress(0, 2)]
);
}
let reader = index.reader()?;
// writing the segment
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"af af af b"));
index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.add_document(doc!(text_field=>"a b c d"));
index_writer.commit()?;
reader.reload()?;
let searcher = reader.searcher();
let get_doc_ids = |terms: Vec<Term>| {
let query = BooleanQuery::new_multiterms_query(terms);
searcher
.search(&query, &TEST_COLLECTOR_WITH_SCORE)
.map(|topdocs| topdocs.docs().to_vec())
};
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "a")])?,
vec![DocAddress(0, 1), DocAddress(0, 2)]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "af")])?,
vec![DocAddress(0, 0)]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "b")])?,
vec![DocAddress(0, 0), DocAddress(0, 1), DocAddress(0, 2)]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "c")])?,
vec![DocAddress(0, 1), DocAddress(0, 2)]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "d")])?,
vec![DocAddress(0, 2)]
);
assert_eq!(
get_doc_ids(vec![
Term::from_field_text(text_field, "b"),
Term::from_field_text(text_field, "a"),
])?,
vec![DocAddress(0, 0), DocAddress(0, 1), DocAddress(0, 2)]
);
Ok(())
}
#[test]
fn test_searcher_2() {
fn test_searcher_2() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
@@ -852,19 +818,17 @@ mod tests {
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
.try_into()?;
assert_eq!(reader.searcher().num_docs(), 0u64);
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"af b"));
index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.add_document(doc!(text_field=>"a b c d"));
index_writer.commit().unwrap();
}
reader.reload().unwrap();
// writing the segment
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"af b"));
index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.add_document(doc!(text_field=>"a b c d"));
index_writer.commit()?;
reader.reload()?;
assert_eq!(reader.searcher().num_docs(), 3u64);
Ok(())
}
#[test]
@@ -886,7 +850,7 @@ mod tests {
}
#[test]
fn test_wrong_fast_field_type() {
fn test_wrong_fast_field_type() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let fast_field_unsigned = schema_builder.add_u64_field("unsigned", FAST);
let fast_field_signed = schema_builder.add_i64_field("signed", FAST);
@@ -896,14 +860,14 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 50_000_000).unwrap();
let mut index_writer = index.writer_for_tests()?;
{
let document =
doc!(fast_field_unsigned => 4u64, fast_field_signed=>4i64, fast_field_float=>4f64);
index_writer.add_document(document);
index_writer.commit().unwrap();
index_writer.commit()?;
}
let reader = index.reader().unwrap();
let reader = index.reader()?;
let searcher = reader.searcher();
let segment_reader: &SegmentReader = searcher.segment_reader(0);
{
@@ -942,11 +906,12 @@ mod tests {
let fast_field_reader = fast_field_reader_opt.unwrap();
assert_eq!(fast_field_reader.get(0), 4f64)
}
Ok(())
}
// motivated by #729
#[test]
fn test_update_via_delete_insert() {
fn test_update_via_delete_insert() -> crate::Result<()> {
use crate::collector::Count;
use crate::indexer::NoMergePolicy;
use crate::query::AllQuery;
@@ -960,17 +925,17 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let index_reader = index.reader().unwrap();
let index_reader = index.reader()?;
let mut index_writer = index.writer(3_000_000).unwrap();
let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
for doc_id in 0u64..DOC_COUNT {
index_writer.add_document(doc!(id => doc_id));
}
index_writer.commit().unwrap();
index_writer.commit()?;
index_reader.reload().unwrap();
index_reader.reload()?;
let searcher = index_reader.searcher();
assert_eq!(
@@ -981,12 +946,11 @@ mod tests {
// update the 10 elements by deleting and re-adding
for doc_id in 0u64..DOC_COUNT {
index_writer.delete_term(Term::from_field_u64(id, doc_id));
index_writer.commit().unwrap();
index_reader.reload().unwrap();
let doc = doc!(id => doc_id);
index_writer.add_document(doc);
index_writer.commit().unwrap();
index_reader.reload().unwrap();
index_writer.commit()?;
index_reader.reload()?;
index_writer.add_document(doc!(id => doc_id));
index_writer.commit()?;
index_reader.reload()?;
let searcher = index_reader.searcher();
// The number of document should be stable.
assert_eq!(
@@ -995,7 +959,7 @@ mod tests {
);
}
index_reader.reload().unwrap();
index_reader.reload()?;
let searcher = index_reader.searcher();
let segment_ids: Vec<SegmentId> = searcher
.segment_readers()
@@ -1004,12 +968,18 @@ mod tests {
.collect();
block_on(index_writer.merge(&segment_ids)).unwrap();
index_reader.reload().unwrap();
index_reader.reload()?;
let searcher = index_reader.searcher();
assert_eq!(searcher.search(&AllQuery, &Count)?, DOC_COUNT as usize);
Ok(())
}
assert_eq!(
searcher.search(&AllQuery, &Count).unwrap(),
DOC_COUNT as usize
);
#[test]
fn test_validate_checksum() -> crate::Result<()> {
let index_path = tempfile::tempdir().expect("dir");
let schema = Schema::builder().build();
let index = Index::create_in_dir(&index_path, schema)?;
assert!(index.validate_checksum()?.is_empty());
Ok(())
}
}

View File

@@ -455,7 +455,7 @@ mod tests {
let int_field = schema_builder.add_u64_field("id", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let mut last_doc = 0u32;
for &doc in docs {
for _ in last_doc..doc {
@@ -496,7 +496,7 @@ mod tests {
let int_field = schema_builder.add_u64_field("id", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
// create two postings list, one containg even number,
// the other containing odd numbers.
for i in 0..6 {

View File

@@ -310,6 +310,7 @@ pub mod tests {
mod bench {
use super::*;
use crate::TERMINATED;
use rand::rngs::StdRng;
use rand::Rng;
use rand::SeedableRng;
@@ -340,7 +341,7 @@ mod bench {
let mut encoder = BlockEncoder::new();
let data = generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
let (num_bits, compressed) = encoder.compress_block_sorted(&data, 0u32);
let mut decoder = BlockDecoder::new();
let mut decoder = BlockDecoder::default();
b.iter(|| {
decoder.uncompress_block_sorted(compressed, 0u32, num_bits);
});
@@ -375,9 +376,9 @@ mod bench {
let mut encoder = BlockEncoder::new();
let data = generate_array(NUM_INTS_BENCH_VINT, 0.001);
let compressed = encoder.compress_vint_sorted(&data, 0u32);
let mut decoder = BlockDecoder::new();
let mut decoder = BlockDecoder::default();
b.iter(|| {
decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT);
decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT, TERMINATED);
});
}
}

View File

@@ -91,7 +91,7 @@ pub mod tests {
let title = schema_builder.add_text_field("title", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 30_000_000)?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(title => r#"abc abc abc"#));
index_writer.add_document(doc!(title => r#"abc be be be be abc"#));
for _ in 0..1_000 {
@@ -176,7 +176,7 @@ pub mod tests {
.tokenizers()
.register("simple_no_truncation", SimpleTokenizer);
let reader = index.reader().unwrap();
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy));
{
index_writer.add_document(doc!(text_field=>exceeding_token_text));
@@ -205,7 +205,7 @@ pub mod tests {
}
#[test]
pub fn test_position_and_fieldnorm1() {
pub fn test_position_and_fieldnorm1() -> crate::Result<()> {
let mut positions = Vec::new();
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
@@ -217,42 +217,38 @@ pub mod tests {
let mut segment_writer =
SegmentWriter::for_segment(3_000_000, segment.clone(), &schema).unwrap();
{
let mut doc = Document::default();
// checking that position works if the field has two values
doc.add_text(text_field, "a b a c a d a a.");
doc.add_text(text_field, "d d d d a");
let op = AddOperation {
opstamp: 0u64,
document: doc,
document: doc!(
text_field => "a b a c a d a a.",
text_field => "d d d d a"
),
};
segment_writer.add_document(op, &schema).unwrap();
segment_writer.add_document(op, &schema)?;
}
{
let mut doc = Document::default();
doc.add_text(text_field, "b a");
let op = AddOperation {
opstamp: 1u64,
document: doc,
document: doc!(text_field => "b a"),
};
segment_writer.add_document(op, &schema).unwrap();
}
for i in 2..1000 {
let mut doc = Document::default();
let mut text = iter::repeat("e ").take(i).collect::<String>();
let mut text: String = iter::repeat("e ").take(i).collect();
text.push_str(" a");
doc.add_text(text_field, &text);
let op = AddOperation {
opstamp: 2u64,
document: doc,
document: doc!(text_field => text),
};
segment_writer.add_document(op, &schema).unwrap();
}
segment_writer.finalize().unwrap();
segment_writer.finalize()?;
}
{
let segment_reader = SegmentReader::open(&segment).unwrap();
let segment_reader = SegmentReader::open(&segment)?;
{
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text_field);
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text_field)?;
assert_eq!(fieldnorm_reader.fieldnorm(0), 8 + 5);
assert_eq!(fieldnorm_reader.fieldnorm(1), 2);
for i in 2..1000 {
@@ -312,6 +308,7 @@ pub mod tests {
assert_eq!(postings_e.doc(), TERMINATED);
}
}
Ok(())
}
#[test]
@@ -322,7 +319,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field => "g b b d c g c"));
index_writer.add_document(doc!(text_field => "g a b b a d c g c"));
assert!(index_writer.commit().is_ok());
@@ -354,7 +351,7 @@ pub mod tests {
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
for i in 0u64..num_docs as u64 {
let doc = doc!(value_field => 2u64, value_field => i % 2u64);
index_writer.add_document(doc);
@@ -425,7 +422,7 @@ pub mod tests {
// delete some of the documents
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.delete_term(term_0);
assert!(index_writer.commit().is_ok());
}
@@ -479,7 +476,7 @@ pub mod tests {
// delete everything else
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.delete_term(term_1);
assert!(index_writer.commit().is_ok());
}
@@ -522,7 +519,7 @@ pub mod tests {
let index = Index::create_in_ram(schema);
let posting_list_size = 1_000_000;
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
for _ in 0..posting_list_size {
let mut doc = Document::default();
if rng.gen_bool(1f64 / 15f64) {
@@ -730,7 +727,7 @@ mod bench {
let mut s = 0u32;
while segment_postings.doc() != TERMINATED {
s += (segment_postings.doc() & n) % 1024;
segment_postings.advance()
segment_postings.advance();
}
s
});

View File

@@ -105,6 +105,7 @@ impl MultiFieldPostingsWriter {
doc: DocId,
field: Field,
token_stream: &mut dyn TokenStream,
term_buffer: &mut Term,
) -> u32 {
let postings_writer =
self.per_field_postings_writers[field.field_id() as usize].deref_mut();
@@ -114,6 +115,7 @@ impl MultiFieldPostingsWriter {
field,
token_stream,
&mut self.heap,
term_buffer,
)
}
@@ -220,21 +222,22 @@ pub trait PostingsWriter {
field: Field,
token_stream: &mut dyn TokenStream,
heap: &mut MemoryArena,
term_buffer: &mut Term,
) -> u32 {
let mut term = Term::for_field(field);
term_buffer.set_field(field);
let mut sink = |token: &Token| {
// We skip all tokens with a len greater than u16.
if token.text.len() <= MAX_TOKEN_LEN {
term.set_text(token.text.as_str());
self.subscribe(term_index, doc_id, token.position as u32, &term, heap);
} else {
info!(
"A token exceeding MAX_TOKEN_LEN ({}>{}) was dropped. Search for \
MAX_TOKEN_LEN in the documentation for more information.",
token.text.len(),
MAX_TOKEN_LEN
);
if token.text.len() > MAX_TOKEN_LEN {
return;
}
term_buffer.set_text(token.text.as_str());
self.subscribe(
term_index,
doc_id,
token.position as u32,
&term_buffer,
heap,
);
};
token_stream.process(&mut sink)
}

View File

@@ -114,7 +114,7 @@ impl SegmentPostings {
.iter()
.map(|&fieldnorm| fieldnorm as u64)
.sum::<u64>();
total_num_tokens as Score / fieldnorms.len() as f32
total_num_tokens as Score / fieldnorms.len() as Score
})
.unwrap_or(0.0);
let mut postings_serializer = PostingsSerializer::new(

View File

@@ -83,7 +83,7 @@ mod tests {
let field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(field=>"aaa"));
index_writer.add_document(doc!(field=>"bbb"));
index_writer.commit().unwrap();

View File

@@ -5,7 +5,6 @@ use crate::query::{BitSetDocSet, Explanation};
use crate::query::{Scorer, Weight};
use crate::schema::{Field, IndexRecordOption};
use crate::termdict::{TermDictionary, TermStreamer};
use crate::Result;
use crate::TantivyError;
use crate::{DocId, Score};
use std::sync::Arc;
@@ -40,7 +39,7 @@ impl<A> Weight for AutomatonWeight<A>
where
A: Automaton + Send + Sync + 'static,
{
fn scorer(&self, reader: &SegmentReader, boost: Score) -> Result<Box<dyn Scorer>> {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let max_doc = reader.max_doc();
let mut doc_bitset = BitSet::with_max_value(max_doc);
let inverted_index = reader.inverted_index(self.field);
@@ -66,7 +65,7 @@ where
Ok(Box::new(const_scorer))
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0)?;
if scorer.seek(doc) == doc {
Ok(Explanation::new("AutomatonScorer", 1.0))
@@ -91,7 +90,7 @@ mod tests {
let mut schema = Schema::builder();
let title = schema.add_text_field("title", STRING);
let index = Index::create_in_ram(schema.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(title=>"abc"));
index_writer.add_document(doc!(title=>"bcd"));
index_writer.add_document(doc!(title=>"abcd"));

View File

@@ -4,19 +4,6 @@ use crate::{DocId, DocSet, Score, TERMINATED};
use std::ops::Deref;
use std::ops::DerefMut;
fn is_sorted<I: Iterator<Item = DocId>>(mut it: I) -> bool {
if let Some(first) = it.next() {
let mut prev = first;
for doc in it {
if doc < prev {
return false;
}
prev = doc;
}
}
true
}
/// Takes a term_scorers sorted by their current doc() and a threshold and returns
/// Returns (pivot_len, pivot_ord) defined as follows:
/// - `pivot_doc` lowest document that has a chance of exceeding (>) the threshold score.
@@ -55,37 +42,12 @@ fn find_pivot_doc(
Some((before_pivot_len, pivot_len, pivot_doc))
}
struct TermScorerWithMaxScore<'a> {
scorer: &'a mut TermScorer,
max_score: Score,
}
impl<'a> From<&'a mut TermScorer> for TermScorerWithMaxScore<'a> {
fn from(scorer: &'a mut TermScorer) -> Self {
let max_score = scorer.max_score();
TermScorerWithMaxScore { scorer, max_score }
}
}
impl<'a> Deref for TermScorerWithMaxScore<'a> {
type Target = TermScorer;
fn deref(&self) -> &Self::Target {
self.scorer
}
}
impl<'a> DerefMut for TermScorerWithMaxScore<'a> {
fn deref_mut(&mut self) -> &mut Self::Target {
self.scorer
}
}
// Before and after calling this method, scorers need to be sorted by their `.doc()`.
fn block_max_was_too_low_advance_one_scorer(
scorers: &mut Vec<TermScorerWithMaxScore>,
pivot_len: usize,
) {
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
let mut scorer_to_seek = pivot_len - 1;
let mut doc_to_seek_after = scorers[scorer_to_seek].doc();
for scorer_ord in (0..pivot_len - 1).rev() {
@@ -102,6 +64,7 @@ fn block_max_was_too_low_advance_one_scorer(
}
scorers[scorer_to_seek].seek(doc_to_seek_after + 1);
restore_ordering(scorers, scorer_to_seek);
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
}
// Given a list of term_scorers and a `ord` and assuming that `term_scorers[ord]` is sorted
@@ -177,64 +140,99 @@ pub fn block_wand(
.map(TermScorerWithMaxScore::from)
.collect();
scorers.sort_by_key(|scorer| scorer.doc());
loop {
// At this point we need to ensure that the scorers are sorted!
// At this point we need to ensure that the scorers are sorted!
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
while let Some((before_pivot_len, pivot_len, pivot_doc)) =
find_pivot_doc(&scorers[..], threshold)
{
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
if let Some((before_pivot_len, pivot_len, pivot_doc)) =
find_pivot_doc(&scorers[..], threshold)
{
debug_assert_ne!(pivot_doc, TERMINATED);
debug_assert!(before_pivot_len < pivot_len);
debug_assert_ne!(pivot_doc, TERMINATED);
debug_assert!(before_pivot_len < pivot_len);
let block_max_score_upperbound: Score = scorers[..pivot_len]
.iter_mut()
.map(|scorer| {
scorer.shallow_seek(pivot_doc);
scorer.block_max_score()
})
.sum();
let block_max_score_upperbound: Score = scorers[..pivot_len]
.iter_mut()
.map(|scorer| {
scorer.shallow_seek(pivot_doc);
scorer.block_max_score()
})
.sum();
// Beware after shallow advance, skip readers can be in advance compared to
// the segment posting lists.
//
// `block_segment_postings.load_block()` need to be called separately.
if block_max_score_upperbound <= threshold {
// Block max condition was not reached
// We could get away by simply advancing the scorers to DocId + 1 but it would
// be inefficient. The optimization requires proper explanation and was
// isolated in a different function.
block_max_was_too_low_advance_one_scorer(&mut scorers, pivot_len);
continue;
}
// Block max condition is observed.
//
// Let's try and advance all scorers before the pivot to the pivot.
if !align_scorers(&mut scorers, pivot_doc, before_pivot_len) {
// At least of the scorer does not contain the pivot.
//
// Let's stop scoring this pivot and go through the pivot selection again.
// Note that the current pivot is not necessarily a bad candidate and it
// may be picked again.
continue;
}
// At this point, all scorers are positioned on the doc.
let score = scorers[..pivot_len]
.iter_mut()
.map(|scorer| scorer.score())
.sum();
if score > threshold {
threshold = callback(pivot_doc, score);
}
// let's advance all of the scorers that are currently positioned on the pivot.
advance_all_scorers_on_pivot(&mut scorers, pivot_len);
} else {
return;
// Beware after shallow advance, skip readers can be in advance compared to
// the segment posting lists.
//
// `block_segment_postings.load_block()` need to be called separately.
if block_max_score_upperbound <= threshold {
// Block max condition was not reached
// We could get away by simply advancing the scorers to DocId + 1 but it would
// be inefficient. The optimization requires proper explanation and was
// isolated in a different function.
block_max_was_too_low_advance_one_scorer(&mut scorers, pivot_len);
continue;
}
// Block max condition is observed.
//
// Let's try and advance all scorers before the pivot to the pivot.
if !align_scorers(&mut scorers, pivot_doc, before_pivot_len) {
// At least of the scorer does not contain the pivot.
//
// Let's stop scoring this pivot and go through the pivot selection again.
// Note that the current pivot is not necessarily a bad candidate and it
// may be picked again.
continue;
}
// At this point, all scorers are positioned on the doc.
let score = scorers[..pivot_len]
.iter_mut()
.map(|scorer| scorer.score())
.sum();
if score > threshold {
threshold = callback(pivot_doc, score);
}
// let's advance all of the scorers that are currently positioned on the pivot.
advance_all_scorers_on_pivot(&mut scorers, pivot_len);
}
}
struct TermScorerWithMaxScore<'a> {
scorer: &'a mut TermScorer,
max_score: Score,
}
impl<'a> From<&'a mut TermScorer> for TermScorerWithMaxScore<'a> {
fn from(scorer: &'a mut TermScorer) -> Self {
let max_score = scorer.max_score();
TermScorerWithMaxScore { scorer, max_score }
}
}
impl<'a> Deref for TermScorerWithMaxScore<'a> {
type Target = TermScorer;
fn deref(&self) -> &Self::Target {
self.scorer
}
}
impl<'a> DerefMut for TermScorerWithMaxScore<'a> {
fn deref_mut(&mut self) -> &mut Self::Target {
self.scorer
}
}
fn is_sorted<I: Iterator<Item = DocId>>(mut it: I) -> bool {
if let Some(first) = it.next() {
let mut prev = first;
for doc in it {
if doc < prev {
return false;
}
prev = doc;
}
}
true
}
#[cfg(test)]
mod tests {
use crate::query::score_combiner::SumCombiner;
@@ -248,17 +246,21 @@ mod tests {
use std::iter;
struct Float(Score);
impl Eq for Float {}
impl PartialEq for Float {
fn eq(&self, other: &Self) -> bool {
self.cmp(&other) == Ordering::Equal
}
}
impl PartialOrd for Float {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for Float {
fn cmp(&self, other: &Self) -> Ordering {
other.0.partial_cmp(&self.0).unwrap_or(Ordering::Equal)

View File

@@ -32,7 +32,7 @@ mod tests {
let index = Index::create_in_ram(schema);
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
{
index_writer.add_document(doc!(text_field => "a b c"));
index_writer.add_document(doc!(text_field => "a c"));
@@ -224,7 +224,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field => "a b c"));
index_writer.add_document(doc!(text_field => "a c"));
index_writer.add_document(doc!(text_field => "b c"));

View File

@@ -144,7 +144,7 @@ mod tests {
fn test_boost_query_explain() {
let schema = Schema::builder().build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(Document::new());
assert!(index_writer.commit().is_ok());
let reader = index.reader().unwrap();

View File

@@ -177,7 +177,7 @@ mod test {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(
country_field => "japan",
));

View File

@@ -24,7 +24,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
for &text in texts {
let doc = doc!(text_field=>text);
index_writer.add_document(doc);
@@ -135,7 +135,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field=>"a b c"));
assert!(index_writer.commit().is_ok());
}
@@ -186,7 +186,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field=>"b"));
index_writer.add_document(doc!(text_field=>"a b"));
index_writer.add_document(doc!(text_field=>"b a"));
@@ -217,7 +217,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field=>"a b c d e f g h"));
assert!(index_writer.commit().is_ok());
}

View File

@@ -9,8 +9,8 @@ use crate::query::Weight;
use crate::query::{EmptyScorer, Explanation};
use crate::schema::IndexRecordOption;
use crate::schema::Term;
use crate::Score;
use crate::{DocId, DocSet};
use crate::{Result, Score};
pub struct PhraseWeight {
phrase_terms: Vec<(usize, Term)>,
@@ -32,7 +32,7 @@ impl PhraseWeight {
}
}
fn fieldnorm_reader(&self, reader: &SegmentReader) -> FieldNormReader {
fn fieldnorm_reader(&self, reader: &SegmentReader) -> crate::Result<FieldNormReader> {
let field = self.phrase_terms[0].1.field();
reader.get_fieldnorms_reader(field)
}
@@ -41,9 +41,9 @@ impl PhraseWeight {
&self,
reader: &SegmentReader,
boost: Score,
) -> Result<Option<PhraseScorer<SegmentPostings>>> {
) -> crate::Result<Option<PhraseScorer<SegmentPostings>>> {
let similarity_weight = self.similarity_weight.boost_by(boost);
let fieldnorm_reader = self.fieldnorm_reader(reader);
let fieldnorm_reader = self.fieldnorm_reader(reader)?;
if reader.has_deletes() {
let mut term_postings_list = Vec::new();
for &(offset, ref term) in &self.phrase_terms {
@@ -85,7 +85,7 @@ impl PhraseWeight {
}
impl Weight for PhraseWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> Result<Box<dyn Scorer>> {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
if let Some(scorer) = self.phrase_scorer(reader, boost)? {
Ok(Box::new(scorer))
} else {
@@ -93,7 +93,7 @@ impl Weight for PhraseWeight {
}
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let scorer_opt = self.phrase_scorer(reader, 1.0)?;
if scorer_opt.is_none() {
return Err(does_not_match(doc));
@@ -102,7 +102,7 @@ impl Weight for PhraseWeight {
if scorer.seek(doc) != doc {
return Err(does_not_match(doc));
}
let fieldnorm_reader = self.fieldnorm_reader(reader);
let fieldnorm_reader = self.fieldnorm_reader(reader)?;
let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc);
let phrase_count = scorer.phrase_count();
let mut explanation = Explanation::new("Phrase Scorer", scorer.score());

View File

@@ -21,51 +21,48 @@ use std::str::FromStr;
use tantivy_query_grammar::{UserInputAST, UserInputBound, UserInputLeaf};
/// Possible error that may happen when parsing a query.
#[derive(Debug, PartialEq, Eq, Fail)]
#[derive(Debug, PartialEq, Eq, Error)]
pub enum QueryParserError {
/// Error in the query syntax
#[fail(display = "Syntax Error")]
#[error("Syntax Error")]
SyntaxError,
/// `FieldDoesNotExist(field_name: String)`
/// The query references a field that is not in the schema
#[fail(display = "File does not exists: '{:?}'", _0)]
#[error("File does not exists: '{0:?}'")]
FieldDoesNotExist(String),
/// The query contains a term for a `u64` or `i64`-field, but the value
/// is neither.
#[fail(display = "Expected a valid integer: '{:?}'", _0)]
#[error("Expected a valid integer: '{0:?}'")]
ExpectedInt(ParseIntError),
/// The query contains a term for a `f64`-field, but the value
/// is not a f64.
#[fail(display = "Invalid query: Only excluding terms given")]
#[error("Invalid query: Only excluding terms given")]
ExpectedFloat(ParseFloatError),
/// It is forbidden queries that are only "excluding". (e.g. -title:pop)
#[fail(display = "Invalid query: Only excluding terms given")]
#[error("Invalid query: Only excluding terms given")]
AllButQueryForbidden,
/// If no default field is declared, running a query without any
/// field specified is forbbidden.
#[fail(display = "No default field declared and no field specified in query")]
#[error("No default field declared and no field specified in query")]
NoDefaultFieldDeclared,
/// The field searched for is not declared
/// as indexed in the schema.
#[fail(display = "The field '{:?}' is not declared as indexed", _0)]
#[error("The field '{0:?}' is not declared as indexed")]
FieldNotIndexed(String),
/// A phrase query was requested for a field that does not
/// have any positions indexed.
#[fail(display = "The field '{:?}' does not have positions indexed", _0)]
#[error("The field '{0:?}' does not have positions indexed")]
FieldDoesNotHavePositionsIndexed(String),
/// The tokenizer for the given field is unknown
/// The two argument strings are the name of the field, the name of the tokenizer
#[fail(
display = "The tokenizer '{:?}' for the field '{:?}' is unknown",
_0, _1
)]
#[error("The tokenizer '{0:?}' for the field '{1:?}' is unknown")]
UnknownTokenizer(String, String),
/// The query contains a range query with a phrase as one of the bounds.
/// Only terms can be used as bounds.
#[fail(display = "A range query cannot have a phrase as one of the bounds")]
#[error("A range query cannot have a phrase as one of the bounds")]
RangeMustNotHavePhrase,
/// The format for the date field is not RFC 3339 compliant.
#[fail(display = "The date field has an invalid format")]
#[error("The date field has an invalid format")]
DateFormatError(chrono::ParseError),
}

View File

@@ -9,7 +9,6 @@ use crate::query::{Query, Scorer, Weight};
use crate::schema::Type;
use crate::schema::{Field, IndexRecordOption, Term};
use crate::termdict::{TermDictionary, TermStreamer};
use crate::Result;
use crate::{DocId, Score};
use std::collections::Bound;
use std::ops::Range;
@@ -48,7 +47,7 @@ fn map_bound<TFrom, TTo, Transform: Fn(&TFrom) -> TTo>(
/// let schema = schema_builder.build();
///
/// let index = Index::create_in_ram(schema);
/// let mut index_writer = index.writer_with_num_threads(1, 6_000_000)?;
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// for year in 1950u64..2017u64 {
/// let num_docs_within_year = 10 + (year - 1950) * (year - 1950);
/// for _ in 0..num_docs_within_year {
@@ -246,7 +245,11 @@ impl RangeQuery {
}
impl Query for RangeQuery {
fn weight(&self, searcher: &Searcher, _scoring_enabled: bool) -> Result<Box<dyn Weight>> {
fn weight(
&self,
searcher: &Searcher,
_scoring_enabled: bool,
) -> crate::Result<Box<dyn Weight>> {
let schema = searcher.schema();
let value_type = schema.get_field_entry(self.field).field_type().value_type();
if value_type != self.value_type {
@@ -289,7 +292,7 @@ impl RangeWeight {
}
impl Weight for RangeWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> Result<Box<dyn Scorer>> {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let max_doc = reader.max_doc();
let mut doc_bitset = BitSet::with_max_value(max_doc);
@@ -315,7 +318,7 @@ impl Weight for RangeWeight {
Ok(Box::new(ConstScorer::new(doc_bitset, boost)))
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0)?;
if scorer.seek(doc) != doc {
return Err(does_not_match(doc));
@@ -342,7 +345,7 @@ mod tests {
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 6_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
for year in 1950u64..2017u64 {
let num_docs_within_year = 10 + (year - 1950) * (year - 1950);
for _ in 0..num_docs_within_year {
@@ -485,7 +488,7 @@ mod tests {
schema_builder.add_i64_field("year", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
let mut index_writer = index.writer_for_tests()?;
let title = schema.get_field("title").unwrap();
let year = schema.get_field("year").unwrap();
index_writer.add_document(doc!(

View File

@@ -103,7 +103,7 @@ mod test {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(
country_field => "japan",
));

View File

@@ -25,7 +25,7 @@ mod tests {
let index = Index::create_in_ram(schema);
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let doc = doc!(text_field => "a");
index_writer.add_document(doc);
assert!(index_writer.commit().is_ok());
@@ -50,7 +50,7 @@ mod tests {
let index = Index::create_in_ram(schema);
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
let mut index_writer = index.writer_for_tests()?;
for _ in 0..COMPRESSION_BLOCK_SIZE {
let doc = doc!(text_field => "a");
index_writer.add_document(doc);
@@ -86,7 +86,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(
left_field => "left1 left2 left2 left2f2 left2f2 left3 abcde abcde abcde abcde abcde abcde abcde abcde abcde abcewde abcde abcde",
right_field => "right1 right2",
@@ -136,7 +136,7 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 5_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field=>"a b"));
index_writer.add_document(doc!(text_field=>"a c"));
index_writer.delete_term(Term::from_field_text(text_field, "b"));
@@ -153,7 +153,7 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"a"));
index_writer.commit()?;

View File

@@ -4,11 +4,10 @@ use crate::docset::DocSet;
use crate::postings::SegmentPostings;
use crate::query::bm25::BM25Weight;
use crate::query::explanation::does_not_match;
use crate::query::weight::{for_each_pruning_scorer, for_each_scorer};
use crate::query::weight::for_each_scorer;
use crate::query::Weight;
use crate::query::{Explanation, Scorer};
use crate::schema::IndexRecordOption;
use crate::Result;
use crate::Term;
use crate::{DocId, Score};
@@ -19,12 +18,12 @@ pub struct TermWeight {
}
impl Weight for TermWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> Result<Box<dyn Scorer>> {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let term_scorer = self.specialized_scorer(reader, boost)?;
Ok(Box::new(term_scorer))
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let mut scorer = self.specialized_scorer(reader, 1.0)?;
if scorer.seek(doc) != doc {
return Err(does_not_match(doc));
@@ -32,7 +31,7 @@ impl Weight for TermWeight {
Ok(scorer.explain())
}
fn count(&self, reader: &SegmentReader) -> Result<u32> {
fn count(&self, reader: &SegmentReader) -> crate::Result<u32> {
if let Some(delete_bitset) = reader.delete_bitset() {
Ok(self.scorer(reader, 1.0)?.count(delete_bitset))
} else {
@@ -73,8 +72,8 @@ impl Weight for TermWeight {
reader: &SegmentReader,
callback: &mut dyn FnMut(DocId, Score) -> Score,
) -> crate::Result<()> {
let mut scorer = self.scorer(reader, 1.0)?;
for_each_pruning_scorer(&mut scorer, threshold, callback);
let scorer = self.specialized_scorer(reader, 1.0)?;
crate::query::boolean_query::block_wand(vec![scorer], threshold, callback);
Ok(())
}
}
@@ -96,10 +95,10 @@ impl TermWeight {
&self,
reader: &SegmentReader,
boost: Score,
) -> Result<TermScorer> {
) -> crate::Result<TermScorer> {
let field = self.term.field();
let inverted_index = reader.inverted_index(field);
let fieldnorm_reader = reader.get_fieldnorms_reader(field);
let fieldnorm_reader = reader.get_fieldnorms_reader(field)?;
let similarity_weight = self.similarity_weight.boost_by(boost);
let postings_opt: Option<SegmentPostings> =
inverted_index.read_postings(&self.term, self.index_record_option);

View File

@@ -398,9 +398,9 @@ mod bench {
use crate::query::score_combiner::DoNothingCombiner;
use crate::query::{ConstScorer, Union, VecDocSet};
use crate::tests;
use crate::DocId;
use crate::DocSet;
use crate::{tests, TERMINATED};
use test::Bencher;
#[bench]
@@ -414,10 +414,12 @@ mod bench {
union_docset
.iter()
.map(|doc_ids| VecDocSet::from(doc_ids.clone()))
.map(ConstScorer::new)
.map(|docset| ConstScorer::new(docset, 1.0))
.collect::<Vec<_>>(),
);
while v.advance() {}
while v.doc() != TERMINATED {
v.advance();
}
});
}
#[bench]
@@ -432,10 +434,12 @@ mod bench {
union_docset
.iter()
.map(|doc_ids| VecDocSet::from(doc_ids.clone()))
.map(ConstScorer::new)
.map(|docset| ConstScorer::new(docset, 1.0))
.collect::<Vec<_>>(),
);
while v.advance() {}
while v.doc() != TERMINATED {
v.advance();
}
});
}
}

View File

@@ -1,5 +1,7 @@
mod pool;
use slog::error;
pub use self::pool::LeasedItem;
use self::pool::Pool;
use crate::core::Segment;
@@ -62,6 +64,7 @@ impl IndexReaderBuilder {
/// to open different segment readers. It may take hundreds of milliseconds
/// of time and it may return an error.
pub fn try_into(self) -> crate::Result<IndexReader> {
let logger = self.index.logger().clone();
let inner_reader = InnerIndexReader {
index: self.index,
num_searchers: self.num_searchers,
@@ -80,8 +83,8 @@ impl IndexReaderBuilder {
let callback = move || {
if let Err(err) = inner_reader_arc_clone.reload() {
error!(
"Error while loading searcher after commit was detected. {:?}",
err
logger,
"Error while loading searcher after commit was detected. {:?}", err
);
}
};
@@ -138,9 +141,11 @@ impl InnerIndexReader {
.collect::<crate::Result<_>>()?
};
let schema = self.index.schema();
let searchers = (0..self.num_searchers)
.map(|_| Searcher::new(schema.clone(), self.index.clone(), segment_readers.clone()))
.collect();
let searchers = std::iter::repeat_with(|| {
Searcher::new(schema.clone(), self.index.clone(), segment_readers.clone())
})
.take(self.num_searchers)
.collect();
self.searcher_pool.publish_new_generation(searchers);
Ok(())
}

View File

@@ -74,9 +74,8 @@ impl Document {
}
/// Add a text field.
pub fn add_text(&mut self, field: Field, text: &str) {
let value = Value::Str(String::from(text));
self.add(FieldValue::new(field, value));
pub fn add_text<S: ToString>(&mut self, field: Field, text: S) {
self.add(FieldValue::new(field, Value::Str(text.to_string())));
}
/// Add a pre-tokenized text field.
@@ -110,8 +109,8 @@ impl Document {
}
/// Add a bytes field
pub fn add_bytes(&mut self, field: Field, value: Vec<u8>) {
self.add(FieldValue::new(field, Value::Bytes(value)))
pub fn add_bytes<T: Into<Vec<u8>>>(&mut self, field: Field, value: T) {
self.add(FieldValue::new(field, Value::Bytes(value.into())))
}
/// Add a field value

View File

@@ -1,5 +1,5 @@
use crate::schema::IntOptions;
use crate::schema::TextOptions;
use crate::schema::{is_valid_field_name, IntOptions};
use crate::schema::FieldType;
use serde::de::{self, MapAccess, Visitor};
@@ -24,6 +24,7 @@ impl FieldEntry {
/// Creates a new u64 field entry in the schema, given
/// a name, and some options.
pub fn new_text(field_name: String, text_options: TextOptions) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry {
name: field_name,
field_type: FieldType::Str(text_options),
@@ -33,6 +34,7 @@ impl FieldEntry {
/// Creates a new u64 field entry in the schema, given
/// a name, and some options.
pub fn new_u64(field_name: String, field_type: IntOptions) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry {
name: field_name,
field_type: FieldType::U64(field_type),
@@ -42,6 +44,7 @@ impl FieldEntry {
/// Creates a new i64 field entry in the schema, given
/// a name, and some options.
pub fn new_i64(field_name: String, field_type: IntOptions) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry {
name: field_name,
field_type: FieldType::I64(field_type),
@@ -51,6 +54,7 @@ impl FieldEntry {
/// Creates a new f64 field entry in the schema, given
/// a name, and some options.
pub fn new_f64(field_name: String, field_type: IntOptions) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry {
name: field_name,
field_type: FieldType::F64(field_type),
@@ -60,6 +64,7 @@ impl FieldEntry {
/// Creates a new date field entry in the schema, given
/// a name, and some options.
pub fn new_date(field_name: String, field_type: IntOptions) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry {
name: field_name,
field_type: FieldType::Date(field_type),
@@ -68,6 +73,7 @@ impl FieldEntry {
/// Creates a field entry for a facet.
pub fn new_facet(field_name: String) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry {
name: field_name,
field_type: FieldType::HierarchicalFacet,
@@ -76,6 +82,7 @@ impl FieldEntry {
/// Creates a field entry for a bytes field
pub fn new_bytes(field_name: String) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry {
name: field_name,
field_type: FieldType::Bytes,
@@ -268,6 +275,12 @@ mod tests {
use crate::schema::TEXT;
use serde_json;
#[test]
#[should_panic]
fn test_invalid_field_name_should_panic() {
FieldEntry::new_text("-hello".to_string(), TEXT);
}
#[test]
fn test_json_serialization() {
let field_value = FieldEntry::new_text(String::from("title"), TEXT);

View File

@@ -149,14 +149,16 @@ pub use self::int_options::IntOptions;
use once_cell::sync::Lazy;
use regex::Regex;
/// Regular expression representing the restriction on a valid field names.
pub const FIELD_NAME_PATTERN: &str = r#"^[_a-zA-Z][_\-a-zA-Z0-9]*$"#;
/// Validator for a potential `field_name`.
/// Returns true iff the name can be use for a field name.
///
/// A field name must start by a letter `[a-zA-Z]`.
/// The other characters can be any alphanumic character `[a-ZA-Z0-9]` or `_`.
pub fn is_valid_field_name(field_name: &str) -> bool {
static FIELD_NAME_PTN: Lazy<Regex> =
Lazy::new(|| Regex::new("^[a-zA-Z][_a-zA-Z0-9]*$").unwrap());
static FIELD_NAME_PTN: Lazy<Regex> = Lazy::new(|| Regex::new(FIELD_NAME_PATTERN).unwrap());
FIELD_NAME_PTN.is_match(field_name)
}
@@ -170,6 +172,11 @@ mod tests {
assert!(is_valid_field_name("text"));
assert!(is_valid_field_name("text0"));
assert!(!is_valid_field_name("0text"));
assert!(is_valid_field_name("field-name"));
assert!(is_valid_field_name("field_name"));
assert!(!is_valid_field_name("field!name"));
assert!(!is_valid_field_name("-fieldname"));
assert!(is_valid_field_name("_fieldname"));
assert!(!is_valid_field_name(""));
assert!(!is_valid_field_name("シャボン玉"));
assert!(is_valid_field_name("my_text_field"));

View File

@@ -381,19 +381,16 @@ impl<'de> Deserialize<'de> for Schema {
/// Error that may happen when deserializing
/// a document from JSON.
#[derive(Debug, Fail, PartialEq)]
#[derive(Debug, Error, PartialEq)]
pub enum DocParsingError {
/// The payload given is not valid JSON.
#[fail(display = "The provided string is not valid JSON")]
#[error("The provided string is not valid JSON")]
NotJSON(String),
/// One of the value node could not be parsed.
#[fail(display = "The field '{:?}' could not be parsed: {:?}", _0, _1)]
#[error("The field '{0:?}' could not be parsed: {1:?}")]
ValueError(String, ValueParsingError),
/// The json-document contains a field that is not declared in the schema.
#[fail(
display = "The document contains a field that is not declared in the schema: {:?}",
_0
)]
#[error("The document contains a field that is not declared in the schema: {0:?}")]
NoSuchFieldInSchema(String),
}

View File

@@ -4,7 +4,6 @@ use super::Field;
use crate::common;
use crate::schema::Facet;
use crate::DateTime;
use byteorder::{BigEndian, ByteOrder};
use std::str;
/// Size (in bytes) of the buffer of a int field.
@@ -19,6 +18,10 @@ where
B: AsRef<[u8]>;
impl Term {
pub(crate) fn new() -> Term {
Term(Vec::with_capacity(100))
}
/// Builds a term given a field, and a i64-value
///
/// Assuming the term has a field id of 1, and a i64 value of 3234,
@@ -93,6 +96,12 @@ impl Term {
term
}
pub(crate) fn from_field_bytes(field: Field, bytes: &[u8]) -> Term {
let mut term = Term::for_field(field);
term.set_bytes(bytes);
term
}
/// Creates a new Term for a given field.
pub(crate) fn for_field(field: Field) -> Term {
let mut term = Term(Vec::with_capacity(100));
@@ -100,12 +109,10 @@ impl Term {
term
}
/// Returns the field.
pub fn set_field(&mut self, field: Field) {
if self.0.len() < 4 {
self.0.resize(4, 0u8);
}
BigEndian::write_u32(&mut self.0[0..4], field.field_id());
pub(crate) fn set_field(&mut self, field: Field) {
self.0.clear();
self.0
.extend_from_slice(&field.field_id().to_be_bytes()[..]);
}
/// Sets a u64 value in the term.
@@ -116,7 +123,7 @@ impl Term {
/// the natural order of the values.
pub fn set_u64(&mut self, val: u64) {
self.0.resize(INT_TERM_LEN, 0u8);
BigEndian::write_u64(&mut self.0[4..], val);
self.0[4..12].copy_from_slice(val.to_be_bytes().as_ref());
}
/// Sets a `i64` value in the term.
@@ -134,12 +141,6 @@ impl Term {
self.0.extend(bytes);
}
pub(crate) fn from_field_bytes(field: Field, bytes: &[u8]) -> Term {
let mut term = Term::for_field(field);
term.set_bytes(bytes);
term
}
/// Set the texts only, keeping the field untouched.
pub fn set_text(&mut self, text: &str) {
self.set_bytes(text.as_bytes());
@@ -157,7 +158,9 @@ where
/// Returns the field.
pub fn field(&self) -> Field {
Field::from_field_id(BigEndian::read_u32(&self.0.as_ref()[..4]))
let mut field_id_bytes = [0u8; 4];
field_id_bytes.copy_from_slice(&self.0.as_ref()[..4]);
Field::from_field_id(u32::from_be_bytes(field_id_bytes))
}
/// Returns the `u64` value stored in a term.
@@ -166,7 +169,9 @@ where
/// ... or returns an invalid value
/// if the term is not a `u64` field.
pub fn get_u64(&self) -> u64 {
BigEndian::read_u64(&self.0.as_ref()[4..])
let mut field_id_bytes = [0u8; 8];
field_id_bytes.copy_from_slice(self.value_bytes());
u64::from_be_bytes(field_id_bytes)
}
/// Returns the `i64` value stored in a term.
@@ -175,7 +180,7 @@ where
/// ... or returns an invalid value
/// if the term is not a `i64` field.
pub fn get_i64(&self) -> i64 {
common::u64_to_i64(BigEndian::read_u64(&self.0.as_ref()[4..]))
common::u64_to_i64(self.get_u64())
}
/// Returns the `f64` value stored in a term.
@@ -184,7 +189,7 @@ where
/// ... or returns an invalid value
/// if the term is not a `f64` field.
pub fn get_f64(&self) -> f64 {
common::u64_to_f64(BigEndian::read_u64(&self.0.as_ref()[4..]))
common::u64_to_f64(self.get_u64())
}
/// Returns the text associated with the term.

View File

@@ -221,6 +221,12 @@ impl<'a> From<&'a str> for Value {
}
}
impl<'a> From<&'a [u8]> for Value {
fn from(bytes: &'a [u8]) -> Value {
Value::Bytes(bytes.to_vec())
}
}
impl<'a> From<Facet> for Value {
fn from(facet: Facet) -> Value {
Value::Facet(facet)

View File

@@ -221,7 +221,7 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str)
/// # let text_field = schema_builder.add_text_field("text", TEXT);
/// # let schema = schema_builder.build();
/// # let index = Index::create_in_ram(schema);
/// # let mut index_writer = index.writer_with_num_threads(1, 30_000_000)?;
/// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// # let doc = doc!(text_field => r#"Comme je descendais des Fleuves impassibles,
/// # Je ne me sentis plus guidé par les haleurs :
/// # Des Peaux-Rouges criards les avaient pris pour cibles,
@@ -506,7 +506,7 @@ Survey in 2016, 2017, and 2018."#;
let index = Index::create_in_ram(schema);
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field => "a"));
index_writer.add_document(doc!(text_field => "a"));
index_writer.add_document(doc!(text_field => "a b"));
@@ -562,7 +562,7 @@ Survey in 2016, 2017, and 2018."#;
let index = Index::create_in_ram(schema);
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
{
let doc = doc ! (text_field => TEST_TEXT);
index_writer.add_document(doc);

View File

@@ -336,7 +336,7 @@ mod test {
let index = Index::create_in_ram(schema.clone());
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(name => 1u64));
index_writer.add_document(doc!(name => 2u64));
index_writer.add_document(doc!(name => 10u64));
@@ -374,7 +374,7 @@ mod test {
let index = Index::create_in_ram(schema.clone());
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(name => "hi"));
index_writer.add_document(doc!(name => "this is a test"));
index_writer.add_document(
@@ -414,7 +414,7 @@ mod test {
let index = Index::create_in_ram(schema.clone());
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(name => "hi"));
index_writer.add_document(doc!(name => "this is a test"));
index_writer.add_document(
@@ -453,7 +453,7 @@ mod test {
let index = Index::create_in_ram(schema.clone());
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(name => 1u64));
index_writer.add_document(doc!(name => 2u64));
index_writer.add_document(doc!(name => 3u64));

View File

@@ -68,19 +68,17 @@ impl<T: BinarySerializable> SkipListBuilder<T> {
}
pub fn insert(&mut self, key: u64, dest: &T) -> io::Result<()> {
let mut layer_id = 0;
let mut skip_pointer = self.data_layer.insert(key, dest)?;
loop {
skip_pointer = match skip_pointer {
Some((skip_doc_id, skip_offset)) => self
for layer_id in 0.. {
if let Some((skip_doc_id, skip_offset)) = skip_pointer {
skip_pointer = self
.get_skip_layer(layer_id)
.insert(skip_doc_id, &skip_offset)?,
None => {
return Ok(());
}
};
layer_id += 1;
.insert(skip_doc_id, &skip_offset)?;
} else {
break;
}
}
Ok(())
}
pub fn write<W: Write>(self, output: &mut W) -> io::Result<()> {

View File

@@ -138,7 +138,7 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
{
{
let mut doc = Document::default();