Compare commits

..

2 Commits

Author SHA1 Message Date
Paul Masurel
6eb7c7f419 Added coveralls 2020-08-20 09:25:45 +09:00
Paul Masurel
e37ca8178a githubactions 2020-08-19 22:52:20 +09:00
21 changed files with 235 additions and 199 deletions

28
.github/workflows/ci.yml vendored Normal file
View File

@@ -0,0 +1,28 @@
name: Tantivy CI
on: [push]
jobs:
test:
name: Test Suite
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
override: true
- uses: actions-rs/cargo@v1
with:
command: test
- uses: actions-rs/cargo@v1
with:
command: fmt
args: --all -- --check
- run: rustup component add clippy
- uses: actions-rs/cargo@v1
with:
command: clippy
args: -- -D warnings

66
.github/workflows/coveralls.yml vendored Normal file
View File

@@ -0,0 +1,66 @@
on: [push]
name: Code coverage with grcov
jobs:
grcov:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os:
- ubuntu-latest
#- macOS-latest
#- windows-latest
steps:
- uses: actions/checkout@v2
- name: Install toolchain
uses: actions-rs/toolchain@v1
with:
toolchain: nightly
override: true
profile: minimal
- name: Execute tests
uses: actions-rs/cargo@v1
with:
command: test
args: --all --lib
env:
CARGO_INCREMENTAL: 0
RUSTFLAGS: "-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort -Zpanic_abort_tests"
# Note that `actions-rs/grcov` Action can install `grcov` too,
# but can't use faster installation methods yet.
# As a temporary experiment `actions-rs/install` Action plugged in here.
# Consider **NOT** to copy that into your workflow,
# but use `actions-rs/grcov` only
- name: Pre-installing grcov
uses: actions-rs/install@v0.1
with:
crate: grcov
use-tool-cache: true
- name: Gather coverage data
id: coverage
uses: actions-rs/grcov@v0.1
with:
coveralls-token: ${{ secrets.COVERALLS_TOKEN }}
- name: Coveralls upload
uses: coverallsapp/github-action@master
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
parallel: true
path-to-lcov: ${{ steps.coverage.outputs.report }}
grcov_finalize:
runs-on: ubuntu-latest
needs: grcov
steps:
- name: Coveralls finalization
uses: coverallsapp/github-action@master
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
parallel-finished: true

1
.gitignore vendored
View File

@@ -12,4 +12,3 @@ cpp/simdcomp/bitpackingbenchmark
*.bk *.bk
.idea .idea
trace.dat trace.dat
cargo-timing*

View File

@@ -1,7 +1,3 @@
Tantivy 0.14.0
=========================
- Remove dependency to atomicwrites #833 .Implemented by @pmasurel upon suggestion and research from @asafigan).
Tantivy 0.13.0 Tantivy 0.13.0
====================== ======================
Tantivy 0.13 introduce a change in the index format that will require Tantivy 0.13 introduce a change in the index format that will require

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "tantivy" name = "tantivy"
version = "0.14.0-dev" version = "0.13.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"] authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT" license = "MIT"
categories = ["database-implementations", "data-structures"] categories = ["database-implementations", "data-structures"]
@@ -22,8 +22,8 @@ tantivy-fst = "0.3"
memmap = {version = "0.7", optional=true} memmap = {version = "0.7", optional=true}
lz4 = {version="1.20", optional=true} lz4 = {version="1.20", optional=true}
snap = "1" snap = "1"
tempfile = {version="3.0", optional=true} atomicwrites = {version="0.2.2", optional=true}
atomicwrites = "0.2" tempfile = "3.0"
log = "0.4" log = "0.4"
serde = {version="1.0", features=["derive"]} serde = {version="1.0", features=["derive"]}
serde_json = "1.0" serde_json = "1.0"
@@ -38,7 +38,7 @@ owning_ref = "0.4"
stable_deref_trait = "1.0.0" stable_deref_trait = "1.0.0"
rust-stemmers = "1.2" rust-stemmers = "1.2"
downcast-rs = { version="1.0" } downcast-rs = { version="1.0" }
tantivy-query-grammar = { version="0.14.0-dev", path="./query-grammar" } tantivy-query-grammar = { version="0.13", path="./query-grammar" }
bitpacking = {version="0.8", default-features = false, features=["bitpacker4x"]} bitpacking = {version="0.8", default-features = false, features=["bitpacker4x"]}
census = "0.4" census = "0.4"
fnv = "1.0.6" fnv = "1.0.6"
@@ -75,7 +75,7 @@ overflow-checks = true
[features] [features]
default = ["mmap"] default = ["mmap"]
mmap = ["fs2", "tempfile", "memmap", "notify"] mmap = ["atomicwrites", "fs2", "memmap", "notify"]
lz4-compression = ["lz4"] lz4-compression = ["lz4"]
failpoints = ["fail/failpoints"] failpoints = ["fail/failpoints"]
unstable = [] # useful for benches. unstable = [] # useful for benches.

View File

@@ -34,6 +34,11 @@ Tantivy is, in fact, strongly inspired by Lucene's design.
The following [benchmark](https://tantivy-search.github.io/bench/) break downs The following [benchmark](https://tantivy-search.github.io/bench/) break downs
performance for different type of queries / collection. performance for different type of queries / collection.
In general, Tantivy tends to be
- slower than Lucene on union with a Top-K due to Block-WAND optimization.
- faster than Lucene on intersection and phrase queries.
Your mileage WILL vary depending on the nature of queries and their load. Your mileage WILL vary depending on the nature of queries and their load.
# Features # Features

View File

@@ -112,6 +112,18 @@ fn main() -> tantivy::Result<()> {
limbs and branches that arch over the pool" limbs and branches that arch over the pool"
)); ));
index_writer.add_document(doc!(
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
// Multivalued field just need to be repeated. // Multivalued field just need to be repeated.
index_writer.add_document(doc!( index_writer.add_document(doc!(
title => "Frankenstein", title => "Frankenstein",

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "tantivy-query-grammar" name = "tantivy-query-grammar"
version = "0.14.0-dev" version = "0.13.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"] authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT" license = "MIT"
categories = ["database-implementations", "data-structures"] categories = ["database-implementations", "data-structures"]

View File

@@ -52,7 +52,7 @@ mod test {
use crate::Occur; use crate::Occur;
#[test] #[test]
fn test_occur_compose() { fn test_Occur_compose() {
assert_eq!(Occur::compose(Occur::Should, Occur::Should), Occur::Should); assert_eq!(Occur::compose(Occur::Should, Occur::Should), Occur::Should);
assert_eq!(Occur::compose(Occur::Should, Occur::Must), Occur::Must); assert_eq!(Occur::compose(Occur::Should, Occur::Must), Occur::Must);
assert_eq!( assert_eq!(

View File

@@ -9,10 +9,8 @@ use combine::{
fn field<'a>() -> impl Parser<&'a str, Output = String> { fn field<'a>() -> impl Parser<&'a str, Output = String> {
( (
(letter().or(char('_'))), letter(),
many(satisfy(|c: char| { many(satisfy(|c: char| c.is_alphanumeric() || c == '_')),
c.is_alphanumeric() || c == '_' || c == '-'
})),
) )
.skip(char(':')) .skip(char(':'))
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2)) .map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
@@ -281,8 +279,6 @@ pub fn parse_to_ast<'a>() -> impl Parser<&'a str, Output = UserInputAST> {
#[cfg(test)] #[cfg(test)]
mod test { mod test {
type TestParseResult = Result<(), StringStreamError>;
use super::*; use super::*;
use combine::parser::Parser; use combine::parser::Parser;
@@ -300,10 +296,9 @@ mod test {
} }
#[test] #[test]
fn test_occur_symbol() -> TestParseResult { fn test_occur_symbol() {
assert_eq!(super::occur_symbol().parse("-")?, (Occur::MustNot, "")); assert_eq!(super::occur_symbol().parse("-"), Ok((Occur::MustNot, "")));
assert_eq!(super::occur_symbol().parse("+")?, (Occur::Must, "")); assert_eq!(super::occur_symbol().parse("+"), Ok((Occur::Must, "")));
Ok(())
} }
#[test] #[test]
@@ -415,25 +410,6 @@ mod test {
assert_eq!(format!("{:?}", ast), "\"abc\""); assert_eq!(format!("{:?}", ast), "\"abc\"");
} }
#[test]
fn test_field_name() -> TestParseResult {
assert_eq!(
super::field().parse("my-field-name:a")?,
("my-field-name".to_string(), "a")
);
assert_eq!(
super::field().parse("my_field_name:a")?,
("my_field_name".to_string(), "a")
);
assert!(super::field().parse(":a").is_err());
assert!(super::field().parse("-my_field:a").is_err());
assert_eq!(
super::field().parse("_my_field:a")?,
("_my_field".to_string(), "a")
);
Ok(())
}
#[test] #[test]
fn test_range_parser() { fn test_range_parser() {
// testing the range() parser separately // testing the range() parser separately

View File

@@ -539,6 +539,7 @@ mod tests {
test_index_on_commit_reload_policy_aux(field, &write_index, &reader); test_index_on_commit_reload_policy_aux(field, &write_index, &reader);
} }
} }
fn test_index_on_commit_reload_policy_aux(field: Field, index: &Index, reader: &IndexReader) { fn test_index_on_commit_reload_policy_aux(field: Field, index: &Index, reader: &IndexReader) {
let mut reader_index = reader.index(); let mut reader_index = reader.index();
let (sender, receiver) = crossbeam::channel::unbounded(); let (sender, receiver) = crossbeam::channel::unbounded();
@@ -549,23 +550,12 @@ mod tests {
assert_eq!(reader.searcher().num_docs(), 0); assert_eq!(reader.searcher().num_docs(), 0);
writer.add_document(doc!(field=>1u64)); writer.add_document(doc!(field=>1u64));
writer.commit().unwrap(); writer.commit().unwrap();
// We need a loop here because it is possible for notify to send more than assert!(receiver.recv().is_ok());
// one modify event. It was observed on CI on MacOS. assert_eq!(reader.searcher().num_docs(), 1);
loop {
assert!(receiver.recv().is_ok());
if reader.searcher().num_docs() == 1 {
break;
}
}
writer.add_document(doc!(field=>2u64)); writer.add_document(doc!(field=>2u64));
writer.commit().unwrap(); writer.commit().unwrap();
// ... Same as above assert!(receiver.recv().is_ok());
loop { assert_eq!(reader.searcher().num_docs(), 2);
assert!(receiver.recv().is_ok());
if reader.searcher().num_docs() == 2 {
break;
}
}
} }
// This test will not pass on windows, because windows // This test will not pass on windows, because windows

View File

@@ -1,4 +1,4 @@
use crate::core::{MANAGED_FILEPATH, META_FILEPATH}; use crate::core::MANAGED_FILEPATH;
use crate::directory::error::{DeleteError, IOError, LockError, OpenReadError, OpenWriteError}; use crate::directory::error::{DeleteError, IOError, LockError, OpenReadError, OpenWriteError};
use crate::directory::footer::{Footer, FooterProxy}; use crate::directory::footer::{Footer, FooterProxy};
use crate::directory::DirectoryLock; use crate::directory::DirectoryLock;
@@ -246,15 +246,13 @@ impl ManagedDirectory {
/// List files for which checksum does not match content /// List files for which checksum does not match content
pub fn list_damaged(&self) -> result::Result<HashSet<PathBuf>, OpenReadError> { pub fn list_damaged(&self) -> result::Result<HashSet<PathBuf>, OpenReadError> {
let mut hashset = HashSet::new(); let mut hashset = HashSet::new();
let mut managed_paths = self let managed_paths = self
.meta_informations .meta_informations
.read() .read()
.expect("Managed directory rlock poisoned in list damaged.") .expect("Managed directory rlock poisoned in list damaged.")
.managed_paths .managed_paths
.clone(); .clone();
managed_paths.remove(*META_FILEPATH);
for path in managed_paths.into_iter() { for path in managed_paths.into_iter() {
if !self.validate_checksum(&path)? { if !self.validate_checksum(&path)? {
hashset.insert(path); hashset.insert(path);

View File

@@ -1,5 +1,4 @@
use crate::core::META_FILEPATH; use crate::core::META_FILEPATH;
use atomicwrites;
use crate::directory::error::LockError; use crate::directory::error::LockError;
use crate::directory::error::{ use crate::directory::error::{
DeleteError, IOError, OpenDirectoryError, OpenReadError, OpenWriteError, DeleteError, IOError, OpenDirectoryError, OpenReadError, OpenWriteError,
@@ -35,7 +34,6 @@ use std::sync::Mutex;
use std::sync::RwLock; use std::sync::RwLock;
use std::sync::Weak; use std::sync::Weak;
use std::thread; use std::thread;
use tempfile;
use tempfile::TempDir; use tempfile::TempDir;
/// Create a default io error given a string. /// Create a default io error given a string.
@@ -489,11 +487,11 @@ impl Directory for MmapDirectory {
} }
} }
fn atomic_write(&mut self, path: &Path, content: &[u8]) -> io::Result<()> { fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
debug!("Atomic Write {:?}", path); debug!("Atomic Write {:?}", path);
let full_path = self.resolve_path(path); let full_path = self.resolve_path(path);
let meta_file = atomicwrites::AtomicFile::new(full_path, atomicwrites::AllowOverwrite); let meta_file = atomicwrites::AtomicFile::new(full_path, atomicwrites::AllowOverwrite);
meta_file.write(|f| f.write_all(content))?; meta_file.write(|f| f.write_all(data))?;
Ok(()) Ok(())
} }

View File

@@ -211,18 +211,19 @@ fn test_watch(directory: &mut dyn Directory) {
.unwrap(); .unwrap();
for i in 0..10 { for i in 0..10 {
assert!(i <= counter.load(SeqCst)); assert_eq!(i, counter.load(SeqCst));
assert!(directory assert!(directory
.atomic_write(Path::new("meta.json"), b"random_test_data_2") .atomic_write(Path::new("meta.json"), b"random_test_data_2")
.is_ok()); .is_ok());
assert!(i + 1 <= counter.load(SeqCst)); // notify can trigger more than once. assert_eq!(receiver.recv_timeout(Duration::from_millis(500)), Ok(i));
assert_eq!(i + 1, counter.load(SeqCst));
} }
mem::drop(watch_handle); mem::drop(watch_handle);
assert!(directory assert!(directory
.atomic_write(Path::new("meta.json"), b"random_test_data") .atomic_write(Path::new("meta.json"), b"random_test_data")
.is_ok()); .is_ok());
assert!(receiver.recv_timeout(Duration::from_millis(500)).is_ok()); assert!(receiver.recv_timeout(Duration::from_millis(500)).is_ok());
assert!(10 <= counter.load(SeqCst)); assert_eq!(10, counter.load(SeqCst));
} }
fn test_lock_non_blocking(directory: &mut dyn Directory) { fn test_lock_non_blocking(directory: &mut dyn Directory) {

View File

@@ -29,9 +29,8 @@ pub use self::segment_writer::SegmentWriter;
/// Alias for the default merge policy, which is the `LogMergePolicy`. /// Alias for the default merge policy, which is the `LogMergePolicy`.
pub type DefaultMergePolicy = LogMergePolicy; pub type DefaultMergePolicy = LogMergePolicy;
#[cfg(feature = "mmap")]
#[cfg(test)] #[cfg(test)]
mod tests_mmap { mod tests {
use crate::schema::{self, Schema}; use crate::schema::{self, Schema};
use crate::{Index, Term}; use crate::{Index, Term};

View File

@@ -1012,12 +1012,4 @@ mod tests {
DOC_COUNT as usize DOC_COUNT as usize
); );
} }
#[test]
fn test_validate_checksum() {
let index_path = tempfile::tempdir().expect("dir");
let schema = Schema::builder().build();
let index = Index::create_in_dir(&index_path, schema).expect("index");
assert!(index.validate_checksum().unwrap().is_empty());
}
} }

View File

@@ -4,6 +4,19 @@ use crate::{DocId, DocSet, Score, TERMINATED};
use std::ops::Deref; use std::ops::Deref;
use std::ops::DerefMut; use std::ops::DerefMut;
fn is_sorted<I: Iterator<Item = DocId>>(mut it: I) -> bool {
if let Some(first) = it.next() {
let mut prev = first;
for doc in it {
if doc < prev {
return false;
}
prev = doc;
}
}
true
}
/// Takes a term_scorers sorted by their current doc() and a threshold and returns /// Takes a term_scorers sorted by their current doc() and a threshold and returns
/// Returns (pivot_len, pivot_ord) defined as follows: /// Returns (pivot_len, pivot_ord) defined as follows:
/// - `pivot_doc` lowest document that has a chance of exceeding (>) the threshold score. /// - `pivot_doc` lowest document that has a chance of exceeding (>) the threshold score.
@@ -42,12 +55,37 @@ fn find_pivot_doc(
Some((before_pivot_len, pivot_len, pivot_doc)) Some((before_pivot_len, pivot_len, pivot_doc))
} }
struct TermScorerWithMaxScore<'a> {
scorer: &'a mut TermScorer,
max_score: Score,
}
impl<'a> From<&'a mut TermScorer> for TermScorerWithMaxScore<'a> {
fn from(scorer: &'a mut TermScorer) -> Self {
let max_score = scorer.max_score();
TermScorerWithMaxScore { scorer, max_score }
}
}
impl<'a> Deref for TermScorerWithMaxScore<'a> {
type Target = TermScorer;
fn deref(&self) -> &Self::Target {
self.scorer
}
}
impl<'a> DerefMut for TermScorerWithMaxScore<'a> {
fn deref_mut(&mut self) -> &mut Self::Target {
self.scorer
}
}
// Before and after calling this method, scorers need to be sorted by their `.doc()`. // Before and after calling this method, scorers need to be sorted by their `.doc()`.
fn block_max_was_too_low_advance_one_scorer( fn block_max_was_too_low_advance_one_scorer(
scorers: &mut Vec<TermScorerWithMaxScore>, scorers: &mut Vec<TermScorerWithMaxScore>,
pivot_len: usize, pivot_len: usize,
) { ) {
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
let mut scorer_to_seek = pivot_len - 1; let mut scorer_to_seek = pivot_len - 1;
let mut doc_to_seek_after = scorers[scorer_to_seek].doc(); let mut doc_to_seek_after = scorers[scorer_to_seek].doc();
for scorer_ord in (0..pivot_len - 1).rev() { for scorer_ord in (0..pivot_len - 1).rev() {
@@ -64,7 +102,6 @@ fn block_max_was_too_low_advance_one_scorer(
} }
scorers[scorer_to_seek].seek(doc_to_seek_after + 1); scorers[scorer_to_seek].seek(doc_to_seek_after + 1);
restore_ordering(scorers, scorer_to_seek); restore_ordering(scorers, scorer_to_seek);
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
} }
// Given a list of term_scorers and a `ord` and assuming that `term_scorers[ord]` is sorted // Given a list of term_scorers and a `ord` and assuming that `term_scorers[ord]` is sorted
@@ -140,99 +177,64 @@ pub fn block_wand(
.map(TermScorerWithMaxScore::from) .map(TermScorerWithMaxScore::from)
.collect(); .collect();
scorers.sort_by_key(|scorer| scorer.doc()); scorers.sort_by_key(|scorer| scorer.doc());
// At this point we need to ensure that the scorers are sorted! loop {
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc()))); // At this point we need to ensure that the scorers are sorted!
while let Some((before_pivot_len, pivot_len, pivot_doc)) =
find_pivot_doc(&scorers[..], threshold)
{
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc()))); debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
debug_assert_ne!(pivot_doc, TERMINATED); if let Some((before_pivot_len, pivot_len, pivot_doc)) =
debug_assert!(before_pivot_len < pivot_len); find_pivot_doc(&scorers[..], threshold)
{
debug_assert_ne!(pivot_doc, TERMINATED);
debug_assert!(before_pivot_len < pivot_len);
let block_max_score_upperbound: Score = scorers[..pivot_len] let block_max_score_upperbound: Score = scorers[..pivot_len]
.iter_mut() .iter_mut()
.map(|scorer| { .map(|scorer| {
scorer.shallow_seek(pivot_doc); scorer.shallow_seek(pivot_doc);
scorer.block_max_score() scorer.block_max_score()
}) })
.sum(); .sum();
// Beware after shallow advance, skip readers can be in advance compared to // Beware after shallow advance, skip readers can be in advance compared to
// the segment posting lists. // the segment posting lists.
//
// `block_segment_postings.load_block()` need to be called separately.
if block_max_score_upperbound <= threshold {
// Block max condition was not reached
// We could get away by simply advancing the scorers to DocId + 1 but it would
// be inefficient. The optimization requires proper explanation and was
// isolated in a different function.
block_max_was_too_low_advance_one_scorer(&mut scorers, pivot_len);
continue;
}
// Block max condition is observed.
//
// Let's try and advance all scorers before the pivot to the pivot.
if !align_scorers(&mut scorers, pivot_doc, before_pivot_len) {
// At least of the scorer does not contain the pivot.
// //
// Let's stop scoring this pivot and go through the pivot selection again. // `block_segment_postings.load_block()` need to be called separately.
// Note that the current pivot is not necessarily a bad candidate and it if block_max_score_upperbound <= threshold {
// may be picked again. // Block max condition was not reached
continue; // We could get away by simply advancing the scorers to DocId + 1 but it would
} // be inefficient. The optimization requires proper explanation and was
// isolated in a different function.
// At this point, all scorers are positioned on the doc. block_max_was_too_low_advance_one_scorer(&mut scorers, pivot_len);
let score = scorers[..pivot_len] continue;
.iter_mut()
.map(|scorer| scorer.score())
.sum();
if score > threshold {
threshold = callback(pivot_doc, score);
}
// let's advance all of the scorers that are currently positioned on the pivot.
advance_all_scorers_on_pivot(&mut scorers, pivot_len);
}
}
struct TermScorerWithMaxScore<'a> {
scorer: &'a mut TermScorer,
max_score: Score,
}
impl<'a> From<&'a mut TermScorer> for TermScorerWithMaxScore<'a> {
fn from(scorer: &'a mut TermScorer) -> Self {
let max_score = scorer.max_score();
TermScorerWithMaxScore { scorer, max_score }
}
}
impl<'a> Deref for TermScorerWithMaxScore<'a> {
type Target = TermScorer;
fn deref(&self) -> &Self::Target {
self.scorer
}
}
impl<'a> DerefMut for TermScorerWithMaxScore<'a> {
fn deref_mut(&mut self) -> &mut Self::Target {
self.scorer
}
}
fn is_sorted<I: Iterator<Item = DocId>>(mut it: I) -> bool {
if let Some(first) = it.next() {
let mut prev = first;
for doc in it {
if doc < prev {
return false;
} }
prev = doc;
// Block max condition is observed.
//
// Let's try and advance all scorers before the pivot to the pivot.
if !align_scorers(&mut scorers, pivot_doc, before_pivot_len) {
// At least of the scorer does not contain the pivot.
//
// Let's stop scoring this pivot and go through the pivot selection again.
// Note that the current pivot is not necessarily a bad candidate and it
// may be picked again.
continue;
}
// At this point, all scorers are positioned on the doc.
let score = scorers[..pivot_len]
.iter_mut()
.map(|scorer| scorer.score())
.sum();
if score > threshold {
threshold = callback(pivot_doc, score);
}
// let's advance all of the scorers that are currently positioned on the pivot.
advance_all_scorers_on_pivot(&mut scorers, pivot_len);
} else {
return;
} }
} }
true
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::query::score_combiner::SumCombiner; use crate::query::score_combiner::SumCombiner;
@@ -246,21 +248,17 @@ mod tests {
use std::iter; use std::iter;
struct Float(Score); struct Float(Score);
impl Eq for Float {} impl Eq for Float {}
impl PartialEq for Float { impl PartialEq for Float {
fn eq(&self, other: &Self) -> bool { fn eq(&self, other: &Self) -> bool {
self.cmp(&other) == Ordering::Equal self.cmp(&other) == Ordering::Equal
} }
} }
impl PartialOrd for Float { impl PartialOrd for Float {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> { fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other)) Some(self.cmp(other))
} }
} }
impl Ord for Float { impl Ord for Float {
fn cmp(&self, other: &Self) -> Ordering { fn cmp(&self, other: &Self) -> Ordering {
other.0.partial_cmp(&self.0).unwrap_or(Ordering::Equal) other.0.partial_cmp(&self.0).unwrap_or(Ordering::Equal)

View File

@@ -4,7 +4,7 @@ use crate::docset::DocSet;
use crate::postings::SegmentPostings; use crate::postings::SegmentPostings;
use crate::query::bm25::BM25Weight; use crate::query::bm25::BM25Weight;
use crate::query::explanation::does_not_match; use crate::query::explanation::does_not_match;
use crate::query::weight::for_each_scorer; use crate::query::weight::{for_each_pruning_scorer, for_each_scorer};
use crate::query::Weight; use crate::query::Weight;
use crate::query::{Explanation, Scorer}; use crate::query::{Explanation, Scorer};
use crate::schema::IndexRecordOption; use crate::schema::IndexRecordOption;
@@ -73,8 +73,8 @@ impl Weight for TermWeight {
reader: &SegmentReader, reader: &SegmentReader,
callback: &mut dyn FnMut(DocId, Score) -> Score, callback: &mut dyn FnMut(DocId, Score) -> Score,
) -> crate::Result<()> { ) -> crate::Result<()> {
let scorer = self.specialized_scorer(reader, 1.0)?; let mut scorer = self.scorer(reader, 1.0)?;
crate::query::boolean_query::block_wand(vec![scorer], threshold, callback); for_each_pruning_scorer(&mut scorer, threshold, callback);
Ok(()) Ok(())
} }
} }

View File

@@ -138,11 +138,9 @@ impl InnerIndexReader {
.collect::<crate::Result<_>>()? .collect::<crate::Result<_>>()?
}; };
let schema = self.index.schema(); let schema = self.index.schema();
let searchers = std::iter::repeat_with(|| { let searchers = (0..self.num_searchers)
Searcher::new(schema.clone(), self.index.clone(), segment_readers.clone()) .map(|_| Searcher::new(schema.clone(), self.index.clone(), segment_readers.clone()))
}) .collect();
.take(self.num_searchers)
.collect();
self.searcher_pool.publish_new_generation(searchers); self.searcher_pool.publish_new_generation(searchers);
Ok(()) Ok(())
} }

View File

@@ -1,5 +1,5 @@
use crate::schema::IntOptions;
use crate::schema::TextOptions; use crate::schema::TextOptions;
use crate::schema::{is_valid_field_name, IntOptions};
use crate::schema::FieldType; use crate::schema::FieldType;
use serde::de::{self, MapAccess, Visitor}; use serde::de::{self, MapAccess, Visitor};
@@ -24,7 +24,6 @@ impl FieldEntry {
/// Creates a new u64 field entry in the schema, given /// Creates a new u64 field entry in the schema, given
/// a name, and some options. /// a name, and some options.
pub fn new_text(field_name: String, text_options: TextOptions) -> FieldEntry { pub fn new_text(field_name: String, text_options: TextOptions) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry { FieldEntry {
name: field_name, name: field_name,
field_type: FieldType::Str(text_options), field_type: FieldType::Str(text_options),
@@ -34,7 +33,6 @@ impl FieldEntry {
/// Creates a new u64 field entry in the schema, given /// Creates a new u64 field entry in the schema, given
/// a name, and some options. /// a name, and some options.
pub fn new_u64(field_name: String, field_type: IntOptions) -> FieldEntry { pub fn new_u64(field_name: String, field_type: IntOptions) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry { FieldEntry {
name: field_name, name: field_name,
field_type: FieldType::U64(field_type), field_type: FieldType::U64(field_type),
@@ -44,7 +42,6 @@ impl FieldEntry {
/// Creates a new i64 field entry in the schema, given /// Creates a new i64 field entry in the schema, given
/// a name, and some options. /// a name, and some options.
pub fn new_i64(field_name: String, field_type: IntOptions) -> FieldEntry { pub fn new_i64(field_name: String, field_type: IntOptions) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry { FieldEntry {
name: field_name, name: field_name,
field_type: FieldType::I64(field_type), field_type: FieldType::I64(field_type),
@@ -54,7 +51,6 @@ impl FieldEntry {
/// Creates a new f64 field entry in the schema, given /// Creates a new f64 field entry in the schema, given
/// a name, and some options. /// a name, and some options.
pub fn new_f64(field_name: String, field_type: IntOptions) -> FieldEntry { pub fn new_f64(field_name: String, field_type: IntOptions) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry { FieldEntry {
name: field_name, name: field_name,
field_type: FieldType::F64(field_type), field_type: FieldType::F64(field_type),
@@ -64,7 +60,6 @@ impl FieldEntry {
/// Creates a new date field entry in the schema, given /// Creates a new date field entry in the schema, given
/// a name, and some options. /// a name, and some options.
pub fn new_date(field_name: String, field_type: IntOptions) -> FieldEntry { pub fn new_date(field_name: String, field_type: IntOptions) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry { FieldEntry {
name: field_name, name: field_name,
field_type: FieldType::Date(field_type), field_type: FieldType::Date(field_type),
@@ -73,7 +68,6 @@ impl FieldEntry {
/// Creates a field entry for a facet. /// Creates a field entry for a facet.
pub fn new_facet(field_name: String) -> FieldEntry { pub fn new_facet(field_name: String) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry { FieldEntry {
name: field_name, name: field_name,
field_type: FieldType::HierarchicalFacet, field_type: FieldType::HierarchicalFacet,
@@ -82,7 +76,6 @@ impl FieldEntry {
/// Creates a field entry for a bytes field /// Creates a field entry for a bytes field
pub fn new_bytes(field_name: String) -> FieldEntry { pub fn new_bytes(field_name: String) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry { FieldEntry {
name: field_name, name: field_name,
field_type: FieldType::Bytes, field_type: FieldType::Bytes,
@@ -275,12 +268,6 @@ mod tests {
use crate::schema::TEXT; use crate::schema::TEXT;
use serde_json; use serde_json;
#[test]
#[should_panic]
fn test_invalid_field_name_should_panic() {
FieldEntry::new_text("-hello".to_string(), TEXT);
}
#[test] #[test]
fn test_json_serialization() { fn test_json_serialization() {
let field_value = FieldEntry::new_text(String::from("title"), TEXT); let field_value = FieldEntry::new_text(String::from("title"), TEXT);

View File

@@ -149,16 +149,14 @@ pub use self::int_options::IntOptions;
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use regex::Regex; use regex::Regex;
/// Regular expression representing the restriction on a valid field names.
pub const FIELD_NAME_PATTERN: &'static str = r#"^[_a-zA-Z][_\-a-zA-Z0-9]*$"#;
/// Validator for a potential `field_name`. /// Validator for a potential `field_name`.
/// Returns true iff the name can be use for a field name. /// Returns true iff the name can be use for a field name.
/// ///
/// A field name must start by a letter `[a-zA-Z]`. /// A field name must start by a letter `[a-zA-Z]`.
/// The other characters can be any alphanumic character `[a-ZA-Z0-9]` or `_`. /// The other characters can be any alphanumic character `[a-ZA-Z0-9]` or `_`.
pub fn is_valid_field_name(field_name: &str) -> bool { pub fn is_valid_field_name(field_name: &str) -> bool {
static FIELD_NAME_PTN: Lazy<Regex> = Lazy::new(|| Regex::new(FIELD_NAME_PATTERN).unwrap()); static FIELD_NAME_PTN: Lazy<Regex> =
Lazy::new(|| Regex::new("^[a-zA-Z][_a-zA-Z0-9]*$").unwrap());
FIELD_NAME_PTN.is_match(field_name) FIELD_NAME_PTN.is_match(field_name)
} }
@@ -172,11 +170,6 @@ mod tests {
assert!(is_valid_field_name("text")); assert!(is_valid_field_name("text"));
assert!(is_valid_field_name("text0")); assert!(is_valid_field_name("text0"));
assert!(!is_valid_field_name("0text")); assert!(!is_valid_field_name("0text"));
assert!(is_valid_field_name("field-name"));
assert!(is_valid_field_name("field_name"));
assert!(!is_valid_field_name("field!name"));
assert!(!is_valid_field_name("-fieldname"));
assert!(is_valid_field_name("_fieldname"));
assert!(!is_valid_field_name("")); assert!(!is_valid_field_name(""));
assert!(!is_valid_field_name("シャボン玉")); assert!(!is_valid_field_name("シャボン玉"));
assert!(is_valid_field_name("my_text_field")); assert!(is_valid_field_name("my_text_field"));