mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-28 13:02:55 +00:00
Compare commits
4 Commits
issue/866b
...
quickfix-e
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7720d21265 | ||
|
|
c339b05789 | ||
|
|
2d3c657f9d | ||
|
|
07f9b828ae |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -12,4 +12,3 @@ cpp/simdcomp/bitpackingbenchmark
|
||||
*.bk
|
||||
.idea
|
||||
trace.dat
|
||||
cargo-timing*
|
||||
|
||||
13
CHANGELOG.md
13
CHANGELOG.md
@@ -1,6 +1,13 @@
|
||||
Tantivy 0.14.0
|
||||
=========================
|
||||
- Remove dependency to atomicwrites #833 .Implemented by @pmasurel upon suggestion and research from @asafigan).
|
||||
Tantivy 0.13.2
|
||||
===================
|
||||
Bugfix. Acquiring a facet reader on a segment that does not contain any
|
||||
doc with this facet returns `None`. (#896)
|
||||
|
||||
Tantivy 0.13.1
|
||||
======================
|
||||
Made `Query` and `Collector` `Send + Sync`.
|
||||
Updated misc dependency versions.
|
||||
|
||||
|
||||
Tantivy 0.13.0
|
||||
======================
|
||||
|
||||
40
Cargo.toml
40
Cargo.toml
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.14.0-dev"
|
||||
version = "0.13.2"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
@@ -13,21 +13,21 @@ keywords = ["search", "information", "retrieval"]
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
base64 = "0.12.0"
|
||||
byteorder = "1.0"
|
||||
crc32fast = "1.2.0"
|
||||
once_cell = "1.0"
|
||||
regex ={version = "1.3.0", default-features = false, features = ["std"]}
|
||||
base64 = "0.12"
|
||||
byteorder = "1"
|
||||
crc32fast = "1"
|
||||
once_cell = "1"
|
||||
regex ={version = "1", default-features = false, features = ["std"]}
|
||||
tantivy-fst = "0.3"
|
||||
memmap = {version = "0.7", optional=true}
|
||||
lz4 = {version="1.20", optional=true}
|
||||
lz4 = {version="1", optional=true}
|
||||
snap = "1"
|
||||
tempfile = {version="3.0", optional=true}
|
||||
atomicwrites = "0.2"
|
||||
atomicwrites = {version="0.2", optional=true}
|
||||
tempfile = "3"
|
||||
log = "0.4"
|
||||
serde = {version="1.0", features=["derive"]}
|
||||
serde_json = "1.0"
|
||||
num_cpus = "1.2"
|
||||
serde = {version="1", features=["derive"]}
|
||||
serde_json = "1"
|
||||
num_cpus = "1"
|
||||
fs2={version="0.4", optional=true}
|
||||
levenshtein_automata = "0.2"
|
||||
notify = {version="4", optional=true}
|
||||
@@ -35,20 +35,20 @@ uuid = { version = "0.8", features = ["v4", "serde"] }
|
||||
crossbeam = "0.7"
|
||||
futures = {version = "0.3", features=["thread-pool"] }
|
||||
owning_ref = "0.4"
|
||||
stable_deref_trait = "1.0.0"
|
||||
rust-stemmers = "1.2"
|
||||
downcast-rs = { version="1.0" }
|
||||
tantivy-query-grammar = { version="0.14.0-dev", path="./query-grammar" }
|
||||
stable_deref_trait = "1"
|
||||
rust-stemmers = "1"
|
||||
downcast-rs = "1"
|
||||
tantivy-query-grammar = { version="0.13", path="./query-grammar" }
|
||||
bitpacking = {version="0.8", default-features = false, features=["bitpacker4x"]}
|
||||
census = "0.4"
|
||||
fnv = "1.0.6"
|
||||
fnv = "1"
|
||||
owned-read = "0.4"
|
||||
failure = "0.1"
|
||||
htmlescape = "0.3.1"
|
||||
htmlescape = "0.3"
|
||||
fail = "0.4"
|
||||
murmurhash32 = "0.2"
|
||||
chrono = "0.4"
|
||||
smallvec = "1.0"
|
||||
smallvec = "1"
|
||||
rayon = "1"
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
@@ -75,7 +75,7 @@ overflow-checks = true
|
||||
|
||||
[features]
|
||||
default = ["mmap"]
|
||||
mmap = ["fs2", "tempfile", "memmap", "notify"]
|
||||
mmap = ["atomicwrites", "fs2", "memmap", "notify"]
|
||||
lz4-compression = ["lz4"]
|
||||
failpoints = ["fail/failpoints"]
|
||||
unstable = [] # useful for benches.
|
||||
|
||||
@@ -34,6 +34,11 @@ Tantivy is, in fact, strongly inspired by Lucene's design.
|
||||
The following [benchmark](https://tantivy-search.github.io/bench/) break downs
|
||||
performance for different type of queries / collection.
|
||||
|
||||
|
||||
In general, Tantivy tends to be
|
||||
- slower than Lucene on union with a Top-K due to Block-WAND optimization.
|
||||
- faster than Lucene on intersection and phrase queries.
|
||||
|
||||
Your mileage WILL vary depending on the nature of queries and their load.
|
||||
|
||||
# Features
|
||||
|
||||
@@ -112,6 +112,18 @@ fn main() -> tantivy::Result<()> {
|
||||
limbs and branches that arch over the pool"
|
||||
));
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
));
|
||||
|
||||
// Multivalued field just need to be repeated.
|
||||
index_writer.add_document(doc!(
|
||||
title => "Frankenstein",
|
||||
|
||||
@@ -56,7 +56,7 @@ fn main() -> tantivy::Result<()> {
|
||||
);
|
||||
let top_docs_by_custom_score =
|
||||
TopDocs::with_limit(2).tweak_score(move |segment_reader: &SegmentReader| {
|
||||
let mut ingredient_reader = segment_reader.facet_reader(ingredient).unwrap();
|
||||
let ingredient_reader = segment_reader.facet_reader(ingredient).unwrap();
|
||||
let facet_dict = ingredient_reader.facet_dict();
|
||||
|
||||
let query_ords: HashSet<u64> = facets
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy-query-grammar"
|
||||
version = "0.14.0-dev"
|
||||
version = "0.13.0"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
|
||||
@@ -52,7 +52,7 @@ mod test {
|
||||
use crate::Occur;
|
||||
|
||||
#[test]
|
||||
fn test_occur_compose() {
|
||||
fn test_Occur_compose() {
|
||||
assert_eq!(Occur::compose(Occur::Should, Occur::Should), Occur::Should);
|
||||
assert_eq!(Occur::compose(Occur::Should, Occur::Must), Occur::Must);
|
||||
assert_eq!(
|
||||
|
||||
@@ -9,10 +9,8 @@ use combine::{
|
||||
|
||||
fn field<'a>() -> impl Parser<&'a str, Output = String> {
|
||||
(
|
||||
(letter().or(char('_'))),
|
||||
many(satisfy(|c: char| {
|
||||
c.is_alphanumeric() || c == '_' || c == '-'
|
||||
})),
|
||||
letter(),
|
||||
many(satisfy(|c: char| c.is_alphanumeric() || c == '_')),
|
||||
)
|
||||
.skip(char(':'))
|
||||
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
|
||||
@@ -281,8 +279,6 @@ pub fn parse_to_ast<'a>() -> impl Parser<&'a str, Output = UserInputAST> {
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
type TestParseResult = Result<(), StringStreamError>;
|
||||
|
||||
use super::*;
|
||||
use combine::parser::Parser;
|
||||
|
||||
@@ -300,10 +296,9 @@ mod test {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_occur_symbol() -> TestParseResult {
|
||||
assert_eq!(super::occur_symbol().parse("-")?, (Occur::MustNot, ""));
|
||||
assert_eq!(super::occur_symbol().parse("+")?, (Occur::Must, ""));
|
||||
Ok(())
|
||||
fn test_occur_symbol() {
|
||||
assert_eq!(super::occur_symbol().parse("-"), Ok((Occur::MustNot, "")));
|
||||
assert_eq!(super::occur_symbol().parse("+"), Ok((Occur::Must, "")));
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -415,25 +410,6 @@ mod test {
|
||||
assert_eq!(format!("{:?}", ast), "\"abc\"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_field_name() -> TestParseResult {
|
||||
assert_eq!(
|
||||
super::field().parse("my-field-name:a")?,
|
||||
("my-field-name".to_string(), "a")
|
||||
);
|
||||
assert_eq!(
|
||||
super::field().parse("my_field_name:a")?,
|
||||
("my_field_name".to_string(), "a")
|
||||
);
|
||||
assert!(super::field().parse(":a").is_err());
|
||||
assert!(super::field().parse("-my_field:a").is_err());
|
||||
assert_eq!(
|
||||
super::field().parse("_my_field:a")?,
|
||||
("_my_field".to_string(), "a")
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_range_parser() {
|
||||
// testing the range() parser separately
|
||||
|
||||
@@ -46,7 +46,7 @@ pub trait CustomScorer<TScore>: Sync {
|
||||
|
||||
impl<TCustomScorer, TScore> Collector for CustomScoreTopCollector<TCustomScorer, TScore>
|
||||
where
|
||||
TCustomScorer: CustomScorer<TScore>,
|
||||
TCustomScorer: CustomScorer<TScore> + Send + Sync,
|
||||
TScore: 'static + PartialOrd + Clone + Send + Sync,
|
||||
{
|
||||
type Fruit = Vec<(TScore, DocAddress)>;
|
||||
|
||||
@@ -133,7 +133,7 @@ impl<T> Fruit for T where T: Send + downcast_rs::Downcast {}
|
||||
/// The collection logic itself is in the `SegmentCollector`.
|
||||
///
|
||||
/// Segments are not guaranteed to be visited in any specific order.
|
||||
pub trait Collector: Sync {
|
||||
pub trait Collector: Sync + Send {
|
||||
/// `Fruit` is the type for the result of our collection.
|
||||
/// e.g. `usize` for the `Count` collector.
|
||||
type Fruit: Fruit;
|
||||
|
||||
@@ -324,7 +324,7 @@ impl TopDocs {
|
||||
where
|
||||
TScore: 'static + Send + Sync + Clone + PartialOrd,
|
||||
TScoreSegmentTweaker: ScoreSegmentTweaker<TScore> + 'static,
|
||||
TScoreTweaker: ScoreTweaker<TScore, Child = TScoreSegmentTweaker>,
|
||||
TScoreTweaker: ScoreTweaker<TScore, Child = TScoreSegmentTweaker> + Send + Sync,
|
||||
{
|
||||
TweakedScoreTopCollector::new(score_tweaker, self.0.into_tscore())
|
||||
}
|
||||
@@ -438,7 +438,7 @@ impl TopDocs {
|
||||
where
|
||||
TScore: 'static + Send + Sync + Clone + PartialOrd,
|
||||
TCustomSegmentScorer: CustomSegmentScorer<TScore> + 'static,
|
||||
TCustomScorer: CustomScorer<TScore, Child = TCustomSegmentScorer>,
|
||||
TCustomScorer: CustomScorer<TScore, Child = TCustomSegmentScorer> + Send + Sync,
|
||||
{
|
||||
CustomScoreTopCollector::new(custom_score, self.0.into_tscore())
|
||||
}
|
||||
|
||||
@@ -49,7 +49,7 @@ pub trait ScoreTweaker<TScore>: Sync {
|
||||
|
||||
impl<TScoreTweaker, TScore> Collector for TweakedScoreTopCollector<TScoreTweaker, TScore>
|
||||
where
|
||||
TScoreTweaker: ScoreTweaker<TScore>,
|
||||
TScoreTweaker: ScoreTweaker<TScore> + Send + Sync,
|
||||
TScore: 'static + PartialOrd + Clone + Send + Sync,
|
||||
{
|
||||
type Fruit = Vec<(TScore, DocAddress)>;
|
||||
|
||||
@@ -539,6 +539,7 @@ mod tests {
|
||||
test_index_on_commit_reload_policy_aux(field, &write_index, &reader);
|
||||
}
|
||||
}
|
||||
|
||||
fn test_index_on_commit_reload_policy_aux(field: Field, index: &Index, reader: &IndexReader) {
|
||||
let mut reader_index = reader.index();
|
||||
let (sender, receiver) = crossbeam::channel::unbounded();
|
||||
@@ -549,23 +550,12 @@ mod tests {
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
writer.add_document(doc!(field=>1u64));
|
||||
writer.commit().unwrap();
|
||||
// We need a loop here because it is possible for notify to send more than
|
||||
// one modify event. It was observed on CI on MacOS.
|
||||
loop {
|
||||
assert!(receiver.recv().is_ok());
|
||||
if reader.searcher().num_docs() == 1 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
assert!(receiver.recv().is_ok());
|
||||
assert_eq!(reader.searcher().num_docs(), 1);
|
||||
writer.add_document(doc!(field=>2u64));
|
||||
writer.commit().unwrap();
|
||||
// ... Same as above
|
||||
loop {
|
||||
assert!(receiver.recv().is_ok());
|
||||
if reader.searcher().num_docs() == 2 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
assert!(receiver.recv().is_ok());
|
||||
assert_eq!(reader.searcher().num_docs(), 2);
|
||||
}
|
||||
|
||||
// This test will not pass on windows, because windows
|
||||
|
||||
@@ -112,8 +112,10 @@ impl SegmentReader {
|
||||
return None;
|
||||
}
|
||||
let term_ords_reader = self.fast_fields().u64s(field)?;
|
||||
let termdict_source = self.termdict_composite.open_read(field)?;
|
||||
let termdict = TermDictionary::from_source(&termdict_source);
|
||||
let termdict = self.termdict_composite
|
||||
.open_read(field)
|
||||
.map(|source| TermDictionary::from_source(&source))
|
||||
.unwrap_or_else(TermDictionary::empty);
|
||||
let facet_reader = FacetReader::new(term_ords_reader, termdict);
|
||||
Some(facet_reader)
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::core::{MANAGED_FILEPATH, META_FILEPATH};
|
||||
use crate::core::MANAGED_FILEPATH;
|
||||
use crate::directory::error::{DeleteError, IOError, LockError, OpenReadError, OpenWriteError};
|
||||
use crate::directory::footer::{Footer, FooterProxy};
|
||||
use crate::directory::DirectoryLock;
|
||||
@@ -246,15 +246,13 @@ impl ManagedDirectory {
|
||||
/// List files for which checksum does not match content
|
||||
pub fn list_damaged(&self) -> result::Result<HashSet<PathBuf>, OpenReadError> {
|
||||
let mut hashset = HashSet::new();
|
||||
let mut managed_paths = self
|
||||
let managed_paths = self
|
||||
.meta_informations
|
||||
.read()
|
||||
.expect("Managed directory rlock poisoned in list damaged.")
|
||||
.managed_paths
|
||||
.clone();
|
||||
|
||||
managed_paths.remove(*META_FILEPATH);
|
||||
|
||||
for path in managed_paths.into_iter() {
|
||||
if !self.validate_checksum(&path)? {
|
||||
hashset.insert(path);
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use crate::core::META_FILEPATH;
|
||||
use atomicwrites;
|
||||
use crate::directory::error::LockError;
|
||||
use crate::directory::error::{
|
||||
DeleteError, IOError, OpenDirectoryError, OpenReadError, OpenWriteError,
|
||||
@@ -35,7 +34,6 @@ use std::sync::Mutex;
|
||||
use std::sync::RwLock;
|
||||
use std::sync::Weak;
|
||||
use std::thread;
|
||||
use tempfile;
|
||||
use tempfile::TempDir;
|
||||
|
||||
/// Create a default io error given a string.
|
||||
@@ -489,11 +487,11 @@ impl Directory for MmapDirectory {
|
||||
}
|
||||
}
|
||||
|
||||
fn atomic_write(&mut self, path: &Path, content: &[u8]) -> io::Result<()> {
|
||||
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
|
||||
debug!("Atomic Write {:?}", path);
|
||||
let full_path = self.resolve_path(path);
|
||||
let meta_file = atomicwrites::AtomicFile::new(full_path, atomicwrites::AllowOverwrite);
|
||||
meta_file.write(|f| f.write_all(content))?;
|
||||
meta_file.write(|f| f.write_all(data))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -211,18 +211,19 @@ fn test_watch(directory: &mut dyn Directory) {
|
||||
.unwrap();
|
||||
|
||||
for i in 0..10 {
|
||||
assert!(i <= counter.load(SeqCst));
|
||||
assert_eq!(i, counter.load(SeqCst));
|
||||
assert!(directory
|
||||
.atomic_write(Path::new("meta.json"), b"random_test_data_2")
|
||||
.is_ok());
|
||||
assert!(i + 1 <= counter.load(SeqCst)); // notify can trigger more than once.
|
||||
assert_eq!(receiver.recv_timeout(Duration::from_millis(500)), Ok(i));
|
||||
assert_eq!(i + 1, counter.load(SeqCst));
|
||||
}
|
||||
mem::drop(watch_handle);
|
||||
assert!(directory
|
||||
.atomic_write(Path::new("meta.json"), b"random_test_data")
|
||||
.is_ok());
|
||||
assert!(receiver.recv_timeout(Duration::from_millis(500)).is_ok());
|
||||
assert!(10 <= counter.load(SeqCst));
|
||||
assert_eq!(10, counter.load(SeqCst));
|
||||
}
|
||||
|
||||
fn test_lock_non_blocking(directory: &mut dyn Directory) {
|
||||
|
||||
@@ -73,7 +73,52 @@ impl FacetReader {
|
||||
}
|
||||
|
||||
/// Return the list of facet ordinals associated to a document.
|
||||
pub fn facet_ords(&mut self, doc: DocId, output: &mut Vec<u64>) {
|
||||
pub fn facet_ords(&self, doc: DocId, output: &mut Vec<u64>) {
|
||||
self.term_ords.get_vals(doc, output);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::{Document, schema::{Facet, SchemaBuilder}};
|
||||
use crate::Index;
|
||||
|
||||
#[test]
|
||||
fn test_facet_not_populated_for_all_docs() -> crate::Result<()> {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let facet_field = schema_builder.add_facet_field("facet");
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b")));
|
||||
index_writer.add_document(Document::default());
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let facet_reader = searcher.segment_reader(0u32).facet_reader(facet_field).unwrap();
|
||||
let mut facet_ords = Vec::new();
|
||||
facet_reader.facet_ords(0u32, &mut facet_ords);
|
||||
assert_eq!(&facet_ords, &[2u64]);
|
||||
facet_reader.facet_ords(1u32, &mut facet_ords);
|
||||
assert!(facet_ords.is_empty());
|
||||
Ok(())
|
||||
}
|
||||
#[test]
|
||||
fn test_facet_not_populated_for_any_docs() -> crate::Result<()> {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let facet_field = schema_builder.add_facet_field("facet");
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||
index_writer.add_document(Document::default());
|
||||
index_writer.add_document(Document::default());
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let facet_reader = searcher.segment_reader(0u32).facet_reader(facet_field).unwrap();
|
||||
let mut facet_ords = Vec::new();
|
||||
facet_reader.facet_ords(0u32, &mut facet_ords);
|
||||
assert!(facet_ords.is_empty());
|
||||
facet_reader.facet_ords(1u32, &mut facet_ords);
|
||||
assert!(facet_ords.is_empty());
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -126,6 +126,7 @@ impl FastFieldsWriter {
|
||||
for field_writer in &self.single_value_writers {
|
||||
field_writer.serialize(serializer)?;
|
||||
}
|
||||
|
||||
for field_writer in &self.multi_values_writers {
|
||||
let field = field_writer.field();
|
||||
field_writer.serialize(serializer, mapping.get(&field))?;
|
||||
|
||||
@@ -29,9 +29,8 @@ pub use self::segment_writer::SegmentWriter;
|
||||
/// Alias for the default merge policy, which is the `LogMergePolicy`.
|
||||
pub type DefaultMergePolicy = LogMergePolicy;
|
||||
|
||||
#[cfg(feature = "mmap")]
|
||||
#[cfg(test)]
|
||||
mod tests_mmap {
|
||||
mod tests {
|
||||
use crate::schema::{self, Schema};
|
||||
use crate::{Index, Term};
|
||||
|
||||
|
||||
@@ -151,7 +151,7 @@ impl SegmentWriter {
|
||||
if let Some(unordered_term_id) = unordered_term_id_opt {
|
||||
self.fast_field_writers
|
||||
.get_multivalue_writer(field)
|
||||
.expect("multified writer for facet missing")
|
||||
.expect("writer for facet missing")
|
||||
.add_val(unordered_term_id);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1012,12 +1012,4 @@ mod tests {
|
||||
DOC_COUNT as usize
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_validate_checksum() {
|
||||
let index_path = tempfile::tempdir().expect("dir");
|
||||
let schema = Schema::builder().build();
|
||||
let index = Index::create_in_dir(&index_path, schema).expect("index");
|
||||
assert!(index.validate_checksum().unwrap().is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,6 +4,19 @@ use crate::{DocId, DocSet, Score, TERMINATED};
|
||||
use std::ops::Deref;
|
||||
use std::ops::DerefMut;
|
||||
|
||||
fn is_sorted<I: Iterator<Item = DocId>>(mut it: I) -> bool {
|
||||
if let Some(first) = it.next() {
|
||||
let mut prev = first;
|
||||
for doc in it {
|
||||
if doc < prev {
|
||||
return false;
|
||||
}
|
||||
prev = doc;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
/// Takes a term_scorers sorted by their current doc() and a threshold and returns
|
||||
/// Returns (pivot_len, pivot_ord) defined as follows:
|
||||
/// - `pivot_doc` lowest document that has a chance of exceeding (>) the threshold score.
|
||||
@@ -42,12 +55,37 @@ fn find_pivot_doc(
|
||||
Some((before_pivot_len, pivot_len, pivot_doc))
|
||||
}
|
||||
|
||||
struct TermScorerWithMaxScore<'a> {
|
||||
scorer: &'a mut TermScorer,
|
||||
max_score: Score,
|
||||
}
|
||||
|
||||
impl<'a> From<&'a mut TermScorer> for TermScorerWithMaxScore<'a> {
|
||||
fn from(scorer: &'a mut TermScorer) -> Self {
|
||||
let max_score = scorer.max_score();
|
||||
TermScorerWithMaxScore { scorer, max_score }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Deref for TermScorerWithMaxScore<'a> {
|
||||
type Target = TermScorer;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.scorer
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> DerefMut for TermScorerWithMaxScore<'a> {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
self.scorer
|
||||
}
|
||||
}
|
||||
|
||||
// Before and after calling this method, scorers need to be sorted by their `.doc()`.
|
||||
fn block_max_was_too_low_advance_one_scorer(
|
||||
scorers: &mut Vec<TermScorerWithMaxScore>,
|
||||
pivot_len: usize,
|
||||
) {
|
||||
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
|
||||
let mut scorer_to_seek = pivot_len - 1;
|
||||
let mut doc_to_seek_after = scorers[scorer_to_seek].doc();
|
||||
for scorer_ord in (0..pivot_len - 1).rev() {
|
||||
@@ -64,7 +102,6 @@ fn block_max_was_too_low_advance_one_scorer(
|
||||
}
|
||||
scorers[scorer_to_seek].seek(doc_to_seek_after + 1);
|
||||
restore_ordering(scorers, scorer_to_seek);
|
||||
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
|
||||
}
|
||||
|
||||
// Given a list of term_scorers and a `ord` and assuming that `term_scorers[ord]` is sorted
|
||||
@@ -140,99 +177,64 @@ pub fn block_wand(
|
||||
.map(TermScorerWithMaxScore::from)
|
||||
.collect();
|
||||
scorers.sort_by_key(|scorer| scorer.doc());
|
||||
// At this point we need to ensure that the scorers are sorted!
|
||||
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
|
||||
while let Some((before_pivot_len, pivot_len, pivot_doc)) =
|
||||
find_pivot_doc(&scorers[..], threshold)
|
||||
{
|
||||
loop {
|
||||
// At this point we need to ensure that the scorers are sorted!
|
||||
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
|
||||
debug_assert_ne!(pivot_doc, TERMINATED);
|
||||
debug_assert!(before_pivot_len < pivot_len);
|
||||
if let Some((before_pivot_len, pivot_len, pivot_doc)) =
|
||||
find_pivot_doc(&scorers[..], threshold)
|
||||
{
|
||||
debug_assert_ne!(pivot_doc, TERMINATED);
|
||||
debug_assert!(before_pivot_len < pivot_len);
|
||||
|
||||
let block_max_score_upperbound: Score = scorers[..pivot_len]
|
||||
.iter_mut()
|
||||
.map(|scorer| {
|
||||
scorer.shallow_seek(pivot_doc);
|
||||
scorer.block_max_score()
|
||||
})
|
||||
.sum();
|
||||
let block_max_score_upperbound: Score = scorers[..pivot_len]
|
||||
.iter_mut()
|
||||
.map(|scorer| {
|
||||
scorer.shallow_seek(pivot_doc);
|
||||
scorer.block_max_score()
|
||||
})
|
||||
.sum();
|
||||
|
||||
// Beware after shallow advance, skip readers can be in advance compared to
|
||||
// the segment posting lists.
|
||||
//
|
||||
// `block_segment_postings.load_block()` need to be called separately.
|
||||
if block_max_score_upperbound <= threshold {
|
||||
// Block max condition was not reached
|
||||
// We could get away by simply advancing the scorers to DocId + 1 but it would
|
||||
// be inefficient. The optimization requires proper explanation and was
|
||||
// isolated in a different function.
|
||||
block_max_was_too_low_advance_one_scorer(&mut scorers, pivot_len);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Block max condition is observed.
|
||||
//
|
||||
// Let's try and advance all scorers before the pivot to the pivot.
|
||||
if !align_scorers(&mut scorers, pivot_doc, before_pivot_len) {
|
||||
// At least of the scorer does not contain the pivot.
|
||||
// Beware after shallow advance, skip readers can be in advance compared to
|
||||
// the segment posting lists.
|
||||
//
|
||||
// Let's stop scoring this pivot and go through the pivot selection again.
|
||||
// Note that the current pivot is not necessarily a bad candidate and it
|
||||
// may be picked again.
|
||||
continue;
|
||||
}
|
||||
|
||||
// At this point, all scorers are positioned on the doc.
|
||||
let score = scorers[..pivot_len]
|
||||
.iter_mut()
|
||||
.map(|scorer| scorer.score())
|
||||
.sum();
|
||||
if score > threshold {
|
||||
threshold = callback(pivot_doc, score);
|
||||
}
|
||||
// let's advance all of the scorers that are currently positioned on the pivot.
|
||||
advance_all_scorers_on_pivot(&mut scorers, pivot_len);
|
||||
}
|
||||
}
|
||||
|
||||
struct TermScorerWithMaxScore<'a> {
|
||||
scorer: &'a mut TermScorer,
|
||||
max_score: Score,
|
||||
}
|
||||
|
||||
impl<'a> From<&'a mut TermScorer> for TermScorerWithMaxScore<'a> {
|
||||
fn from(scorer: &'a mut TermScorer) -> Self {
|
||||
let max_score = scorer.max_score();
|
||||
TermScorerWithMaxScore { scorer, max_score }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Deref for TermScorerWithMaxScore<'a> {
|
||||
type Target = TermScorer;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.scorer
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> DerefMut for TermScorerWithMaxScore<'a> {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
self.scorer
|
||||
}
|
||||
}
|
||||
|
||||
fn is_sorted<I: Iterator<Item = DocId>>(mut it: I) -> bool {
|
||||
if let Some(first) = it.next() {
|
||||
let mut prev = first;
|
||||
for doc in it {
|
||||
if doc < prev {
|
||||
return false;
|
||||
// `block_segment_postings.load_block()` need to be called separately.
|
||||
if block_max_score_upperbound <= threshold {
|
||||
// Block max condition was not reached
|
||||
// We could get away by simply advancing the scorers to DocId + 1 but it would
|
||||
// be inefficient. The optimization requires proper explanation and was
|
||||
// isolated in a different function.
|
||||
block_max_was_too_low_advance_one_scorer(&mut scorers, pivot_len);
|
||||
continue;
|
||||
}
|
||||
prev = doc;
|
||||
|
||||
// Block max condition is observed.
|
||||
//
|
||||
// Let's try and advance all scorers before the pivot to the pivot.
|
||||
if !align_scorers(&mut scorers, pivot_doc, before_pivot_len) {
|
||||
// At least of the scorer does not contain the pivot.
|
||||
//
|
||||
// Let's stop scoring this pivot and go through the pivot selection again.
|
||||
// Note that the current pivot is not necessarily a bad candidate and it
|
||||
// may be picked again.
|
||||
continue;
|
||||
}
|
||||
|
||||
// At this point, all scorers are positioned on the doc.
|
||||
let score = scorers[..pivot_len]
|
||||
.iter_mut()
|
||||
.map(|scorer| scorer.score())
|
||||
.sum();
|
||||
if score > threshold {
|
||||
threshold = callback(pivot_doc, score);
|
||||
}
|
||||
// let's advance all of the scorers that are currently positioned on the pivot.
|
||||
advance_all_scorers_on_pivot(&mut scorers, pivot_len);
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::query::score_combiner::SumCombiner;
|
||||
@@ -246,21 +248,17 @@ mod tests {
|
||||
use std::iter;
|
||||
|
||||
struct Float(Score);
|
||||
|
||||
impl Eq for Float {}
|
||||
|
||||
impl PartialEq for Float {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.cmp(&other) == Ordering::Equal
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for Float {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for Float {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
other.0.partial_cmp(&self.0).unwrap_or(Ordering::Equal)
|
||||
|
||||
@@ -40,7 +40,7 @@ use std::fmt;
|
||||
///
|
||||
/// When implementing a new type of `Query`, it is normal to implement a
|
||||
/// dedicated `Query`, `Weight` and `Scorer`.
|
||||
pub trait Query: QueryClone + downcast_rs::Downcast + fmt::Debug {
|
||||
pub trait Query: QueryClone + Send + Sync + downcast_rs::Downcast + fmt::Debug {
|
||||
/// Create the weight associated to a query.
|
||||
///
|
||||
/// If scoring is not required, setting `scoring_enabled` to `false`
|
||||
|
||||
@@ -4,7 +4,7 @@ use crate::docset::DocSet;
|
||||
use crate::postings::SegmentPostings;
|
||||
use crate::query::bm25::BM25Weight;
|
||||
use crate::query::explanation::does_not_match;
|
||||
use crate::query::weight::for_each_scorer;
|
||||
use crate::query::weight::{for_each_pruning_scorer, for_each_scorer};
|
||||
use crate::query::Weight;
|
||||
use crate::query::{Explanation, Scorer};
|
||||
use crate::schema::IndexRecordOption;
|
||||
@@ -73,8 +73,8 @@ impl Weight for TermWeight {
|
||||
reader: &SegmentReader,
|
||||
callback: &mut dyn FnMut(DocId, Score) -> Score,
|
||||
) -> crate::Result<()> {
|
||||
let scorer = self.specialized_scorer(reader, 1.0)?;
|
||||
crate::query::boolean_query::block_wand(vec![scorer], threshold, callback);
|
||||
let mut scorer = self.scorer(reader, 1.0)?;
|
||||
for_each_pruning_scorer(&mut scorer, threshold, callback);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -138,11 +138,9 @@ impl InnerIndexReader {
|
||||
.collect::<crate::Result<_>>()?
|
||||
};
|
||||
let schema = self.index.schema();
|
||||
let searchers = std::iter::repeat_with(|| {
|
||||
Searcher::new(schema.clone(), self.index.clone(), segment_readers.clone())
|
||||
})
|
||||
.take(self.num_searchers)
|
||||
.collect();
|
||||
let searchers = (0..self.num_searchers)
|
||||
.map(|_| Searcher::new(schema.clone(), self.index.clone(), segment_readers.clone()))
|
||||
.collect();
|
||||
self.searcher_pool.publish_new_generation(searchers);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use crate::schema::IntOptions;
|
||||
use crate::schema::TextOptions;
|
||||
use crate::schema::{is_valid_field_name, IntOptions};
|
||||
|
||||
use crate::schema::FieldType;
|
||||
use serde::de::{self, MapAccess, Visitor};
|
||||
@@ -24,7 +24,6 @@ impl FieldEntry {
|
||||
/// Creates a new u64 field entry in the schema, given
|
||||
/// a name, and some options.
|
||||
pub fn new_text(field_name: String, text_options: TextOptions) -> FieldEntry {
|
||||
assert!(is_valid_field_name(&field_name));
|
||||
FieldEntry {
|
||||
name: field_name,
|
||||
field_type: FieldType::Str(text_options),
|
||||
@@ -34,7 +33,6 @@ impl FieldEntry {
|
||||
/// Creates a new u64 field entry in the schema, given
|
||||
/// a name, and some options.
|
||||
pub fn new_u64(field_name: String, field_type: IntOptions) -> FieldEntry {
|
||||
assert!(is_valid_field_name(&field_name));
|
||||
FieldEntry {
|
||||
name: field_name,
|
||||
field_type: FieldType::U64(field_type),
|
||||
@@ -44,7 +42,6 @@ impl FieldEntry {
|
||||
/// Creates a new i64 field entry in the schema, given
|
||||
/// a name, and some options.
|
||||
pub fn new_i64(field_name: String, field_type: IntOptions) -> FieldEntry {
|
||||
assert!(is_valid_field_name(&field_name));
|
||||
FieldEntry {
|
||||
name: field_name,
|
||||
field_type: FieldType::I64(field_type),
|
||||
@@ -54,7 +51,6 @@ impl FieldEntry {
|
||||
/// Creates a new f64 field entry in the schema, given
|
||||
/// a name, and some options.
|
||||
pub fn new_f64(field_name: String, field_type: IntOptions) -> FieldEntry {
|
||||
assert!(is_valid_field_name(&field_name));
|
||||
FieldEntry {
|
||||
name: field_name,
|
||||
field_type: FieldType::F64(field_type),
|
||||
@@ -64,7 +60,6 @@ impl FieldEntry {
|
||||
/// Creates a new date field entry in the schema, given
|
||||
/// a name, and some options.
|
||||
pub fn new_date(field_name: String, field_type: IntOptions) -> FieldEntry {
|
||||
assert!(is_valid_field_name(&field_name));
|
||||
FieldEntry {
|
||||
name: field_name,
|
||||
field_type: FieldType::Date(field_type),
|
||||
@@ -73,7 +68,6 @@ impl FieldEntry {
|
||||
|
||||
/// Creates a field entry for a facet.
|
||||
pub fn new_facet(field_name: String) -> FieldEntry {
|
||||
assert!(is_valid_field_name(&field_name));
|
||||
FieldEntry {
|
||||
name: field_name,
|
||||
field_type: FieldType::HierarchicalFacet,
|
||||
@@ -82,7 +76,6 @@ impl FieldEntry {
|
||||
|
||||
/// Creates a field entry for a bytes field
|
||||
pub fn new_bytes(field_name: String) -> FieldEntry {
|
||||
assert!(is_valid_field_name(&field_name));
|
||||
FieldEntry {
|
||||
name: field_name,
|
||||
field_type: FieldType::Bytes,
|
||||
@@ -275,12 +268,6 @@ mod tests {
|
||||
use crate::schema::TEXT;
|
||||
use serde_json;
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn test_invalid_field_name_should_panic() {
|
||||
FieldEntry::new_text("-hello".to_string(), TEXT);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_json_serialization() {
|
||||
let field_value = FieldEntry::new_text(String::from("title"), TEXT);
|
||||
|
||||
@@ -149,16 +149,14 @@ pub use self::int_options::IntOptions;
|
||||
use once_cell::sync::Lazy;
|
||||
use regex::Regex;
|
||||
|
||||
/// Regular expression representing the restriction on a valid field names.
|
||||
pub const FIELD_NAME_PATTERN: &'static str = r#"^[_a-zA-Z][_\-a-zA-Z0-9]*$"#;
|
||||
|
||||
/// Validator for a potential `field_name`.
|
||||
/// Returns true iff the name can be use for a field name.
|
||||
///
|
||||
/// A field name must start by a letter `[a-zA-Z]`.
|
||||
/// The other characters can be any alphanumic character `[a-ZA-Z0-9]` or `_`.
|
||||
pub fn is_valid_field_name(field_name: &str) -> bool {
|
||||
static FIELD_NAME_PTN: Lazy<Regex> = Lazy::new(|| Regex::new(FIELD_NAME_PATTERN).unwrap());
|
||||
static FIELD_NAME_PTN: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new("^[a-zA-Z][_a-zA-Z0-9]*$").unwrap());
|
||||
FIELD_NAME_PTN.is_match(field_name)
|
||||
}
|
||||
|
||||
@@ -172,11 +170,6 @@ mod tests {
|
||||
assert!(is_valid_field_name("text"));
|
||||
assert!(is_valid_field_name("text0"));
|
||||
assert!(!is_valid_field_name("0text"));
|
||||
assert!(is_valid_field_name("field-name"));
|
||||
assert!(is_valid_field_name("field_name"));
|
||||
assert!(!is_valid_field_name("field!name"));
|
||||
assert!(!is_valid_field_name("-fieldname"));
|
||||
assert!(is_valid_field_name("_fieldname"));
|
||||
assert!(!is_valid_field_name(""));
|
||||
assert!(!is_valid_field_name("シャボン玉"));
|
||||
assert!(is_valid_field_name("my_text_field"));
|
||||
|
||||
Reference in New Issue
Block a user