Compare commits

..

15 Commits

Author SHA1 Message Date
Paul Masurel
37e7af322d Reverting atomic_write to the atomic_writes in order to address #866 2020-09-19 10:39:43 +09:00
Paul Masurel
151498cbe7 Creating the tempfile for atomicwrites in the same directory as the MmapDirectory. (#878) 2020-09-05 23:06:29 +09:00
Paul Masurel
3a72b1cb98 Accept dash within field names. (#874)
Accept dash in field names and enforce field names constraint at the
creation of the schema.

Closes #796
2020-09-01 13:38:52 +09:00
Paul Masurel
2737822620 Fixing unit tests. (#868)
There was a unit test failing when notify was sending more
than one event on atomicwrites.

It was observed on MacOS CI.
2020-08-27 16:43:39 +09:00
b8591340
06c12ae221 Filter meta.json from validate_checksum (#872) 2020-08-27 07:54:37 +09:00
Paul Masurel
4e4400af7f Added cargo timing report to .gitignore 2020-08-23 16:15:28 +09:00
Paul Masurel
3f1ecf53ab Merge branch 'master' of github.com:tantivy-search/tantivy 2020-08-22 21:30:47 +09:00
Paul Masurel
0b583b8130 Plastic changes 2020-08-22 21:29:12 +09:00
Paul Masurel
31d18dca1c Removing dependency to atomicwrites (#866) 2020-08-21 21:37:05 +09:00
stephenlagree
5e06e7de5a Update basic_search.rs (#865)
Remove duplicated document entry.
2020-08-21 11:23:09 +09:00
Paul Masurel
8af53cbd36 Merge branch 'master' of github.com:tantivy-search/tantivy 2020-08-21 08:57:42 +09:00
Paul Masurel
4914076e8f Fixing release build 2020-08-21 08:57:27 +09:00
Paul Masurel
e04f47e922 Using block wand for term queries too. 2020-08-20 15:51:21 +09:00
Paul Masurel
f355695581 Code clean up 2020-08-20 15:42:50 +09:00
Paul Masurel
cbacdf0de8 Edited README. 2020-08-20 14:28:24 +09:00
29 changed files with 225 additions and 226 deletions

1
.gitignore vendored
View File

@@ -12,3 +12,4 @@ cpp/simdcomp/bitpackingbenchmark
*.bk *.bk
.idea .idea
trace.dat trace.dat
cargo-timing*

View File

@@ -1,13 +1,6 @@
Tantivy 0.13.2 Tantivy 0.14.0
=================== =========================
Bugfix. Acquiring a facet reader on a segment that does not contain any - Remove dependency to atomicwrites #833 .Implemented by @pmasurel upon suggestion and research from @asafigan).
doc with this facet returns `None`. (#896)
Tantivy 0.13.1
======================
Made `Query` and `Collector` `Send + Sync`.
Updated misc dependency versions.
Tantivy 0.13.0 Tantivy 0.13.0
====================== ======================

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "tantivy" name = "tantivy"
version = "0.13.2" version = "0.14.0-dev"
authors = ["Paul Masurel <paul.masurel@gmail.com>"] authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT" license = "MIT"
categories = ["database-implementations", "data-structures"] categories = ["database-implementations", "data-structures"]
@@ -13,21 +13,21 @@ keywords = ["search", "information", "retrieval"]
edition = "2018" edition = "2018"
[dependencies] [dependencies]
base64 = "0.12" base64 = "0.12.0"
byteorder = "1" byteorder = "1.0"
crc32fast = "1" crc32fast = "1.2.0"
once_cell = "1" once_cell = "1.0"
regex ={version = "1", default-features = false, features = ["std"]} regex ={version = "1.3.0", default-features = false, features = ["std"]}
tantivy-fst = "0.3" tantivy-fst = "0.3"
memmap = {version = "0.7", optional=true} memmap = {version = "0.7", optional=true}
lz4 = {version="1", optional=true} lz4 = {version="1.20", optional=true}
snap = "1" snap = "1"
atomicwrites = {version="0.2", optional=true} tempfile = {version="3.0", optional=true}
tempfile = "3" atomicwrites = "0.2"
log = "0.4" log = "0.4"
serde = {version="1", features=["derive"]} serde = {version="1.0", features=["derive"]}
serde_json = "1" serde_json = "1.0"
num_cpus = "1" num_cpus = "1.2"
fs2={version="0.4", optional=true} fs2={version="0.4", optional=true}
levenshtein_automata = "0.2" levenshtein_automata = "0.2"
notify = {version="4", optional=true} notify = {version="4", optional=true}
@@ -35,20 +35,20 @@ uuid = { version = "0.8", features = ["v4", "serde"] }
crossbeam = "0.7" crossbeam = "0.7"
futures = {version = "0.3", features=["thread-pool"] } futures = {version = "0.3", features=["thread-pool"] }
owning_ref = "0.4" owning_ref = "0.4"
stable_deref_trait = "1" stable_deref_trait = "1.0.0"
rust-stemmers = "1" rust-stemmers = "1.2"
downcast-rs = "1" downcast-rs = { version="1.0" }
tantivy-query-grammar = { version="0.13", path="./query-grammar" } tantivy-query-grammar = { version="0.14.0-dev", path="./query-grammar" }
bitpacking = {version="0.8", default-features = false, features=["bitpacker4x"]} bitpacking = {version="0.8", default-features = false, features=["bitpacker4x"]}
census = "0.4" census = "0.4"
fnv = "1" fnv = "1.0.6"
owned-read = "0.4" owned-read = "0.4"
failure = "0.1" failure = "0.1"
htmlescape = "0.3" htmlescape = "0.3.1"
fail = "0.4" fail = "0.4"
murmurhash32 = "0.2" murmurhash32 = "0.2"
chrono = "0.4" chrono = "0.4"
smallvec = "1" smallvec = "1.0"
rayon = "1" rayon = "1"
[target.'cfg(windows)'.dependencies] [target.'cfg(windows)'.dependencies]
@@ -75,7 +75,7 @@ overflow-checks = true
[features] [features]
default = ["mmap"] default = ["mmap"]
mmap = ["atomicwrites", "fs2", "memmap", "notify"] mmap = ["fs2", "tempfile", "memmap", "notify"]
lz4-compression = ["lz4"] lz4-compression = ["lz4"]
failpoints = ["fail/failpoints"] failpoints = ["fail/failpoints"]
unstable = [] # useful for benches. unstable = [] # useful for benches.

View File

@@ -34,11 +34,6 @@ Tantivy is, in fact, strongly inspired by Lucene's design.
The following [benchmark](https://tantivy-search.github.io/bench/) break downs The following [benchmark](https://tantivy-search.github.io/bench/) break downs
performance for different type of queries / collection. performance for different type of queries / collection.
In general, Tantivy tends to be
- slower than Lucene on union with a Top-K due to Block-WAND optimization.
- faster than Lucene on intersection and phrase queries.
Your mileage WILL vary depending on the nature of queries and their load. Your mileage WILL vary depending on the nature of queries and their load.
# Features # Features

View File

@@ -112,18 +112,6 @@ fn main() -> tantivy::Result<()> {
limbs and branches that arch over the pool" limbs and branches that arch over the pool"
)); ));
index_writer.add_document(doc!(
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
// Multivalued field just need to be repeated. // Multivalued field just need to be repeated.
index_writer.add_document(doc!( index_writer.add_document(doc!(
title => "Frankenstein", title => "Frankenstein",

View File

@@ -56,7 +56,7 @@ fn main() -> tantivy::Result<()> {
); );
let top_docs_by_custom_score = let top_docs_by_custom_score =
TopDocs::with_limit(2).tweak_score(move |segment_reader: &SegmentReader| { TopDocs::with_limit(2).tweak_score(move |segment_reader: &SegmentReader| {
let ingredient_reader = segment_reader.facet_reader(ingredient).unwrap(); let mut ingredient_reader = segment_reader.facet_reader(ingredient).unwrap();
let facet_dict = ingredient_reader.facet_dict(); let facet_dict = ingredient_reader.facet_dict();
let query_ords: HashSet<u64> = facets let query_ords: HashSet<u64> = facets

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "tantivy-query-grammar" name = "tantivy-query-grammar"
version = "0.13.0" version = "0.14.0-dev"
authors = ["Paul Masurel <paul.masurel@gmail.com>"] authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT" license = "MIT"
categories = ["database-implementations", "data-structures"] categories = ["database-implementations", "data-structures"]

View File

@@ -52,7 +52,7 @@ mod test {
use crate::Occur; use crate::Occur;
#[test] #[test]
fn test_Occur_compose() { fn test_occur_compose() {
assert_eq!(Occur::compose(Occur::Should, Occur::Should), Occur::Should); assert_eq!(Occur::compose(Occur::Should, Occur::Should), Occur::Should);
assert_eq!(Occur::compose(Occur::Should, Occur::Must), Occur::Must); assert_eq!(Occur::compose(Occur::Should, Occur::Must), Occur::Must);
assert_eq!( assert_eq!(

View File

@@ -9,8 +9,10 @@ use combine::{
fn field<'a>() -> impl Parser<&'a str, Output = String> { fn field<'a>() -> impl Parser<&'a str, Output = String> {
( (
letter(), (letter().or(char('_'))),
many(satisfy(|c: char| c.is_alphanumeric() || c == '_')), many(satisfy(|c: char| {
c.is_alphanumeric() || c == '_' || c == '-'
})),
) )
.skip(char(':')) .skip(char(':'))
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2)) .map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
@@ -279,6 +281,8 @@ pub fn parse_to_ast<'a>() -> impl Parser<&'a str, Output = UserInputAST> {
#[cfg(test)] #[cfg(test)]
mod test { mod test {
type TestParseResult = Result<(), StringStreamError>;
use super::*; use super::*;
use combine::parser::Parser; use combine::parser::Parser;
@@ -296,9 +300,10 @@ mod test {
} }
#[test] #[test]
fn test_occur_symbol() { fn test_occur_symbol() -> TestParseResult {
assert_eq!(super::occur_symbol().parse("-"), Ok((Occur::MustNot, ""))); assert_eq!(super::occur_symbol().parse("-")?, (Occur::MustNot, ""));
assert_eq!(super::occur_symbol().parse("+"), Ok((Occur::Must, ""))); assert_eq!(super::occur_symbol().parse("+")?, (Occur::Must, ""));
Ok(())
} }
#[test] #[test]
@@ -410,6 +415,25 @@ mod test {
assert_eq!(format!("{:?}", ast), "\"abc\""); assert_eq!(format!("{:?}", ast), "\"abc\"");
} }
#[test]
fn test_field_name() -> TestParseResult {
assert_eq!(
super::field().parse("my-field-name:a")?,
("my-field-name".to_string(), "a")
);
assert_eq!(
super::field().parse("my_field_name:a")?,
("my_field_name".to_string(), "a")
);
assert!(super::field().parse(":a").is_err());
assert!(super::field().parse("-my_field:a").is_err());
assert_eq!(
super::field().parse("_my_field:a")?,
("_my_field".to_string(), "a")
);
Ok(())
}
#[test] #[test]
fn test_range_parser() { fn test_range_parser() {
// testing the range() parser separately // testing the range() parser separately

View File

@@ -46,7 +46,7 @@ pub trait CustomScorer<TScore>: Sync {
impl<TCustomScorer, TScore> Collector for CustomScoreTopCollector<TCustomScorer, TScore> impl<TCustomScorer, TScore> Collector for CustomScoreTopCollector<TCustomScorer, TScore>
where where
TCustomScorer: CustomScorer<TScore> + Send + Sync, TCustomScorer: CustomScorer<TScore>,
TScore: 'static + PartialOrd + Clone + Send + Sync, TScore: 'static + PartialOrd + Clone + Send + Sync,
{ {
type Fruit = Vec<(TScore, DocAddress)>; type Fruit = Vec<(TScore, DocAddress)>;

View File

@@ -133,7 +133,7 @@ impl<T> Fruit for T where T: Send + downcast_rs::Downcast {}
/// The collection logic itself is in the `SegmentCollector`. /// The collection logic itself is in the `SegmentCollector`.
/// ///
/// Segments are not guaranteed to be visited in any specific order. /// Segments are not guaranteed to be visited in any specific order.
pub trait Collector: Sync + Send { pub trait Collector: Sync {
/// `Fruit` is the type for the result of our collection. /// `Fruit` is the type for the result of our collection.
/// e.g. `usize` for the `Count` collector. /// e.g. `usize` for the `Count` collector.
type Fruit: Fruit; type Fruit: Fruit;

View File

@@ -324,7 +324,7 @@ impl TopDocs {
where where
TScore: 'static + Send + Sync + Clone + PartialOrd, TScore: 'static + Send + Sync + Clone + PartialOrd,
TScoreSegmentTweaker: ScoreSegmentTweaker<TScore> + 'static, TScoreSegmentTweaker: ScoreSegmentTweaker<TScore> + 'static,
TScoreTweaker: ScoreTweaker<TScore, Child = TScoreSegmentTweaker> + Send + Sync, TScoreTweaker: ScoreTweaker<TScore, Child = TScoreSegmentTweaker>,
{ {
TweakedScoreTopCollector::new(score_tweaker, self.0.into_tscore()) TweakedScoreTopCollector::new(score_tweaker, self.0.into_tscore())
} }
@@ -438,7 +438,7 @@ impl TopDocs {
where where
TScore: 'static + Send + Sync + Clone + PartialOrd, TScore: 'static + Send + Sync + Clone + PartialOrd,
TCustomSegmentScorer: CustomSegmentScorer<TScore> + 'static, TCustomSegmentScorer: CustomSegmentScorer<TScore> + 'static,
TCustomScorer: CustomScorer<TScore, Child = TCustomSegmentScorer> + Send + Sync, TCustomScorer: CustomScorer<TScore, Child = TCustomSegmentScorer>,
{ {
CustomScoreTopCollector::new(custom_score, self.0.into_tscore()) CustomScoreTopCollector::new(custom_score, self.0.into_tscore())
} }

View File

@@ -49,7 +49,7 @@ pub trait ScoreTweaker<TScore>: Sync {
impl<TScoreTweaker, TScore> Collector for TweakedScoreTopCollector<TScoreTweaker, TScore> impl<TScoreTweaker, TScore> Collector for TweakedScoreTopCollector<TScoreTweaker, TScore>
where where
TScoreTweaker: ScoreTweaker<TScore> + Send + Sync, TScoreTweaker: ScoreTweaker<TScore>,
TScore: 'static + PartialOrd + Clone + Send + Sync, TScore: 'static + PartialOrd + Clone + Send + Sync,
{ {
type Fruit = Vec<(TScore, DocAddress)>; type Fruit = Vec<(TScore, DocAddress)>;

View File

@@ -539,7 +539,6 @@ mod tests {
test_index_on_commit_reload_policy_aux(field, &write_index, &reader); test_index_on_commit_reload_policy_aux(field, &write_index, &reader);
} }
} }
fn test_index_on_commit_reload_policy_aux(field: Field, index: &Index, reader: &IndexReader) { fn test_index_on_commit_reload_policy_aux(field: Field, index: &Index, reader: &IndexReader) {
let mut reader_index = reader.index(); let mut reader_index = reader.index();
let (sender, receiver) = crossbeam::channel::unbounded(); let (sender, receiver) = crossbeam::channel::unbounded();
@@ -550,12 +549,23 @@ mod tests {
assert_eq!(reader.searcher().num_docs(), 0); assert_eq!(reader.searcher().num_docs(), 0);
writer.add_document(doc!(field=>1u64)); writer.add_document(doc!(field=>1u64));
writer.commit().unwrap(); writer.commit().unwrap();
assert!(receiver.recv().is_ok()); // We need a loop here because it is possible for notify to send more than
assert_eq!(reader.searcher().num_docs(), 1); // one modify event. It was observed on CI on MacOS.
loop {
assert!(receiver.recv().is_ok());
if reader.searcher().num_docs() == 1 {
break;
}
}
writer.add_document(doc!(field=>2u64)); writer.add_document(doc!(field=>2u64));
writer.commit().unwrap(); writer.commit().unwrap();
assert!(receiver.recv().is_ok()); // ... Same as above
assert_eq!(reader.searcher().num_docs(), 2); loop {
assert!(receiver.recv().is_ok());
if reader.searcher().num_docs() == 2 {
break;
}
}
} }
// This test will not pass on windows, because windows // This test will not pass on windows, because windows

View File

@@ -112,10 +112,8 @@ impl SegmentReader {
return None; return None;
} }
let term_ords_reader = self.fast_fields().u64s(field)?; let term_ords_reader = self.fast_fields().u64s(field)?;
let termdict = self.termdict_composite let termdict_source = self.termdict_composite.open_read(field)?;
.open_read(field) let termdict = TermDictionary::from_source(&termdict_source);
.map(|source| TermDictionary::from_source(&source))
.unwrap_or_else(TermDictionary::empty);
let facet_reader = FacetReader::new(term_ords_reader, termdict); let facet_reader = FacetReader::new(term_ords_reader, termdict);
Some(facet_reader) Some(facet_reader)
} }

View File

@@ -1,4 +1,4 @@
use crate::core::MANAGED_FILEPATH; use crate::core::{MANAGED_FILEPATH, META_FILEPATH};
use crate::directory::error::{DeleteError, IOError, LockError, OpenReadError, OpenWriteError}; use crate::directory::error::{DeleteError, IOError, LockError, OpenReadError, OpenWriteError};
use crate::directory::footer::{Footer, FooterProxy}; use crate::directory::footer::{Footer, FooterProxy};
use crate::directory::DirectoryLock; use crate::directory::DirectoryLock;
@@ -246,13 +246,15 @@ impl ManagedDirectory {
/// List files for which checksum does not match content /// List files for which checksum does not match content
pub fn list_damaged(&self) -> result::Result<HashSet<PathBuf>, OpenReadError> { pub fn list_damaged(&self) -> result::Result<HashSet<PathBuf>, OpenReadError> {
let mut hashset = HashSet::new(); let mut hashset = HashSet::new();
let managed_paths = self let mut managed_paths = self
.meta_informations .meta_informations
.read() .read()
.expect("Managed directory rlock poisoned in list damaged.") .expect("Managed directory rlock poisoned in list damaged.")
.managed_paths .managed_paths
.clone(); .clone();
managed_paths.remove(*META_FILEPATH);
for path in managed_paths.into_iter() { for path in managed_paths.into_iter() {
if !self.validate_checksum(&path)? { if !self.validate_checksum(&path)? {
hashset.insert(path); hashset.insert(path);

View File

@@ -1,4 +1,5 @@
use crate::core::META_FILEPATH; use crate::core::META_FILEPATH;
use atomicwrites;
use crate::directory::error::LockError; use crate::directory::error::LockError;
use crate::directory::error::{ use crate::directory::error::{
DeleteError, IOError, OpenDirectoryError, OpenReadError, OpenWriteError, DeleteError, IOError, OpenDirectoryError, OpenReadError, OpenWriteError,
@@ -34,6 +35,7 @@ use std::sync::Mutex;
use std::sync::RwLock; use std::sync::RwLock;
use std::sync::Weak; use std::sync::Weak;
use std::thread; use std::thread;
use tempfile;
use tempfile::TempDir; use tempfile::TempDir;
/// Create a default io error given a string. /// Create a default io error given a string.
@@ -487,11 +489,11 @@ impl Directory for MmapDirectory {
} }
} }
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> { fn atomic_write(&mut self, path: &Path, content: &[u8]) -> io::Result<()> {
debug!("Atomic Write {:?}", path); debug!("Atomic Write {:?}", path);
let full_path = self.resolve_path(path); let full_path = self.resolve_path(path);
let meta_file = atomicwrites::AtomicFile::new(full_path, atomicwrites::AllowOverwrite); let meta_file = atomicwrites::AtomicFile::new(full_path, atomicwrites::AllowOverwrite);
meta_file.write(|f| f.write_all(data))?; meta_file.write(|f| f.write_all(content))?;
Ok(()) Ok(())
} }

View File

@@ -211,19 +211,18 @@ fn test_watch(directory: &mut dyn Directory) {
.unwrap(); .unwrap();
for i in 0..10 { for i in 0..10 {
assert_eq!(i, counter.load(SeqCst)); assert!(i <= counter.load(SeqCst));
assert!(directory assert!(directory
.atomic_write(Path::new("meta.json"), b"random_test_data_2") .atomic_write(Path::new("meta.json"), b"random_test_data_2")
.is_ok()); .is_ok());
assert_eq!(receiver.recv_timeout(Duration::from_millis(500)), Ok(i)); assert!(i + 1 <= counter.load(SeqCst)); // notify can trigger more than once.
assert_eq!(i + 1, counter.load(SeqCst));
} }
mem::drop(watch_handle); mem::drop(watch_handle);
assert!(directory assert!(directory
.atomic_write(Path::new("meta.json"), b"random_test_data") .atomic_write(Path::new("meta.json"), b"random_test_data")
.is_ok()); .is_ok());
assert!(receiver.recv_timeout(Duration::from_millis(500)).is_ok()); assert!(receiver.recv_timeout(Duration::from_millis(500)).is_ok());
assert_eq!(10, counter.load(SeqCst)); assert!(10 <= counter.load(SeqCst));
} }
fn test_lock_non_blocking(directory: &mut dyn Directory) { fn test_lock_non_blocking(directory: &mut dyn Directory) {

View File

@@ -73,52 +73,7 @@ impl FacetReader {
} }
/// Return the list of facet ordinals associated to a document. /// Return the list of facet ordinals associated to a document.
pub fn facet_ords(&self, doc: DocId, output: &mut Vec<u64>) { pub fn facet_ords(&mut self, doc: DocId, output: &mut Vec<u64>) {
self.term_ords.get_vals(doc, output); self.term_ords.get_vals(doc, output);
} }
} }
#[cfg(test)]
mod tests {
use crate::{Document, schema::{Facet, SchemaBuilder}};
use crate::Index;
#[test]
fn test_facet_not_populated_for_all_docs() -> crate::Result<()> {
let mut schema_builder = SchemaBuilder::default();
let facet_field = schema_builder.add_facet_field("facet");
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b")));
index_writer.add_document(Document::default());
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let facet_reader = searcher.segment_reader(0u32).facet_reader(facet_field).unwrap();
let mut facet_ords = Vec::new();
facet_reader.facet_ords(0u32, &mut facet_ords);
assert_eq!(&facet_ords, &[2u64]);
facet_reader.facet_ords(1u32, &mut facet_ords);
assert!(facet_ords.is_empty());
Ok(())
}
#[test]
fn test_facet_not_populated_for_any_docs() -> crate::Result<()> {
let mut schema_builder = SchemaBuilder::default();
let facet_field = schema_builder.add_facet_field("facet");
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
index_writer.add_document(Document::default());
index_writer.add_document(Document::default());
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let facet_reader = searcher.segment_reader(0u32).facet_reader(facet_field).unwrap();
let mut facet_ords = Vec::new();
facet_reader.facet_ords(0u32, &mut facet_ords);
assert!(facet_ords.is_empty());
facet_reader.facet_ords(1u32, &mut facet_ords);
assert!(facet_ords.is_empty());
Ok(())
}
}

View File

@@ -126,7 +126,6 @@ impl FastFieldsWriter {
for field_writer in &self.single_value_writers { for field_writer in &self.single_value_writers {
field_writer.serialize(serializer)?; field_writer.serialize(serializer)?;
} }
for field_writer in &self.multi_values_writers { for field_writer in &self.multi_values_writers {
let field = field_writer.field(); let field = field_writer.field();
field_writer.serialize(serializer, mapping.get(&field))?; field_writer.serialize(serializer, mapping.get(&field))?;

View File

@@ -29,8 +29,9 @@ pub use self::segment_writer::SegmentWriter;
/// Alias for the default merge policy, which is the `LogMergePolicy`. /// Alias for the default merge policy, which is the `LogMergePolicy`.
pub type DefaultMergePolicy = LogMergePolicy; pub type DefaultMergePolicy = LogMergePolicy;
#[cfg(feature = "mmap")]
#[cfg(test)] #[cfg(test)]
mod tests { mod tests_mmap {
use crate::schema::{self, Schema}; use crate::schema::{self, Schema};
use crate::{Index, Term}; use crate::{Index, Term};

View File

@@ -151,7 +151,7 @@ impl SegmentWriter {
if let Some(unordered_term_id) = unordered_term_id_opt { if let Some(unordered_term_id) = unordered_term_id_opt {
self.fast_field_writers self.fast_field_writers
.get_multivalue_writer(field) .get_multivalue_writer(field)
.expect("writer for facet missing") .expect("multified writer for facet missing")
.add_val(unordered_term_id); .add_val(unordered_term_id);
} }
} }

View File

@@ -1012,4 +1012,12 @@ mod tests {
DOC_COUNT as usize DOC_COUNT as usize
); );
} }
#[test]
fn test_validate_checksum() {
let index_path = tempfile::tempdir().expect("dir");
let schema = Schema::builder().build();
let index = Index::create_in_dir(&index_path, schema).expect("index");
assert!(index.validate_checksum().unwrap().is_empty());
}
} }

View File

@@ -4,19 +4,6 @@ use crate::{DocId, DocSet, Score, TERMINATED};
use std::ops::Deref; use std::ops::Deref;
use std::ops::DerefMut; use std::ops::DerefMut;
fn is_sorted<I: Iterator<Item = DocId>>(mut it: I) -> bool {
if let Some(first) = it.next() {
let mut prev = first;
for doc in it {
if doc < prev {
return false;
}
prev = doc;
}
}
true
}
/// Takes a term_scorers sorted by their current doc() and a threshold and returns /// Takes a term_scorers sorted by their current doc() and a threshold and returns
/// Returns (pivot_len, pivot_ord) defined as follows: /// Returns (pivot_len, pivot_ord) defined as follows:
/// - `pivot_doc` lowest document that has a chance of exceeding (>) the threshold score. /// - `pivot_doc` lowest document that has a chance of exceeding (>) the threshold score.
@@ -55,37 +42,12 @@ fn find_pivot_doc(
Some((before_pivot_len, pivot_len, pivot_doc)) Some((before_pivot_len, pivot_len, pivot_doc))
} }
struct TermScorerWithMaxScore<'a> {
scorer: &'a mut TermScorer,
max_score: Score,
}
impl<'a> From<&'a mut TermScorer> for TermScorerWithMaxScore<'a> {
fn from(scorer: &'a mut TermScorer) -> Self {
let max_score = scorer.max_score();
TermScorerWithMaxScore { scorer, max_score }
}
}
impl<'a> Deref for TermScorerWithMaxScore<'a> {
type Target = TermScorer;
fn deref(&self) -> &Self::Target {
self.scorer
}
}
impl<'a> DerefMut for TermScorerWithMaxScore<'a> {
fn deref_mut(&mut self) -> &mut Self::Target {
self.scorer
}
}
// Before and after calling this method, scorers need to be sorted by their `.doc()`. // Before and after calling this method, scorers need to be sorted by their `.doc()`.
fn block_max_was_too_low_advance_one_scorer( fn block_max_was_too_low_advance_one_scorer(
scorers: &mut Vec<TermScorerWithMaxScore>, scorers: &mut Vec<TermScorerWithMaxScore>,
pivot_len: usize, pivot_len: usize,
) { ) {
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
let mut scorer_to_seek = pivot_len - 1; let mut scorer_to_seek = pivot_len - 1;
let mut doc_to_seek_after = scorers[scorer_to_seek].doc(); let mut doc_to_seek_after = scorers[scorer_to_seek].doc();
for scorer_ord in (0..pivot_len - 1).rev() { for scorer_ord in (0..pivot_len - 1).rev() {
@@ -102,6 +64,7 @@ fn block_max_was_too_low_advance_one_scorer(
} }
scorers[scorer_to_seek].seek(doc_to_seek_after + 1); scorers[scorer_to_seek].seek(doc_to_seek_after + 1);
restore_ordering(scorers, scorer_to_seek); restore_ordering(scorers, scorer_to_seek);
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
} }
// Given a list of term_scorers and a `ord` and assuming that `term_scorers[ord]` is sorted // Given a list of term_scorers and a `ord` and assuming that `term_scorers[ord]` is sorted
@@ -177,64 +140,99 @@ pub fn block_wand(
.map(TermScorerWithMaxScore::from) .map(TermScorerWithMaxScore::from)
.collect(); .collect();
scorers.sort_by_key(|scorer| scorer.doc()); scorers.sort_by_key(|scorer| scorer.doc());
loop { // At this point we need to ensure that the scorers are sorted!
// At this point we need to ensure that the scorers are sorted! debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
while let Some((before_pivot_len, pivot_len, pivot_doc)) =
find_pivot_doc(&scorers[..], threshold)
{
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc()))); debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
if let Some((before_pivot_len, pivot_len, pivot_doc)) = debug_assert_ne!(pivot_doc, TERMINATED);
find_pivot_doc(&scorers[..], threshold) debug_assert!(before_pivot_len < pivot_len);
{
debug_assert_ne!(pivot_doc, TERMINATED);
debug_assert!(before_pivot_len < pivot_len);
let block_max_score_upperbound: Score = scorers[..pivot_len] let block_max_score_upperbound: Score = scorers[..pivot_len]
.iter_mut() .iter_mut()
.map(|scorer| { .map(|scorer| {
scorer.shallow_seek(pivot_doc); scorer.shallow_seek(pivot_doc);
scorer.block_max_score() scorer.block_max_score()
}) })
.sum(); .sum();
// Beware after shallow advance, skip readers can be in advance compared to // Beware after shallow advance, skip readers can be in advance compared to
// the segment posting lists. // the segment posting lists.
// //
// `block_segment_postings.load_block()` need to be called separately. // `block_segment_postings.load_block()` need to be called separately.
if block_max_score_upperbound <= threshold { if block_max_score_upperbound <= threshold {
// Block max condition was not reached // Block max condition was not reached
// We could get away by simply advancing the scorers to DocId + 1 but it would // We could get away by simply advancing the scorers to DocId + 1 but it would
// be inefficient. The optimization requires proper explanation and was // be inefficient. The optimization requires proper explanation and was
// isolated in a different function. // isolated in a different function.
block_max_was_too_low_advance_one_scorer(&mut scorers, pivot_len); block_max_was_too_low_advance_one_scorer(&mut scorers, pivot_len);
continue; continue;
}
// Block max condition is observed.
//
// Let's try and advance all scorers before the pivot to the pivot.
if !align_scorers(&mut scorers, pivot_doc, before_pivot_len) {
// At least of the scorer does not contain the pivot.
//
// Let's stop scoring this pivot and go through the pivot selection again.
// Note that the current pivot is not necessarily a bad candidate and it
// may be picked again.
continue;
}
// At this point, all scorers are positioned on the doc.
let score = scorers[..pivot_len]
.iter_mut()
.map(|scorer| scorer.score())
.sum();
if score > threshold {
threshold = callback(pivot_doc, score);
}
// let's advance all of the scorers that are currently positioned on the pivot.
advance_all_scorers_on_pivot(&mut scorers, pivot_len);
} else {
return;
} }
// Block max condition is observed.
//
// Let's try and advance all scorers before the pivot to the pivot.
if !align_scorers(&mut scorers, pivot_doc, before_pivot_len) {
// At least of the scorer does not contain the pivot.
//
// Let's stop scoring this pivot and go through the pivot selection again.
// Note that the current pivot is not necessarily a bad candidate and it
// may be picked again.
continue;
}
// At this point, all scorers are positioned on the doc.
let score = scorers[..pivot_len]
.iter_mut()
.map(|scorer| scorer.score())
.sum();
if score > threshold {
threshold = callback(pivot_doc, score);
}
// let's advance all of the scorers that are currently positioned on the pivot.
advance_all_scorers_on_pivot(&mut scorers, pivot_len);
} }
} }
struct TermScorerWithMaxScore<'a> {
scorer: &'a mut TermScorer,
max_score: Score,
}
impl<'a> From<&'a mut TermScorer> for TermScorerWithMaxScore<'a> {
fn from(scorer: &'a mut TermScorer) -> Self {
let max_score = scorer.max_score();
TermScorerWithMaxScore { scorer, max_score }
}
}
impl<'a> Deref for TermScorerWithMaxScore<'a> {
type Target = TermScorer;
fn deref(&self) -> &Self::Target {
self.scorer
}
}
impl<'a> DerefMut for TermScorerWithMaxScore<'a> {
fn deref_mut(&mut self) -> &mut Self::Target {
self.scorer
}
}
fn is_sorted<I: Iterator<Item = DocId>>(mut it: I) -> bool {
if let Some(first) = it.next() {
let mut prev = first;
for doc in it {
if doc < prev {
return false;
}
prev = doc;
}
}
true
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::query::score_combiner::SumCombiner; use crate::query::score_combiner::SumCombiner;
@@ -248,17 +246,21 @@ mod tests {
use std::iter; use std::iter;
struct Float(Score); struct Float(Score);
impl Eq for Float {} impl Eq for Float {}
impl PartialEq for Float { impl PartialEq for Float {
fn eq(&self, other: &Self) -> bool { fn eq(&self, other: &Self) -> bool {
self.cmp(&other) == Ordering::Equal self.cmp(&other) == Ordering::Equal
} }
} }
impl PartialOrd for Float { impl PartialOrd for Float {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> { fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other)) Some(self.cmp(other))
} }
} }
impl Ord for Float { impl Ord for Float {
fn cmp(&self, other: &Self) -> Ordering { fn cmp(&self, other: &Self) -> Ordering {
other.0.partial_cmp(&self.0).unwrap_or(Ordering::Equal) other.0.partial_cmp(&self.0).unwrap_or(Ordering::Equal)

View File

@@ -40,7 +40,7 @@ use std::fmt;
/// ///
/// When implementing a new type of `Query`, it is normal to implement a /// When implementing a new type of `Query`, it is normal to implement a
/// dedicated `Query`, `Weight` and `Scorer`. /// dedicated `Query`, `Weight` and `Scorer`.
pub trait Query: QueryClone + Send + Sync + downcast_rs::Downcast + fmt::Debug { pub trait Query: QueryClone + downcast_rs::Downcast + fmt::Debug {
/// Create the weight associated to a query. /// Create the weight associated to a query.
/// ///
/// If scoring is not required, setting `scoring_enabled` to `false` /// If scoring is not required, setting `scoring_enabled` to `false`

View File

@@ -4,7 +4,7 @@ use crate::docset::DocSet;
use crate::postings::SegmentPostings; use crate::postings::SegmentPostings;
use crate::query::bm25::BM25Weight; use crate::query::bm25::BM25Weight;
use crate::query::explanation::does_not_match; use crate::query::explanation::does_not_match;
use crate::query::weight::{for_each_pruning_scorer, for_each_scorer}; use crate::query::weight::for_each_scorer;
use crate::query::Weight; use crate::query::Weight;
use crate::query::{Explanation, Scorer}; use crate::query::{Explanation, Scorer};
use crate::schema::IndexRecordOption; use crate::schema::IndexRecordOption;
@@ -73,8 +73,8 @@ impl Weight for TermWeight {
reader: &SegmentReader, reader: &SegmentReader,
callback: &mut dyn FnMut(DocId, Score) -> Score, callback: &mut dyn FnMut(DocId, Score) -> Score,
) -> crate::Result<()> { ) -> crate::Result<()> {
let mut scorer = self.scorer(reader, 1.0)?; let scorer = self.specialized_scorer(reader, 1.0)?;
for_each_pruning_scorer(&mut scorer, threshold, callback); crate::query::boolean_query::block_wand(vec![scorer], threshold, callback);
Ok(()) Ok(())
} }
} }

View File

@@ -138,9 +138,11 @@ impl InnerIndexReader {
.collect::<crate::Result<_>>()? .collect::<crate::Result<_>>()?
}; };
let schema = self.index.schema(); let schema = self.index.schema();
let searchers = (0..self.num_searchers) let searchers = std::iter::repeat_with(|| {
.map(|_| Searcher::new(schema.clone(), self.index.clone(), segment_readers.clone())) Searcher::new(schema.clone(), self.index.clone(), segment_readers.clone())
.collect(); })
.take(self.num_searchers)
.collect();
self.searcher_pool.publish_new_generation(searchers); self.searcher_pool.publish_new_generation(searchers);
Ok(()) Ok(())
} }

View File

@@ -1,5 +1,5 @@
use crate::schema::IntOptions;
use crate::schema::TextOptions; use crate::schema::TextOptions;
use crate::schema::{is_valid_field_name, IntOptions};
use crate::schema::FieldType; use crate::schema::FieldType;
use serde::de::{self, MapAccess, Visitor}; use serde::de::{self, MapAccess, Visitor};
@@ -24,6 +24,7 @@ impl FieldEntry {
/// Creates a new u64 field entry in the schema, given /// Creates a new u64 field entry in the schema, given
/// a name, and some options. /// a name, and some options.
pub fn new_text(field_name: String, text_options: TextOptions) -> FieldEntry { pub fn new_text(field_name: String, text_options: TextOptions) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry { FieldEntry {
name: field_name, name: field_name,
field_type: FieldType::Str(text_options), field_type: FieldType::Str(text_options),
@@ -33,6 +34,7 @@ impl FieldEntry {
/// Creates a new u64 field entry in the schema, given /// Creates a new u64 field entry in the schema, given
/// a name, and some options. /// a name, and some options.
pub fn new_u64(field_name: String, field_type: IntOptions) -> FieldEntry { pub fn new_u64(field_name: String, field_type: IntOptions) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry { FieldEntry {
name: field_name, name: field_name,
field_type: FieldType::U64(field_type), field_type: FieldType::U64(field_type),
@@ -42,6 +44,7 @@ impl FieldEntry {
/// Creates a new i64 field entry in the schema, given /// Creates a new i64 field entry in the schema, given
/// a name, and some options. /// a name, and some options.
pub fn new_i64(field_name: String, field_type: IntOptions) -> FieldEntry { pub fn new_i64(field_name: String, field_type: IntOptions) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry { FieldEntry {
name: field_name, name: field_name,
field_type: FieldType::I64(field_type), field_type: FieldType::I64(field_type),
@@ -51,6 +54,7 @@ impl FieldEntry {
/// Creates a new f64 field entry in the schema, given /// Creates a new f64 field entry in the schema, given
/// a name, and some options. /// a name, and some options.
pub fn new_f64(field_name: String, field_type: IntOptions) -> FieldEntry { pub fn new_f64(field_name: String, field_type: IntOptions) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry { FieldEntry {
name: field_name, name: field_name,
field_type: FieldType::F64(field_type), field_type: FieldType::F64(field_type),
@@ -60,6 +64,7 @@ impl FieldEntry {
/// Creates a new date field entry in the schema, given /// Creates a new date field entry in the schema, given
/// a name, and some options. /// a name, and some options.
pub fn new_date(field_name: String, field_type: IntOptions) -> FieldEntry { pub fn new_date(field_name: String, field_type: IntOptions) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry { FieldEntry {
name: field_name, name: field_name,
field_type: FieldType::Date(field_type), field_type: FieldType::Date(field_type),
@@ -68,6 +73,7 @@ impl FieldEntry {
/// Creates a field entry for a facet. /// Creates a field entry for a facet.
pub fn new_facet(field_name: String) -> FieldEntry { pub fn new_facet(field_name: String) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry { FieldEntry {
name: field_name, name: field_name,
field_type: FieldType::HierarchicalFacet, field_type: FieldType::HierarchicalFacet,
@@ -76,6 +82,7 @@ impl FieldEntry {
/// Creates a field entry for a bytes field /// Creates a field entry for a bytes field
pub fn new_bytes(field_name: String) -> FieldEntry { pub fn new_bytes(field_name: String) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry { FieldEntry {
name: field_name, name: field_name,
field_type: FieldType::Bytes, field_type: FieldType::Bytes,
@@ -268,6 +275,12 @@ mod tests {
use crate::schema::TEXT; use crate::schema::TEXT;
use serde_json; use serde_json;
#[test]
#[should_panic]
fn test_invalid_field_name_should_panic() {
FieldEntry::new_text("-hello".to_string(), TEXT);
}
#[test] #[test]
fn test_json_serialization() { fn test_json_serialization() {
let field_value = FieldEntry::new_text(String::from("title"), TEXT); let field_value = FieldEntry::new_text(String::from("title"), TEXT);

View File

@@ -149,14 +149,16 @@ pub use self::int_options::IntOptions;
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use regex::Regex; use regex::Regex;
/// Regular expression representing the restriction on a valid field names.
pub const FIELD_NAME_PATTERN: &'static str = r#"^[_a-zA-Z][_\-a-zA-Z0-9]*$"#;
/// Validator for a potential `field_name`. /// Validator for a potential `field_name`.
/// Returns true iff the name can be use for a field name. /// Returns true iff the name can be use for a field name.
/// ///
/// A field name must start by a letter `[a-zA-Z]`. /// A field name must start by a letter `[a-zA-Z]`.
/// The other characters can be any alphanumic character `[a-ZA-Z0-9]` or `_`. /// The other characters can be any alphanumic character `[a-ZA-Z0-9]` or `_`.
pub fn is_valid_field_name(field_name: &str) -> bool { pub fn is_valid_field_name(field_name: &str) -> bool {
static FIELD_NAME_PTN: Lazy<Regex> = static FIELD_NAME_PTN: Lazy<Regex> = Lazy::new(|| Regex::new(FIELD_NAME_PATTERN).unwrap());
Lazy::new(|| Regex::new("^[a-zA-Z][_a-zA-Z0-9]*$").unwrap());
FIELD_NAME_PTN.is_match(field_name) FIELD_NAME_PTN.is_match(field_name)
} }
@@ -170,6 +172,11 @@ mod tests {
assert!(is_valid_field_name("text")); assert!(is_valid_field_name("text"));
assert!(is_valid_field_name("text0")); assert!(is_valid_field_name("text0"));
assert!(!is_valid_field_name("0text")); assert!(!is_valid_field_name("0text"));
assert!(is_valid_field_name("field-name"));
assert!(is_valid_field_name("field_name"));
assert!(!is_valid_field_name("field!name"));
assert!(!is_valid_field_name("-fieldname"));
assert!(is_valid_field_name("_fieldname"));
assert!(!is_valid_field_name("")); assert!(!is_valid_field_name(""));
assert!(!is_valid_field_name("シャボン玉")); assert!(!is_valid_field_name("シャボン玉"));
assert!(is_valid_field_name("my_text_field")); assert!(is_valid_field_name("my_text_field"));