Compare commits

..

24 Commits

Author SHA1 Message Date
Paul Masurel
9eb87e91cc TermInfo contain the end_offset of the postings.
We slice the ReadOnlySource tightly.
2020-09-21 00:42:15 +09:00
Paul Masurel
36f43da4d8 Added Field stats to remove total num tokens from the beginning of posting list files 2020-09-19 23:23:03 +09:00
Paul Masurel
19a02b2c30 Merge tag '0.13.1'
0.13.1 was published as a hotfix to accomodate tantivy-py.
2020-09-19 21:20:27 +09:00
Paul Masurel
70bae7ce4c Removing Term Vec allocation (#881) 2020-09-08 23:11:00 +09:00
Paul Masurel
ac2a7273e6 Re-added comment to Score. 2020-09-08 21:41:34 +09:00
Paul Masurel
4ce9517a82 fix unit test for bench. remove scoref64 feature. fixed test for lz4 feature. 2020-09-08 07:35:00 +09:00
Paul Masurel
73024a8af3 Fixing compilation of bench and doctests. 2020-09-08 07:18:43 +09:00
Paul Masurel
e70e605fc3 fix unit test (at least on linux) 2020-09-07 23:35:04 +09:00
Paul Masurel
439d6956a9 Returning Result in some of the API (#880)
* Returning Result in some of the API

* Introducing `.writer_for_test(..)`
2020-09-07 15:52:34 +09:00
Paul Masurel
6530bf0eae Make field types less strict when populating documents. 2020-09-06 10:24:03 +09:00
Paul Masurel
151498cbe7 Creating the tempfile for atomicwrites in the same directory as the MmapDirectory. (#878) 2020-09-05 23:06:29 +09:00
Paul Masurel
3a72b1cb98 Accept dash within field names. (#874)
Accept dash in field names and enforce field names constraint at the
creation of the schema.

Closes #796
2020-09-01 13:38:52 +09:00
Paul Masurel
2737822620 Fixing unit tests. (#868)
There was a unit test failing when notify was sending more
than one event on atomicwrites.

It was observed on MacOS CI.
2020-08-27 16:43:39 +09:00
b8591340
06c12ae221 Filter meta.json from validate_checksum (#872) 2020-08-27 07:54:37 +09:00
Paul Masurel
4e4400af7f Added cargo timing report to .gitignore 2020-08-23 16:15:28 +09:00
Paul Masurel
3f1ecf53ab Merge branch 'master' of github.com:tantivy-search/tantivy 2020-08-22 21:30:47 +09:00
Paul Masurel
0b583b8130 Plastic changes 2020-08-22 21:29:12 +09:00
Paul Masurel
31d18dca1c Removing dependency to atomicwrites (#866) 2020-08-21 21:37:05 +09:00
stephenlagree
5e06e7de5a Update basic_search.rs (#865)
Remove duplicated document entry.
2020-08-21 11:23:09 +09:00
Paul Masurel
8af53cbd36 Merge branch 'master' of github.com:tantivy-search/tantivy 2020-08-21 08:57:42 +09:00
Paul Masurel
4914076e8f Fixing release build 2020-08-21 08:57:27 +09:00
Paul Masurel
e04f47e922 Using block wand for term queries too. 2020-08-20 15:51:21 +09:00
Paul Masurel
f355695581 Code clean up 2020-08-20 15:42:50 +09:00
Paul Masurel
cbacdf0de8 Edited README. 2020-08-20 14:28:24 +09:00
65 changed files with 824 additions and 701 deletions

1
.gitignore vendored
View File

@@ -12,3 +12,4 @@ cpp/simdcomp/bitpackingbenchmark
*.bk *.bk
.idea .idea
trace.dat trace.dat
cargo-timing*

View File

@@ -1,14 +1,12 @@
Tantivy 0.13.2 Tantivy 0.14.0
=================== =========================
Bugfix. Acquiring a facet reader on a segment that does not contain any - Remove dependency to atomicwrites #833 .Implemented by @pmasurel upon suggestion and research from @asafigan).
doc with this facet returns `None`. (#896)
Tantivy 0.13.1 Tantivy 0.13.1
====================== ===================
Made `Query` and `Collector` `Send + Sync`. Made `Query` and `Collector` `Send + Sync`.
Updated misc dependency versions. Updated misc dependency versions.
Tantivy 0.13.0 Tantivy 0.13.0
====================== ======================
Tantivy 0.13 introduce a change in the index format that will require Tantivy 0.13 introduce a change in the index format that will require

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "tantivy" name = "tantivy"
version = "0.13.2" version = "0.14.0-dev"
authors = ["Paul Masurel <paul.masurel@gmail.com>"] authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT" license = "MIT"
categories = ["database-implementations", "data-structures"] categories = ["database-implementations", "data-structures"]
@@ -22,8 +22,7 @@ tantivy-fst = "0.3"
memmap = {version = "0.7", optional=true} memmap = {version = "0.7", optional=true}
lz4 = {version="1", optional=true} lz4 = {version="1", optional=true}
snap = "1" snap = "1"
atomicwrites = {version="0.2", optional=true} tempfile = {version="3", optional=true}
tempfile = "3"
log = "0.4" log = "0.4"
serde = {version="1", features=["derive"]} serde = {version="1", features=["derive"]}
serde_json = "1" serde_json = "1"
@@ -35,10 +34,10 @@ uuid = { version = "0.8", features = ["v4", "serde"] }
crossbeam = "0.7" crossbeam = "0.7"
futures = {version = "0.3", features=["thread-pool"] } futures = {version = "0.3", features=["thread-pool"] }
owning_ref = "0.4" owning_ref = "0.4"
tantivy-query-grammar = { version="0.14.0-dev", path="./query-grammar" }
stable_deref_trait = "1" stable_deref_trait = "1"
rust-stemmers = "1" rust-stemmers = "1"
downcast-rs = "1" downcast-rs = "1"
tantivy-query-grammar = { version="0.13", path="./query-grammar" }
bitpacking = {version="0.8", default-features = false, features=["bitpacker4x"]} bitpacking = {version="0.8", default-features = false, features=["bitpacker4x"]}
census = "0.4" census = "0.4"
fnv = "1" fnv = "1"
@@ -75,12 +74,11 @@ overflow-checks = true
[features] [features]
default = ["mmap"] default = ["mmap"]
mmap = ["atomicwrites", "fs2", "memmap", "notify"] mmap = ["fs2", "tempfile", "memmap", "notify"]
lz4-compression = ["lz4"] lz4-compression = ["lz4"]
failpoints = ["fail/failpoints"] failpoints = ["fail/failpoints"]
unstable = [] # useful for benches. unstable = [] # useful for benches.
wasm-bindgen = ["uuid/wasm-bindgen"] wasm-bindgen = ["uuid/wasm-bindgen"]
scoref64 = [] # scores are f64 instead of f32. was introduced to debug blockwand.
[workspace] [workspace]
members = ["query-grammar"] members = ["query-grammar"]

View File

@@ -34,11 +34,6 @@ Tantivy is, in fact, strongly inspired by Lucene's design.
The following [benchmark](https://tantivy-search.github.io/bench/) break downs The following [benchmark](https://tantivy-search.github.io/bench/) break downs
performance for different type of queries / collection. performance for different type of queries / collection.
In general, Tantivy tends to be
- slower than Lucene on union with a Top-K due to Block-WAND optimization.
- faster than Lucene on intersection and phrase queries.
Your mileage WILL vary depending on the nature of queries and their load. Your mileage WILL vary depending on the nature of queries and their load.
# Features # Features

50
doc/src/index-format.md Normal file
View File

@@ -0,0 +1,50 @@
# Managed files
+----------+-----------+-------------------+
| content | footer | footer_len: u32 |
+----------+-----------+-------------------+
# Term Dictionary (Composite File)
+---------+---------------------------+------------------------+
| fst | term_info_store | footer_len: u64 |
+---------+---------------------------+------------------------+
During a merge the term info store need to fit in memory.
It has a cost of n bytes per term.
# term_info_store
+-------------------+---------------------------+------------------------+
| len_block_meta | block_meta | term_infos |
+-------------------+---------------------------+------------------------+
# inverted_index
+------------------------+---------------------------+------------------------+
| total_num_tokens: u64 | posting_lists.. | term_infos |
+------------------------+---------------------------+------------------------+
# postings lists
+------------------------+---------------------------+------------------------+
|
+
# composite file
+----------------+-----+----------------+----------------------+----------------+
| field file 1 | ... | field field n |composite file footer | footer len: u32|
+----------------+-----+----------------+----------------------+----------------+
# composite file footer
+-----------------+---------------------------------------+
|num fields: vint | (file_addr, offset_delta: vint) []... |
+-----------------+---------------------------------------+
# FileAddr
+--------------+--------------+
| field: u32 | idx: VInt |
+--------------+--------------+
# Posting lists
+-----------------------------------------+
| skip_reader
+-----------------------------------------+

View File

@@ -112,18 +112,6 @@ fn main() -> tantivy::Result<()> {
limbs and branches that arch over the pool" limbs and branches that arch over the pool"
)); ));
index_writer.add_document(doc!(
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
// Multivalued field just need to be repeated. // Multivalued field just need to be repeated.
index_writer.add_document(doc!( index_writer.add_document(doc!(
title => "Frankenstein", title => "Frankenstein",

View File

@@ -56,7 +56,7 @@ fn main() -> tantivy::Result<()> {
); );
let top_docs_by_custom_score = let top_docs_by_custom_score =
TopDocs::with_limit(2).tweak_score(move |segment_reader: &SegmentReader| { TopDocs::with_limit(2).tweak_score(move |segment_reader: &SegmentReader| {
let ingredient_reader = segment_reader.facet_reader(ingredient).unwrap(); let mut ingredient_reader = segment_reader.facet_reader(ingredient).unwrap();
let facet_dict = ingredient_reader.facet_dict(); let facet_dict = ingredient_reader.facet_dict();
let query_ords: HashSet<u64> = facets let query_ords: HashSet<u64> = facets

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "tantivy-query-grammar" name = "tantivy-query-grammar"
version = "0.13.0" version = "0.14.0-dev"
authors = ["Paul Masurel <paul.masurel@gmail.com>"] authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT" license = "MIT"
categories = ["database-implementations", "data-structures"] categories = ["database-implementations", "data-structures"]

View File

@@ -52,7 +52,7 @@ mod test {
use crate::Occur; use crate::Occur;
#[test] #[test]
fn test_Occur_compose() { fn test_occur_compose() {
assert_eq!(Occur::compose(Occur::Should, Occur::Should), Occur::Should); assert_eq!(Occur::compose(Occur::Should, Occur::Should), Occur::Should);
assert_eq!(Occur::compose(Occur::Should, Occur::Must), Occur::Must); assert_eq!(Occur::compose(Occur::Should, Occur::Must), Occur::Must);
assert_eq!( assert_eq!(

View File

@@ -9,8 +9,10 @@ use combine::{
fn field<'a>() -> impl Parser<&'a str, Output = String> { fn field<'a>() -> impl Parser<&'a str, Output = String> {
( (
letter(), (letter().or(char('_'))),
many(satisfy(|c: char| c.is_alphanumeric() || c == '_')), many(satisfy(|c: char| {
c.is_alphanumeric() || c == '_' || c == '-'
})),
) )
.skip(char(':')) .skip(char(':'))
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2)) .map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
@@ -279,6 +281,8 @@ pub fn parse_to_ast<'a>() -> impl Parser<&'a str, Output = UserInputAST> {
#[cfg(test)] #[cfg(test)]
mod test { mod test {
type TestParseResult = Result<(), StringStreamError>;
use super::*; use super::*;
use combine::parser::Parser; use combine::parser::Parser;
@@ -296,9 +300,10 @@ mod test {
} }
#[test] #[test]
fn test_occur_symbol() { fn test_occur_symbol() -> TestParseResult {
assert_eq!(super::occur_symbol().parse("-"), Ok((Occur::MustNot, ""))); assert_eq!(super::occur_symbol().parse("-")?, (Occur::MustNot, ""));
assert_eq!(super::occur_symbol().parse("+"), Ok((Occur::Must, ""))); assert_eq!(super::occur_symbol().parse("+")?, (Occur::Must, ""));
Ok(())
} }
#[test] #[test]
@@ -410,6 +415,25 @@ mod test {
assert_eq!(format!("{:?}", ast), "\"abc\""); assert_eq!(format!("{:?}", ast), "\"abc\"");
} }
#[test]
fn test_field_name() -> TestParseResult {
assert_eq!(
super::field().parse("my-field-name:a")?,
("my-field-name".to_string(), "a")
);
assert_eq!(
super::field().parse("my_field_name:a")?,
("my_field_name".to_string(), "a")
);
assert!(super::field().parse(":a").is_err());
assert!(super::field().parse("-my_field:a").is_err());
assert_eq!(
super::field().parse("_my_field:a")?,
("_my_field".to_string(), "a")
);
Ok(())
}
#[test] #[test]
fn test_range_parser() { fn test_range_parser() {
// testing the range() parser separately // testing the range() parser separately

View File

@@ -472,7 +472,7 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
let num_facets: usize = 3 * 4 * 5; let num_facets: usize = 3 * 4 * 5;
let facets: Vec<Facet> = (0..num_facets) let facets: Vec<Facet> = (0..num_facets)
.map(|mut n| { .map(|mut n| {
@@ -531,7 +531,7 @@ mod tests {
let facet_field = schema_builder.add_facet_field("facets"); let facet_field = schema_builder.add_facet_field("facets");
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!( index_writer.add_document(doc!(
facet_field => Facet::from_text(&"/subjects/A/a"), facet_field => Facet::from_text(&"/subjects/A/a"),
facet_field => Facet::from_text(&"/subjects/B/a"), facet_field => Facet::from_text(&"/subjects/B/a"),
@@ -550,12 +550,12 @@ mod tests {
} }
#[test] #[test]
fn test_doc_search_by_facet() { fn test_doc_search_by_facet() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let facet_field = schema_builder.add_facet_field("facet"); let facet_field = schema_builder.add_facet_field("facet");
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!( index_writer.add_document(doc!(
facet_field => Facet::from_text(&"/A/A"), facet_field => Facet::from_text(&"/A/A"),
)); ));
@@ -568,8 +568,8 @@ mod tests {
index_writer.add_document(doc!( index_writer.add_document(doc!(
facet_field => Facet::from_text(&"/D/C/A"), facet_field => Facet::from_text(&"/D/C/A"),
)); ));
index_writer.commit().unwrap(); index_writer.commit()?;
let reader = index.reader().unwrap(); let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 4); assert_eq!(searcher.num_docs(), 4);
@@ -586,17 +586,17 @@ mod tests {
assert_eq!(count_facet("/A/C"), 1); assert_eq!(count_facet("/A/C"), 1);
assert_eq!(count_facet("/A/C/A"), 1); assert_eq!(count_facet("/A/C/A"), 1);
assert_eq!(count_facet("/C/A"), 0); assert_eq!(count_facet("/C/A"), 0);
let query_parser = QueryParser::for_index(&index, vec![]);
{ {
let query_parser = QueryParser::for_index(&index, vec![]); let query = query_parser.parse_query("facet:/A/B")?;
{ assert_eq!(1, searcher.search(&query, &Count).unwrap());
let query = query_parser.parse_query("facet:/A/B").unwrap();
assert_eq!(1, searcher.search(&query, &Count).unwrap());
}
{
let query = query_parser.parse_query("facet:/A").unwrap();
assert_eq!(3, searcher.search(&query, &Count).unwrap());
}
} }
{
let query = query_parser.parse_query("facet:/A")?;
assert_eq!(3, searcher.search(&query, &Count)?);
}
Ok(())
} }
#[test] #[test]
@@ -631,7 +631,7 @@ mod tests {
.collect(); .collect();
docs[..].shuffle(&mut thread_rng()); docs[..].shuffle(&mut thread_rng());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
for doc in docs { for doc in docs {
index_writer.add_document(doc); index_writer.add_document(doc);
} }
@@ -684,7 +684,7 @@ mod bench {
// 40425 docs // 40425 docs
docs[..].shuffle(&mut thread_rng()); docs[..].shuffle(&mut thread_rng());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
for doc in docs { for doc in docs {
index_writer.add_document(doc); index_writer.add_document(doc);
} }

View File

@@ -89,7 +89,7 @@ mod tests {
let index = Index::create_in_ram(schema.clone()); let index = Index::create_in_ram(schema.clone());
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
{ {
for i in 0u64..10u64 { for i in 0u64..10u64 {
index_writer.add_document(doc!( index_writer.add_document(doc!(

View File

@@ -259,7 +259,7 @@ mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text=>"abc")); index_writer.add_document(doc!(text=>"abc"));
index_writer.add_document(doc!(text=>"abc abc abc")); index_writer.add_document(doc!(text=>"abc abc abc"));
index_writer.add_document(doc!(text=>"abc abc")); index_writer.add_document(doc!(text=>"abc abc"));

View File

@@ -38,7 +38,7 @@ use std::fmt;
/// let schema = schema_builder.build(); /// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema); /// let index = Index::create_in_ram(schema);
/// ///
/// let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); /// let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
/// index_writer.add_document(doc!(title => "The Name of the Wind")); /// index_writer.add_document(doc!(title => "The Name of the Wind"));
/// index_writer.add_document(doc!(title => "The Diary of Muadib")); /// index_writer.add_document(doc!(title => "The Diary of Muadib"));
/// index_writer.add_document(doc!(title => "A Dairy Cow")); /// index_writer.add_document(doc!(title => "A Dairy Cow"));
@@ -123,7 +123,7 @@ impl TopDocs {
/// let schema = schema_builder.build(); /// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema); /// let index = Index::create_in_ram(schema);
/// ///
/// let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); /// let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
/// index_writer.add_document(doc!(title => "The Name of the Wind")); /// index_writer.add_document(doc!(title => "The Name of the Wind"));
/// index_writer.add_document(doc!(title => "The Diary of Muadib")); /// index_writer.add_document(doc!(title => "The Diary of Muadib"));
/// index_writer.add_document(doc!(title => "A Dairy Cow")); /// index_writer.add_document(doc!(title => "A Dairy Cow"));
@@ -163,7 +163,7 @@ impl TopDocs {
/// # let schema = schema_builder.build(); /// # let schema = schema_builder.build();
/// # /// #
/// # let index = Index::create_in_ram(schema); /// # let index = Index::create_in_ram(schema);
/// # let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?; /// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// # index_writer.add_document(doc!(title => "The Name of the Wind", rating => 92u64)); /// # index_writer.add_document(doc!(title => "The Name of the Wind", rating => 92u64));
/// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64)); /// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64));
/// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64)); /// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64));
@@ -264,7 +264,7 @@ impl TopDocs {
/// fn create_index() -> tantivy::Result<Index> { /// fn create_index() -> tantivy::Result<Index> {
/// let schema = create_schema(); /// let schema = create_schema();
/// let index = Index::create_in_ram(schema); /// let index = Index::create_in_ram(schema);
/// let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?; /// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// let product_name = index.schema().get_field("product_name").unwrap(); /// let product_name = index.schema().get_field("product_name").unwrap();
/// let popularity: Field = index.schema().get_field("popularity").unwrap(); /// let popularity: Field = index.schema().get_field("popularity").unwrap();
/// index_writer.add_document(doc!(product_name => "The Diary of Muadib", popularity => 1u64)); /// index_writer.add_document(doc!(product_name => "The Diary of Muadib", popularity => 1u64));
@@ -371,7 +371,7 @@ impl TopDocs {
/// # fn main() -> tantivy::Result<()> { /// # fn main() -> tantivy::Result<()> {
/// # let schema = create_schema(); /// # let schema = create_schema();
/// # let index = Index::create_in_ram(schema); /// # let index = Index::create_in_ram(schema);
/// # let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?; /// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// # let product_name = index.schema().get_field("product_name").unwrap(); /// # let product_name = index.schema().get_field("product_name").unwrap();
/// # /// #
/// let popularity: Field = index.schema().get_field("popularity").unwrap(); /// let popularity: Field = index.schema().get_field("popularity").unwrap();
@@ -561,7 +561,7 @@ mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"Hello happy tax payer.")); index_writer.add_document(doc!(text_field=>"Hello happy tax payer."));
index_writer.add_document(doc!(text_field=>"Droopy says hello happy tax payer")); index_writer.add_document(doc!(text_field=>"Droopy says hello happy tax payer"));
index_writer.add_document(doc!(text_field=>"I like Droopy")); index_writer.add_document(doc!(text_field=>"I like Droopy"));
@@ -821,7 +821,7 @@ mod tests {
) -> (Index, Box<dyn Query>) { ) -> (Index, Box<dyn Query>) {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
doc_adder(&mut index_writer); doc_adder(&mut index_writer);
index_writer.commit().unwrap(); index_writer.commit().unwrap();
let query_parser = QueryParser::for_index(&index, vec![query_field]); let query_parser = QueryParser::for_index(&index, vec![query_field]);

View File

@@ -300,6 +300,15 @@ impl Index {
) )
} }
/// Helper to create an index writer for tests.
///
/// That index writer only simply has a single thread and a heap of 5 MB.
/// Using a single thread gives us a deterministic allocation of DocId.
#[cfg(test)]
pub fn writer_for_tests(&self) -> crate::Result<IndexWriter> {
self.writer_with_num_threads(1, 10_000_000)
}
/// Creates a multithreaded writer /// Creates a multithreaded writer
/// ///
/// Tantivy will automatically define the number of threads to use. /// Tantivy will automatically define the number of threads to use.
@@ -502,7 +511,7 @@ mod tests {
let schema = throw_away_schema(); let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap(); let field = schema.get_field("num_likes").unwrap();
let mut index = Index::create_from_tempdir(schema).unwrap(); let mut index = Index::create_from_tempdir(schema).unwrap();
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut writer = index.writer_for_tests().unwrap();
writer.commit().unwrap(); writer.commit().unwrap();
let reader = index let reader = index
.reader_builder() .reader_builder()
@@ -539,23 +548,33 @@ mod tests {
test_index_on_commit_reload_policy_aux(field, &write_index, &reader); test_index_on_commit_reload_policy_aux(field, &write_index, &reader);
} }
} }
fn test_index_on_commit_reload_policy_aux(field: Field, index: &Index, reader: &IndexReader) { fn test_index_on_commit_reload_policy_aux(field: Field, index: &Index, reader: &IndexReader) {
let mut reader_index = reader.index(); let mut reader_index = reader.index();
let (sender, receiver) = crossbeam::channel::unbounded(); let (sender, receiver) = crossbeam::channel::unbounded();
let _watch_handle = reader_index.directory_mut().watch(Box::new(move || { let _watch_handle = reader_index.directory_mut().watch(Box::new(move || {
let _ = sender.send(()); let _ = sender.send(());
})); }));
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut writer = index.writer_for_tests().unwrap();
assert_eq!(reader.searcher().num_docs(), 0); assert_eq!(reader.searcher().num_docs(), 0);
writer.add_document(doc!(field=>1u64)); writer.add_document(doc!(field=>1u64));
writer.commit().unwrap(); writer.commit().unwrap();
assert!(receiver.recv().is_ok()); // We need a loop here because it is possible for notify to send more than
assert_eq!(reader.searcher().num_docs(), 1); // one modify event. It was observed on CI on MacOS.
loop {
assert!(receiver.recv().is_ok());
if reader.searcher().num_docs() == 1 {
break;
}
}
writer.add_document(doc!(field=>2u64)); writer.add_document(doc!(field=>2u64));
writer.commit().unwrap(); writer.commit().unwrap();
assert!(receiver.recv().is_ok()); // ... Same as above
assert_eq!(reader.searcher().num_docs(), 2); loop {
assert!(receiver.recv().is_ok());
if reader.searcher().num_docs() == 2 {
break;
}
}
} }
// This test will not pass on windows, because windows // This test will not pass on windows, because windows

View File

@@ -116,6 +116,7 @@ impl SegmentMeta {
SegmentComponent::FASTFIELDS => ".fast".to_string(), SegmentComponent::FASTFIELDS => ".fast".to_string(),
SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(), SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(),
SegmentComponent::DELETE => format!(".{}.del", self.delete_opstamp().unwrap_or(0)), SegmentComponent::DELETE => format!(".{}.del", self.delete_opstamp().unwrap_or(0)),
SegmentComponent::FIELDSTATS => ".fieldstats".to_string(),
}); });
PathBuf::from(path) PathBuf::from(path)
} }

View File

@@ -1,9 +1,7 @@
use crate::common::BinarySerializable;
use crate::directory::ReadOnlySource; use crate::directory::ReadOnlySource;
use crate::positions::PositionReader; use crate::positions::PositionReader;
use crate::postings::TermInfo; use crate::postings::TermInfo;
use crate::postings::{BlockSegmentPostings, SegmentPostings}; use crate::postings::{BlockSegmentPostings, SegmentPostings};
use crate::schema::FieldType;
use crate::schema::IndexRecordOption; use crate::schema::IndexRecordOption;
use crate::schema::Term; use crate::schema::Term;
use crate::termdict::TermDictionary; use crate::termdict::TermDictionary;
@@ -37,14 +35,12 @@ impl InvertedIndexReader {
postings_source: ReadOnlySource, postings_source: ReadOnlySource,
positions_source: ReadOnlySource, positions_source: ReadOnlySource,
positions_idx_source: ReadOnlySource, positions_idx_source: ReadOnlySource,
total_num_tokens: u64,
record_option: IndexRecordOption, record_option: IndexRecordOption,
) -> InvertedIndexReader { ) -> InvertedIndexReader {
let total_num_tokens_data = postings_source.slice(0, 8);
let mut total_num_tokens_cursor = total_num_tokens_data.as_slice();
let total_num_tokens = u64::deserialize(&mut total_num_tokens_cursor).unwrap_or(0u64);
InvertedIndexReader { InvertedIndexReader {
termdict, termdict,
postings_source: postings_source.slice_from(8), postings_source,
positions_source, positions_source,
positions_idx_source, positions_idx_source,
record_option, record_option,
@@ -54,10 +50,7 @@ impl InvertedIndexReader {
/// Creates an empty `InvertedIndexReader` object, which /// Creates an empty `InvertedIndexReader` object, which
/// contains no terms at all. /// contains no terms at all.
pub fn empty(field_type: &FieldType) -> InvertedIndexReader { pub fn empty(record_option: IndexRecordOption) -> InvertedIndexReader {
let record_option = field_type
.get_index_record_option()
.unwrap_or(IndexRecordOption::Basic);
InvertedIndexReader { InvertedIndexReader {
termdict: TermDictionary::empty(), termdict: TermDictionary::empty(),
postings_source: ReadOnlySource::empty(), postings_source: ReadOnlySource::empty(),
@@ -93,7 +86,7 @@ impl InvertedIndexReader {
term_info: &TermInfo, term_info: &TermInfo,
block_postings: &mut BlockSegmentPostings, block_postings: &mut BlockSegmentPostings,
) { ) {
let offset = term_info.postings_offset as usize; let offset = term_info.postings_start_offset as usize;
let end_source = self.postings_source.len(); let end_source = self.postings_source.len();
let postings_slice = self.postings_source.slice(offset, end_source); let postings_slice = self.postings_source.slice(offset, end_source);
block_postings.reset(term_info.doc_freq, postings_slice); block_postings.reset(term_info.doc_freq, postings_slice);
@@ -121,8 +114,10 @@ impl InvertedIndexReader {
term_info: &TermInfo, term_info: &TermInfo,
requested_option: IndexRecordOption, requested_option: IndexRecordOption,
) -> BlockSegmentPostings { ) -> BlockSegmentPostings {
let offset = term_info.postings_offset as usize; let postings_data = self.postings_source.slice(
let postings_data = self.postings_source.slice_from(offset); term_info.postings_start_offset as usize,
term_info.postings_end_offset as usize,
);
BlockSegmentPostings::from_data( BlockSegmentPostings::from_data(
term_info.doc_freq, term_info.doc_freq,
postings_data, postings_data,

View File

@@ -24,14 +24,17 @@ pub enum SegmentComponent {
/// Accessing a document from the store is relatively slow, as it /// Accessing a document from the store is relatively slow, as it
/// requires to decompress the entire block it belongs to. /// requires to decompress the entire block it belongs to.
STORE, STORE,
/// Bitset describing which document of the segment is deleted. /// Bitset describing which document of the segment is deleted.
DELETE, DELETE,
FIELDSTATS,
} }
impl SegmentComponent { impl SegmentComponent {
/// Iterates through the components. /// Iterates through the components.
pub fn iterator() -> slice::Iter<'static, SegmentComponent> { pub fn iterator() -> slice::Iter<'static, SegmentComponent> {
static SEGMENT_COMPONENTS: [SegmentComponent; 8] = [ static SEGMENT_COMPONENTS: [SegmentComponent; 9] = [
SegmentComponent::POSTINGS, SegmentComponent::POSTINGS,
SegmentComponent::POSITIONS, SegmentComponent::POSITIONS,
SegmentComponent::POSITIONSSKIP, SegmentComponent::POSITIONSSKIP,
@@ -40,6 +43,7 @@ impl SegmentComponent {
SegmentComponent::TERMS, SegmentComponent::TERMS,
SegmentComponent::STORE, SegmentComponent::STORE,
SegmentComponent::DELETE, SegmentComponent::DELETE,
SegmentComponent::FIELDSTATS,
]; ];
SEGMENT_COMPONENTS.iter() SEGMENT_COMPONENTS.iter()
} }

View File

@@ -1,4 +1,3 @@
use crate::common::CompositeFile;
use crate::common::HasLen; use crate::common::HasLen;
use crate::core::InvertedIndexReader; use crate::core::InvertedIndexReader;
use crate::core::Segment; use crate::core::Segment;
@@ -9,13 +8,14 @@ use crate::fastfield::DeleteBitSet;
use crate::fastfield::FacetReader; use crate::fastfield::FacetReader;
use crate::fastfield::FastFieldReaders; use crate::fastfield::FastFieldReaders;
use crate::fieldnorm::{FieldNormReader, FieldNormReaders}; use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
use crate::schema::Field;
use crate::schema::FieldType; use crate::schema::FieldType;
use crate::schema::Schema; use crate::schema::Schema;
use crate::schema::{Field, IndexRecordOption};
use crate::space_usage::SegmentSpaceUsage; use crate::space_usage::SegmentSpaceUsage;
use crate::store::StoreReader; use crate::store::StoreReader;
use crate::termdict::TermDictionary; use crate::termdict::TermDictionary;
use crate::DocId; use crate::DocId;
use crate::{common::CompositeFile, postings::FieldStats};
use fail::fail_point; use fail::fail_point;
use std::collections::HashMap; use std::collections::HashMap;
use std::fmt; use std::fmt;
@@ -49,6 +49,7 @@ pub struct SegmentReader {
positions_idx_composite: CompositeFile, positions_idx_composite: CompositeFile,
fast_fields_readers: Arc<FastFieldReaders>, fast_fields_readers: Arc<FastFieldReaders>,
fieldnorm_readers: FieldNormReaders, fieldnorm_readers: FieldNormReaders,
field_stats: FieldStats,
store_source: ReadOnlySource, store_source: ReadOnlySource,
delete_bitset_opt: Option<DeleteBitSet>, delete_bitset_opt: Option<DeleteBitSet>,
@@ -112,10 +113,8 @@ impl SegmentReader {
return None; return None;
} }
let term_ords_reader = self.fast_fields().u64s(field)?; let term_ords_reader = self.fast_fields().u64s(field)?;
let termdict = self.termdict_composite let termdict_source = self.termdict_composite.open_read(field)?;
.open_read(field) let termdict = TermDictionary::from_source(&termdict_source);
.map(|source| TermDictionary::from_source(&source))
.unwrap_or_else(TermDictionary::empty);
let facet_reader = FacetReader::new(term_ords_reader, termdict); let facet_reader = FacetReader::new(term_ords_reader, termdict);
Some(facet_reader) Some(facet_reader)
} }
@@ -127,17 +126,15 @@ impl SegmentReader {
/// ///
/// They are simply stored as a fast field, serialized in /// They are simply stored as a fast field, serialized in
/// the `.fieldnorm` file of the segment. /// the `.fieldnorm` file of the segment.
pub fn get_fieldnorms_reader(&self, field: Field) -> FieldNormReader { pub fn get_fieldnorms_reader(&self, field: Field) -> crate::Result<FieldNormReader> {
if let Some(fieldnorm_reader) = self.fieldnorm_readers.get_field(field) { self.fieldnorm_readers.get_field(field).ok_or_else(|| {
fieldnorm_reader
} else {
let field_name = self.schema.get_field_name(field); let field_name = self.schema.get_field_name(field);
let err_msg = format!( let err_msg = format!(
"Field norm not found for field {:?}. Was it market as indexed during indexing.", "Field norm not found for field {:?}. Was it market as indexed during indexing.",
field_name field_name
); );
panic!(err_msg); crate::TantivyError::SchemaError(err_msg)
} })
} }
/// Accessor to the segment's `StoreReader`. /// Accessor to the segment's `StoreReader`.
@@ -183,6 +180,9 @@ impl SegmentReader {
let fieldnorm_data = segment.open_read(SegmentComponent::FIELDNORMS)?; let fieldnorm_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?; let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
let field_stats_data = segment.open_read(SegmentComponent::FIELDSTATS)?;
let field_stats = FieldStats::from_source(field_stats_data.as_slice())?;
let delete_bitset_opt = if segment.meta().has_deletes() { let delete_bitset_opt = if segment.meta().has_deletes() {
let delete_data = segment.open_read(SegmentComponent::DELETE)?; let delete_data = segment.open_read(SegmentComponent::DELETE)?;
Some(DeleteBitSet::open(delete_data)) Some(DeleteBitSet::open(delete_data))
@@ -198,6 +198,7 @@ impl SegmentReader {
postings_composite, postings_composite,
fast_fields_readers: fast_field_readers, fast_fields_readers: fast_field_readers,
fieldnorm_readers, fieldnorm_readers,
field_stats,
segment_id: segment.id(), segment_id: segment.id(),
store_source, store_source,
delete_bitset_opt, delete_bitset_opt,
@@ -214,6 +215,11 @@ impl SegmentReader {
/// The field reader is in charge of iterating through the /// The field reader is in charge of iterating through the
/// term dictionary associated to a specific field, /// term dictionary associated to a specific field,
/// and opening the posting list associated to any term. /// and opening the posting list associated to any term.
///
/// If the field is marked as index, a warn is logged and an empty `InvertedIndexReader`
/// is returned.
/// Similarly if the field is marked as indexed but no term has been indexed for the given
/// index. an empty `InvertedIndexReader` is returned (but no warning is logged).
pub fn inverted_index(&self, field: Field) -> Arc<InvertedIndexReader> { pub fn inverted_index(&self, field: Field) -> Arc<InvertedIndexReader> {
if let Some(inv_idx_reader) = self if let Some(inv_idx_reader) = self
.inv_idx_reader_cache .inv_idx_reader_cache
@@ -228,21 +234,21 @@ impl SegmentReader {
let record_option_opt = field_type.get_index_record_option(); let record_option_opt = field_type.get_index_record_option();
if record_option_opt.is_none() { if record_option_opt.is_none() {
panic!("Field {:?} does not seem indexed.", field_entry.name()); warn!("Field {:?} does not seem indexed.", field_entry.name());
} }
let record_option = record_option_opt.unwrap();
let postings_source_opt = self.postings_composite.open_read(field); let postings_source_opt = self.postings_composite.open_read(field);
if postings_source_opt.is_none() { if postings_source_opt.is_none() || record_option_opt.is_none() {
// no documents in the segment contained this field. // no documents in the segment contained this field.
// As a result, no data is associated to the inverted index. // As a result, no data is associated to the inverted index.
// //
// Returns an empty inverted index. // Returns an empty inverted index.
return Arc::new(InvertedIndexReader::empty(field_type)); let record_option = record_option_opt.unwrap_or(IndexRecordOption::Basic);
return Arc::new(InvertedIndexReader::empty(record_option));
} }
let record_option = record_option_opt.unwrap();
let postings_source = postings_source_opt.unwrap(); let postings_source = postings_source_opt.unwrap();
let termdict_source = self.termdict_composite.open_read(field).expect( let termdict_source = self.termdict_composite.open_read(field).expect(
@@ -259,11 +265,17 @@ impl SegmentReader {
.open_read(field) .open_read(field)
.expect("Index corrupted. Failed to open field positions in composite file."); .expect("Index corrupted. Failed to open field positions in composite file.");
let total_num_tokens = self
.field_stats
.get(field)
.map(|field_stat| field_stat.num_tokens())
.unwrap_or(0u64);
let inv_idx_reader = Arc::new(InvertedIndexReader::new( let inv_idx_reader = Arc::new(InvertedIndexReader::new(
TermDictionary::from_source(&termdict_source), TermDictionary::from_source(&termdict_source),
postings_source, postings_source,
positions_source, positions_source,
positions_idx_source, positions_idx_source,
total_num_tokens,
record_option, record_option,
)); ));
@@ -341,7 +353,7 @@ mod test {
let name = schema.get_field("name").unwrap(); let name = schema.get_field("name").unwrap();
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(name => "tantivy")); index_writer.add_document(doc!(name => "tantivy"));
index_writer.add_document(doc!(name => "horse")); index_writer.add_document(doc!(name => "horse"));
index_writer.add_document(doc!(name => "jockey")); index_writer.add_document(doc!(name => "jockey"));

View File

@@ -271,7 +271,11 @@ mod tests {
let mut vec = Vec::new(); let mut vec = Vec::new();
let footer_proxy = FooterProxy::new(&mut vec); let footer_proxy = FooterProxy::new(&mut vec);
assert!(footer_proxy.terminate().is_ok()); assert!(footer_proxy.terminate().is_ok());
assert_eq!(vec.len(), 167); if crate::store::COMPRESSION == "lz4" {
assert_eq!(vec.len(), 158);
} else {
assert_eq!(vec.len(), 167);
}
let footer = Footer::deserialize(&mut &vec[..]).unwrap(); let footer = Footer::deserialize(&mut &vec[..]).unwrap();
assert!(matches!( assert!(matches!(
footer.versioned_footer, footer.versioned_footer,

View File

@@ -1,4 +1,4 @@
use crate::core::MANAGED_FILEPATH; use crate::core::{MANAGED_FILEPATH, META_FILEPATH};
use crate::directory::error::{DeleteError, IOError, LockError, OpenReadError, OpenWriteError}; use crate::directory::error::{DeleteError, IOError, LockError, OpenReadError, OpenWriteError};
use crate::directory::footer::{Footer, FooterProxy}; use crate::directory::footer::{Footer, FooterProxy};
use crate::directory::DirectoryLock; use crate::directory::DirectoryLock;
@@ -246,13 +246,15 @@ impl ManagedDirectory {
/// List files for which checksum does not match content /// List files for which checksum does not match content
pub fn list_damaged(&self) -> result::Result<HashSet<PathBuf>, OpenReadError> { pub fn list_damaged(&self) -> result::Result<HashSet<PathBuf>, OpenReadError> {
let mut hashset = HashSet::new(); let mut hashset = HashSet::new();
let managed_paths = self let mut managed_paths = self
.meta_informations .meta_informations
.read() .read()
.expect("Managed directory rlock poisoned in list damaged.") .expect("Managed directory rlock poisoned in list damaged.")
.managed_paths .managed_paths
.clone(); .clone();
managed_paths.remove(*META_FILEPATH);
for path in managed_paths.into_iter() { for path in managed_paths.into_iter() {
if !self.validate_checksum(&path)? { if !self.validate_checksum(&path)? {
hashset.insert(path); hashset.insert(path);

View File

@@ -487,11 +487,13 @@ impl Directory for MmapDirectory {
} }
} }
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> { fn atomic_write(&mut self, path: &Path, content: &[u8]) -> io::Result<()> {
debug!("Atomic Write {:?}", path); debug!("Atomic Write {:?}", path);
let mut tempfile = tempfile::Builder::new().tempfile_in(&self.inner.root_path)?;
tempfile.write_all(content)?;
tempfile.flush()?;
let full_path = self.resolve_path(path); let full_path = self.resolve_path(path);
let meta_file = atomicwrites::AtomicFile::new(full_path, atomicwrites::AllowOverwrite); tempfile.into_temp_path().persist(full_path)?;
meta_file.write(|f| f.write_all(data))?;
Ok(()) Ok(())
} }
@@ -652,7 +654,7 @@ mod tests {
{ {
let index = Index::create(mmap_directory.clone(), schema).unwrap(); let index = Index::create(mmap_directory.clone(), schema).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
let mut log_merge_policy = LogMergePolicy::default(); let mut log_merge_policy = LogMergePolicy::default();
log_merge_policy.set_min_merge_size(3); log_merge_policy.set_min_merge_size(3);
index_writer.set_merge_policy(Box::new(log_merge_policy)); index_writer.set_merge_policy(Box::new(log_merge_policy));

View File

@@ -211,19 +211,19 @@ fn test_watch(directory: &mut dyn Directory) {
.unwrap(); .unwrap();
for i in 0..10 { for i in 0..10 {
assert_eq!(i, counter.load(SeqCst)); assert!(i <= counter.load(SeqCst));
assert!(directory assert!(directory
.atomic_write(Path::new("meta.json"), b"random_test_data_2") .atomic_write(Path::new("meta.json"), b"random_test_data_2")
.is_ok()); .is_ok());
assert_eq!(receiver.recv_timeout(Duration::from_millis(500)), Ok(i)); assert_eq!(receiver.recv_timeout(Duration::from_millis(500)), Ok(i));
assert_eq!(i + 1, counter.load(SeqCst)); assert!(i + 1 <= counter.load(SeqCst)); // notify can trigger more than once.
} }
mem::drop(watch_handle); mem::drop(watch_handle);
assert!(directory assert!(directory
.atomic_write(Path::new("meta.json"), b"random_test_data") .atomic_write(Path::new("meta.json"), b"random_test_data")
.is_ok()); .is_ok());
assert!(receiver.recv_timeout(Duration::from_millis(500)).is_ok()); assert!(receiver.recv_timeout(Duration::from_millis(500)).is_ok());
assert_eq!(10, counter.load(SeqCst)); assert!(10 <= counter.load(SeqCst));
} }
fn test_lock_non_blocking(directory: &mut dyn Directory) { fn test_lock_non_blocking(directory: &mut dyn Directory) {

View File

@@ -15,7 +15,7 @@ mod tests {
let field = schema_builder.add_bytes_field("bytesfield"); let field = schema_builder.add_bytes_field("bytesfield");
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(field=>vec![0u8, 1, 2, 3])); index_writer.add_document(doc!(field=>vec![0u8, 1, 2, 3]));
index_writer.add_document(doc!(field=>vec![])); index_writer.add_document(doc!(field=>vec![]));
index_writer.add_document(doc!(field=>vec![255u8])); index_writer.add_document(doc!(field=>vec![255u8]));

View File

@@ -73,52 +73,7 @@ impl FacetReader {
} }
/// Return the list of facet ordinals associated to a document. /// Return the list of facet ordinals associated to a document.
pub fn facet_ords(&self, doc: DocId, output: &mut Vec<u64>) { pub fn facet_ords(&mut self, doc: DocId, output: &mut Vec<u64>) {
self.term_ords.get_vals(doc, output); self.term_ords.get_vals(doc, output);
} }
} }
#[cfg(test)]
mod tests {
use crate::{Document, schema::{Facet, SchemaBuilder}};
use crate::Index;
#[test]
fn test_facet_not_populated_for_all_docs() -> crate::Result<()> {
let mut schema_builder = SchemaBuilder::default();
let facet_field = schema_builder.add_facet_field("facet");
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b")));
index_writer.add_document(Document::default());
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let facet_reader = searcher.segment_reader(0u32).facet_reader(facet_field).unwrap();
let mut facet_ords = Vec::new();
facet_reader.facet_ords(0u32, &mut facet_ords);
assert_eq!(&facet_ords, &[2u64]);
facet_reader.facet_ords(1u32, &mut facet_ords);
assert!(facet_ords.is_empty());
Ok(())
}
#[test]
fn test_facet_not_populated_for_any_docs() -> crate::Result<()> {
let mut schema_builder = SchemaBuilder::default();
let facet_field = schema_builder.add_facet_field("facet");
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
index_writer.add_document(Document::default());
index_writer.add_document(Document::default());
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let facet_reader = searcher.segment_reader(0u32).facet_reader(facet_field).unwrap();
let mut facet_ords = Vec::new();
facet_reader.facet_ords(0u32, &mut facet_ords);
assert!(facet_ords.is_empty());
facet_reader.facet_ords(1u32, &mut facet_ords);
assert!(facet_ords.is_empty());
Ok(())
}
}

View File

@@ -474,7 +474,7 @@ mod tests {
let date_field = schema_builder.add_date_field("date", FAST); let date_field = schema_builder.add_date_field("date", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy)); index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer.add_document(doc!(date_field =>crate::chrono::prelude::Utc::now())); index_writer.add_document(doc!(date_field =>crate::chrono::prelude::Utc::now()));
index_writer.commit().unwrap(); index_writer.commit().unwrap();
@@ -511,7 +511,7 @@ mod tests {
); );
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy)); index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer.add_document(doc!( index_writer.add_document(doc!(
date_field => crate::DateTime::from_u64(1i64.to_u64()), date_field => crate::DateTime::from_u64(1i64.to_u64()),

View File

@@ -25,7 +25,7 @@ mod tests {
); );
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(field=>1u64, field=>3u64)); index_writer.add_document(doc!(field=>1u64, field=>3u64));
index_writer.add_document(doc!()); index_writer.add_document(doc!());
index_writer.add_document(doc!(field=>4u64)); index_writer.add_document(doc!(field=>4u64));
@@ -64,7 +64,7 @@ mod tests {
schema_builder.add_i64_field("time_stamp_i", IntOptions::default().set_stored()); schema_builder.add_i64_field("time_stamp_i", IntOptions::default().set_stored());
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
let first_time_stamp = chrono::Utc::now(); let first_time_stamp = chrono::Utc::now();
index_writer.add_document( index_writer.add_document(
doc!(date_field=>first_time_stamp, date_field=>first_time_stamp, time_i=>1i64), doc!(date_field=>first_time_stamp, date_field=>first_time_stamp, time_i=>1i64),
@@ -186,7 +186,7 @@ mod tests {
); );
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(field=> 1i64, field => 3i64)); index_writer.add_document(doc!(field=> 1i64, field => 3i64));
index_writer.add_document(doc!()); index_writer.add_document(doc!());
index_writer.add_document(doc!(field=> -4i64)); index_writer.add_document(doc!(field=> -4i64));
@@ -221,7 +221,7 @@ mod tests {
let field = schema_builder.add_facet_field("facetfield"); let field = schema_builder.add_facet_field("facetfield");
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
for i in 0..100_000 { for i in 0..100_000 {
index_writer.add_document(doc!(field=> Facet::from(format!("/lang/{}", i).as_str()))); index_writer.add_document(doc!(field=> Facet::from(format!("/lang/{}", i).as_str())));
} }

View File

@@ -74,7 +74,7 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index let mut index_writer = index
.writer_with_num_threads(1, 30_000_000) .writer_for_tests()
.expect("Failed to create index writer."); .expect("Failed to create index writer.");
index_writer.add_document(doc!( index_writer.add_document(doc!(
facet_field => Facet::from("/category/cat2"), facet_field => Facet::from("/category/cat2"),

View File

@@ -126,7 +126,6 @@ impl FastFieldsWriter {
for field_writer in &self.single_value_writers { for field_writer in &self.single_value_writers {
field_writer.serialize(serializer)?; field_writer.serialize(serializer)?;
} }
for field_writer in &self.multi_values_writers { for field_writer in &self.multi_values_writers {
let field = field_writer.field(); let field = field_writer.field();
field_writer.serialize(serializer, mapping.get(&field))?; field_writer.serialize(serializer, mapping.get(&field))?;

View File

@@ -800,7 +800,7 @@ mod tests {
let mut schema_builder = schema::Schema::builder(); let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT); let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let index_writer = index.writer_for_tests().unwrap();
let operations = vec![ let operations = vec![
UserOperation::Add(doc!(text_field=>"a")), UserOperation::Add(doc!(text_field=>"a")),
UserOperation::Add(doc!(text_field=>"b")), UserOperation::Add(doc!(text_field=>"b")),
@@ -815,7 +815,7 @@ mod tests {
let text_field = schema_builder.add_text_field("text", schema::TEXT); let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field => "hello1")); index_writer.add_document(doc!(text_field => "hello1"));
index_writer.add_document(doc!(text_field => "hello2")); index_writer.add_document(doc!(text_field => "hello2"));
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
@@ -864,7 +864,7 @@ mod tests {
.reload_policy(ReloadPolicy::Manual) .reload_policy(ReloadPolicy::Manual)
.try_into() .try_into()
.unwrap(); .unwrap();
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
let a_term = Term::from_field_text(text_field, "a"); let a_term = Term::from_field_text(text_field, "a");
let b_term = Term::from_field_text(text_field, "b"); let b_term = Term::from_field_text(text_field, "b");
let operations = vec![ let operations = vec![
@@ -926,8 +926,8 @@ mod tests {
fn test_lockfile_already_exists_error_msg() { fn test_lockfile_already_exists_error_msg() {
let schema_builder = schema::Schema::builder(); let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let _index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let _index_writer = index.writer_for_tests().unwrap();
match index.writer_with_num_threads(1, 3_000_000) { match index.writer_for_tests() {
Err(err) => { Err(err) => {
let err_msg = err.to_string(); let err_msg = err.to_string();
assert!(err_msg.contains("already an `IndexWriter`")); assert!(err_msg.contains("already an `IndexWriter`"));
@@ -1261,7 +1261,7 @@ mod tests {
let idfield = schema_builder.add_text_field("id", STRING); let idfield = schema_builder.add_text_field("id", STRING);
schema_builder.add_text_field("optfield", STRING); schema_builder.add_text_field("optfield", STRING);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(idfield=>"myid")); index_writer.add_document(doc!(idfield=>"myid"));
let commit = index_writer.commit(); let commit = index_writer.commit();
assert!(commit.is_ok()); assert!(commit.is_ok());

View File

@@ -25,14 +25,14 @@ use std::cmp;
use std::collections::HashMap; use std::collections::HashMap;
use std::sync::Arc; use std::sync::Arc;
fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 { fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::Result<u64> {
let mut total_tokens = 0u64; let mut total_tokens = 0u64;
let mut count: [usize; 256] = [0; 256]; let mut count: [usize; 256] = [0; 256];
for reader in readers { for reader in readers {
if reader.has_deletes() { if reader.has_deletes() {
// if there are deletes, then we use an approximation // if there are deletes, then we use an approximation
// using the fieldnorm // using the fieldnorm
let fieldnorms_reader = reader.get_fieldnorms_reader(field); let fieldnorms_reader = reader.get_fieldnorms_reader(field)?;
for doc in reader.doc_ids_alive() { for doc in reader.doc_ids_alive() {
let fieldnorm_id = fieldnorms_reader.fieldnorm_id(doc); let fieldnorm_id = fieldnorms_reader.fieldnorm_id(doc);
count[fieldnorm_id as usize] += 1; count[fieldnorm_id as usize] += 1;
@@ -41,7 +41,7 @@ fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 {
total_tokens += reader.inverted_index(field).total_num_tokens(); total_tokens += reader.inverted_index(field).total_num_tokens();
} }
} }
total_tokens Ok(total_tokens
+ count + count
.iter() .iter()
.cloned() .cloned()
@@ -49,7 +49,7 @@ fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 {
.map(|(fieldnorm_ord, count)| { .map(|(fieldnorm_ord, count)| {
count as u64 * u64::from(FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8)) count as u64 * u64::from(FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8))
}) })
.sum::<u64>() .sum::<u64>())
} }
pub struct IndexMerger { pub struct IndexMerger {
@@ -175,7 +175,7 @@ impl IndexMerger {
for field in fields { for field in fields {
fieldnorms_data.clear(); fieldnorms_data.clear();
for reader in &self.readers { for reader in &self.readers {
let fieldnorms_reader = reader.get_fieldnorms_reader(field); let fieldnorms_reader = reader.get_fieldnorms_reader(field)?;
for doc_id in reader.doc_ids_alive() { for doc_id in reader.doc_ids_alive() {
let fieldnorm_id = fieldnorms_reader.fieldnorm_id(doc_id); let fieldnorm_id = fieldnorms_reader.fieldnorm_id(doc_id);
fieldnorms_data.push(fieldnorm_id); fieldnorms_data.push(fieldnorm_id);
@@ -541,7 +541,7 @@ impl IndexMerger {
// The total number of tokens will only be exact when there has been no deletes. // The total number of tokens will only be exact when there has been no deletes.
// //
// Otherwise, we approximate by removing deleted documents proportionally. // Otherwise, we approximate by removing deleted documents proportionally.
let total_num_tokens: u64 = compute_total_num_tokens(&self.readers, indexed_field); let total_num_tokens: u64 = compute_total_num_tokens(&self.readers, indexed_field)?;
// Create the total list of doc ids // Create the total list of doc ids
// by stacking the doc ids from the different segment. // by stacking the doc ids from the different segment.
@@ -751,7 +751,7 @@ mod tests {
}; };
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
{ {
// writing the segment // writing the segment
{ {
@@ -803,7 +803,7 @@ mod tests {
let segment_ids = index let segment_ids = index
.searchable_segment_ids() .searchable_segment_ids()
.expect("Searchable segments failed."); .expect("Searchable segments failed.");
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
block_on(index_writer.merge(&segment_ids)).expect("Merging failed"); block_on(index_writer.merge(&segment_ids)).expect("Merging failed");
index_writer.wait_merging_threads().unwrap(); index_writer.wait_merging_threads().unwrap();
} }
@@ -904,7 +904,7 @@ mod tests {
let score_field = schema_builder.add_u64_field("score", score_fieldtype); let score_field = schema_builder.add_u64_field("score", score_fieldtype);
let bytes_score_field = schema_builder.add_bytes_field("score_bytes"); let bytes_score_field = schema_builder.add_bytes_field("score_bytes");
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
let reader = index.reader().unwrap(); let reader = index.reader().unwrap();
let search_term = |searcher: &Searcher, term: Term| { let search_term = |searcher: &Searcher, term: Term| {
let collector = FastFieldTestCollector::for_field(score_field); let collector = FastFieldTestCollector::for_field(score_field);
@@ -1211,7 +1211,7 @@ mod tests {
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let reader = index.reader().unwrap(); let reader = index.reader().unwrap();
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
let index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| { let index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| {
let mut doc = Document::default(); let mut doc = Document::default();
for facet in doc_facets { for facet in doc_facets {
@@ -1276,7 +1276,7 @@ mod tests {
let segment_ids = index let segment_ids = index
.searchable_segment_ids() .searchable_segment_ids()
.expect("Searchable segments failed."); .expect("Searchable segments failed.");
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
block_on(index_writer.merge(&segment_ids)).expect("Merging failed"); block_on(index_writer.merge(&segment_ids)).expect("Merging failed");
index_writer.wait_merging_threads().unwrap(); index_writer.wait_merging_threads().unwrap();
reader.reload().unwrap(); reader.reload().unwrap();
@@ -1295,7 +1295,7 @@ mod tests {
// Deleting one term // Deleting one term
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
let facet = Facet::from_path(vec!["top", "a", "firstdoc"]); let facet = Facet::from_path(vec!["top", "a", "firstdoc"]);
let facet_term = Term::from_facet(facet_field, &facet); let facet_term = Term::from_facet(facet_field, &facet);
index_writer.delete_term(facet_term); index_writer.delete_term(facet_term);
@@ -1320,7 +1320,7 @@ mod tests {
let mut schema_builder = schema::Schema::builder(); let mut schema_builder = schema::Schema::builder();
let int_field = schema_builder.add_u64_field("intvals", INDEXED); let int_field = schema_builder.add_u64_field("intvals", INDEXED);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(int_field => 1u64)); index_writer.add_document(doc!(int_field => 1u64));
index_writer.commit().expect("commit failed"); index_writer.commit().expect("commit failed");
index_writer.add_document(doc!(int_field => 1u64)); index_writer.add_document(doc!(int_field => 1u64));
@@ -1349,7 +1349,7 @@ mod tests {
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let reader = index.reader().unwrap(); let reader = index.reader().unwrap();
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
let mut doc = Document::default(); let mut doc = Document::default();
doc.add_u64(int_field, 1); doc.add_u64(int_field, 1);
index_writer.add_document(doc.clone()); index_writer.add_document(doc.clone());
@@ -1388,7 +1388,7 @@ mod tests {
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| { let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| {
let mut doc = Document::default(); let mut doc = Document::default();
for &val in int_vals { for &val in int_vals {
@@ -1462,7 +1462,7 @@ mod tests {
let segment_ids = index let segment_ids = index
.searchable_segment_ids() .searchable_segment_ids()
.expect("Searchable segments failed."); .expect("Searchable segments failed.");
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
assert!(block_on(index_writer.merge(&segment_ids)).is_ok()); assert!(block_on(index_writer.merge(&segment_ids)).is_ok());
assert!(index_writer.wait_merging_threads().is_ok()); assert!(index_writer.wait_merging_threads().is_ok());
} }
@@ -1516,7 +1516,7 @@ mod tests {
let index = Index::create_in_ram(builder.build()); let index = Index::create_in_ram(builder.build());
let mut writer = index.writer_with_num_threads(1, 3_000_000)?; let mut writer = index.writer_for_tests()?;
// Make sure we'll attempt to merge every created segment // Make sure we'll attempt to merge every created segment
let mut policy = crate::indexer::LogMergePolicy::default(); let mut policy = crate::indexer::LogMergePolicy::default();
@@ -1548,7 +1548,7 @@ mod tests {
let mut builder = schema::SchemaBuilder::new(); let mut builder = schema::SchemaBuilder::new();
let text = builder.add_text_field("text", TEXT); let text = builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(builder.build()); let index = Index::create_in_ram(builder.build());
let mut writer = index.writer_with_num_threads(1, 3_000_000)?; let mut writer = index.writer_for_tests()?;
let happy_term = Term::from_field_text(text, "happy"); let happy_term = Term::from_field_text(text, "happy");
let term_query = TermQuery::new(happy_term, IndexRecordOption::WithFreqs); let term_query = TermQuery::new(happy_term, IndexRecordOption::WithFreqs);
for _ in 0..62 { for _ in 0..62 {

View File

@@ -29,8 +29,9 @@ pub use self::segment_writer::SegmentWriter;
/// Alias for the default merge policy, which is the `LogMergePolicy`. /// Alias for the default merge policy, which is the `LogMergePolicy`.
pub type DefaultMergePolicy = LogMergePolicy; pub type DefaultMergePolicy = LogMergePolicy;
#[cfg(feature = "mmap")]
#[cfg(test)] #[cfg(test)]
mod tests { mod tests_mmap {
use crate::schema::{self, Schema}; use crate::schema::{self, Schema};
use crate::{Index, Term}; use crate::{Index, Term};
@@ -39,7 +40,7 @@ mod tests {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT); let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_from_tempdir(schema_builder.build()).unwrap(); let index = Index::create_from_tempdir(schema_builder.build()).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
// there must be one deleted document in the segment // there must be one deleted document in the segment
index_writer.add_document(doc!(text_field=>"b")); index_writer.add_document(doc!(text_field=>"b"));
index_writer.delete_term(Term::from_field_text(text_field, "b")); index_writer.delete_term(Term::from_field_text(text_field, "b"));

View File

@@ -555,7 +555,7 @@ mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(MergeWheneverPossible)); index_writer.set_merge_policy(Box::new(MergeWheneverPossible));
{ {
@@ -608,7 +608,7 @@ mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
{ {
for _ in 0..100 { for _ in 0..100 {
@@ -679,7 +679,7 @@ mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
{ {
for _ in 0..100 { for _ in 0..100 {

View File

@@ -17,7 +17,6 @@ use crate::tokenizer::{TokenStreamChain, Tokenizer};
use crate::Opstamp; use crate::Opstamp;
use crate::{DocId, SegmentComponent}; use crate::{DocId, SegmentComponent};
use std::io; use std::io;
use std::str;
/// Computes the initial size of the hash table. /// Computes the initial size of the hash table.
/// ///
@@ -48,6 +47,7 @@ pub struct SegmentWriter {
fieldnorms_writer: FieldNormsWriter, fieldnorms_writer: FieldNormsWriter,
doc_opstamps: Vec<Opstamp>, doc_opstamps: Vec<Opstamp>,
tokenizers: Vec<Option<TextAnalyzer>>, tokenizers: Vec<Option<TextAnalyzer>>,
term_buffer: Term,
} }
impl SegmentWriter { impl SegmentWriter {
@@ -91,6 +91,7 @@ impl SegmentWriter {
fast_field_writers: FastFieldsWriter::from_schema(schema), fast_field_writers: FastFieldsWriter::from_schema(schema),
doc_opstamps: Vec::with_capacity(1_000), doc_opstamps: Vec::with_capacity(1_000),
tokenizers, tokenizers,
term_buffer: Term::new(),
}) })
} }
@@ -128,30 +129,32 @@ impl SegmentWriter {
if !field_options.is_indexed() { if !field_options.is_indexed() {
continue; continue;
} }
let (term_buffer, multifield_postings) =
(&mut self.term_buffer, &mut self.multifield_postings);
match *field_options.field_type() { match *field_options.field_type() {
FieldType::HierarchicalFacet => { FieldType::HierarchicalFacet => {
let facets: Vec<&str> = field_values term_buffer.set_field(field);
.iter() let facets =
.flat_map(|field_value| match *field_value.value() { field_values
Value::Facet(ref facet) => Some(facet.encoded_str()), .iter()
_ => { .flat_map(|field_value| match *field_value.value() {
panic!("Expected hierarchical facet"); Value::Facet(ref facet) => Some(facet.encoded_str()),
} _ => {
}) panic!("Expected hierarchical facet");
.collect(); }
let mut term = Term::for_field(field); // we set the Term });
for fake_str in facets { for fake_str in facets {
let mut unordered_term_id_opt = None; let mut unordered_term_id_opt = None;
FacetTokenizer.token_stream(fake_str).process(&mut |token| { FacetTokenizer.token_stream(fake_str).process(&mut |token| {
term.set_text(&token.text); term_buffer.set_text(&token.text);
let unordered_term_id = let unordered_term_id =
self.multifield_postings.subscribe(doc_id, &term); multifield_postings.subscribe(doc_id, &term_buffer);
unordered_term_id_opt = Some(unordered_term_id); unordered_term_id_opt = Some(unordered_term_id);
}); });
if let Some(unordered_term_id) = unordered_term_id_opt { if let Some(unordered_term_id) = unordered_term_id_opt {
self.fast_field_writers self.fast_field_writers
.get_multivalue_writer(field) .get_multivalue_writer(field)
.expect("writer for facet missing") .expect("multified writer for facet missing")
.add_val(unordered_term_id); .add_val(unordered_term_id);
} }
} }
@@ -168,7 +171,6 @@ impl SegmentWriter {
if let Some(last_token) = tok_str.tokens.last() { if let Some(last_token) = tok_str.tokens.last() {
total_offset += last_token.offset_to; total_offset += last_token.offset_to;
} }
token_streams token_streams
.push(PreTokenizedStream::from(tok_str.clone()).into()); .push(PreTokenizedStream::from(tok_str.clone()).into());
} }
@@ -178,7 +180,6 @@ impl SegmentWriter {
{ {
offsets.push(total_offset); offsets.push(total_offset);
total_offset += text.len(); total_offset += text.len();
token_streams.push(tokenizer.token_stream(text)); token_streams.push(tokenizer.token_stream(text));
} }
} }
@@ -190,8 +191,12 @@ impl SegmentWriter {
0 0
} else { } else {
let mut token_stream = TokenStreamChain::new(offsets, token_streams); let mut token_stream = TokenStreamChain::new(offsets, token_streams);
self.multifield_postings multifield_postings.index_text(
.index_text(doc_id, field, &mut token_stream) doc_id,
field,
&mut token_stream,
term_buffer,
)
}; };
self.fieldnorms_writer.record(doc_id, field, num_tokens); self.fieldnorms_writer.record(doc_id, field, num_tokens);
@@ -199,44 +204,36 @@ impl SegmentWriter {
FieldType::U64(ref int_option) => { FieldType::U64(ref int_option) => {
if int_option.is_indexed() { if int_option.is_indexed() {
for field_value in field_values { for field_value in field_values {
let term = Term::from_field_u64( term_buffer.set_field(field_value.field());
field_value.field(), term_buffer.set_u64(field_value.value().u64_value());
field_value.value().u64_value(), multifield_postings.subscribe(doc_id, &term_buffer);
);
self.multifield_postings.subscribe(doc_id, &term);
} }
} }
} }
FieldType::Date(ref int_option) => { FieldType::Date(ref int_option) => {
if int_option.is_indexed() { if int_option.is_indexed() {
for field_value in field_values { for field_value in field_values {
let term = Term::from_field_i64( term_buffer.set_field(field_value.field());
field_value.field(), term_buffer.set_i64(field_value.value().date_value().timestamp());
field_value.value().date_value().timestamp(), multifield_postings.subscribe(doc_id, &term_buffer);
);
self.multifield_postings.subscribe(doc_id, &term);
} }
} }
} }
FieldType::I64(ref int_option) => { FieldType::I64(ref int_option) => {
if int_option.is_indexed() { if int_option.is_indexed() {
for field_value in field_values { for field_value in field_values {
let term = Term::from_field_i64( term_buffer.set_field(field_value.field());
field_value.field(), term_buffer.set_i64(field_value.value().i64_value());
field_value.value().i64_value(), multifield_postings.subscribe(doc_id, &term_buffer);
);
self.multifield_postings.subscribe(doc_id, &term);
} }
} }
} }
FieldType::F64(ref int_option) => { FieldType::F64(ref int_option) => {
if int_option.is_indexed() { if int_option.is_indexed() {
for field_value in field_values { for field_value in field_values {
let term = Term::from_field_f64( term_buffer.set_field(field_value.field());
field_value.field(), term_buffer.set_f64(field_value.value().f64_value());
field_value.value().f64_value(), multifield_postings.subscribe(doc_id, &term_buffer);
);
self.multifield_postings.subscribe(doc_id, &term);
} }
} }
} }

View File

@@ -245,18 +245,10 @@ pub type DocId = u32;
/// with opstamp `n+1`. /// with opstamp `n+1`.
pub type Opstamp = u64; pub type Opstamp = u64;
/// A Score that represents the relevance of the document to the query
///
/// This is modelled internally as a `f64`, because tantivy was compiled with the `scoref64`
/// feature. The larger the number, the more relevant the document is to the search query.
#[cfg(feature = "scoref64")]
pub type Score = f64;
/// A Score that represents the relevance of the document to the query /// A Score that represents the relevance of the document to the query
/// ///
/// This is modelled internally as a `f32`. The larger the number, the more relevant /// This is modelled internally as a `f32`. The larger the number, the more relevant
/// the document to the search query. /// the document to the search query.
#[cfg(not(feature = "scoref64"))]
pub type Score = f32; pub type Score = f32;
/// A `SegmentLocalId` identifies a segment. /// A `SegmentLocalId` identifies a segment.
@@ -296,7 +288,6 @@ mod tests {
use crate::schema::*; use crate::schema::*;
use crate::DocAddress; use crate::DocAddress;
use crate::Index; use crate::Index;
use crate::IndexWriter;
use crate::Postings; use crate::Postings;
use crate::ReloadPolicy; use crate::ReloadPolicy;
use rand::distributions::Bernoulli; use rand::distributions::Bernoulli;
@@ -361,14 +352,14 @@ mod tests {
#[test] #[test]
#[cfg(feature = "mmap")] #[cfg(feature = "mmap")]
fn test_indexing() { fn test_indexing() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_from_tempdir(schema).unwrap(); let index = Index::create_from_tempdir(schema).unwrap();
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests()?;
{ {
let doc = doc!(text_field=>"af b"); let doc = doc!(text_field=>"af b");
index_writer.add_document(doc); index_writer.add_document(doc);
@@ -383,29 +374,30 @@ mod tests {
} }
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
} }
Ok(())
} }
#[test] #[test]
fn test_docfreq1() { fn test_docfreq1() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests()?;
{ {
index_writer.add_document(doc!(text_field=>"a b c")); index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.commit().unwrap(); index_writer.commit()?;
} }
{ {
index_writer.add_document(doc!(text_field=>"a")); index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"a a")); index_writer.add_document(doc!(text_field=>"a a"));
index_writer.commit().unwrap(); index_writer.commit()?;
} }
{ {
index_writer.add_document(doc!(text_field=>"c")); index_writer.add_document(doc!(text_field=>"c"));
index_writer.commit().unwrap(); index_writer.commit()?;
} }
{ {
let reader = index.reader().unwrap(); let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
let term_a = Term::from_field_text(text_field, "a"); let term_a = Term::from_field_text(text_field, "a");
assert_eq!(searcher.doc_freq(&term_a), 3); assert_eq!(searcher.doc_freq(&term_a), 3);
@@ -416,67 +408,50 @@ mod tests {
let term_d = Term::from_field_text(text_field, "d"); let term_d = Term::from_field_text(text_field, "d");
assert_eq!(searcher.doc_freq(&term_d), 0); assert_eq!(searcher.doc_freq(&term_d), 0);
} }
Ok(())
} }
#[test] #[test]
fn test_fieldnorm_no_docs_with_field() { fn test_fieldnorm_no_docs_with_field() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let title_field = schema_builder.add_text_field("title", TEXT); let title_field = schema_builder.add_text_field("title", TEXT);
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.commit()?;
let index_reader = index.reader()?;
let searcher = index_reader.searcher();
let reader = searcher.segment_reader(0);
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let fieldnorm_reader = reader.get_fieldnorms_reader(text_field)?;
{ assert_eq!(fieldnorm_reader.fieldnorm(0), 3);
let doc = doc!(text_field=>"a b c");
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
} }
{ {
let index_reader = index.reader().unwrap(); let fieldnorm_reader = reader.get_fieldnorms_reader(title_field)?;
let searcher = index_reader.searcher(); assert_eq!(fieldnorm_reader.fieldnorm_id(0), 0);
let reader = searcher.segment_reader(0);
{
let fieldnorm_reader = reader.get_fieldnorms_reader(text_field);
assert_eq!(fieldnorm_reader.fieldnorm(0), 3);
}
{
let fieldnorm_reader = reader.get_fieldnorms_reader(title_field);
assert_eq!(fieldnorm_reader.fieldnorm_id(0), 0);
}
} }
Ok(())
} }
#[test] #[test]
fn test_fieldnorm() { fn test_fieldnorm() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
{ let mut index_writer = index.writer_for_tests()?;
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); index_writer.add_document(doc!(text_field=>"a b c"));
{ index_writer.add_document(doc!());
let doc = doc!(text_field=>"a b c"); index_writer.add_document(doc!(text_field=>"a b"));
index_writer.add_document(doc); index_writer.commit()?;
} let reader = index.reader()?;
{ let searcher = reader.searcher();
let doc = doc!(); let segment_reader: &SegmentReader = searcher.segment_reader(0);
index_writer.add_document(doc); let fieldnorms_reader = segment_reader.get_fieldnorms_reader(text_field)?;
} assert_eq!(fieldnorms_reader.fieldnorm(0), 3);
{ assert_eq!(fieldnorms_reader.fieldnorm(1), 0);
let doc = doc!(text_field=>"a b"); assert_eq!(fieldnorms_reader.fieldnorm(2), 2);
index_writer.add_document(doc); Ok(())
}
index_writer.commit().unwrap();
}
{
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let segment_reader: &SegmentReader = searcher.segment_reader(0);
let fieldnorms_reader = segment_reader.get_fieldnorms_reader(text_field);
assert_eq!(fieldnorms_reader.fieldnorm(0), 3);
assert_eq!(fieldnorms_reader.fieldnorm(1), 0);
assert_eq!(fieldnorms_reader.fieldnorm(2), 2);
}
} }
fn advance_undeleted(docset: &mut dyn DocSet, reader: &SegmentReader) -> bool { fn advance_undeleted(docset: &mut dyn DocSet, reader: &SegmentReader) -> bool {
@@ -491,7 +466,7 @@ mod tests {
} }
#[test] #[test]
fn test_delete_postings1() { fn test_delete_postings1() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let term_abcd = Term::from_field_text(text_field, "abcd"); let term_abcd = Term::from_field_text(text_field, "abcd");
@@ -507,7 +482,7 @@ mod tests {
.unwrap(); .unwrap();
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests()?;
// 0 // 0
index_writer.add_document(doc!(text_field=>"a b")); index_writer.add_document(doc!(text_field=>"a b"));
// 1 // 1
@@ -523,10 +498,10 @@ mod tests {
index_writer.add_document(doc!(text_field=>" b c")); index_writer.add_document(doc!(text_field=>" b c"));
// 5 // 5
index_writer.add_document(doc!(text_field=>" a")); index_writer.add_document(doc!(text_field=>" a"));
index_writer.commit().unwrap(); index_writer.commit()?;
} }
{ {
reader.reload().unwrap(); reader.reload()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
let inverted_index = segment_reader.inverted_index(text_field); let inverted_index = segment_reader.inverted_index(text_field);
@@ -554,15 +529,15 @@ mod tests {
} }
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests()?;
// 0 // 0
index_writer.add_document(doc!(text_field=>"a b")); index_writer.add_document(doc!(text_field=>"a b"));
// 1 // 1
index_writer.delete_term(Term::from_field_text(text_field, "c")); index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.rollback().unwrap(); index_writer.rollback()?;
} }
{ {
reader.reload().unwrap(); reader.reload()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
let seg_reader = searcher.segment_reader(0); let seg_reader = searcher.segment_reader(0);
let inverted_index = seg_reader.inverted_index(term_abcd.field()); let inverted_index = seg_reader.inverted_index(term_abcd.field());
@@ -591,15 +566,15 @@ mod tests {
} }
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b")); index_writer.add_document(doc!(text_field=>"a b"));
index_writer.delete_term(Term::from_field_text(text_field, "c")); index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.rollback().unwrap(); index_writer.rollback()?;
index_writer.delete_term(Term::from_field_text(text_field, "a")); index_writer.delete_term(Term::from_field_text(text_field, "a"));
index_writer.commit().unwrap(); index_writer.commit()?;
} }
{ {
reader.reload().unwrap(); reader.reload()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
let inverted_index = segment_reader.inverted_index(term_abcd.field()); let inverted_index = segment_reader.inverted_index(term_abcd.field());
@@ -631,19 +606,20 @@ mod tests {
assert!(!advance_undeleted(&mut postings, segment_reader)); assert!(!advance_undeleted(&mut postings, segment_reader));
} }
} }
Ok(())
} }
#[test] #[test]
fn test_indexed_u64() { fn test_indexed_u64() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field("value", INDEXED); let field = schema_builder.add_u64_field("value", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(field=>1u64)); index_writer.add_document(doc!(field=>1u64));
index_writer.commit().unwrap(); index_writer.commit()?;
let reader = index.reader().unwrap(); let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
let term = Term::from_field_u64(field, 1u64); let term = Term::from_field_u64(field, 1u64);
let mut postings = searcher let mut postings = searcher
@@ -653,20 +629,21 @@ mod tests {
.unwrap(); .unwrap();
assert_eq!(postings.doc(), 0); assert_eq!(postings.doc(), 0);
assert_eq!(postings.advance(), TERMINATED); assert_eq!(postings.advance(), TERMINATED);
Ok(())
} }
#[test] #[test]
fn test_indexed_i64() { fn test_indexed_i64() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let value_field = schema_builder.add_i64_field("value", INDEXED); let value_field = schema_builder.add_i64_field("value", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests()?;
let negative_val = -1i64; let negative_val = -1i64;
index_writer.add_document(doc!(value_field => negative_val)); index_writer.add_document(doc!(value_field => negative_val));
index_writer.commit().unwrap(); index_writer.commit()?;
let reader = index.reader().unwrap(); let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
let term = Term::from_field_i64(value_field, negative_val); let term = Term::from_field_i64(value_field, negative_val);
let mut postings = searcher let mut postings = searcher
@@ -676,20 +653,21 @@ mod tests {
.unwrap(); .unwrap();
assert_eq!(postings.doc(), 0); assert_eq!(postings.doc(), 0);
assert_eq!(postings.advance(), TERMINATED); assert_eq!(postings.advance(), TERMINATED);
Ok(())
} }
#[test] #[test]
fn test_indexed_f64() { fn test_indexed_f64() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let value_field = schema_builder.add_f64_field("value", INDEXED); let value_field = schema_builder.add_f64_field("value", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests()?;
let val = std::f64::consts::PI; let val = std::f64::consts::PI;
index_writer.add_document(doc!(value_field => val)); index_writer.add_document(doc!(value_field => val));
index_writer.commit().unwrap(); index_writer.commit()?;
let reader = index.reader().unwrap(); let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
let term = Term::from_field_f64(value_field, val); let term = Term::from_field_f64(value_field, val);
let mut postings = searcher let mut postings = searcher
@@ -699,26 +677,29 @@ mod tests {
.unwrap(); .unwrap();
assert_eq!(postings.doc(), 0); assert_eq!(postings.doc(), 0);
assert_eq!(postings.advance(), TERMINATED); assert_eq!(postings.advance(), TERMINATED);
Ok(())
} }
#[test] #[test]
fn test_indexedfield_not_in_documents() { fn test_indexedfield_not_in_documents() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let absent_field = schema_builder.add_text_field("text", TEXT); let absent_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(2, 6_000_000).unwrap(); let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a")); index_writer.add_document(doc!(text_field=>"a"));
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
let reader = index.reader().unwrap(); let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
segment_reader.inverted_index(absent_field); //< should not panic let inverted_index = segment_reader.inverted_index(absent_field); //< should not panic
assert_eq!(inverted_index.terms().num_terms(), 0);
Ok(())
} }
#[test] #[test]
fn test_delete_postings2() { fn test_delete_postings2() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
@@ -726,53 +707,40 @@ mod tests {
let reader = index let reader = index
.reader_builder() .reader_builder()
.reload_policy(ReloadPolicy::Manual) .reload_policy(ReloadPolicy::Manual)
.try_into() .try_into()?;
.unwrap();
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(2, 6_000_000).unwrap(); let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"63"));
let add_document = |index_writer: &mut IndexWriter, val: &'static str| { index_writer.add_document(doc!(text_field=>"70"));
let doc = doc!(text_field=>val); index_writer.add_document(doc!(text_field=>"34"));
index_writer.add_document(doc); index_writer.add_document(doc!(text_field=>"1"));
}; index_writer.add_document(doc!(text_field=>"38"));
index_writer.add_document(doc!(text_field=>"33"));
let remove_document = |index_writer: &mut IndexWriter, val: &'static str| { index_writer.add_document(doc!(text_field=>"40"));
let delterm = Term::from_field_text(text_field, val); index_writer.add_document(doc!(text_field=>"17"));
index_writer.delete_term(delterm); index_writer.delete_term(Term::from_field_text(text_field, "38"));
}; index_writer.delete_term(Term::from_field_text(text_field, "34"));
index_writer.commit()?;
add_document(&mut index_writer, "63"); reader.reload()?;
add_document(&mut index_writer, "70"); assert_eq!(reader.searcher().num_docs(), 6);
add_document(&mut index_writer, "34"); Ok(())
add_document(&mut index_writer, "1");
add_document(&mut index_writer, "38");
add_document(&mut index_writer, "33");
add_document(&mut index_writer, "40");
add_document(&mut index_writer, "17");
remove_document(&mut index_writer, "38");
remove_document(&mut index_writer, "34");
index_writer.commit().unwrap();
reader.reload().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 6);
} }
#[test] #[test]
fn test_termfreq() { fn test_termfreq() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests()?;
let doc = doc!(text_field=>"af af af bc bc"); index_writer.add_document(doc!(text_field=>"af af af bc bc"));
index_writer.add_document(doc); index_writer.commit()?;
index_writer.commit().unwrap();
} }
{ {
let index_reader = index.reader().unwrap(); let index_reader = index.reader()?;
let searcher = index_reader.searcher(); let searcher = index_reader.searcher();
let reader = searcher.segment_reader(0); let reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(text_field); let inverted_index = reader.inverted_index(text_field);
@@ -788,63 +756,63 @@ mod tests {
assert_eq!(postings.term_freq(), 3); assert_eq!(postings.term_freq(), 3);
assert_eq!(postings.advance(), TERMINATED); assert_eq!(postings.advance(), TERMINATED);
} }
Ok(())
} }
#[test] #[test]
fn test_searcher_1() { fn test_searcher_1() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let reader = index.reader().unwrap(); let reader = index.reader()?;
{ // writing the segment
// writing the segment let mut index_writer = index.writer_for_tests()?;
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); index_writer.add_document(doc!(text_field=>"af af af b"));
index_writer.add_document(doc!(text_field=>"af af af b")); index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.add_document(doc!(text_field=>"a b c")); index_writer.add_document(doc!(text_field=>"a b c d"));
index_writer.add_document(doc!(text_field=>"a b c d")); index_writer.commit()?;
index_writer.commit().unwrap();
} reader.reload()?;
{ let searcher = reader.searcher();
reader.reload().unwrap(); let get_doc_ids = |terms: Vec<Term>| {
let searcher = reader.searcher(); let query = BooleanQuery::new_multiterms_query(terms);
let get_doc_ids = |terms: Vec<Term>| { searcher
let query = BooleanQuery::new_multiterms_query(terms); .search(&query, &TEST_COLLECTOR_WITH_SCORE)
let topdocs = searcher.search(&query, &TEST_COLLECTOR_WITH_SCORE).unwrap(); .map(|topdocs| topdocs.docs().to_vec())
topdocs.docs().to_vec() };
}; assert_eq!(
assert_eq!( get_doc_ids(vec![Term::from_field_text(text_field, "a")])?,
get_doc_ids(vec![Term::from_field_text(text_field, "a")]), vec![DocAddress(0, 1), DocAddress(0, 2)]
vec![DocAddress(0, 1), DocAddress(0, 2)] );
); assert_eq!(
assert_eq!( get_doc_ids(vec![Term::from_field_text(text_field, "af")])?,
get_doc_ids(vec![Term::from_field_text(text_field, "af")]), vec![DocAddress(0, 0)]
vec![DocAddress(0, 0)] );
); assert_eq!(
assert_eq!( get_doc_ids(vec![Term::from_field_text(text_field, "b")])?,
get_doc_ids(vec![Term::from_field_text(text_field, "b")]), vec![DocAddress(0, 0), DocAddress(0, 1), DocAddress(0, 2)]
vec![DocAddress(0, 0), DocAddress(0, 1), DocAddress(0, 2)] );
); assert_eq!(
assert_eq!( get_doc_ids(vec![Term::from_field_text(text_field, "c")])?,
get_doc_ids(vec![Term::from_field_text(text_field, "c")]), vec![DocAddress(0, 1), DocAddress(0, 2)]
vec![DocAddress(0, 1), DocAddress(0, 2)] );
); assert_eq!(
assert_eq!( get_doc_ids(vec![Term::from_field_text(text_field, "d")])?,
get_doc_ids(vec![Term::from_field_text(text_field, "d")]), vec![DocAddress(0, 2)]
vec![DocAddress(0, 2)] );
); assert_eq!(
assert_eq!( get_doc_ids(vec![
get_doc_ids(vec![ Term::from_field_text(text_field, "b"),
Term::from_field_text(text_field, "b"), Term::from_field_text(text_field, "a"),
Term::from_field_text(text_field, "a"), ])?,
]), vec![DocAddress(0, 0), DocAddress(0, 1), DocAddress(0, 2)]
vec![DocAddress(0, 0), DocAddress(0, 1), DocAddress(0, 2)] );
); Ok(())
}
} }
#[test] #[test]
fn test_searcher_2() { fn test_searcher_2() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
@@ -852,19 +820,17 @@ mod tests {
let reader = index let reader = index
.reader_builder() .reader_builder()
.reload_policy(ReloadPolicy::Manual) .reload_policy(ReloadPolicy::Manual)
.try_into() .try_into()?;
.unwrap();
assert_eq!(reader.searcher().num_docs(), 0u64); assert_eq!(reader.searcher().num_docs(), 0u64);
{ // writing the segment
// writing the segment let mut index_writer = index.writer_for_tests()?;
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); index_writer.add_document(doc!(text_field=>"af b"));
index_writer.add_document(doc!(text_field=>"af b")); index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.add_document(doc!(text_field=>"a b c")); index_writer.add_document(doc!(text_field=>"a b c d"));
index_writer.add_document(doc!(text_field=>"a b c d")); index_writer.commit()?;
index_writer.commit().unwrap(); reader.reload()?;
}
reader.reload().unwrap();
assert_eq!(reader.searcher().num_docs(), 3u64); assert_eq!(reader.searcher().num_docs(), 3u64);
Ok(())
} }
#[test] #[test]
@@ -886,7 +852,7 @@ mod tests {
} }
#[test] #[test]
fn test_wrong_fast_field_type() { fn test_wrong_fast_field_type() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let fast_field_unsigned = schema_builder.add_u64_field("unsigned", FAST); let fast_field_unsigned = schema_builder.add_u64_field("unsigned", FAST);
let fast_field_signed = schema_builder.add_i64_field("signed", FAST); let fast_field_signed = schema_builder.add_i64_field("signed", FAST);
@@ -896,14 +862,14 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 50_000_000).unwrap(); let mut index_writer = index.writer_for_tests()?;
{ {
let document = let document =
doc!(fast_field_unsigned => 4u64, fast_field_signed=>4i64, fast_field_float=>4f64); doc!(fast_field_unsigned => 4u64, fast_field_signed=>4i64, fast_field_float=>4f64);
index_writer.add_document(document); index_writer.add_document(document);
index_writer.commit().unwrap(); index_writer.commit()?;
} }
let reader = index.reader().unwrap(); let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
let segment_reader: &SegmentReader = searcher.segment_reader(0); let segment_reader: &SegmentReader = searcher.segment_reader(0);
{ {
@@ -942,11 +908,12 @@ mod tests {
let fast_field_reader = fast_field_reader_opt.unwrap(); let fast_field_reader = fast_field_reader_opt.unwrap();
assert_eq!(fast_field_reader.get(0), 4f64) assert_eq!(fast_field_reader.get(0), 4f64)
} }
Ok(())
} }
// motivated by #729 // motivated by #729
#[test] #[test]
fn test_update_via_delete_insert() { fn test_update_via_delete_insert() -> crate::Result<()> {
use crate::collector::Count; use crate::collector::Count;
use crate::indexer::NoMergePolicy; use crate::indexer::NoMergePolicy;
use crate::query::AllQuery; use crate::query::AllQuery;
@@ -960,17 +927,17 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone()); let index = Index::create_in_ram(schema.clone());
let index_reader = index.reader().unwrap(); let index_reader = index.reader()?;
let mut index_writer = index.writer(3_000_000).unwrap(); let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy)); index_writer.set_merge_policy(Box::new(NoMergePolicy));
for doc_id in 0u64..DOC_COUNT { for doc_id in 0u64..DOC_COUNT {
index_writer.add_document(doc!(id => doc_id)); index_writer.add_document(doc!(id => doc_id));
} }
index_writer.commit().unwrap(); index_writer.commit()?;
index_reader.reload().unwrap(); index_reader.reload()?;
let searcher = index_reader.searcher(); let searcher = index_reader.searcher();
assert_eq!( assert_eq!(
@@ -981,12 +948,11 @@ mod tests {
// update the 10 elements by deleting and re-adding // update the 10 elements by deleting and re-adding
for doc_id in 0u64..DOC_COUNT { for doc_id in 0u64..DOC_COUNT {
index_writer.delete_term(Term::from_field_u64(id, doc_id)); index_writer.delete_term(Term::from_field_u64(id, doc_id));
index_writer.commit().unwrap(); index_writer.commit()?;
index_reader.reload().unwrap(); index_reader.reload()?;
let doc = doc!(id => doc_id); index_writer.add_document(doc!(id => doc_id));
index_writer.add_document(doc); index_writer.commit()?;
index_writer.commit().unwrap(); index_reader.reload()?;
index_reader.reload().unwrap();
let searcher = index_reader.searcher(); let searcher = index_reader.searcher();
// The number of document should be stable. // The number of document should be stable.
assert_eq!( assert_eq!(
@@ -995,7 +961,7 @@ mod tests {
); );
} }
index_reader.reload().unwrap(); index_reader.reload()?;
let searcher = index_reader.searcher(); let searcher = index_reader.searcher();
let segment_ids: Vec<SegmentId> = searcher let segment_ids: Vec<SegmentId> = searcher
.segment_readers() .segment_readers()
@@ -1004,12 +970,18 @@ mod tests {
.collect(); .collect();
block_on(index_writer.merge(&segment_ids)).unwrap(); block_on(index_writer.merge(&segment_ids)).unwrap();
index_reader.reload().unwrap(); index_reader.reload()?;
let searcher = index_reader.searcher(); let searcher = index_reader.searcher();
assert_eq!(searcher.search(&AllQuery, &Count)?, DOC_COUNT as usize);
Ok(())
}
assert_eq!( #[test]
searcher.search(&AllQuery, &Count).unwrap(), fn test_validate_checksum() -> crate::Result<()> {
DOC_COUNT as usize let index_path = tempfile::tempdir().expect("dir");
); let schema = Schema::builder().build();
let index = Index::create_in_dir(&index_path, schema)?;
assert!(index.validate_checksum()?.is_empty());
Ok(())
} }
} }

View File

@@ -455,7 +455,7 @@ mod tests {
let int_field = schema_builder.add_u64_field("id", INDEXED); let int_field = schema_builder.add_u64_field("id", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
let mut last_doc = 0u32; let mut last_doc = 0u32;
for &doc in docs { for &doc in docs {
for _ in last_doc..doc { for _ in last_doc..doc {
@@ -496,7 +496,7 @@ mod tests {
let int_field = schema_builder.add_u64_field("id", INDEXED); let int_field = schema_builder.add_u64_field("id", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
// create two postings list, one containg even number, // create two postings list, one containg even number,
// the other containing odd numbers. // the other containing odd numbers.
for i in 0..6 { for i in 0..6 {

View File

@@ -310,6 +310,7 @@ pub mod tests {
mod bench { mod bench {
use super::*; use super::*;
use crate::TERMINATED;
use rand::rngs::StdRng; use rand::rngs::StdRng;
use rand::Rng; use rand::Rng;
use rand::SeedableRng; use rand::SeedableRng;
@@ -340,7 +341,7 @@ mod bench {
let mut encoder = BlockEncoder::new(); let mut encoder = BlockEncoder::new();
let data = generate_array(COMPRESSION_BLOCK_SIZE, 0.1); let data = generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
let (num_bits, compressed) = encoder.compress_block_sorted(&data, 0u32); let (num_bits, compressed) = encoder.compress_block_sorted(&data, 0u32);
let mut decoder = BlockDecoder::new(); let mut decoder = BlockDecoder::default();
b.iter(|| { b.iter(|| {
decoder.uncompress_block_sorted(compressed, 0u32, num_bits); decoder.uncompress_block_sorted(compressed, 0u32, num_bits);
}); });
@@ -375,9 +376,9 @@ mod bench {
let mut encoder = BlockEncoder::new(); let mut encoder = BlockEncoder::new();
let data = generate_array(NUM_INTS_BENCH_VINT, 0.001); let data = generate_array(NUM_INTS_BENCH_VINT, 0.001);
let compressed = encoder.compress_vint_sorted(&data, 0u32); let compressed = encoder.compress_vint_sorted(&data, 0u32);
let mut decoder = BlockDecoder::new(); let mut decoder = BlockDecoder::default();
b.iter(|| { b.iter(|| {
decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT); decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT, TERMINATED);
}); });
} }
} }

View File

@@ -5,6 +5,7 @@ Postings module (also called inverted index)
mod block_search; mod block_search;
mod block_segment_postings; mod block_segment_postings;
pub(crate) mod compression; pub(crate) mod compression;
mod field_stats;
mod postings; mod postings;
mod postings_writer; mod postings_writer;
mod recorder; mod recorder;
@@ -15,6 +16,7 @@ mod stacker;
mod term_info; mod term_info;
pub(crate) use self::block_search::BlockSearcher; pub(crate) use self::block_search::BlockSearcher;
pub(crate) use self::field_stats::{FieldStat, FieldStats};
pub(crate) use self::postings_writer::MultiFieldPostingsWriter; pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer}; pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
@@ -91,7 +93,7 @@ pub mod tests {
let title = schema_builder.add_text_field("title", TEXT); let title = schema_builder.add_text_field("title", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 30_000_000)?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(title => r#"abc abc abc"#)); index_writer.add_document(doc!(title => r#"abc abc abc"#));
index_writer.add_document(doc!(title => r#"abc be be be be abc"#)); index_writer.add_document(doc!(title => r#"abc be be be be abc"#));
for _ in 0..1_000 { for _ in 0..1_000 {
@@ -176,7 +178,7 @@ pub mod tests {
.tokenizers() .tokenizers()
.register("simple_no_truncation", SimpleTokenizer); .register("simple_no_truncation", SimpleTokenizer);
let reader = index.reader().unwrap(); let reader = index.reader().unwrap();
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy)); index_writer.set_merge_policy(Box::new(NoMergePolicy));
{ {
index_writer.add_document(doc!(text_field=>exceeding_token_text)); index_writer.add_document(doc!(text_field=>exceeding_token_text));
@@ -205,7 +207,7 @@ pub mod tests {
} }
#[test] #[test]
pub fn test_position_and_fieldnorm1() { pub fn test_position_and_fieldnorm1() -> crate::Result<()> {
let mut positions = Vec::new(); let mut positions = Vec::new();
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
@@ -217,42 +219,38 @@ pub mod tests {
let mut segment_writer = let mut segment_writer =
SegmentWriter::for_segment(3_000_000, segment.clone(), &schema).unwrap(); SegmentWriter::for_segment(3_000_000, segment.clone(), &schema).unwrap();
{ {
let mut doc = Document::default();
// checking that position works if the field has two values // checking that position works if the field has two values
doc.add_text(text_field, "a b a c a d a a.");
doc.add_text(text_field, "d d d d a");
let op = AddOperation { let op = AddOperation {
opstamp: 0u64, opstamp: 0u64,
document: doc, document: doc!(
text_field => "a b a c a d a a.",
text_field => "d d d d a"
),
}; };
segment_writer.add_document(op, &schema).unwrap(); segment_writer.add_document(op, &schema)?;
} }
{ {
let mut doc = Document::default();
doc.add_text(text_field, "b a");
let op = AddOperation { let op = AddOperation {
opstamp: 1u64, opstamp: 1u64,
document: doc, document: doc!(text_field => "b a"),
}; };
segment_writer.add_document(op, &schema).unwrap(); segment_writer.add_document(op, &schema).unwrap();
} }
for i in 2..1000 { for i in 2..1000 {
let mut doc = Document::default(); let mut text: String = iter::repeat("e ").take(i).collect();
let mut text = iter::repeat("e ").take(i).collect::<String>();
text.push_str(" a"); text.push_str(" a");
doc.add_text(text_field, &text);
let op = AddOperation { let op = AddOperation {
opstamp: 2u64, opstamp: 2u64,
document: doc, document: doc!(text_field => text),
}; };
segment_writer.add_document(op, &schema).unwrap(); segment_writer.add_document(op, &schema).unwrap();
} }
segment_writer.finalize().unwrap(); segment_writer.finalize()?;
} }
{ {
let segment_reader = SegmentReader::open(&segment).unwrap(); let segment_reader = SegmentReader::open(&segment)?;
{ {
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text_field); let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text_field)?;
assert_eq!(fieldnorm_reader.fieldnorm(0), 8 + 5); assert_eq!(fieldnorm_reader.fieldnorm(0), 8 + 5);
assert_eq!(fieldnorm_reader.fieldnorm(1), 2); assert_eq!(fieldnorm_reader.fieldnorm(1), 2);
for i in 2..1000 { for i in 2..1000 {
@@ -312,6 +310,7 @@ pub mod tests {
assert_eq!(postings_e.doc(), TERMINATED); assert_eq!(postings_e.doc(), TERMINATED);
} }
} }
Ok(())
} }
#[test] #[test]
@@ -322,7 +321,7 @@ pub mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field => "g b b d c g c")); index_writer.add_document(doc!(text_field => "g b b d c g c"));
index_writer.add_document(doc!(text_field => "g a b b a d c g c")); index_writer.add_document(doc!(text_field => "g a b b a d c g c"));
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
@@ -354,7 +353,7 @@ pub mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
for i in 0u64..num_docs as u64 { for i in 0u64..num_docs as u64 {
let doc = doc!(value_field => 2u64, value_field => i % 2u64); let doc = doc!(value_field => 2u64, value_field => i % 2u64);
index_writer.add_document(doc); index_writer.add_document(doc);
@@ -425,7 +424,7 @@ pub mod tests {
// delete some of the documents // delete some of the documents
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.delete_term(term_0); index_writer.delete_term(term_0);
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
} }
@@ -479,7 +478,7 @@ pub mod tests {
// delete everything else // delete everything else
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.delete_term(term_1); index_writer.delete_term(term_1);
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
} }
@@ -522,7 +521,7 @@ pub mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let posting_list_size = 1_000_000; let posting_list_size = 1_000_000;
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
for _ in 0..posting_list_size { for _ in 0..posting_list_size {
let mut doc = Document::default(); let mut doc = Document::default();
if rng.gen_bool(1f64 / 15f64) { if rng.gen_bool(1f64 / 15f64) {
@@ -730,7 +729,7 @@ mod bench {
let mut s = 0u32; let mut s = 0u32;
while segment_postings.doc() != TERMINATED { while segment_postings.doc() != TERMINATED {
s += (segment_postings.doc() & n) % 1024; s += (segment_postings.doc() & n) % 1024;
segment_postings.advance() segment_postings.advance();
} }
s s
}); });

View File

@@ -105,6 +105,7 @@ impl MultiFieldPostingsWriter {
doc: DocId, doc: DocId,
field: Field, field: Field,
token_stream: &mut dyn TokenStream, token_stream: &mut dyn TokenStream,
term_buffer: &mut Term,
) -> u32 { ) -> u32 {
let postings_writer = let postings_writer =
self.per_field_postings_writers[field.field_id() as usize].deref_mut(); self.per_field_postings_writers[field.field_id() as usize].deref_mut();
@@ -114,6 +115,7 @@ impl MultiFieldPostingsWriter {
field, field,
token_stream, token_stream,
&mut self.heap, &mut self.heap,
term_buffer,
) )
} }
@@ -220,13 +222,20 @@ pub trait PostingsWriter {
field: Field, field: Field,
token_stream: &mut dyn TokenStream, token_stream: &mut dyn TokenStream,
heap: &mut MemoryArena, heap: &mut MemoryArena,
term_buffer: &mut Term,
) -> u32 { ) -> u32 {
let mut term = Term::for_field(field); term_buffer.set_field(field);
let mut sink = |token: &Token| { let mut sink = |token: &Token| {
// We skip all tokens with a len greater than u16. // We skip all tokens with a len greater than u16.
if token.text.len() <= MAX_TOKEN_LEN { if token.text.len() <= MAX_TOKEN_LEN {
term.set_text(token.text.as_str()); term_buffer.set_text(token.text.as_str());
self.subscribe(term_index, doc_id, token.position as u32, &term, heap); self.subscribe(
term_index,
doc_id,
token.position as u32,
&term_buffer,
heap,
);
} else { } else {
info!( info!(
"A token exceeding MAX_TOKEN_LEN ({}>{}) was dropped. Search for \ "A token exceeding MAX_TOKEN_LEN ({}>{}) was dropped. Search for \

View File

@@ -114,7 +114,7 @@ impl SegmentPostings {
.iter() .iter()
.map(|&fieldnorm| fieldnorm as u64) .map(|&fieldnorm| fieldnorm as u64)
.sum::<u64>(); .sum::<u64>();
total_num_tokens as Score / fieldnorms.len() as f32 total_num_tokens as Score / fieldnorms.len() as Score
}) })
.unwrap_or(0.0); .unwrap_or(0.0);
let mut postings_serializer = PostingsSerializer::new( let mut postings_serializer = PostingsSerializer::new(

View File

@@ -1,5 +1,4 @@
use super::TermInfo; use super::{FieldStat, FieldStats, TermInfo};
use crate::common::{BinarySerializable, VInt};
use crate::common::{CompositeWrite, CountingWriter}; use crate::common::{CompositeWrite, CountingWriter};
use crate::core::Segment; use crate::core::Segment;
use crate::directory::WritePtr; use crate::directory::WritePtr;
@@ -11,6 +10,10 @@ use crate::query::BM25Weight;
use crate::schema::Schema; use crate::schema::Schema;
use crate::schema::{Field, FieldEntry, FieldType}; use crate::schema::{Field, FieldEntry, FieldType};
use crate::termdict::{TermDictionaryBuilder, TermOrdinal}; use crate::termdict::{TermDictionaryBuilder, TermOrdinal};
use crate::{
common::{BinarySerializable, VInt},
directory::TerminatingWrite,
};
use crate::{DocId, Score}; use crate::{DocId, Score};
use std::cmp::Ordering; use std::cmp::Ordering;
use std::io::{self, Write}; use std::io::{self, Write};
@@ -51,6 +54,8 @@ pub struct InvertedIndexSerializer {
postings_write: CompositeWrite<WritePtr>, postings_write: CompositeWrite<WritePtr>,
positions_write: CompositeWrite<WritePtr>, positions_write: CompositeWrite<WritePtr>,
positionsidx_write: CompositeWrite<WritePtr>, positionsidx_write: CompositeWrite<WritePtr>,
field_stats: FieldStats,
field_stats_write: WritePtr,
schema: Schema, schema: Schema,
} }
@@ -61,6 +66,7 @@ impl InvertedIndexSerializer {
postings_write: CompositeWrite<WritePtr>, postings_write: CompositeWrite<WritePtr>,
positions_write: CompositeWrite<WritePtr>, positions_write: CompositeWrite<WritePtr>,
positionsidx_write: CompositeWrite<WritePtr>, positionsidx_write: CompositeWrite<WritePtr>,
field_stats_write: WritePtr,
schema: Schema, schema: Schema,
) -> crate::Result<InvertedIndexSerializer> { ) -> crate::Result<InvertedIndexSerializer> {
Ok(InvertedIndexSerializer { Ok(InvertedIndexSerializer {
@@ -68,18 +74,21 @@ impl InvertedIndexSerializer {
postings_write, postings_write,
positions_write, positions_write,
positionsidx_write, positionsidx_write,
field_stats: FieldStats::default(),
field_stats_write,
schema, schema,
}) })
} }
/// Open a new `PostingsSerializer` for the given segment /// Open a new `PostingsSerializer` for the given segment
pub fn open(segment: &mut Segment) -> crate::Result<InvertedIndexSerializer> { pub fn open(segment: &mut Segment) -> crate::Result<InvertedIndexSerializer> {
use crate::SegmentComponent::{POSITIONS, POSITIONSSKIP, POSTINGS, TERMS}; use crate::SegmentComponent::{FIELDSTATS, POSITIONS, POSITIONSSKIP, POSTINGS, TERMS};
InvertedIndexSerializer::create( InvertedIndexSerializer::create(
CompositeWrite::wrap(segment.open_write(TERMS)?), CompositeWrite::wrap(segment.open_write(TERMS)?),
CompositeWrite::wrap(segment.open_write(POSTINGS)?), CompositeWrite::wrap(segment.open_write(POSTINGS)?),
CompositeWrite::wrap(segment.open_write(POSITIONS)?), CompositeWrite::wrap(segment.open_write(POSITIONS)?),
CompositeWrite::wrap(segment.open_write(POSITIONSSKIP)?), CompositeWrite::wrap(segment.open_write(POSITIONSSKIP)?),
segment.open_write(FIELDSTATS)?,
segment.schema(), segment.schema(),
) )
} }
@@ -94,6 +103,8 @@ impl InvertedIndexSerializer {
total_num_tokens: u64, total_num_tokens: u64,
fieldnorm_reader: Option<FieldNormReader>, fieldnorm_reader: Option<FieldNormReader>,
) -> io::Result<FieldSerializer<'_>> { ) -> io::Result<FieldSerializer<'_>> {
self.field_stats
.insert(field, FieldStat::new(total_num_tokens));
let field_entry: &FieldEntry = self.schema.get_field_entry(field); let field_entry: &FieldEntry = self.schema.get_field_entry(field);
let term_dictionary_write = self.terms_write.for_field(field); let term_dictionary_write = self.terms_write.for_field(field);
let postings_write = self.postings_write.for_field(field); let postings_write = self.postings_write.for_field(field);
@@ -112,7 +123,10 @@ impl InvertedIndexSerializer {
} }
/// Closes the serializer. /// Closes the serializer.
pub fn close(self) -> io::Result<()> { pub fn close(mut self) -> io::Result<()> {
self.field_stats
.serialize(self.field_stats_write.get_mut())?;
self.field_stats_write.terminate()?;
self.terms_write.close()?; self.terms_write.close()?;
self.postings_write.close()?; self.postings_write.close()?;
self.positions_write.close()?; self.positions_write.close()?;
@@ -142,7 +156,6 @@ impl<'a> FieldSerializer<'a> {
positionsidx_write: &'a mut CountingWriter<WritePtr>, positionsidx_write: &'a mut CountingWriter<WritePtr>,
fieldnorm_reader: Option<FieldNormReader>, fieldnorm_reader: Option<FieldNormReader>,
) -> io::Result<FieldSerializer<'a>> { ) -> io::Result<FieldSerializer<'a>> {
total_num_tokens.serialize(postings_write)?;
let (term_freq_enabled, position_enabled): (bool, bool) = match field_type { let (term_freq_enabled, position_enabled): (bool, bool) = match field_type {
FieldType::Str(ref text_options) => { FieldType::Str(ref text_options) => {
if let Some(text_indexing_options) = text_options.get_indexing_options() { if let Some(text_indexing_options) = text_options.get_indexing_options() {
@@ -190,7 +203,8 @@ impl<'a> FieldSerializer<'a> {
.unwrap_or(0u64); .unwrap_or(0u64);
TermInfo { TermInfo {
doc_freq: 0, doc_freq: 0,
postings_offset: self.postings_serializer.addr(), postings_start_offset: self.postings_serializer.addr(),
postings_end_offset: 0u64,
positions_idx, positions_idx,
} }
} }
@@ -244,10 +258,12 @@ impl<'a> FieldSerializer<'a> {
/// using `VInt` encoding. /// using `VInt` encoding.
pub fn close_term(&mut self) -> io::Result<()> { pub fn close_term(&mut self) -> io::Result<()> {
if self.term_open { if self.term_open {
self.term_dictionary_builder
.insert_value(&self.current_term_info)?;
self.postings_serializer self.postings_serializer
.close_term(self.current_term_info.doc_freq)?; .close_term(self.current_term_info.doc_freq)?;
let end_offset = self.postings_serializer.addr();
self.current_term_info.postings_end_offset = end_offset;
self.term_dictionary_builder
.insert_value(&self.current_term_info)?;
self.term_open = false; self.term_open = false;
} }
Ok(()) Ok(())

View File

@@ -7,35 +7,49 @@ use std::io;
pub struct TermInfo { pub struct TermInfo {
/// Number of documents in the segment containing the term /// Number of documents in the segment containing the term
pub doc_freq: u32, pub doc_freq: u32,
/// Start offset within the postings (`.idx`) file. /// Start offset of the posting list within the postings (`.idx`) file.
pub postings_offset: u64, pub postings_start_offset: u64,
/// End offset of the posting list within the postings (`.idx`) file.
pub postings_end_offset: u64,
/// Start offset of the first block within the position (`.pos`) file. /// Start offset of the first block within the position (`.pos`) file.
pub positions_idx: u64, pub positions_idx: u64,
} }
impl TermInfo {
pub(crate) fn posting_num_bytes(&self) -> u32 {
let num_bytes = self.postings_end_offset - self.postings_start_offset;
assert!(num_bytes <= std::u32::MAX as u64);
num_bytes as u32
}
}
impl FixedSize for TermInfo { impl FixedSize for TermInfo {
/// Size required for the binary serialization of a `TermInfo` object. /// Size required for the binary serialization of a `TermInfo` object.
/// This is large, but in practise, `TermInfo` are encoded in blocks and /// This is large, but in practise, `TermInfo` are encoded in blocks and
/// only the first `TermInfo` of a block is serialized uncompressed. /// only the first `TermInfo` of a block is serialized uncompressed.
/// The subsequent `TermInfo` are delta encoded and bitpacked. /// The subsequent `TermInfo` are delta encoded and bitpacked.
const SIZE_IN_BYTES: usize = u32::SIZE_IN_BYTES + 2 * u64::SIZE_IN_BYTES; const SIZE_IN_BYTES: usize = 2 * u32::SIZE_IN_BYTES + 2 * u64::SIZE_IN_BYTES;
} }
impl BinarySerializable for TermInfo { impl BinarySerializable for TermInfo {
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> { fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
self.doc_freq.serialize(writer)?; self.doc_freq.serialize(writer)?;
self.postings_offset.serialize(writer)?; self.postings_start_offset.serialize(writer)?;
self.posting_num_bytes().serialize(writer)?;
self.positions_idx.serialize(writer)?; self.positions_idx.serialize(writer)?;
Ok(()) Ok(())
} }
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> { fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let doc_freq = u32::deserialize(reader)?; let doc_freq = u32::deserialize(reader)?;
let postings_offset = u64::deserialize(reader)?; let postings_start_offset = u64::deserialize(reader)?;
let postings_num_bytes = u32::deserialize(reader)?;
let postings_end_offset = postings_start_offset + u64::from(postings_num_bytes);
let positions_idx = u64::deserialize(reader)?; let positions_idx = u64::deserialize(reader)?;
Ok(TermInfo { Ok(TermInfo {
doc_freq, doc_freq,
postings_offset, postings_start_offset,
postings_end_offset,
positions_idx, positions_idx,
}) })
} }

View File

@@ -83,7 +83,7 @@ mod tests {
let field = schema_builder.add_text_field("text", TEXT); let field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(field=>"aaa")); index_writer.add_document(doc!(field=>"aaa"));
index_writer.add_document(doc!(field=>"bbb")); index_writer.add_document(doc!(field=>"bbb"));
index_writer.commit().unwrap(); index_writer.commit().unwrap();

View File

@@ -5,7 +5,6 @@ use crate::query::{BitSetDocSet, Explanation};
use crate::query::{Scorer, Weight}; use crate::query::{Scorer, Weight};
use crate::schema::{Field, IndexRecordOption}; use crate::schema::{Field, IndexRecordOption};
use crate::termdict::{TermDictionary, TermStreamer}; use crate::termdict::{TermDictionary, TermStreamer};
use crate::Result;
use crate::TantivyError; use crate::TantivyError;
use crate::{DocId, Score}; use crate::{DocId, Score};
use std::sync::Arc; use std::sync::Arc;
@@ -40,7 +39,7 @@ impl<A> Weight for AutomatonWeight<A>
where where
A: Automaton + Send + Sync + 'static, A: Automaton + Send + Sync + 'static,
{ {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> Result<Box<dyn Scorer>> { fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let max_doc = reader.max_doc(); let max_doc = reader.max_doc();
let mut doc_bitset = BitSet::with_max_value(max_doc); let mut doc_bitset = BitSet::with_max_value(max_doc);
let inverted_index = reader.inverted_index(self.field); let inverted_index = reader.inverted_index(self.field);
@@ -66,7 +65,7 @@ where
Ok(Box::new(const_scorer)) Ok(Box::new(const_scorer))
} }
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> { fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0)?; let mut scorer = self.scorer(reader, 1.0)?;
if scorer.seek(doc) == doc { if scorer.seek(doc) == doc {
Ok(Explanation::new("AutomatonScorer", 1.0)) Ok(Explanation::new("AutomatonScorer", 1.0))
@@ -91,7 +90,7 @@ mod tests {
let mut schema = Schema::builder(); let mut schema = Schema::builder();
let title = schema.add_text_field("title", STRING); let title = schema.add_text_field("title", STRING);
let index = Index::create_in_ram(schema.build()); let index = Index::create_in_ram(schema.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(title=>"abc")); index_writer.add_document(doc!(title=>"abc"));
index_writer.add_document(doc!(title=>"bcd")); index_writer.add_document(doc!(title=>"bcd"));
index_writer.add_document(doc!(title=>"abcd")); index_writer.add_document(doc!(title=>"abcd"));

View File

@@ -4,19 +4,6 @@ use crate::{DocId, DocSet, Score, TERMINATED};
use std::ops::Deref; use std::ops::Deref;
use std::ops::DerefMut; use std::ops::DerefMut;
fn is_sorted<I: Iterator<Item = DocId>>(mut it: I) -> bool {
if let Some(first) = it.next() {
let mut prev = first;
for doc in it {
if doc < prev {
return false;
}
prev = doc;
}
}
true
}
/// Takes a term_scorers sorted by their current doc() and a threshold and returns /// Takes a term_scorers sorted by their current doc() and a threshold and returns
/// Returns (pivot_len, pivot_ord) defined as follows: /// Returns (pivot_len, pivot_ord) defined as follows:
/// - `pivot_doc` lowest document that has a chance of exceeding (>) the threshold score. /// - `pivot_doc` lowest document that has a chance of exceeding (>) the threshold score.
@@ -55,37 +42,12 @@ fn find_pivot_doc(
Some((before_pivot_len, pivot_len, pivot_doc)) Some((before_pivot_len, pivot_len, pivot_doc))
} }
struct TermScorerWithMaxScore<'a> {
scorer: &'a mut TermScorer,
max_score: Score,
}
impl<'a> From<&'a mut TermScorer> for TermScorerWithMaxScore<'a> {
fn from(scorer: &'a mut TermScorer) -> Self {
let max_score = scorer.max_score();
TermScorerWithMaxScore { scorer, max_score }
}
}
impl<'a> Deref for TermScorerWithMaxScore<'a> {
type Target = TermScorer;
fn deref(&self) -> &Self::Target {
self.scorer
}
}
impl<'a> DerefMut for TermScorerWithMaxScore<'a> {
fn deref_mut(&mut self) -> &mut Self::Target {
self.scorer
}
}
// Before and after calling this method, scorers need to be sorted by their `.doc()`. // Before and after calling this method, scorers need to be sorted by their `.doc()`.
fn block_max_was_too_low_advance_one_scorer( fn block_max_was_too_low_advance_one_scorer(
scorers: &mut Vec<TermScorerWithMaxScore>, scorers: &mut Vec<TermScorerWithMaxScore>,
pivot_len: usize, pivot_len: usize,
) { ) {
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
let mut scorer_to_seek = pivot_len - 1; let mut scorer_to_seek = pivot_len - 1;
let mut doc_to_seek_after = scorers[scorer_to_seek].doc(); let mut doc_to_seek_after = scorers[scorer_to_seek].doc();
for scorer_ord in (0..pivot_len - 1).rev() { for scorer_ord in (0..pivot_len - 1).rev() {
@@ -102,6 +64,7 @@ fn block_max_was_too_low_advance_one_scorer(
} }
scorers[scorer_to_seek].seek(doc_to_seek_after + 1); scorers[scorer_to_seek].seek(doc_to_seek_after + 1);
restore_ordering(scorers, scorer_to_seek); restore_ordering(scorers, scorer_to_seek);
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
} }
// Given a list of term_scorers and a `ord` and assuming that `term_scorers[ord]` is sorted // Given a list of term_scorers and a `ord` and assuming that `term_scorers[ord]` is sorted
@@ -177,64 +140,99 @@ pub fn block_wand(
.map(TermScorerWithMaxScore::from) .map(TermScorerWithMaxScore::from)
.collect(); .collect();
scorers.sort_by_key(|scorer| scorer.doc()); scorers.sort_by_key(|scorer| scorer.doc());
loop { // At this point we need to ensure that the scorers are sorted!
// At this point we need to ensure that the scorers are sorted! debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
while let Some((before_pivot_len, pivot_len, pivot_doc)) =
find_pivot_doc(&scorers[..], threshold)
{
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc()))); debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
if let Some((before_pivot_len, pivot_len, pivot_doc)) = debug_assert_ne!(pivot_doc, TERMINATED);
find_pivot_doc(&scorers[..], threshold) debug_assert!(before_pivot_len < pivot_len);
{
debug_assert_ne!(pivot_doc, TERMINATED);
debug_assert!(before_pivot_len < pivot_len);
let block_max_score_upperbound: Score = scorers[..pivot_len] let block_max_score_upperbound: Score = scorers[..pivot_len]
.iter_mut() .iter_mut()
.map(|scorer| { .map(|scorer| {
scorer.shallow_seek(pivot_doc); scorer.shallow_seek(pivot_doc);
scorer.block_max_score() scorer.block_max_score()
}) })
.sum(); .sum();
// Beware after shallow advance, skip readers can be in advance compared to // Beware after shallow advance, skip readers can be in advance compared to
// the segment posting lists. // the segment posting lists.
// //
// `block_segment_postings.load_block()` need to be called separately. // `block_segment_postings.load_block()` need to be called separately.
if block_max_score_upperbound <= threshold { if block_max_score_upperbound <= threshold {
// Block max condition was not reached // Block max condition was not reached
// We could get away by simply advancing the scorers to DocId + 1 but it would // We could get away by simply advancing the scorers to DocId + 1 but it would
// be inefficient. The optimization requires proper explanation and was // be inefficient. The optimization requires proper explanation and was
// isolated in a different function. // isolated in a different function.
block_max_was_too_low_advance_one_scorer(&mut scorers, pivot_len); block_max_was_too_low_advance_one_scorer(&mut scorers, pivot_len);
continue; continue;
}
// Block max condition is observed.
//
// Let's try and advance all scorers before the pivot to the pivot.
if !align_scorers(&mut scorers, pivot_doc, before_pivot_len) {
// At least of the scorer does not contain the pivot.
//
// Let's stop scoring this pivot and go through the pivot selection again.
// Note that the current pivot is not necessarily a bad candidate and it
// may be picked again.
continue;
}
// At this point, all scorers are positioned on the doc.
let score = scorers[..pivot_len]
.iter_mut()
.map(|scorer| scorer.score())
.sum();
if score > threshold {
threshold = callback(pivot_doc, score);
}
// let's advance all of the scorers that are currently positioned on the pivot.
advance_all_scorers_on_pivot(&mut scorers, pivot_len);
} else {
return;
} }
// Block max condition is observed.
//
// Let's try and advance all scorers before the pivot to the pivot.
if !align_scorers(&mut scorers, pivot_doc, before_pivot_len) {
// At least of the scorer does not contain the pivot.
//
// Let's stop scoring this pivot and go through the pivot selection again.
// Note that the current pivot is not necessarily a bad candidate and it
// may be picked again.
continue;
}
// At this point, all scorers are positioned on the doc.
let score = scorers[..pivot_len]
.iter_mut()
.map(|scorer| scorer.score())
.sum();
if score > threshold {
threshold = callback(pivot_doc, score);
}
// let's advance all of the scorers that are currently positioned on the pivot.
advance_all_scorers_on_pivot(&mut scorers, pivot_len);
} }
} }
struct TermScorerWithMaxScore<'a> {
scorer: &'a mut TermScorer,
max_score: Score,
}
impl<'a> From<&'a mut TermScorer> for TermScorerWithMaxScore<'a> {
fn from(scorer: &'a mut TermScorer) -> Self {
let max_score = scorer.max_score();
TermScorerWithMaxScore { scorer, max_score }
}
}
impl<'a> Deref for TermScorerWithMaxScore<'a> {
type Target = TermScorer;
fn deref(&self) -> &Self::Target {
self.scorer
}
}
impl<'a> DerefMut for TermScorerWithMaxScore<'a> {
fn deref_mut(&mut self) -> &mut Self::Target {
self.scorer
}
}
fn is_sorted<I: Iterator<Item = DocId>>(mut it: I) -> bool {
if let Some(first) = it.next() {
let mut prev = first;
for doc in it {
if doc < prev {
return false;
}
prev = doc;
}
}
true
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::query::score_combiner::SumCombiner; use crate::query::score_combiner::SumCombiner;
@@ -248,17 +246,21 @@ mod tests {
use std::iter; use std::iter;
struct Float(Score); struct Float(Score);
impl Eq for Float {} impl Eq for Float {}
impl PartialEq for Float { impl PartialEq for Float {
fn eq(&self, other: &Self) -> bool { fn eq(&self, other: &Self) -> bool {
self.cmp(&other) == Ordering::Equal self.cmp(&other) == Ordering::Equal
} }
} }
impl PartialOrd for Float { impl PartialOrd for Float {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> { fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other)) Some(self.cmp(other))
} }
} }
impl Ord for Float { impl Ord for Float {
fn cmp(&self, other: &Self) -> Ordering { fn cmp(&self, other: &Self) -> Ordering {
other.0.partial_cmp(&self.0).unwrap_or(Ordering::Equal) other.0.partial_cmp(&self.0).unwrap_or(Ordering::Equal)

View File

@@ -32,7 +32,7 @@ mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
{ {
index_writer.add_document(doc!(text_field => "a b c")); index_writer.add_document(doc!(text_field => "a b c"));
index_writer.add_document(doc!(text_field => "a c")); index_writer.add_document(doc!(text_field => "a c"));
@@ -224,7 +224,7 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field => "a b c")); index_writer.add_document(doc!(text_field => "a b c"));
index_writer.add_document(doc!(text_field => "a c")); index_writer.add_document(doc!(text_field => "a c"));
index_writer.add_document(doc!(text_field => "b c")); index_writer.add_document(doc!(text_field => "b c"));

View File

@@ -144,7 +144,7 @@ mod tests {
fn test_boost_query_explain() { fn test_boost_query_explain() {
let schema = Schema::builder().build(); let schema = Schema::builder().build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(Document::new()); index_writer.add_document(Document::new());
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
let reader = index.reader().unwrap(); let reader = index.reader().unwrap();

View File

@@ -177,7 +177,7 @@ mod test {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!( index_writer.add_document(doc!(
country_field => "japan", country_field => "japan",
)); ));

View File

@@ -24,7 +24,7 @@ pub mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
for &text in texts { for &text in texts {
let doc = doc!(text_field=>text); let doc = doc!(text_field=>text);
index_writer.add_document(doc); index_writer.add_document(doc);
@@ -135,7 +135,7 @@ pub mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field=>"a b c")); index_writer.add_document(doc!(text_field=>"a b c"));
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
} }
@@ -186,7 +186,7 @@ pub mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field=>"b")); index_writer.add_document(doc!(text_field=>"b"));
index_writer.add_document(doc!(text_field=>"a b")); index_writer.add_document(doc!(text_field=>"a b"));
index_writer.add_document(doc!(text_field=>"b a")); index_writer.add_document(doc!(text_field=>"b a"));
@@ -217,7 +217,7 @@ pub mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field=>"a b c d e f g h")); index_writer.add_document(doc!(text_field=>"a b c d e f g h"));
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
} }

View File

@@ -9,8 +9,8 @@ use crate::query::Weight;
use crate::query::{EmptyScorer, Explanation}; use crate::query::{EmptyScorer, Explanation};
use crate::schema::IndexRecordOption; use crate::schema::IndexRecordOption;
use crate::schema::Term; use crate::schema::Term;
use crate::Score;
use crate::{DocId, DocSet}; use crate::{DocId, DocSet};
use crate::{Result, Score};
pub struct PhraseWeight { pub struct PhraseWeight {
phrase_terms: Vec<(usize, Term)>, phrase_terms: Vec<(usize, Term)>,
@@ -32,7 +32,7 @@ impl PhraseWeight {
} }
} }
fn fieldnorm_reader(&self, reader: &SegmentReader) -> FieldNormReader { fn fieldnorm_reader(&self, reader: &SegmentReader) -> crate::Result<FieldNormReader> {
let field = self.phrase_terms[0].1.field(); let field = self.phrase_terms[0].1.field();
reader.get_fieldnorms_reader(field) reader.get_fieldnorms_reader(field)
} }
@@ -41,9 +41,9 @@ impl PhraseWeight {
&self, &self,
reader: &SegmentReader, reader: &SegmentReader,
boost: Score, boost: Score,
) -> Result<Option<PhraseScorer<SegmentPostings>>> { ) -> crate::Result<Option<PhraseScorer<SegmentPostings>>> {
let similarity_weight = self.similarity_weight.boost_by(boost); let similarity_weight = self.similarity_weight.boost_by(boost);
let fieldnorm_reader = self.fieldnorm_reader(reader); let fieldnorm_reader = self.fieldnorm_reader(reader)?;
if reader.has_deletes() { if reader.has_deletes() {
let mut term_postings_list = Vec::new(); let mut term_postings_list = Vec::new();
for &(offset, ref term) in &self.phrase_terms { for &(offset, ref term) in &self.phrase_terms {
@@ -85,7 +85,7 @@ impl PhraseWeight {
} }
impl Weight for PhraseWeight { impl Weight for PhraseWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> Result<Box<dyn Scorer>> { fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
if let Some(scorer) = self.phrase_scorer(reader, boost)? { if let Some(scorer) = self.phrase_scorer(reader, boost)? {
Ok(Box::new(scorer)) Ok(Box::new(scorer))
} else { } else {
@@ -93,7 +93,7 @@ impl Weight for PhraseWeight {
} }
} }
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> { fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let scorer_opt = self.phrase_scorer(reader, 1.0)?; let scorer_opt = self.phrase_scorer(reader, 1.0)?;
if scorer_opt.is_none() { if scorer_opt.is_none() {
return Err(does_not_match(doc)); return Err(does_not_match(doc));
@@ -102,7 +102,7 @@ impl Weight for PhraseWeight {
if scorer.seek(doc) != doc { if scorer.seek(doc) != doc {
return Err(does_not_match(doc)); return Err(does_not_match(doc));
} }
let fieldnorm_reader = self.fieldnorm_reader(reader); let fieldnorm_reader = self.fieldnorm_reader(reader)?;
let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc); let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc);
let phrase_count = scorer.phrase_count(); let phrase_count = scorer.phrase_count();
let mut explanation = Explanation::new("Phrase Scorer", scorer.score()); let mut explanation = Explanation::new("Phrase Scorer", scorer.score());

View File

@@ -9,7 +9,6 @@ use crate::query::{Query, Scorer, Weight};
use crate::schema::Type; use crate::schema::Type;
use crate::schema::{Field, IndexRecordOption, Term}; use crate::schema::{Field, IndexRecordOption, Term};
use crate::termdict::{TermDictionary, TermStreamer}; use crate::termdict::{TermDictionary, TermStreamer};
use crate::Result;
use crate::{DocId, Score}; use crate::{DocId, Score};
use std::collections::Bound; use std::collections::Bound;
use std::ops::Range; use std::ops::Range;
@@ -48,7 +47,7 @@ fn map_bound<TFrom, TTo, Transform: Fn(&TFrom) -> TTo>(
/// let schema = schema_builder.build(); /// let schema = schema_builder.build();
/// ///
/// let index = Index::create_in_ram(schema); /// let index = Index::create_in_ram(schema);
/// let mut index_writer = index.writer_with_num_threads(1, 6_000_000)?; /// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// for year in 1950u64..2017u64 { /// for year in 1950u64..2017u64 {
/// let num_docs_within_year = 10 + (year - 1950) * (year - 1950); /// let num_docs_within_year = 10 + (year - 1950) * (year - 1950);
/// for _ in 0..num_docs_within_year { /// for _ in 0..num_docs_within_year {
@@ -246,7 +245,11 @@ impl RangeQuery {
} }
impl Query for RangeQuery { impl Query for RangeQuery {
fn weight(&self, searcher: &Searcher, _scoring_enabled: bool) -> Result<Box<dyn Weight>> { fn weight(
&self,
searcher: &Searcher,
_scoring_enabled: bool,
) -> crate::Result<Box<dyn Weight>> {
let schema = searcher.schema(); let schema = searcher.schema();
let value_type = schema.get_field_entry(self.field).field_type().value_type(); let value_type = schema.get_field_entry(self.field).field_type().value_type();
if value_type != self.value_type { if value_type != self.value_type {
@@ -289,7 +292,7 @@ impl RangeWeight {
} }
impl Weight for RangeWeight { impl Weight for RangeWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> Result<Box<dyn Scorer>> { fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let max_doc = reader.max_doc(); let max_doc = reader.max_doc();
let mut doc_bitset = BitSet::with_max_value(max_doc); let mut doc_bitset = BitSet::with_max_value(max_doc);
@@ -315,7 +318,7 @@ impl Weight for RangeWeight {
Ok(Box::new(ConstScorer::new(doc_bitset, boost))) Ok(Box::new(ConstScorer::new(doc_bitset, boost)))
} }
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> { fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0)?; let mut scorer = self.scorer(reader, 1.0)?;
if scorer.seek(doc) != doc { if scorer.seek(doc) != doc {
return Err(does_not_match(doc)); return Err(does_not_match(doc));
@@ -342,7 +345,7 @@ mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_with_num_threads(1, 6_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
for year in 1950u64..2017u64 { for year in 1950u64..2017u64 {
let num_docs_within_year = 10 + (year - 1950) * (year - 1950); let num_docs_within_year = 10 + (year - 1950) * (year - 1950);
for _ in 0..num_docs_within_year { for _ in 0..num_docs_within_year {
@@ -485,7 +488,7 @@ mod tests {
schema_builder.add_i64_field("year", INDEXED); schema_builder.add_i64_field("year", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone()); let index = Index::create_in_ram(schema.clone());
let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?; let mut index_writer = index.writer_for_tests()?;
let title = schema.get_field("title").unwrap(); let title = schema.get_field("title").unwrap();
let year = schema.get_field("year").unwrap(); let year = schema.get_field("year").unwrap();
index_writer.add_document(doc!( index_writer.add_document(doc!(

View File

@@ -103,7 +103,7 @@ mod test {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!( index_writer.add_document(doc!(
country_field => "japan", country_field => "japan",
)); ));

View File

@@ -25,7 +25,7 @@ mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
let doc = doc!(text_field => "a"); let doc = doc!(text_field => "a");
index_writer.add_document(doc); index_writer.add_document(doc);
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
@@ -50,7 +50,7 @@ mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?; let mut index_writer = index.writer_for_tests()?;
for _ in 0..COMPRESSION_BLOCK_SIZE { for _ in 0..COMPRESSION_BLOCK_SIZE {
let doc = doc!(text_field => "a"); let doc = doc!(text_field => "a");
index_writer.add_document(doc); index_writer.add_document(doc);
@@ -86,7 +86,7 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!( index_writer.add_document(doc!(
left_field => "left1 left2 left2 left2f2 left2f2 left3 abcde abcde abcde abcde abcde abcde abcde abcde abcde abcewde abcde abcde", left_field => "left1 left2 left2 left2f2 left2f2 left3 abcde abcde abcde abcde abcde abcde abcde abcde abcde abcewde abcde abcde",
right_field => "right1 right2", right_field => "right1 right2",
@@ -136,7 +136,7 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 5_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field=>"a b")); index_writer.add_document(doc!(text_field=>"a b"));
index_writer.add_document(doc!(text_field=>"a c")); index_writer.add_document(doc!(text_field=>"a c"));
index_writer.delete_term(Term::from_field_text(text_field, "b")); index_writer.delete_term(Term::from_field_text(text_field, "b"));
@@ -153,7 +153,7 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field=>"a")); index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"a")); index_writer.add_document(doc!(text_field=>"a"));
index_writer.commit()?; index_writer.commit()?;

View File

@@ -4,11 +4,10 @@ use crate::docset::DocSet;
use crate::postings::SegmentPostings; use crate::postings::SegmentPostings;
use crate::query::bm25::BM25Weight; use crate::query::bm25::BM25Weight;
use crate::query::explanation::does_not_match; use crate::query::explanation::does_not_match;
use crate::query::weight::{for_each_pruning_scorer, for_each_scorer}; use crate::query::weight::for_each_scorer;
use crate::query::Weight; use crate::query::Weight;
use crate::query::{Explanation, Scorer}; use crate::query::{Explanation, Scorer};
use crate::schema::IndexRecordOption; use crate::schema::IndexRecordOption;
use crate::Result;
use crate::Term; use crate::Term;
use crate::{DocId, Score}; use crate::{DocId, Score};
@@ -19,12 +18,12 @@ pub struct TermWeight {
} }
impl Weight for TermWeight { impl Weight for TermWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> Result<Box<dyn Scorer>> { fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let term_scorer = self.specialized_scorer(reader, boost)?; let term_scorer = self.specialized_scorer(reader, boost)?;
Ok(Box::new(term_scorer)) Ok(Box::new(term_scorer))
} }
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> { fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let mut scorer = self.specialized_scorer(reader, 1.0)?; let mut scorer = self.specialized_scorer(reader, 1.0)?;
if scorer.seek(doc) != doc { if scorer.seek(doc) != doc {
return Err(does_not_match(doc)); return Err(does_not_match(doc));
@@ -32,7 +31,7 @@ impl Weight for TermWeight {
Ok(scorer.explain()) Ok(scorer.explain())
} }
fn count(&self, reader: &SegmentReader) -> Result<u32> { fn count(&self, reader: &SegmentReader) -> crate::Result<u32> {
if let Some(delete_bitset) = reader.delete_bitset() { if let Some(delete_bitset) = reader.delete_bitset() {
Ok(self.scorer(reader, 1.0)?.count(delete_bitset)) Ok(self.scorer(reader, 1.0)?.count(delete_bitset))
} else { } else {
@@ -73,8 +72,8 @@ impl Weight for TermWeight {
reader: &SegmentReader, reader: &SegmentReader,
callback: &mut dyn FnMut(DocId, Score) -> Score, callback: &mut dyn FnMut(DocId, Score) -> Score,
) -> crate::Result<()> { ) -> crate::Result<()> {
let mut scorer = self.scorer(reader, 1.0)?; let scorer = self.specialized_scorer(reader, 1.0)?;
for_each_pruning_scorer(&mut scorer, threshold, callback); crate::query::boolean_query::block_wand(vec![scorer], threshold, callback);
Ok(()) Ok(())
} }
} }
@@ -96,10 +95,10 @@ impl TermWeight {
&self, &self,
reader: &SegmentReader, reader: &SegmentReader,
boost: Score, boost: Score,
) -> Result<TermScorer> { ) -> crate::Result<TermScorer> {
let field = self.term.field(); let field = self.term.field();
let inverted_index = reader.inverted_index(field); let inverted_index = reader.inverted_index(field);
let fieldnorm_reader = reader.get_fieldnorms_reader(field); let fieldnorm_reader = reader.get_fieldnorms_reader(field)?;
let similarity_weight = self.similarity_weight.boost_by(boost); let similarity_weight = self.similarity_weight.boost_by(boost);
let postings_opt: Option<SegmentPostings> = let postings_opt: Option<SegmentPostings> =
inverted_index.read_postings(&self.term, self.index_record_option); inverted_index.read_postings(&self.term, self.index_record_option);

View File

@@ -398,9 +398,9 @@ mod bench {
use crate::query::score_combiner::DoNothingCombiner; use crate::query::score_combiner::DoNothingCombiner;
use crate::query::{ConstScorer, Union, VecDocSet}; use crate::query::{ConstScorer, Union, VecDocSet};
use crate::tests;
use crate::DocId; use crate::DocId;
use crate::DocSet; use crate::DocSet;
use crate::{tests, TERMINATED};
use test::Bencher; use test::Bencher;
#[bench] #[bench]
@@ -414,10 +414,12 @@ mod bench {
union_docset union_docset
.iter() .iter()
.map(|doc_ids| VecDocSet::from(doc_ids.clone())) .map(|doc_ids| VecDocSet::from(doc_ids.clone()))
.map(ConstScorer::new) .map(|docset| ConstScorer::new(docset, 1.0))
.collect::<Vec<_>>(), .collect::<Vec<_>>(),
); );
while v.advance() {} while v.doc() != TERMINATED {
v.advance();
}
}); });
} }
#[bench] #[bench]
@@ -432,10 +434,12 @@ mod bench {
union_docset union_docset
.iter() .iter()
.map(|doc_ids| VecDocSet::from(doc_ids.clone())) .map(|doc_ids| VecDocSet::from(doc_ids.clone()))
.map(ConstScorer::new) .map(|docset| ConstScorer::new(docset, 1.0))
.collect::<Vec<_>>(), .collect::<Vec<_>>(),
); );
while v.advance() {} while v.doc() != TERMINATED {
v.advance();
}
}); });
} }
} }

View File

@@ -138,9 +138,11 @@ impl InnerIndexReader {
.collect::<crate::Result<_>>()? .collect::<crate::Result<_>>()?
}; };
let schema = self.index.schema(); let schema = self.index.schema();
let searchers = (0..self.num_searchers) let searchers = std::iter::repeat_with(|| {
.map(|_| Searcher::new(schema.clone(), self.index.clone(), segment_readers.clone())) Searcher::new(schema.clone(), self.index.clone(), segment_readers.clone())
.collect(); })
.take(self.num_searchers)
.collect();
self.searcher_pool.publish_new_generation(searchers); self.searcher_pool.publish_new_generation(searchers);
Ok(()) Ok(())
} }

View File

@@ -74,9 +74,8 @@ impl Document {
} }
/// Add a text field. /// Add a text field.
pub fn add_text(&mut self, field: Field, text: &str) { pub fn add_text<S: ToString>(&mut self, field: Field, text: S) {
let value = Value::Str(String::from(text)); self.add(FieldValue::new(field, Value::Str(text.to_string())));
self.add(FieldValue::new(field, value));
} }
/// Add a pre-tokenized text field. /// Add a pre-tokenized text field.
@@ -110,8 +109,8 @@ impl Document {
} }
/// Add a bytes field /// Add a bytes field
pub fn add_bytes(&mut self, field: Field, value: Vec<u8>) { pub fn add_bytes<T: Into<Vec<u8>>>(&mut self, field: Field, value: T) {
self.add(FieldValue::new(field, Value::Bytes(value))) self.add(FieldValue::new(field, Value::Bytes(value.into())))
} }
/// Add a field value /// Add a field value

View File

@@ -1,5 +1,5 @@
use crate::schema::IntOptions;
use crate::schema::TextOptions; use crate::schema::TextOptions;
use crate::schema::{is_valid_field_name, IntOptions};
use crate::schema::FieldType; use crate::schema::FieldType;
use serde::de::{self, MapAccess, Visitor}; use serde::de::{self, MapAccess, Visitor};
@@ -24,6 +24,7 @@ impl FieldEntry {
/// Creates a new u64 field entry in the schema, given /// Creates a new u64 field entry in the schema, given
/// a name, and some options. /// a name, and some options.
pub fn new_text(field_name: String, text_options: TextOptions) -> FieldEntry { pub fn new_text(field_name: String, text_options: TextOptions) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry { FieldEntry {
name: field_name, name: field_name,
field_type: FieldType::Str(text_options), field_type: FieldType::Str(text_options),
@@ -33,6 +34,7 @@ impl FieldEntry {
/// Creates a new u64 field entry in the schema, given /// Creates a new u64 field entry in the schema, given
/// a name, and some options. /// a name, and some options.
pub fn new_u64(field_name: String, field_type: IntOptions) -> FieldEntry { pub fn new_u64(field_name: String, field_type: IntOptions) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry { FieldEntry {
name: field_name, name: field_name,
field_type: FieldType::U64(field_type), field_type: FieldType::U64(field_type),
@@ -42,6 +44,7 @@ impl FieldEntry {
/// Creates a new i64 field entry in the schema, given /// Creates a new i64 field entry in the schema, given
/// a name, and some options. /// a name, and some options.
pub fn new_i64(field_name: String, field_type: IntOptions) -> FieldEntry { pub fn new_i64(field_name: String, field_type: IntOptions) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry { FieldEntry {
name: field_name, name: field_name,
field_type: FieldType::I64(field_type), field_type: FieldType::I64(field_type),
@@ -51,6 +54,7 @@ impl FieldEntry {
/// Creates a new f64 field entry in the schema, given /// Creates a new f64 field entry in the schema, given
/// a name, and some options. /// a name, and some options.
pub fn new_f64(field_name: String, field_type: IntOptions) -> FieldEntry { pub fn new_f64(field_name: String, field_type: IntOptions) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry { FieldEntry {
name: field_name, name: field_name,
field_type: FieldType::F64(field_type), field_type: FieldType::F64(field_type),
@@ -60,6 +64,7 @@ impl FieldEntry {
/// Creates a new date field entry in the schema, given /// Creates a new date field entry in the schema, given
/// a name, and some options. /// a name, and some options.
pub fn new_date(field_name: String, field_type: IntOptions) -> FieldEntry { pub fn new_date(field_name: String, field_type: IntOptions) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry { FieldEntry {
name: field_name, name: field_name,
field_type: FieldType::Date(field_type), field_type: FieldType::Date(field_type),
@@ -68,6 +73,7 @@ impl FieldEntry {
/// Creates a field entry for a facet. /// Creates a field entry for a facet.
pub fn new_facet(field_name: String) -> FieldEntry { pub fn new_facet(field_name: String) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry { FieldEntry {
name: field_name, name: field_name,
field_type: FieldType::HierarchicalFacet, field_type: FieldType::HierarchicalFacet,
@@ -76,6 +82,7 @@ impl FieldEntry {
/// Creates a field entry for a bytes field /// Creates a field entry for a bytes field
pub fn new_bytes(field_name: String) -> FieldEntry { pub fn new_bytes(field_name: String) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry { FieldEntry {
name: field_name, name: field_name,
field_type: FieldType::Bytes, field_type: FieldType::Bytes,
@@ -268,6 +275,12 @@ mod tests {
use crate::schema::TEXT; use crate::schema::TEXT;
use serde_json; use serde_json;
#[test]
#[should_panic]
fn test_invalid_field_name_should_panic() {
FieldEntry::new_text("-hello".to_string(), TEXT);
}
#[test] #[test]
fn test_json_serialization() { fn test_json_serialization() {
let field_value = FieldEntry::new_text(String::from("title"), TEXT); let field_value = FieldEntry::new_text(String::from("title"), TEXT);

View File

@@ -149,14 +149,16 @@ pub use self::int_options::IntOptions;
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use regex::Regex; use regex::Regex;
/// Regular expression representing the restriction on a valid field names.
pub const FIELD_NAME_PATTERN: &str = r#"^[_a-zA-Z][_\-a-zA-Z0-9]*$"#;
/// Validator for a potential `field_name`. /// Validator for a potential `field_name`.
/// Returns true iff the name can be use for a field name. /// Returns true iff the name can be use for a field name.
/// ///
/// A field name must start by a letter `[a-zA-Z]`. /// A field name must start by a letter `[a-zA-Z]`.
/// The other characters can be any alphanumic character `[a-ZA-Z0-9]` or `_`. /// The other characters can be any alphanumic character `[a-ZA-Z0-9]` or `_`.
pub fn is_valid_field_name(field_name: &str) -> bool { pub fn is_valid_field_name(field_name: &str) -> bool {
static FIELD_NAME_PTN: Lazy<Regex> = static FIELD_NAME_PTN: Lazy<Regex> = Lazy::new(|| Regex::new(FIELD_NAME_PATTERN).unwrap());
Lazy::new(|| Regex::new("^[a-zA-Z][_a-zA-Z0-9]*$").unwrap());
FIELD_NAME_PTN.is_match(field_name) FIELD_NAME_PTN.is_match(field_name)
} }
@@ -170,6 +172,11 @@ mod tests {
assert!(is_valid_field_name("text")); assert!(is_valid_field_name("text"));
assert!(is_valid_field_name("text0")); assert!(is_valid_field_name("text0"));
assert!(!is_valid_field_name("0text")); assert!(!is_valid_field_name("0text"));
assert!(is_valid_field_name("field-name"));
assert!(is_valid_field_name("field_name"));
assert!(!is_valid_field_name("field!name"));
assert!(!is_valid_field_name("-fieldname"));
assert!(is_valid_field_name("_fieldname"));
assert!(!is_valid_field_name("")); assert!(!is_valid_field_name(""));
assert!(!is_valid_field_name("シャボン玉")); assert!(!is_valid_field_name("シャボン玉"));
assert!(is_valid_field_name("my_text_field")); assert!(is_valid_field_name("my_text_field"));

View File

@@ -4,7 +4,6 @@ use super::Field;
use crate::common; use crate::common;
use crate::schema::Facet; use crate::schema::Facet;
use crate::DateTime; use crate::DateTime;
use byteorder::{BigEndian, ByteOrder};
use std::str; use std::str;
/// Size (in bytes) of the buffer of a int field. /// Size (in bytes) of the buffer of a int field.
@@ -19,6 +18,10 @@ where
B: AsRef<[u8]>; B: AsRef<[u8]>;
impl Term { impl Term {
pub(crate) fn new() -> Term {
Term(Vec::with_capacity(100))
}
/// Builds a term given a field, and a i64-value /// Builds a term given a field, and a i64-value
/// ///
/// Assuming the term has a field id of 1, and a i64 value of 3234, /// Assuming the term has a field id of 1, and a i64 value of 3234,
@@ -93,6 +96,12 @@ impl Term {
term term
} }
pub(crate) fn from_field_bytes(field: Field, bytes: &[u8]) -> Term {
let mut term = Term::for_field(field);
term.set_bytes(bytes);
term
}
/// Creates a new Term for a given field. /// Creates a new Term for a given field.
pub(crate) fn for_field(field: Field) -> Term { pub(crate) fn for_field(field: Field) -> Term {
let mut term = Term(Vec::with_capacity(100)); let mut term = Term(Vec::with_capacity(100));
@@ -100,12 +109,10 @@ impl Term {
term term
} }
/// Returns the field. pub(crate) fn set_field(&mut self, field: Field) {
pub fn set_field(&mut self, field: Field) { self.0.clear();
if self.0.len() < 4 { self.0
self.0.resize(4, 0u8); .extend_from_slice(&field.field_id().to_be_bytes()[..]);
}
BigEndian::write_u32(&mut self.0[0..4], field.field_id());
} }
/// Sets a u64 value in the term. /// Sets a u64 value in the term.
@@ -116,7 +123,7 @@ impl Term {
/// the natural order of the values. /// the natural order of the values.
pub fn set_u64(&mut self, val: u64) { pub fn set_u64(&mut self, val: u64) {
self.0.resize(INT_TERM_LEN, 0u8); self.0.resize(INT_TERM_LEN, 0u8);
BigEndian::write_u64(&mut self.0[4..], val); self.0[4..12].copy_from_slice(val.to_be_bytes().as_ref());
} }
/// Sets a `i64` value in the term. /// Sets a `i64` value in the term.
@@ -134,12 +141,6 @@ impl Term {
self.0.extend(bytes); self.0.extend(bytes);
} }
pub(crate) fn from_field_bytes(field: Field, bytes: &[u8]) -> Term {
let mut term = Term::for_field(field);
term.set_bytes(bytes);
term
}
/// Set the texts only, keeping the field untouched. /// Set the texts only, keeping the field untouched.
pub fn set_text(&mut self, text: &str) { pub fn set_text(&mut self, text: &str) {
self.set_bytes(text.as_bytes()); self.set_bytes(text.as_bytes());
@@ -157,7 +158,9 @@ where
/// Returns the field. /// Returns the field.
pub fn field(&self) -> Field { pub fn field(&self) -> Field {
Field::from_field_id(BigEndian::read_u32(&self.0.as_ref()[..4])) let mut field_id_bytes = [0u8; 4];
field_id_bytes.copy_from_slice(&self.0.as_ref()[..4]);
Field::from_field_id(u32::from_be_bytes(field_id_bytes))
} }
/// Returns the `u64` value stored in a term. /// Returns the `u64` value stored in a term.
@@ -166,7 +169,9 @@ where
/// ... or returns an invalid value /// ... or returns an invalid value
/// if the term is not a `u64` field. /// if the term is not a `u64` field.
pub fn get_u64(&self) -> u64 { pub fn get_u64(&self) -> u64 {
BigEndian::read_u64(&self.0.as_ref()[4..]) let mut field_id_bytes = [0u8; 8];
field_id_bytes.copy_from_slice(self.value_bytes());
u64::from_be_bytes(field_id_bytes)
} }
/// Returns the `i64` value stored in a term. /// Returns the `i64` value stored in a term.
@@ -175,7 +180,7 @@ where
/// ... or returns an invalid value /// ... or returns an invalid value
/// if the term is not a `i64` field. /// if the term is not a `i64` field.
pub fn get_i64(&self) -> i64 { pub fn get_i64(&self) -> i64 {
common::u64_to_i64(BigEndian::read_u64(&self.0.as_ref()[4..])) common::u64_to_i64(self.get_u64())
} }
/// Returns the `f64` value stored in a term. /// Returns the `f64` value stored in a term.
@@ -184,7 +189,7 @@ where
/// ... or returns an invalid value /// ... or returns an invalid value
/// if the term is not a `f64` field. /// if the term is not a `f64` field.
pub fn get_f64(&self) -> f64 { pub fn get_f64(&self) -> f64 {
common::u64_to_f64(BigEndian::read_u64(&self.0.as_ref()[4..])) common::u64_to_f64(self.get_u64())
} }
/// Returns the text associated with the term. /// Returns the text associated with the term.

View File

@@ -221,6 +221,12 @@ impl<'a> From<&'a str> for Value {
} }
} }
impl<'a> From<&'a [u8]> for Value {
fn from(bytes: &'a [u8]) -> Value {
Value::Bytes(bytes.to_vec())
}
}
impl<'a> From<Facet> for Value { impl<'a> From<Facet> for Value {
fn from(facet: Facet) -> Value { fn from(facet: Facet) -> Value {
Value::Facet(facet) Value::Facet(facet)

View File

@@ -221,7 +221,7 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str)
/// # let text_field = schema_builder.add_text_field("text", TEXT); /// # let text_field = schema_builder.add_text_field("text", TEXT);
/// # let schema = schema_builder.build(); /// # let schema = schema_builder.build();
/// # let index = Index::create_in_ram(schema); /// # let index = Index::create_in_ram(schema);
/// # let mut index_writer = index.writer_with_num_threads(1, 30_000_000)?; /// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// # let doc = doc!(text_field => r#"Comme je descendais des Fleuves impassibles, /// # let doc = doc!(text_field => r#"Comme je descendais des Fleuves impassibles,
/// # Je ne me sentis plus guidé par les haleurs : /// # Je ne me sentis plus guidé par les haleurs :
/// # Des Peaux-Rouges criards les avaient pris pour cibles, /// # Des Peaux-Rouges criards les avaient pris pour cibles,
@@ -506,7 +506,7 @@ Survey in 2016, 2017, and 2018."#;
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field => "a")); index_writer.add_document(doc!(text_field => "a"));
index_writer.add_document(doc!(text_field => "a")); index_writer.add_document(doc!(text_field => "a"));
index_writer.add_document(doc!(text_field => "a b")); index_writer.add_document(doc!(text_field => "a b"));
@@ -562,7 +562,7 @@ Survey in 2016, 2017, and 2018."#;
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
{ {
let doc = doc ! (text_field => TEST_TEXT); let doc = doc ! (text_field => TEST_TEXT);
index_writer.add_document(doc); index_writer.add_document(doc);

View File

@@ -25,6 +25,8 @@ pub enum ComponentSpaceUsage {
Store(StoreSpaceUsage), Store(StoreSpaceUsage),
/// Some sort of raw byte count /// Some sort of raw byte count
Basic(ByteCount), Basic(ByteCount),
///
Unimplemented,
} }
/// Represents combined space usage of an entire searcher and its component segments. /// Represents combined space usage of an entire searcher and its component segments.
@@ -119,7 +121,7 @@ impl SegmentSpaceUsage {
/// Clones the underlying data. /// Clones the underlying data.
/// Use the components directly if this is somehow in performance critical code. /// Use the components directly if this is somehow in performance critical code.
pub fn component(&self, component: SegmentComponent) -> ComponentSpaceUsage { pub fn component(&self, component: SegmentComponent) -> ComponentSpaceUsage {
use self::ComponentSpaceUsage::*; use self::ComponentSpaceUsage::{Basic, PerField, Store, Unimplemented};
use crate::SegmentComponent::*; use crate::SegmentComponent::*;
match component { match component {
POSTINGS => PerField(self.postings().clone()), POSTINGS => PerField(self.postings().clone()),
@@ -130,6 +132,7 @@ impl SegmentSpaceUsage {
TERMS => PerField(self.termdict().clone()), TERMS => PerField(self.termdict().clone()),
STORE => Store(self.store().clone()), STORE => Store(self.store().clone()),
DELETE => Basic(self.deletes()), DELETE => Basic(self.deletes()),
FIELDSTATS => Unimplemented,
} }
} }
@@ -336,7 +339,7 @@ mod test {
let index = Index::create_in_ram(schema.clone()); let index = Index::create_in_ram(schema.clone());
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(name => 1u64)); index_writer.add_document(doc!(name => 1u64));
index_writer.add_document(doc!(name => 2u64)); index_writer.add_document(doc!(name => 2u64));
index_writer.add_document(doc!(name => 10u64)); index_writer.add_document(doc!(name => 10u64));
@@ -374,7 +377,7 @@ mod test {
let index = Index::create_in_ram(schema.clone()); let index = Index::create_in_ram(schema.clone());
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(name => "hi")); index_writer.add_document(doc!(name => "hi"));
index_writer.add_document(doc!(name => "this is a test")); index_writer.add_document(doc!(name => "this is a test"));
index_writer.add_document( index_writer.add_document(
@@ -414,7 +417,7 @@ mod test {
let index = Index::create_in_ram(schema.clone()); let index = Index::create_in_ram(schema.clone());
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(name => "hi")); index_writer.add_document(doc!(name => "hi"));
index_writer.add_document(doc!(name => "this is a test")); index_writer.add_document(doc!(name => "this is a test"));
index_writer.add_document( index_writer.add_document(
@@ -453,7 +456,7 @@ mod test {
let index = Index::create_in_ram(schema.clone()); let index = Index::create_in_ram(schema.clone());
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(name => 1u64)); index_writer.add_document(doc!(name => 1u64));
index_writer.add_document(doc!(name => 2u64)); index_writer.add_document(doc!(name => 2u64));
index_writer.add_document(doc!(name => 3u64)); index_writer.add_document(doc!(name => 3u64));

View File

@@ -44,11 +44,13 @@ mod tests {
const BLOCK_SIZE: usize = 1_500; const BLOCK_SIZE: usize = 1_500;
fn make_term_info(val: u64) -> TermInfo { fn make_term_info(term_ord: u64) -> TermInfo {
let offset = |term_ord: u64| term_ord * 100 + term_ord * term_ord;
TermInfo { TermInfo {
doc_freq: val as u32, doc_freq: term_ord as u32,
positions_idx: val * 2u64, postings_start_offset: offset(term_ord),
postings_offset: val * 3u64, postings_end_offset: offset(term_ord + 1),
positions_idx: offset(term_ord) * 2u64,
} }
} }
@@ -138,7 +140,7 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
{ {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
{ {
{ {
let mut doc = Document::default(); let mut doc = Document::default();
@@ -208,20 +210,14 @@ mod tests {
} }
#[test] #[test]
fn test_stream_high_range_prefix_suffix() { fn test_stream_high_range_prefix_suffix() -> std::io::Result<()> {
let buffer: Vec<u8> = { let buffer: Vec<u8> = {
let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap(); let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
// term requires more than 16bits // term requires more than 16bits
term_dictionary_builder term_dictionary_builder.insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1))?;
.insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1)) term_dictionary_builder.insert("abcdefghijklmnopqrstuvwxyz", &make_term_info(2))?;
.unwrap(); term_dictionary_builder.insert("abr", &make_term_info(3))?;
term_dictionary_builder term_dictionary_builder.finish()?
.insert("abcdefghijklmnopqrstuvwxyz", &make_term_info(2))
.unwrap();
term_dictionary_builder
.insert("abr", &make_term_info(2))
.unwrap();
term_dictionary_builder.finish().unwrap()
}; };
let source = ReadOnlySource::from(buffer); let source = ReadOnlySource::from(buffer);
let term_dictionary: TermDictionary = TermDictionary::from_source(&source); let term_dictionary: TermDictionary = TermDictionary::from_source(&source);
@@ -229,12 +225,15 @@ mod tests {
assert!(kv_stream.advance()); assert!(kv_stream.advance());
assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxy".as_bytes()); assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxy".as_bytes());
assert_eq!(kv_stream.value(), &make_term_info(1)); assert_eq!(kv_stream.value(), &make_term_info(1));
dbg!(make_term_info(1));
assert!(kv_stream.advance()); assert!(kv_stream.advance());
assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxyz".as_bytes()); assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxyz".as_bytes());
assert_eq!(kv_stream.value(), &make_term_info(2)); assert_eq!(kv_stream.value(), &make_term_info(2));
assert!(kv_stream.advance()); assert!(kv_stream.advance());
assert_eq!(kv_stream.key(), "abr".as_bytes()); assert_eq!(kv_stream.key(), "abr".as_bytes());
assert_eq!(kv_stream.value(), &make_term_info(3));
assert!(!kv_stream.advance()); assert!(!kv_stream.advance());
Ok(())
} }
#[test] #[test]

View File

@@ -57,21 +57,28 @@ impl TermInfoBlockMeta {
self.doc_freq_nbits + self.postings_offset_nbits + self.positions_idx_nbits self.doc_freq_nbits + self.postings_offset_nbits + self.positions_idx_nbits
} }
// Here inner_offset is the offset within the block, WITHOUT the first term_info.
// In other word, term_info #1,#2,#3 gets inner_offset 0,1,2... While term_info #0
// is encoded without bitpacking.
fn deserialize_term_info(&self, data: &[u8], inner_offset: usize) -> TermInfo { fn deserialize_term_info(&self, data: &[u8], inner_offset: usize) -> TermInfo {
assert!(inner_offset < BLOCK_LEN - 1);
let num_bits = self.num_bits() as usize; let num_bits = self.num_bits() as usize;
let mut cursor = num_bits * inner_offset; let mut cursor = num_bits * inner_offset;
let postings_start_offset = extract_bits(data, cursor, self.postings_offset_nbits);
let postings_end_offset = self.ref_term_info.postings_start_offset
+ extract_bits(data, cursor + num_bits, self.postings_offset_nbits);
cursor += self.postings_offset_nbits as usize;
let doc_freq = extract_bits(data, cursor, self.doc_freq_nbits) as u32; let doc_freq = extract_bits(data, cursor, self.doc_freq_nbits) as u32;
cursor += self.doc_freq_nbits as usize; cursor += self.doc_freq_nbits as usize;
let postings_offset = extract_bits(data, cursor, self.postings_offset_nbits);
cursor += self.postings_offset_nbits as usize;
let positions_idx = extract_bits(data, cursor, self.positions_idx_nbits); let positions_idx = extract_bits(data, cursor, self.positions_idx_nbits);
TermInfo { TermInfo {
doc_freq, doc_freq,
postings_offset: postings_offset + self.ref_term_info.postings_offset, postings_start_offset: postings_start_offset + self.ref_term_info.postings_start_offset,
postings_end_offset,
positions_idx: positions_idx + self.ref_term_info.positions_idx, positions_idx: positions_idx + self.ref_term_info.positions_idx,
} }
} }
@@ -126,14 +133,13 @@ impl TermInfoStore {
.expect("Failed to deserialize terminfoblockmeta"); .expect("Failed to deserialize terminfoblockmeta");
let inner_offset = (term_ord as usize) % BLOCK_LEN; let inner_offset = (term_ord as usize) % BLOCK_LEN;
if inner_offset == 0 { if inner_offset == 0 {
term_info_block_data.ref_term_info return term_info_block_data.ref_term_info;
} else {
let term_info_data = self.term_info_source.as_slice();
term_info_block_data.deserialize_term_info(
&term_info_data[term_info_block_data.offset as usize..],
inner_offset - 1,
)
} }
let term_info_data = self.term_info_source.as_slice();
term_info_block_data.deserialize_term_info(
&term_info_data[term_info_block_data.offset as usize..],
inner_offset - 1,
)
} }
pub fn num_terms(&self) -> usize { pub fn num_terms(&self) -> usize {
@@ -154,16 +160,17 @@ fn bitpack_serialize<W: Write>(
term_info_block_meta: &TermInfoBlockMeta, term_info_block_meta: &TermInfoBlockMeta,
term_info: &TermInfo, term_info: &TermInfo,
) -> io::Result<()> { ) -> io::Result<()> {
bit_packer.write(
term_info.postings_start_offset,
term_info_block_meta.postings_offset_nbits,
write,
)?;
bit_packer.write( bit_packer.write(
u64::from(term_info.doc_freq), u64::from(term_info.doc_freq),
term_info_block_meta.doc_freq_nbits, term_info_block_meta.doc_freq_nbits,
write, write,
)?; )?;
bit_packer.write(
term_info.postings_offset,
term_info_block_meta.postings_offset_nbits,
write,
)?;
bit_packer.write( bit_packer.write(
term_info.positions_idx, term_info.positions_idx,
term_info_block_meta.positions_idx_nbits, term_info_block_meta.positions_idx_nbits,
@@ -183,23 +190,27 @@ impl TermInfoStoreWriter {
} }
fn flush_block(&mut self) -> io::Result<()> { fn flush_block(&mut self) -> io::Result<()> {
if self.term_infos.is_empty() {
return Ok(());
}
let mut bit_packer = BitPacker::new(); let mut bit_packer = BitPacker::new();
let ref_term_info = self.term_infos[0].clone(); let ref_term_info = self.term_infos[0].clone();
let last_term_info = if let Some(last_term_info) = self.term_infos.last().cloned() {
last_term_info
} else {
return Ok(());
};
let postings_end_offset =
last_term_info.postings_end_offset - ref_term_info.postings_start_offset;
for term_info in &mut self.term_infos[1..] { for term_info in &mut self.term_infos[1..] {
term_info.postings_offset -= ref_term_info.postings_offset; term_info.postings_start_offset -= ref_term_info.postings_start_offset;
term_info.positions_idx -= ref_term_info.positions_idx; term_info.positions_idx -= ref_term_info.positions_idx;
} }
let mut max_doc_freq: u32 = 0u32; let mut max_doc_freq: u32 = 0u32;
let mut max_postings_offset: u64 = 0u64; let max_postings_offset: u64 = postings_end_offset;
let mut max_positions_idx: u64 = 0u64; let max_positions_idx: u64 = last_term_info.positions_idx;
for term_info in &self.term_infos[1..] { for term_info in &self.term_infos[1..] {
max_doc_freq = cmp::max(max_doc_freq, term_info.doc_freq); max_doc_freq = cmp::max(max_doc_freq, term_info.doc_freq);
max_postings_offset = cmp::max(max_postings_offset, term_info.postings_offset);
max_positions_idx = cmp::max(max_positions_idx, term_info.positions_idx);
} }
let max_doc_freq_nbits: u8 = compute_num_bits(u64::from(max_doc_freq)); let max_doc_freq_nbits: u8 = compute_num_bits(u64::from(max_doc_freq));
@@ -224,6 +235,12 @@ impl TermInfoStoreWriter {
)?; )?;
} }
bit_packer.write(
postings_end_offset,
term_info_block_meta.postings_offset_nbits,
&mut self.buffer_term_infos,
)?;
// Block need end up at the end of a byte. // Block need end up at the end of a byte.
bit_packer.flush(&mut self.buffer_term_infos)?; bit_packer.flush(&mut self.buffer_term_infos)?;
self.term_infos.clear(); self.term_infos.clear();
@@ -232,6 +249,7 @@ impl TermInfoStoreWriter {
} }
pub fn write_term_info(&mut self, term_info: &TermInfo) -> io::Result<()> { pub fn write_term_info(&mut self, term_info: &TermInfo) -> io::Result<()> {
assert!(term_info.postings_end_offset >= term_info.postings_start_offset);
self.num_terms += 1u64; self.num_terms += 1u64;
self.term_infos.push(term_info.clone()); self.term_infos.push(term_info.clone());
if self.term_infos.len() >= BLOCK_LEN { if self.term_infos.len() >= BLOCK_LEN {
@@ -291,10 +309,11 @@ mod tests {
#[test] #[test]
fn test_term_info_block_meta_serialization() { fn test_term_info_block_meta_serialization() {
let term_info_block_meta = TermInfoBlockMeta { let term_info_block_meta = TermInfoBlockMeta {
offset: 2009, offset: 2009u64,
ref_term_info: TermInfo { ref_term_info: TermInfo {
doc_freq: 512, doc_freq: 512,
postings_offset: 51, postings_start_offset: 51,
postings_end_offset: 57u64,
positions_idx: 3584, positions_idx: 3584,
}, },
doc_freq_nbits: 10, doc_freq_nbits: 10,
@@ -312,10 +331,12 @@ mod tests {
fn test_pack() { fn test_pack() {
let mut store_writer = TermInfoStoreWriter::new(); let mut store_writer = TermInfoStoreWriter::new();
let mut term_infos = vec![]; let mut term_infos = vec![];
let offset = |i| (i * 13 + i * i) as u64;
for i in 0..1000 { for i in 0..1000 {
let term_info = TermInfo { let term_info = TermInfo {
doc_freq: i as u32, doc_freq: i as u32,
postings_offset: (i / 10) as u64, postings_start_offset: offset(i),
postings_end_offset: offset(i + 1),
positions_idx: (i * 7) as u64, positions_idx: (i * 7) as u64,
}; };
store_writer.write_term_info(&term_info).unwrap(); store_writer.write_term_info(&term_info).unwrap();
@@ -325,7 +346,12 @@ mod tests {
store_writer.serialize(&mut buffer).unwrap(); store_writer.serialize(&mut buffer).unwrap();
let term_info_store = TermInfoStore::open(&ReadOnlySource::from(buffer)); let term_info_store = TermInfoStore::open(&ReadOnlySource::from(buffer));
for i in 0..1000 { for i in 0..1000 {
assert_eq!(term_info_store.get(i as u64), term_infos[i]); assert_eq!(
term_info_store.get(i as u64),
term_infos[i],
"term info {}",
i
);
} }
} }
} }