Compare commits

...

24 Commits

Author SHA1 Message Date
Paul Masurel
9eb87e91cc TermInfo contain the end_offset of the postings.
We slice the ReadOnlySource tightly.
2020-09-21 00:42:15 +09:00
Paul Masurel
36f43da4d8 Added Field stats to remove total num tokens from the beginning of posting list files 2020-09-19 23:23:03 +09:00
Paul Masurel
19a02b2c30 Merge tag '0.13.1'
0.13.1 was published as a hotfix to accomodate tantivy-py.
2020-09-19 21:20:27 +09:00
Paul Masurel
70bae7ce4c Removing Term Vec allocation (#881) 2020-09-08 23:11:00 +09:00
Paul Masurel
ac2a7273e6 Re-added comment to Score. 2020-09-08 21:41:34 +09:00
Paul Masurel
4ce9517a82 fix unit test for bench. remove scoref64 feature. fixed test for lz4 feature. 2020-09-08 07:35:00 +09:00
Paul Masurel
73024a8af3 Fixing compilation of bench and doctests. 2020-09-08 07:18:43 +09:00
Paul Masurel
e70e605fc3 fix unit test (at least on linux) 2020-09-07 23:35:04 +09:00
Paul Masurel
439d6956a9 Returning Result in some of the API (#880)
* Returning Result in some of the API

* Introducing `.writer_for_test(..)`
2020-09-07 15:52:34 +09:00
Paul Masurel
6530bf0eae Make field types less strict when populating documents. 2020-09-06 10:24:03 +09:00
Paul Masurel
151498cbe7 Creating the tempfile for atomicwrites in the same directory as the MmapDirectory. (#878) 2020-09-05 23:06:29 +09:00
Paul Masurel
3a72b1cb98 Accept dash within field names. (#874)
Accept dash in field names and enforce field names constraint at the
creation of the schema.

Closes #796
2020-09-01 13:38:52 +09:00
Paul Masurel
2737822620 Fixing unit tests. (#868)
There was a unit test failing when notify was sending more
than one event on atomicwrites.

It was observed on MacOS CI.
2020-08-27 16:43:39 +09:00
b8591340
06c12ae221 Filter meta.json from validate_checksum (#872) 2020-08-27 07:54:37 +09:00
Paul Masurel
4e4400af7f Added cargo timing report to .gitignore 2020-08-23 16:15:28 +09:00
Paul Masurel
3f1ecf53ab Merge branch 'master' of github.com:tantivy-search/tantivy 2020-08-22 21:30:47 +09:00
Paul Masurel
0b583b8130 Plastic changes 2020-08-22 21:29:12 +09:00
Paul Masurel
31d18dca1c Removing dependency to atomicwrites (#866) 2020-08-21 21:37:05 +09:00
stephenlagree
5e06e7de5a Update basic_search.rs (#865)
Remove duplicated document entry.
2020-08-21 11:23:09 +09:00
Paul Masurel
8af53cbd36 Merge branch 'master' of github.com:tantivy-search/tantivy 2020-08-21 08:57:42 +09:00
Paul Masurel
4914076e8f Fixing release build 2020-08-21 08:57:27 +09:00
Paul Masurel
e04f47e922 Using block wand for term queries too. 2020-08-20 15:51:21 +09:00
Paul Masurel
f355695581 Code clean up 2020-08-20 15:42:50 +09:00
Paul Masurel
cbacdf0de8 Edited README. 2020-08-20 14:28:24 +09:00
62 changed files with 820 additions and 644 deletions

1
.gitignore vendored
View File

@@ -12,3 +12,4 @@ cpp/simdcomp/bitpackingbenchmark
*.bk
.idea
trace.dat
cargo-timing*

View File

@@ -1,9 +1,12 @@
Tantivy 0.14.0
=========================
- Remove dependency to atomicwrites #833 .Implemented by @pmasurel upon suggestion and research from @asafigan).
Tantivy 0.13.1
======================
===================
Made `Query` and `Collector` `Send + Sync`.
Updated misc dependency versions.
Tantivy 0.13.0
======================
Tantivy 0.13 introduce a change in the index format that will require

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.13.1"
version = "0.14.0-dev"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
@@ -22,8 +22,7 @@ tantivy-fst = "0.3"
memmap = {version = "0.7", optional=true}
lz4 = {version="1", optional=true}
snap = "1"
atomicwrites = {version="0.2", optional=true}
tempfile = "3"
tempfile = {version="3", optional=true}
log = "0.4"
serde = {version="1", features=["derive"]}
serde_json = "1"
@@ -35,10 +34,10 @@ uuid = { version = "0.8", features = ["v4", "serde"] }
crossbeam = "0.7"
futures = {version = "0.3", features=["thread-pool"] }
owning_ref = "0.4"
tantivy-query-grammar = { version="0.14.0-dev", path="./query-grammar" }
stable_deref_trait = "1"
rust-stemmers = "1"
downcast-rs = "1"
tantivy-query-grammar = { version="0.13", path="./query-grammar" }
bitpacking = {version="0.8", default-features = false, features=["bitpacker4x"]}
census = "0.4"
fnv = "1"
@@ -75,12 +74,11 @@ overflow-checks = true
[features]
default = ["mmap"]
mmap = ["atomicwrites", "fs2", "memmap", "notify"]
mmap = ["fs2", "tempfile", "memmap", "notify"]
lz4-compression = ["lz4"]
failpoints = ["fail/failpoints"]
unstable = [] # useful for benches.
wasm-bindgen = ["uuid/wasm-bindgen"]
scoref64 = [] # scores are f64 instead of f32. was introduced to debug blockwand.
[workspace]
members = ["query-grammar"]

View File

@@ -34,11 +34,6 @@ Tantivy is, in fact, strongly inspired by Lucene's design.
The following [benchmark](https://tantivy-search.github.io/bench/) break downs
performance for different type of queries / collection.
In general, Tantivy tends to be
- slower than Lucene on union with a Top-K due to Block-WAND optimization.
- faster than Lucene on intersection and phrase queries.
Your mileage WILL vary depending on the nature of queries and their load.
# Features

50
doc/src/index-format.md Normal file
View File

@@ -0,0 +1,50 @@
# Managed files
+----------+-----------+-------------------+
| content | footer | footer_len: u32 |
+----------+-----------+-------------------+
# Term Dictionary (Composite File)
+---------+---------------------------+------------------------+
| fst | term_info_store | footer_len: u64 |
+---------+---------------------------+------------------------+
During a merge the term info store need to fit in memory.
It has a cost of n bytes per term.
# term_info_store
+-------------------+---------------------------+------------------------+
| len_block_meta | block_meta | term_infos |
+-------------------+---------------------------+------------------------+
# inverted_index
+------------------------+---------------------------+------------------------+
| total_num_tokens: u64 | posting_lists.. | term_infos |
+------------------------+---------------------------+------------------------+
# postings lists
+------------------------+---------------------------+------------------------+
|
+
# composite file
+----------------+-----+----------------+----------------------+----------------+
| field file 1 | ... | field field n |composite file footer | footer len: u32|
+----------------+-----+----------------+----------------------+----------------+
# composite file footer
+-----------------+---------------------------------------+
|num fields: vint | (file_addr, offset_delta: vint) []... |
+-----------------+---------------------------------------+
# FileAddr
+--------------+--------------+
| field: u32 | idx: VInt |
+--------------+--------------+
# Posting lists
+-----------------------------------------+
| skip_reader
+-----------------------------------------+

View File

@@ -112,18 +112,6 @@ fn main() -> tantivy::Result<()> {
limbs and branches that arch over the pool"
));
index_writer.add_document(doc!(
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
// Multivalued field just need to be repeated.
index_writer.add_document(doc!(
title => "Frankenstein",

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy-query-grammar"
version = "0.13.0"
version = "0.14.0-dev"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]

View File

@@ -52,7 +52,7 @@ mod test {
use crate::Occur;
#[test]
fn test_Occur_compose() {
fn test_occur_compose() {
assert_eq!(Occur::compose(Occur::Should, Occur::Should), Occur::Should);
assert_eq!(Occur::compose(Occur::Should, Occur::Must), Occur::Must);
assert_eq!(

View File

@@ -9,8 +9,10 @@ use combine::{
fn field<'a>() -> impl Parser<&'a str, Output = String> {
(
letter(),
many(satisfy(|c: char| c.is_alphanumeric() || c == '_')),
(letter().or(char('_'))),
many(satisfy(|c: char| {
c.is_alphanumeric() || c == '_' || c == '-'
})),
)
.skip(char(':'))
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
@@ -279,6 +281,8 @@ pub fn parse_to_ast<'a>() -> impl Parser<&'a str, Output = UserInputAST> {
#[cfg(test)]
mod test {
type TestParseResult = Result<(), StringStreamError>;
use super::*;
use combine::parser::Parser;
@@ -296,9 +300,10 @@ mod test {
}
#[test]
fn test_occur_symbol() {
assert_eq!(super::occur_symbol().parse("-"), Ok((Occur::MustNot, "")));
assert_eq!(super::occur_symbol().parse("+"), Ok((Occur::Must, "")));
fn test_occur_symbol() -> TestParseResult {
assert_eq!(super::occur_symbol().parse("-")?, (Occur::MustNot, ""));
assert_eq!(super::occur_symbol().parse("+")?, (Occur::Must, ""));
Ok(())
}
#[test]
@@ -410,6 +415,25 @@ mod test {
assert_eq!(format!("{:?}", ast), "\"abc\"");
}
#[test]
fn test_field_name() -> TestParseResult {
assert_eq!(
super::field().parse("my-field-name:a")?,
("my-field-name".to_string(), "a")
);
assert_eq!(
super::field().parse("my_field_name:a")?,
("my_field_name".to_string(), "a")
);
assert!(super::field().parse(":a").is_err());
assert!(super::field().parse("-my_field:a").is_err());
assert_eq!(
super::field().parse("_my_field:a")?,
("_my_field".to_string(), "a")
);
Ok(())
}
#[test]
fn test_range_parser() {
// testing the range() parser separately

View File

@@ -472,7 +472,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let num_facets: usize = 3 * 4 * 5;
let facets: Vec<Facet> = (0..num_facets)
.map(|mut n| {
@@ -531,7 +531,7 @@ mod tests {
let facet_field = schema_builder.add_facet_field("facets");
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(
facet_field => Facet::from_text(&"/subjects/A/a"),
facet_field => Facet::from_text(&"/subjects/B/a"),
@@ -550,12 +550,12 @@ mod tests {
}
#[test]
fn test_doc_search_by_facet() {
fn test_doc_search_by_facet() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let facet_field = schema_builder.add_facet_field("facet");
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(
facet_field => Facet::from_text(&"/A/A"),
));
@@ -568,8 +568,8 @@ mod tests {
index_writer.add_document(doc!(
facet_field => Facet::from_text(&"/D/C/A"),
));
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 4);
@@ -586,17 +586,17 @@ mod tests {
assert_eq!(count_facet("/A/C"), 1);
assert_eq!(count_facet("/A/C/A"), 1);
assert_eq!(count_facet("/C/A"), 0);
let query_parser = QueryParser::for_index(&index, vec![]);
{
let query_parser = QueryParser::for_index(&index, vec![]);
{
let query = query_parser.parse_query("facet:/A/B").unwrap();
assert_eq!(1, searcher.search(&query, &Count).unwrap());
}
{
let query = query_parser.parse_query("facet:/A").unwrap();
assert_eq!(3, searcher.search(&query, &Count).unwrap());
}
let query = query_parser.parse_query("facet:/A/B")?;
assert_eq!(1, searcher.search(&query, &Count).unwrap());
}
{
let query = query_parser.parse_query("facet:/A")?;
assert_eq!(3, searcher.search(&query, &Count)?);
}
Ok(())
}
#[test]
@@ -631,7 +631,7 @@ mod tests {
.collect();
docs[..].shuffle(&mut thread_rng());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
for doc in docs {
index_writer.add_document(doc);
}
@@ -684,7 +684,7 @@ mod bench {
// 40425 docs
docs[..].shuffle(&mut thread_rng());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
for doc in docs {
index_writer.add_document(doc);
}

View File

@@ -89,7 +89,7 @@ mod tests {
let index = Index::create_in_ram(schema.clone());
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
{
for i in 0u64..10u64 {
index_writer.add_document(doc!(

View File

@@ -259,7 +259,7 @@ mod tests {
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text=>"abc"));
index_writer.add_document(doc!(text=>"abc abc abc"));
index_writer.add_document(doc!(text=>"abc abc"));

View File

@@ -38,7 +38,7 @@ use std::fmt;
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
///
/// let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
/// index_writer.add_document(doc!(title => "The Name of the Wind"));
/// index_writer.add_document(doc!(title => "The Diary of Muadib"));
/// index_writer.add_document(doc!(title => "A Dairy Cow"));
@@ -123,7 +123,7 @@ impl TopDocs {
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
///
/// let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
/// index_writer.add_document(doc!(title => "The Name of the Wind"));
/// index_writer.add_document(doc!(title => "The Diary of Muadib"));
/// index_writer.add_document(doc!(title => "A Dairy Cow"));
@@ -163,7 +163,7 @@ impl TopDocs {
/// # let schema = schema_builder.build();
/// #
/// # let index = Index::create_in_ram(schema);
/// # let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
/// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// # index_writer.add_document(doc!(title => "The Name of the Wind", rating => 92u64));
/// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64));
/// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64));
@@ -264,7 +264,7 @@ impl TopDocs {
/// fn create_index() -> tantivy::Result<Index> {
/// let schema = create_schema();
/// let index = Index::create_in_ram(schema);
/// let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// let product_name = index.schema().get_field("product_name").unwrap();
/// let popularity: Field = index.schema().get_field("popularity").unwrap();
/// index_writer.add_document(doc!(product_name => "The Diary of Muadib", popularity => 1u64));
@@ -371,7 +371,7 @@ impl TopDocs {
/// # fn main() -> tantivy::Result<()> {
/// # let schema = create_schema();
/// # let index = Index::create_in_ram(schema);
/// # let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
/// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// # let product_name = index.schema().get_field("product_name").unwrap();
/// #
/// let popularity: Field = index.schema().get_field("popularity").unwrap();
@@ -561,7 +561,7 @@ mod tests {
let index = Index::create_in_ram(schema);
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"Hello happy tax payer."));
index_writer.add_document(doc!(text_field=>"Droopy says hello happy tax payer"));
index_writer.add_document(doc!(text_field=>"I like Droopy"));
@@ -821,7 +821,7 @@ mod tests {
) -> (Index, Box<dyn Query>) {
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
doc_adder(&mut index_writer);
index_writer.commit().unwrap();
let query_parser = QueryParser::for_index(&index, vec![query_field]);

View File

@@ -300,6 +300,15 @@ impl Index {
)
}
/// Helper to create an index writer for tests.
///
/// That index writer only simply has a single thread and a heap of 5 MB.
/// Using a single thread gives us a deterministic allocation of DocId.
#[cfg(test)]
pub fn writer_for_tests(&self) -> crate::Result<IndexWriter> {
self.writer_with_num_threads(1, 10_000_000)
}
/// Creates a multithreaded writer
///
/// Tantivy will automatically define the number of threads to use.
@@ -502,7 +511,7 @@ mod tests {
let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap();
let mut index = Index::create_from_tempdir(schema).unwrap();
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut writer = index.writer_for_tests().unwrap();
writer.commit().unwrap();
let reader = index
.reader_builder()
@@ -539,23 +548,33 @@ mod tests {
test_index_on_commit_reload_policy_aux(field, &write_index, &reader);
}
}
fn test_index_on_commit_reload_policy_aux(field: Field, index: &Index, reader: &IndexReader) {
let mut reader_index = reader.index();
let (sender, receiver) = crossbeam::channel::unbounded();
let _watch_handle = reader_index.directory_mut().watch(Box::new(move || {
let _ = sender.send(());
}));
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut writer = index.writer_for_tests().unwrap();
assert_eq!(reader.searcher().num_docs(), 0);
writer.add_document(doc!(field=>1u64));
writer.commit().unwrap();
assert!(receiver.recv().is_ok());
assert_eq!(reader.searcher().num_docs(), 1);
// We need a loop here because it is possible for notify to send more than
// one modify event. It was observed on CI on MacOS.
loop {
assert!(receiver.recv().is_ok());
if reader.searcher().num_docs() == 1 {
break;
}
}
writer.add_document(doc!(field=>2u64));
writer.commit().unwrap();
assert!(receiver.recv().is_ok());
assert_eq!(reader.searcher().num_docs(), 2);
// ... Same as above
loop {
assert!(receiver.recv().is_ok());
if reader.searcher().num_docs() == 2 {
break;
}
}
}
// This test will not pass on windows, because windows

View File

@@ -116,6 +116,7 @@ impl SegmentMeta {
SegmentComponent::FASTFIELDS => ".fast".to_string(),
SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(),
SegmentComponent::DELETE => format!(".{}.del", self.delete_opstamp().unwrap_or(0)),
SegmentComponent::FIELDSTATS => ".fieldstats".to_string(),
});
PathBuf::from(path)
}

View File

@@ -1,9 +1,7 @@
use crate::common::BinarySerializable;
use crate::directory::ReadOnlySource;
use crate::positions::PositionReader;
use crate::postings::TermInfo;
use crate::postings::{BlockSegmentPostings, SegmentPostings};
use crate::schema::FieldType;
use crate::schema::IndexRecordOption;
use crate::schema::Term;
use crate::termdict::TermDictionary;
@@ -37,14 +35,12 @@ impl InvertedIndexReader {
postings_source: ReadOnlySource,
positions_source: ReadOnlySource,
positions_idx_source: ReadOnlySource,
total_num_tokens: u64,
record_option: IndexRecordOption,
) -> InvertedIndexReader {
let total_num_tokens_data = postings_source.slice(0, 8);
let mut total_num_tokens_cursor = total_num_tokens_data.as_slice();
let total_num_tokens = u64::deserialize(&mut total_num_tokens_cursor).unwrap_or(0u64);
InvertedIndexReader {
termdict,
postings_source: postings_source.slice_from(8),
postings_source,
positions_source,
positions_idx_source,
record_option,
@@ -54,10 +50,7 @@ impl InvertedIndexReader {
/// Creates an empty `InvertedIndexReader` object, which
/// contains no terms at all.
pub fn empty(field_type: &FieldType) -> InvertedIndexReader {
let record_option = field_type
.get_index_record_option()
.unwrap_or(IndexRecordOption::Basic);
pub fn empty(record_option: IndexRecordOption) -> InvertedIndexReader {
InvertedIndexReader {
termdict: TermDictionary::empty(),
postings_source: ReadOnlySource::empty(),
@@ -93,7 +86,7 @@ impl InvertedIndexReader {
term_info: &TermInfo,
block_postings: &mut BlockSegmentPostings,
) {
let offset = term_info.postings_offset as usize;
let offset = term_info.postings_start_offset as usize;
let end_source = self.postings_source.len();
let postings_slice = self.postings_source.slice(offset, end_source);
block_postings.reset(term_info.doc_freq, postings_slice);
@@ -121,8 +114,10 @@ impl InvertedIndexReader {
term_info: &TermInfo,
requested_option: IndexRecordOption,
) -> BlockSegmentPostings {
let offset = term_info.postings_offset as usize;
let postings_data = self.postings_source.slice_from(offset);
let postings_data = self.postings_source.slice(
term_info.postings_start_offset as usize,
term_info.postings_end_offset as usize,
);
BlockSegmentPostings::from_data(
term_info.doc_freq,
postings_data,

View File

@@ -24,14 +24,17 @@ pub enum SegmentComponent {
/// Accessing a document from the store is relatively slow, as it
/// requires to decompress the entire block it belongs to.
STORE,
/// Bitset describing which document of the segment is deleted.
DELETE,
FIELDSTATS,
}
impl SegmentComponent {
/// Iterates through the components.
pub fn iterator() -> slice::Iter<'static, SegmentComponent> {
static SEGMENT_COMPONENTS: [SegmentComponent; 8] = [
static SEGMENT_COMPONENTS: [SegmentComponent; 9] = [
SegmentComponent::POSTINGS,
SegmentComponent::POSITIONS,
SegmentComponent::POSITIONSSKIP,
@@ -40,6 +43,7 @@ impl SegmentComponent {
SegmentComponent::TERMS,
SegmentComponent::STORE,
SegmentComponent::DELETE,
SegmentComponent::FIELDSTATS,
];
SEGMENT_COMPONENTS.iter()
}

View File

@@ -1,4 +1,3 @@
use crate::common::CompositeFile;
use crate::common::HasLen;
use crate::core::InvertedIndexReader;
use crate::core::Segment;
@@ -9,13 +8,14 @@ use crate::fastfield::DeleteBitSet;
use crate::fastfield::FacetReader;
use crate::fastfield::FastFieldReaders;
use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
use crate::schema::Field;
use crate::schema::FieldType;
use crate::schema::Schema;
use crate::schema::{Field, IndexRecordOption};
use crate::space_usage::SegmentSpaceUsage;
use crate::store::StoreReader;
use crate::termdict::TermDictionary;
use crate::DocId;
use crate::{common::CompositeFile, postings::FieldStats};
use fail::fail_point;
use std::collections::HashMap;
use std::fmt;
@@ -49,6 +49,7 @@ pub struct SegmentReader {
positions_idx_composite: CompositeFile,
fast_fields_readers: Arc<FastFieldReaders>,
fieldnorm_readers: FieldNormReaders,
field_stats: FieldStats,
store_source: ReadOnlySource,
delete_bitset_opt: Option<DeleteBitSet>,
@@ -125,17 +126,15 @@ impl SegmentReader {
///
/// They are simply stored as a fast field, serialized in
/// the `.fieldnorm` file of the segment.
pub fn get_fieldnorms_reader(&self, field: Field) -> FieldNormReader {
if let Some(fieldnorm_reader) = self.fieldnorm_readers.get_field(field) {
fieldnorm_reader
} else {
pub fn get_fieldnorms_reader(&self, field: Field) -> crate::Result<FieldNormReader> {
self.fieldnorm_readers.get_field(field).ok_or_else(|| {
let field_name = self.schema.get_field_name(field);
let err_msg = format!(
"Field norm not found for field {:?}. Was it market as indexed during indexing.",
field_name
);
panic!(err_msg);
}
crate::TantivyError::SchemaError(err_msg)
})
}
/// Accessor to the segment's `StoreReader`.
@@ -181,6 +180,9 @@ impl SegmentReader {
let fieldnorm_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
let field_stats_data = segment.open_read(SegmentComponent::FIELDSTATS)?;
let field_stats = FieldStats::from_source(field_stats_data.as_slice())?;
let delete_bitset_opt = if segment.meta().has_deletes() {
let delete_data = segment.open_read(SegmentComponent::DELETE)?;
Some(DeleteBitSet::open(delete_data))
@@ -196,6 +198,7 @@ impl SegmentReader {
postings_composite,
fast_fields_readers: fast_field_readers,
fieldnorm_readers,
field_stats,
segment_id: segment.id(),
store_source,
delete_bitset_opt,
@@ -212,6 +215,11 @@ impl SegmentReader {
/// The field reader is in charge of iterating through the
/// term dictionary associated to a specific field,
/// and opening the posting list associated to any term.
///
/// If the field is marked as index, a warn is logged and an empty `InvertedIndexReader`
/// is returned.
/// Similarly if the field is marked as indexed but no term has been indexed for the given
/// index. an empty `InvertedIndexReader` is returned (but no warning is logged).
pub fn inverted_index(&self, field: Field) -> Arc<InvertedIndexReader> {
if let Some(inv_idx_reader) = self
.inv_idx_reader_cache
@@ -226,21 +234,21 @@ impl SegmentReader {
let record_option_opt = field_type.get_index_record_option();
if record_option_opt.is_none() {
panic!("Field {:?} does not seem indexed.", field_entry.name());
warn!("Field {:?} does not seem indexed.", field_entry.name());
}
let record_option = record_option_opt.unwrap();
let postings_source_opt = self.postings_composite.open_read(field);
if postings_source_opt.is_none() {
if postings_source_opt.is_none() || record_option_opt.is_none() {
// no documents in the segment contained this field.
// As a result, no data is associated to the inverted index.
//
// Returns an empty inverted index.
return Arc::new(InvertedIndexReader::empty(field_type));
let record_option = record_option_opt.unwrap_or(IndexRecordOption::Basic);
return Arc::new(InvertedIndexReader::empty(record_option));
}
let record_option = record_option_opt.unwrap();
let postings_source = postings_source_opt.unwrap();
let termdict_source = self.termdict_composite.open_read(field).expect(
@@ -257,11 +265,17 @@ impl SegmentReader {
.open_read(field)
.expect("Index corrupted. Failed to open field positions in composite file.");
let total_num_tokens = self
.field_stats
.get(field)
.map(|field_stat| field_stat.num_tokens())
.unwrap_or(0u64);
let inv_idx_reader = Arc::new(InvertedIndexReader::new(
TermDictionary::from_source(&termdict_source),
postings_source,
positions_source,
positions_idx_source,
total_num_tokens,
record_option,
));
@@ -339,7 +353,7 @@ mod test {
let name = schema.get_field("name").unwrap();
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(name => "tantivy"));
index_writer.add_document(doc!(name => "horse"));
index_writer.add_document(doc!(name => "jockey"));

View File

@@ -271,7 +271,11 @@ mod tests {
let mut vec = Vec::new();
let footer_proxy = FooterProxy::new(&mut vec);
assert!(footer_proxy.terminate().is_ok());
assert_eq!(vec.len(), 167);
if crate::store::COMPRESSION == "lz4" {
assert_eq!(vec.len(), 158);
} else {
assert_eq!(vec.len(), 167);
}
let footer = Footer::deserialize(&mut &vec[..]).unwrap();
assert!(matches!(
footer.versioned_footer,

View File

@@ -1,4 +1,4 @@
use crate::core::MANAGED_FILEPATH;
use crate::core::{MANAGED_FILEPATH, META_FILEPATH};
use crate::directory::error::{DeleteError, IOError, LockError, OpenReadError, OpenWriteError};
use crate::directory::footer::{Footer, FooterProxy};
use crate::directory::DirectoryLock;
@@ -246,13 +246,15 @@ impl ManagedDirectory {
/// List files for which checksum does not match content
pub fn list_damaged(&self) -> result::Result<HashSet<PathBuf>, OpenReadError> {
let mut hashset = HashSet::new();
let managed_paths = self
let mut managed_paths = self
.meta_informations
.read()
.expect("Managed directory rlock poisoned in list damaged.")
.managed_paths
.clone();
managed_paths.remove(*META_FILEPATH);
for path in managed_paths.into_iter() {
if !self.validate_checksum(&path)? {
hashset.insert(path);

View File

@@ -487,11 +487,13 @@ impl Directory for MmapDirectory {
}
}
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
fn atomic_write(&mut self, path: &Path, content: &[u8]) -> io::Result<()> {
debug!("Atomic Write {:?}", path);
let mut tempfile = tempfile::Builder::new().tempfile_in(&self.inner.root_path)?;
tempfile.write_all(content)?;
tempfile.flush()?;
let full_path = self.resolve_path(path);
let meta_file = atomicwrites::AtomicFile::new(full_path, atomicwrites::AllowOverwrite);
meta_file.write(|f| f.write_all(data))?;
tempfile.into_temp_path().persist(full_path)?;
Ok(())
}
@@ -652,7 +654,7 @@ mod tests {
{
let index = Index::create(mmap_directory.clone(), schema).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let mut log_merge_policy = LogMergePolicy::default();
log_merge_policy.set_min_merge_size(3);
index_writer.set_merge_policy(Box::new(log_merge_policy));

View File

@@ -211,19 +211,19 @@ fn test_watch(directory: &mut dyn Directory) {
.unwrap();
for i in 0..10 {
assert_eq!(i, counter.load(SeqCst));
assert!(i <= counter.load(SeqCst));
assert!(directory
.atomic_write(Path::new("meta.json"), b"random_test_data_2")
.is_ok());
assert_eq!(receiver.recv_timeout(Duration::from_millis(500)), Ok(i));
assert_eq!(i + 1, counter.load(SeqCst));
assert!(i + 1 <= counter.load(SeqCst)); // notify can trigger more than once.
}
mem::drop(watch_handle);
assert!(directory
.atomic_write(Path::new("meta.json"), b"random_test_data")
.is_ok());
assert!(receiver.recv_timeout(Duration::from_millis(500)).is_ok());
assert_eq!(10, counter.load(SeqCst));
assert!(10 <= counter.load(SeqCst));
}
fn test_lock_non_blocking(directory: &mut dyn Directory) {

View File

@@ -15,7 +15,7 @@ mod tests {
let field = schema_builder.add_bytes_field("bytesfield");
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(field=>vec![0u8, 1, 2, 3]));
index_writer.add_document(doc!(field=>vec![]));
index_writer.add_document(doc!(field=>vec![255u8]));

View File

@@ -474,7 +474,7 @@ mod tests {
let date_field = schema_builder.add_date_field("date", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer.add_document(doc!(date_field =>crate::chrono::prelude::Utc::now()));
index_writer.commit().unwrap();
@@ -511,7 +511,7 @@ mod tests {
);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer.add_document(doc!(
date_field => crate::DateTime::from_u64(1i64.to_u64()),

View File

@@ -25,7 +25,7 @@ mod tests {
);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(field=>1u64, field=>3u64));
index_writer.add_document(doc!());
index_writer.add_document(doc!(field=>4u64));
@@ -64,7 +64,7 @@ mod tests {
schema_builder.add_i64_field("time_stamp_i", IntOptions::default().set_stored());
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let first_time_stamp = chrono::Utc::now();
index_writer.add_document(
doc!(date_field=>first_time_stamp, date_field=>first_time_stamp, time_i=>1i64),
@@ -186,7 +186,7 @@ mod tests {
);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(field=> 1i64, field => 3i64));
index_writer.add_document(doc!());
index_writer.add_document(doc!(field=> -4i64));
@@ -221,7 +221,7 @@ mod tests {
let field = schema_builder.add_facet_field("facetfield");
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
for i in 0..100_000 {
index_writer.add_document(doc!(field=> Facet::from(format!("/lang/{}", i).as_str())));
}

View File

@@ -74,7 +74,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index
.writer_with_num_threads(1, 30_000_000)
.writer_for_tests()
.expect("Failed to create index writer.");
index_writer.add_document(doc!(
facet_field => Facet::from("/category/cat2"),

View File

@@ -800,7 +800,7 @@ mod tests {
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
let index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let index_writer = index.writer_for_tests().unwrap();
let operations = vec![
UserOperation::Add(doc!(text_field=>"a")),
UserOperation::Add(doc!(text_field=>"b")),
@@ -815,7 +815,7 @@ mod tests {
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field => "hello1"));
index_writer.add_document(doc!(text_field => "hello2"));
assert!(index_writer.commit().is_ok());
@@ -864,7 +864,7 @@ mod tests {
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let a_term = Term::from_field_text(text_field, "a");
let b_term = Term::from_field_text(text_field, "b");
let operations = vec![
@@ -926,8 +926,8 @@ mod tests {
fn test_lockfile_already_exists_error_msg() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let _index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
match index.writer_with_num_threads(1, 3_000_000) {
let _index_writer = index.writer_for_tests().unwrap();
match index.writer_for_tests() {
Err(err) => {
let err_msg = err.to_string();
assert!(err_msg.contains("already an `IndexWriter`"));
@@ -1261,7 +1261,7 @@ mod tests {
let idfield = schema_builder.add_text_field("id", STRING);
schema_builder.add_text_field("optfield", STRING);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(idfield=>"myid"));
let commit = index_writer.commit();
assert!(commit.is_ok());

View File

@@ -25,14 +25,14 @@ use std::cmp;
use std::collections::HashMap;
use std::sync::Arc;
fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 {
fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::Result<u64> {
let mut total_tokens = 0u64;
let mut count: [usize; 256] = [0; 256];
for reader in readers {
if reader.has_deletes() {
// if there are deletes, then we use an approximation
// using the fieldnorm
let fieldnorms_reader = reader.get_fieldnorms_reader(field);
let fieldnorms_reader = reader.get_fieldnorms_reader(field)?;
for doc in reader.doc_ids_alive() {
let fieldnorm_id = fieldnorms_reader.fieldnorm_id(doc);
count[fieldnorm_id as usize] += 1;
@@ -41,7 +41,7 @@ fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 {
total_tokens += reader.inverted_index(field).total_num_tokens();
}
}
total_tokens
Ok(total_tokens
+ count
.iter()
.cloned()
@@ -49,7 +49,7 @@ fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 {
.map(|(fieldnorm_ord, count)| {
count as u64 * u64::from(FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8))
})
.sum::<u64>()
.sum::<u64>())
}
pub struct IndexMerger {
@@ -175,7 +175,7 @@ impl IndexMerger {
for field in fields {
fieldnorms_data.clear();
for reader in &self.readers {
let fieldnorms_reader = reader.get_fieldnorms_reader(field);
let fieldnorms_reader = reader.get_fieldnorms_reader(field)?;
for doc_id in reader.doc_ids_alive() {
let fieldnorm_id = fieldnorms_reader.fieldnorm_id(doc_id);
fieldnorms_data.push(fieldnorm_id);
@@ -541,7 +541,7 @@ impl IndexMerger {
// The total number of tokens will only be exact when there has been no deletes.
//
// Otherwise, we approximate by removing deleted documents proportionally.
let total_num_tokens: u64 = compute_total_num_tokens(&self.readers, indexed_field);
let total_num_tokens: u64 = compute_total_num_tokens(&self.readers, indexed_field)?;
// Create the total list of doc ids
// by stacking the doc ids from the different segment.
@@ -751,7 +751,7 @@ mod tests {
};
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
{
// writing the segment
{
@@ -803,7 +803,7 @@ mod tests {
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
block_on(index_writer.merge(&segment_ids)).expect("Merging failed");
index_writer.wait_merging_threads().unwrap();
}
@@ -904,7 +904,7 @@ mod tests {
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
let bytes_score_field = schema_builder.add_bytes_field("score_bytes");
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let reader = index.reader().unwrap();
let search_term = |searcher: &Searcher, term: Term| {
let collector = FastFieldTestCollector::for_field(score_field);
@@ -1211,7 +1211,7 @@ mod tests {
let index = Index::create_in_ram(schema_builder.build());
let reader = index.reader().unwrap();
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| {
let mut doc = Document::default();
for facet in doc_facets {
@@ -1276,7 +1276,7 @@ mod tests {
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
block_on(index_writer.merge(&segment_ids)).expect("Merging failed");
index_writer.wait_merging_threads().unwrap();
reader.reload().unwrap();
@@ -1295,7 +1295,7 @@ mod tests {
// Deleting one term
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let facet = Facet::from_path(vec!["top", "a", "firstdoc"]);
let facet_term = Term::from_facet(facet_field, &facet);
index_writer.delete_term(facet_term);
@@ -1320,7 +1320,7 @@ mod tests {
let mut schema_builder = schema::Schema::builder();
let int_field = schema_builder.add_u64_field("intvals", INDEXED);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(int_field => 1u64));
index_writer.commit().expect("commit failed");
index_writer.add_document(doc!(int_field => 1u64));
@@ -1349,7 +1349,7 @@ mod tests {
let index = Index::create_in_ram(schema_builder.build());
let reader = index.reader().unwrap();
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let mut doc = Document::default();
doc.add_u64(int_field, 1);
index_writer.add_document(doc.clone());
@@ -1388,7 +1388,7 @@ mod tests {
let index = Index::create_in_ram(schema_builder.build());
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| {
let mut doc = Document::default();
for &val in int_vals {
@@ -1462,7 +1462,7 @@ mod tests {
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
assert!(block_on(index_writer.merge(&segment_ids)).is_ok());
assert!(index_writer.wait_merging_threads().is_ok());
}
@@ -1516,7 +1516,7 @@ mod tests {
let index = Index::create_in_ram(builder.build());
let mut writer = index.writer_with_num_threads(1, 3_000_000)?;
let mut writer = index.writer_for_tests()?;
// Make sure we'll attempt to merge every created segment
let mut policy = crate::indexer::LogMergePolicy::default();
@@ -1548,7 +1548,7 @@ mod tests {
let mut builder = schema::SchemaBuilder::new();
let text = builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(builder.build());
let mut writer = index.writer_with_num_threads(1, 3_000_000)?;
let mut writer = index.writer_for_tests()?;
let happy_term = Term::from_field_text(text, "happy");
let term_query = TermQuery::new(happy_term, IndexRecordOption::WithFreqs);
for _ in 0..62 {

View File

@@ -29,8 +29,9 @@ pub use self::segment_writer::SegmentWriter;
/// Alias for the default merge policy, which is the `LogMergePolicy`.
pub type DefaultMergePolicy = LogMergePolicy;
#[cfg(feature = "mmap")]
#[cfg(test)]
mod tests {
mod tests_mmap {
use crate::schema::{self, Schema};
use crate::{Index, Term};
@@ -39,7 +40,7 @@ mod tests {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_from_tempdir(schema_builder.build()).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
// there must be one deleted document in the segment
index_writer.add_document(doc!(text_field=>"b"));
index_writer.delete_term(Term::from_field_text(text_field, "b"));

View File

@@ -555,7 +555,7 @@ mod tests {
let index = Index::create_in_ram(schema);
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(MergeWheneverPossible));
{
@@ -608,7 +608,7 @@ mod tests {
let index = Index::create_in_ram(schema);
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
{
for _ in 0..100 {
@@ -679,7 +679,7 @@ mod tests {
let index = Index::create_in_ram(schema);
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
{
for _ in 0..100 {

View File

@@ -17,7 +17,6 @@ use crate::tokenizer::{TokenStreamChain, Tokenizer};
use crate::Opstamp;
use crate::{DocId, SegmentComponent};
use std::io;
use std::str;
/// Computes the initial size of the hash table.
///
@@ -48,6 +47,7 @@ pub struct SegmentWriter {
fieldnorms_writer: FieldNormsWriter,
doc_opstamps: Vec<Opstamp>,
tokenizers: Vec<Option<TextAnalyzer>>,
term_buffer: Term,
}
impl SegmentWriter {
@@ -91,6 +91,7 @@ impl SegmentWriter {
fast_field_writers: FastFieldsWriter::from_schema(schema),
doc_opstamps: Vec::with_capacity(1_000),
tokenizers,
term_buffer: Term::new(),
})
}
@@ -128,24 +129,26 @@ impl SegmentWriter {
if !field_options.is_indexed() {
continue;
}
let (term_buffer, multifield_postings) =
(&mut self.term_buffer, &mut self.multifield_postings);
match *field_options.field_type() {
FieldType::HierarchicalFacet => {
let facets: Vec<&str> = field_values
.iter()
.flat_map(|field_value| match *field_value.value() {
Value::Facet(ref facet) => Some(facet.encoded_str()),
_ => {
panic!("Expected hierarchical facet");
}
})
.collect();
let mut term = Term::for_field(field); // we set the Term
term_buffer.set_field(field);
let facets =
field_values
.iter()
.flat_map(|field_value| match *field_value.value() {
Value::Facet(ref facet) => Some(facet.encoded_str()),
_ => {
panic!("Expected hierarchical facet");
}
});
for fake_str in facets {
let mut unordered_term_id_opt = None;
FacetTokenizer.token_stream(fake_str).process(&mut |token| {
term.set_text(&token.text);
term_buffer.set_text(&token.text);
let unordered_term_id =
self.multifield_postings.subscribe(doc_id, &term);
multifield_postings.subscribe(doc_id, &term_buffer);
unordered_term_id_opt = Some(unordered_term_id);
});
if let Some(unordered_term_id) = unordered_term_id_opt {
@@ -168,7 +171,6 @@ impl SegmentWriter {
if let Some(last_token) = tok_str.tokens.last() {
total_offset += last_token.offset_to;
}
token_streams
.push(PreTokenizedStream::from(tok_str.clone()).into());
}
@@ -178,7 +180,6 @@ impl SegmentWriter {
{
offsets.push(total_offset);
total_offset += text.len();
token_streams.push(tokenizer.token_stream(text));
}
}
@@ -190,8 +191,12 @@ impl SegmentWriter {
0
} else {
let mut token_stream = TokenStreamChain::new(offsets, token_streams);
self.multifield_postings
.index_text(doc_id, field, &mut token_stream)
multifield_postings.index_text(
doc_id,
field,
&mut token_stream,
term_buffer,
)
};
self.fieldnorms_writer.record(doc_id, field, num_tokens);
@@ -199,44 +204,36 @@ impl SegmentWriter {
FieldType::U64(ref int_option) => {
if int_option.is_indexed() {
for field_value in field_values {
let term = Term::from_field_u64(
field_value.field(),
field_value.value().u64_value(),
);
self.multifield_postings.subscribe(doc_id, &term);
term_buffer.set_field(field_value.field());
term_buffer.set_u64(field_value.value().u64_value());
multifield_postings.subscribe(doc_id, &term_buffer);
}
}
}
FieldType::Date(ref int_option) => {
if int_option.is_indexed() {
for field_value in field_values {
let term = Term::from_field_i64(
field_value.field(),
field_value.value().date_value().timestamp(),
);
self.multifield_postings.subscribe(doc_id, &term);
term_buffer.set_field(field_value.field());
term_buffer.set_i64(field_value.value().date_value().timestamp());
multifield_postings.subscribe(doc_id, &term_buffer);
}
}
}
FieldType::I64(ref int_option) => {
if int_option.is_indexed() {
for field_value in field_values {
let term = Term::from_field_i64(
field_value.field(),
field_value.value().i64_value(),
);
self.multifield_postings.subscribe(doc_id, &term);
term_buffer.set_field(field_value.field());
term_buffer.set_i64(field_value.value().i64_value());
multifield_postings.subscribe(doc_id, &term_buffer);
}
}
}
FieldType::F64(ref int_option) => {
if int_option.is_indexed() {
for field_value in field_values {
let term = Term::from_field_f64(
field_value.field(),
field_value.value().f64_value(),
);
self.multifield_postings.subscribe(doc_id, &term);
term_buffer.set_field(field_value.field());
term_buffer.set_f64(field_value.value().f64_value());
multifield_postings.subscribe(doc_id, &term_buffer);
}
}
}

View File

@@ -245,18 +245,10 @@ pub type DocId = u32;
/// with opstamp `n+1`.
pub type Opstamp = u64;
/// A Score that represents the relevance of the document to the query
///
/// This is modelled internally as a `f64`, because tantivy was compiled with the `scoref64`
/// feature. The larger the number, the more relevant the document is to the search query.
#[cfg(feature = "scoref64")]
pub type Score = f64;
/// A Score that represents the relevance of the document to the query
///
/// This is modelled internally as a `f32`. The larger the number, the more relevant
/// the document to the search query.
#[cfg(not(feature = "scoref64"))]
pub type Score = f32;
/// A `SegmentLocalId` identifies a segment.
@@ -296,7 +288,6 @@ mod tests {
use crate::schema::*;
use crate::DocAddress;
use crate::Index;
use crate::IndexWriter;
use crate::Postings;
use crate::ReloadPolicy;
use rand::distributions::Bernoulli;
@@ -361,14 +352,14 @@ mod tests {
#[test]
#[cfg(feature = "mmap")]
fn test_indexing() {
fn test_indexing() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_from_tempdir(schema).unwrap();
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests()?;
{
let doc = doc!(text_field=>"af b");
index_writer.add_document(doc);
@@ -383,29 +374,30 @@ mod tests {
}
assert!(index_writer.commit().is_ok());
}
Ok(())
}
#[test]
fn test_docfreq1() {
fn test_docfreq1() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests()?;
{
index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.commit().unwrap();
index_writer.commit()?;
}
{
index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"a a"));
index_writer.commit().unwrap();
index_writer.commit()?;
}
{
index_writer.add_document(doc!(text_field=>"c"));
index_writer.commit().unwrap();
index_writer.commit()?;
}
{
let reader = index.reader().unwrap();
let reader = index.reader()?;
let searcher = reader.searcher();
let term_a = Term::from_field_text(text_field, "a");
assert_eq!(searcher.doc_freq(&term_a), 3);
@@ -416,67 +408,50 @@ mod tests {
let term_d = Term::from_field_text(text_field, "d");
assert_eq!(searcher.doc_freq(&term_d), 0);
}
Ok(())
}
#[test]
fn test_fieldnorm_no_docs_with_field() {
fn test_fieldnorm_no_docs_with_field() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let title_field = schema_builder.add_text_field("title", TEXT);
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.commit()?;
let index_reader = index.reader()?;
let searcher = index_reader.searcher();
let reader = searcher.segment_reader(0);
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
{
let doc = doc!(text_field=>"a b c");
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
let fieldnorm_reader = reader.get_fieldnorms_reader(text_field)?;
assert_eq!(fieldnorm_reader.fieldnorm(0), 3);
}
{
let index_reader = index.reader().unwrap();
let searcher = index_reader.searcher();
let reader = searcher.segment_reader(0);
{
let fieldnorm_reader = reader.get_fieldnorms_reader(text_field);
assert_eq!(fieldnorm_reader.fieldnorm(0), 3);
}
{
let fieldnorm_reader = reader.get_fieldnorms_reader(title_field);
assert_eq!(fieldnorm_reader.fieldnorm_id(0), 0);
}
let fieldnorm_reader = reader.get_fieldnorms_reader(title_field)?;
assert_eq!(fieldnorm_reader.fieldnorm_id(0), 0);
}
Ok(())
}
#[test]
fn test_fieldnorm() {
fn test_fieldnorm() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
{
let doc = doc!(text_field=>"a b c");
index_writer.add_document(doc);
}
{
let doc = doc!();
index_writer.add_document(doc);
}
{
let doc = doc!(text_field=>"a b");
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
}
{
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let segment_reader: &SegmentReader = searcher.segment_reader(0);
let fieldnorms_reader = segment_reader.get_fieldnorms_reader(text_field);
assert_eq!(fieldnorms_reader.fieldnorm(0), 3);
assert_eq!(fieldnorms_reader.fieldnorm(1), 0);
assert_eq!(fieldnorms_reader.fieldnorm(2), 2);
}
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.add_document(doc!());
index_writer.add_document(doc!(text_field=>"a b"));
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
let segment_reader: &SegmentReader = searcher.segment_reader(0);
let fieldnorms_reader = segment_reader.get_fieldnorms_reader(text_field)?;
assert_eq!(fieldnorms_reader.fieldnorm(0), 3);
assert_eq!(fieldnorms_reader.fieldnorm(1), 0);
assert_eq!(fieldnorms_reader.fieldnorm(2), 2);
Ok(())
}
fn advance_undeleted(docset: &mut dyn DocSet, reader: &SegmentReader) -> bool {
@@ -491,7 +466,7 @@ mod tests {
}
#[test]
fn test_delete_postings1() {
fn test_delete_postings1() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let term_abcd = Term::from_field_text(text_field, "abcd");
@@ -507,7 +482,7 @@ mod tests {
.unwrap();
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests()?;
// 0
index_writer.add_document(doc!(text_field=>"a b"));
// 1
@@ -523,10 +498,10 @@ mod tests {
index_writer.add_document(doc!(text_field=>" b c"));
// 5
index_writer.add_document(doc!(text_field=>" a"));
index_writer.commit().unwrap();
index_writer.commit()?;
}
{
reader.reload().unwrap();
reader.reload()?;
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0);
let inverted_index = segment_reader.inverted_index(text_field);
@@ -554,15 +529,15 @@ mod tests {
}
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests()?;
// 0
index_writer.add_document(doc!(text_field=>"a b"));
// 1
index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.rollback().unwrap();
index_writer.rollback()?;
}
{
reader.reload().unwrap();
reader.reload()?;
let searcher = reader.searcher();
let seg_reader = searcher.segment_reader(0);
let inverted_index = seg_reader.inverted_index(term_abcd.field());
@@ -591,15 +566,15 @@ mod tests {
}
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b"));
index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.rollback().unwrap();
index_writer.rollback()?;
index_writer.delete_term(Term::from_field_text(text_field, "a"));
index_writer.commit().unwrap();
index_writer.commit()?;
}
{
reader.reload().unwrap();
reader.reload()?;
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0);
let inverted_index = segment_reader.inverted_index(term_abcd.field());
@@ -631,19 +606,20 @@ mod tests {
assert!(!advance_undeleted(&mut postings, segment_reader));
}
}
Ok(())
}
#[test]
fn test_indexed_u64() {
fn test_indexed_u64() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field("value", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(field=>1u64));
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
let term = Term::from_field_u64(field, 1u64);
let mut postings = searcher
@@ -653,20 +629,21 @@ mod tests {
.unwrap();
assert_eq!(postings.doc(), 0);
assert_eq!(postings.advance(), TERMINATED);
Ok(())
}
#[test]
fn test_indexed_i64() {
fn test_indexed_i64() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let value_field = schema_builder.add_i64_field("value", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests()?;
let negative_val = -1i64;
index_writer.add_document(doc!(value_field => negative_val));
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
let term = Term::from_field_i64(value_field, negative_val);
let mut postings = searcher
@@ -676,20 +653,21 @@ mod tests {
.unwrap();
assert_eq!(postings.doc(), 0);
assert_eq!(postings.advance(), TERMINATED);
Ok(())
}
#[test]
fn test_indexed_f64() {
fn test_indexed_f64() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let value_field = schema_builder.add_f64_field("value", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests()?;
let val = std::f64::consts::PI;
index_writer.add_document(doc!(value_field => val));
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
let term = Term::from_field_f64(value_field, val);
let mut postings = searcher
@@ -699,26 +677,29 @@ mod tests {
.unwrap();
assert_eq!(postings.doc(), 0);
assert_eq!(postings.advance(), TERMINATED);
Ok(())
}
#[test]
fn test_indexedfield_not_in_documents() {
fn test_indexedfield_not_in_documents() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let absent_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(2, 6_000_000).unwrap();
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a"));
assert!(index_writer.commit().is_ok());
let reader = index.reader().unwrap();
let reader = index.reader()?;
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0);
segment_reader.inverted_index(absent_field); //< should not panic
let inverted_index = segment_reader.inverted_index(absent_field); //< should not panic
assert_eq!(inverted_index.terms().num_terms(), 0);
Ok(())
}
#[test]
fn test_delete_postings2() {
fn test_delete_postings2() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
@@ -726,53 +707,40 @@ mod tests {
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
.try_into()?;
// writing the segment
let mut index_writer = index.writer_with_num_threads(2, 6_000_000).unwrap();
let add_document = |index_writer: &mut IndexWriter, val: &'static str| {
let doc = doc!(text_field=>val);
index_writer.add_document(doc);
};
let remove_document = |index_writer: &mut IndexWriter, val: &'static str| {
let delterm = Term::from_field_text(text_field, val);
index_writer.delete_term(delterm);
};
add_document(&mut index_writer, "63");
add_document(&mut index_writer, "70");
add_document(&mut index_writer, "34");
add_document(&mut index_writer, "1");
add_document(&mut index_writer, "38");
add_document(&mut index_writer, "33");
add_document(&mut index_writer, "40");
add_document(&mut index_writer, "17");
remove_document(&mut index_writer, "38");
remove_document(&mut index_writer, "34");
index_writer.commit().unwrap();
reader.reload().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 6);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"63"));
index_writer.add_document(doc!(text_field=>"70"));
index_writer.add_document(doc!(text_field=>"34"));
index_writer.add_document(doc!(text_field=>"1"));
index_writer.add_document(doc!(text_field=>"38"));
index_writer.add_document(doc!(text_field=>"33"));
index_writer.add_document(doc!(text_field=>"40"));
index_writer.add_document(doc!(text_field=>"17"));
index_writer.delete_term(Term::from_field_text(text_field, "38"));
index_writer.delete_term(Term::from_field_text(text_field, "34"));
index_writer.commit()?;
reader.reload()?;
assert_eq!(reader.searcher().num_docs(), 6);
Ok(())
}
#[test]
fn test_termfreq() {
fn test_termfreq() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let doc = doc!(text_field=>"af af af bc bc");
index_writer.add_document(doc);
index_writer.commit().unwrap();
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"af af af bc bc"));
index_writer.commit()?;
}
{
let index_reader = index.reader().unwrap();
let index_reader = index.reader()?;
let searcher = index_reader.searcher();
let reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(text_field);
@@ -788,63 +756,63 @@ mod tests {
assert_eq!(postings.term_freq(), 3);
assert_eq!(postings.advance(), TERMINATED);
}
Ok(())
}
#[test]
fn test_searcher_1() {
fn test_searcher_1() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let reader = index.reader().unwrap();
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"af af af b"));
index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.add_document(doc!(text_field=>"a b c d"));
index_writer.commit().unwrap();
}
{
reader.reload().unwrap();
let searcher = reader.searcher();
let get_doc_ids = |terms: Vec<Term>| {
let query = BooleanQuery::new_multiterms_query(terms);
let topdocs = searcher.search(&query, &TEST_COLLECTOR_WITH_SCORE).unwrap();
topdocs.docs().to_vec()
};
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
vec![DocAddress(0, 1), DocAddress(0, 2)]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
vec![DocAddress(0, 0)]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
vec![DocAddress(0, 0), DocAddress(0, 1), DocAddress(0, 2)]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "c")]),
vec![DocAddress(0, 1), DocAddress(0, 2)]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "d")]),
vec![DocAddress(0, 2)]
);
assert_eq!(
get_doc_ids(vec![
Term::from_field_text(text_field, "b"),
Term::from_field_text(text_field, "a"),
]),
vec![DocAddress(0, 0), DocAddress(0, 1), DocAddress(0, 2)]
);
}
let reader = index.reader()?;
// writing the segment
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"af af af b"));
index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.add_document(doc!(text_field=>"a b c d"));
index_writer.commit()?;
reader.reload()?;
let searcher = reader.searcher();
let get_doc_ids = |terms: Vec<Term>| {
let query = BooleanQuery::new_multiterms_query(terms);
searcher
.search(&query, &TEST_COLLECTOR_WITH_SCORE)
.map(|topdocs| topdocs.docs().to_vec())
};
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "a")])?,
vec![DocAddress(0, 1), DocAddress(0, 2)]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "af")])?,
vec![DocAddress(0, 0)]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "b")])?,
vec![DocAddress(0, 0), DocAddress(0, 1), DocAddress(0, 2)]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "c")])?,
vec![DocAddress(0, 1), DocAddress(0, 2)]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "d")])?,
vec![DocAddress(0, 2)]
);
assert_eq!(
get_doc_ids(vec![
Term::from_field_text(text_field, "b"),
Term::from_field_text(text_field, "a"),
])?,
vec![DocAddress(0, 0), DocAddress(0, 1), DocAddress(0, 2)]
);
Ok(())
}
#[test]
fn test_searcher_2() {
fn test_searcher_2() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
@@ -852,19 +820,17 @@ mod tests {
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
.try_into()?;
assert_eq!(reader.searcher().num_docs(), 0u64);
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"af b"));
index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.add_document(doc!(text_field=>"a b c d"));
index_writer.commit().unwrap();
}
reader.reload().unwrap();
// writing the segment
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"af b"));
index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.add_document(doc!(text_field=>"a b c d"));
index_writer.commit()?;
reader.reload()?;
assert_eq!(reader.searcher().num_docs(), 3u64);
Ok(())
}
#[test]
@@ -886,7 +852,7 @@ mod tests {
}
#[test]
fn test_wrong_fast_field_type() {
fn test_wrong_fast_field_type() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let fast_field_unsigned = schema_builder.add_u64_field("unsigned", FAST);
let fast_field_signed = schema_builder.add_i64_field("signed", FAST);
@@ -896,14 +862,14 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 50_000_000).unwrap();
let mut index_writer = index.writer_for_tests()?;
{
let document =
doc!(fast_field_unsigned => 4u64, fast_field_signed=>4i64, fast_field_float=>4f64);
index_writer.add_document(document);
index_writer.commit().unwrap();
index_writer.commit()?;
}
let reader = index.reader().unwrap();
let reader = index.reader()?;
let searcher = reader.searcher();
let segment_reader: &SegmentReader = searcher.segment_reader(0);
{
@@ -942,11 +908,12 @@ mod tests {
let fast_field_reader = fast_field_reader_opt.unwrap();
assert_eq!(fast_field_reader.get(0), 4f64)
}
Ok(())
}
// motivated by #729
#[test]
fn test_update_via_delete_insert() {
fn test_update_via_delete_insert() -> crate::Result<()> {
use crate::collector::Count;
use crate::indexer::NoMergePolicy;
use crate::query::AllQuery;
@@ -960,17 +927,17 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let index_reader = index.reader().unwrap();
let index_reader = index.reader()?;
let mut index_writer = index.writer(3_000_000).unwrap();
let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
for doc_id in 0u64..DOC_COUNT {
index_writer.add_document(doc!(id => doc_id));
}
index_writer.commit().unwrap();
index_writer.commit()?;
index_reader.reload().unwrap();
index_reader.reload()?;
let searcher = index_reader.searcher();
assert_eq!(
@@ -981,12 +948,11 @@ mod tests {
// update the 10 elements by deleting and re-adding
for doc_id in 0u64..DOC_COUNT {
index_writer.delete_term(Term::from_field_u64(id, doc_id));
index_writer.commit().unwrap();
index_reader.reload().unwrap();
let doc = doc!(id => doc_id);
index_writer.add_document(doc);
index_writer.commit().unwrap();
index_reader.reload().unwrap();
index_writer.commit()?;
index_reader.reload()?;
index_writer.add_document(doc!(id => doc_id));
index_writer.commit()?;
index_reader.reload()?;
let searcher = index_reader.searcher();
// The number of document should be stable.
assert_eq!(
@@ -995,7 +961,7 @@ mod tests {
);
}
index_reader.reload().unwrap();
index_reader.reload()?;
let searcher = index_reader.searcher();
let segment_ids: Vec<SegmentId> = searcher
.segment_readers()
@@ -1004,12 +970,18 @@ mod tests {
.collect();
block_on(index_writer.merge(&segment_ids)).unwrap();
index_reader.reload().unwrap();
index_reader.reload()?;
let searcher = index_reader.searcher();
assert_eq!(searcher.search(&AllQuery, &Count)?, DOC_COUNT as usize);
Ok(())
}
assert_eq!(
searcher.search(&AllQuery, &Count).unwrap(),
DOC_COUNT as usize
);
#[test]
fn test_validate_checksum() -> crate::Result<()> {
let index_path = tempfile::tempdir().expect("dir");
let schema = Schema::builder().build();
let index = Index::create_in_dir(&index_path, schema)?;
assert!(index.validate_checksum()?.is_empty());
Ok(())
}
}

View File

@@ -455,7 +455,7 @@ mod tests {
let int_field = schema_builder.add_u64_field("id", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let mut last_doc = 0u32;
for &doc in docs {
for _ in last_doc..doc {
@@ -496,7 +496,7 @@ mod tests {
let int_field = schema_builder.add_u64_field("id", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
// create two postings list, one containg even number,
// the other containing odd numbers.
for i in 0..6 {

View File

@@ -310,6 +310,7 @@ pub mod tests {
mod bench {
use super::*;
use crate::TERMINATED;
use rand::rngs::StdRng;
use rand::Rng;
use rand::SeedableRng;
@@ -340,7 +341,7 @@ mod bench {
let mut encoder = BlockEncoder::new();
let data = generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
let (num_bits, compressed) = encoder.compress_block_sorted(&data, 0u32);
let mut decoder = BlockDecoder::new();
let mut decoder = BlockDecoder::default();
b.iter(|| {
decoder.uncompress_block_sorted(compressed, 0u32, num_bits);
});
@@ -375,9 +376,9 @@ mod bench {
let mut encoder = BlockEncoder::new();
let data = generate_array(NUM_INTS_BENCH_VINT, 0.001);
let compressed = encoder.compress_vint_sorted(&data, 0u32);
let mut decoder = BlockDecoder::new();
let mut decoder = BlockDecoder::default();
b.iter(|| {
decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT);
decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT, TERMINATED);
});
}
}

View File

@@ -5,6 +5,7 @@ Postings module (also called inverted index)
mod block_search;
mod block_segment_postings;
pub(crate) mod compression;
mod field_stats;
mod postings;
mod postings_writer;
mod recorder;
@@ -15,6 +16,7 @@ mod stacker;
mod term_info;
pub(crate) use self::block_search::BlockSearcher;
pub(crate) use self::field_stats::{FieldStat, FieldStats};
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
@@ -91,7 +93,7 @@ pub mod tests {
let title = schema_builder.add_text_field("title", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 30_000_000)?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(title => r#"abc abc abc"#));
index_writer.add_document(doc!(title => r#"abc be be be be abc"#));
for _ in 0..1_000 {
@@ -176,7 +178,7 @@ pub mod tests {
.tokenizers()
.register("simple_no_truncation", SimpleTokenizer);
let reader = index.reader().unwrap();
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy));
{
index_writer.add_document(doc!(text_field=>exceeding_token_text));
@@ -205,7 +207,7 @@ pub mod tests {
}
#[test]
pub fn test_position_and_fieldnorm1() {
pub fn test_position_and_fieldnorm1() -> crate::Result<()> {
let mut positions = Vec::new();
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
@@ -217,42 +219,38 @@ pub mod tests {
let mut segment_writer =
SegmentWriter::for_segment(3_000_000, segment.clone(), &schema).unwrap();
{
let mut doc = Document::default();
// checking that position works if the field has two values
doc.add_text(text_field, "a b a c a d a a.");
doc.add_text(text_field, "d d d d a");
let op = AddOperation {
opstamp: 0u64,
document: doc,
document: doc!(
text_field => "a b a c a d a a.",
text_field => "d d d d a"
),
};
segment_writer.add_document(op, &schema).unwrap();
segment_writer.add_document(op, &schema)?;
}
{
let mut doc = Document::default();
doc.add_text(text_field, "b a");
let op = AddOperation {
opstamp: 1u64,
document: doc,
document: doc!(text_field => "b a"),
};
segment_writer.add_document(op, &schema).unwrap();
}
for i in 2..1000 {
let mut doc = Document::default();
let mut text = iter::repeat("e ").take(i).collect::<String>();
let mut text: String = iter::repeat("e ").take(i).collect();
text.push_str(" a");
doc.add_text(text_field, &text);
let op = AddOperation {
opstamp: 2u64,
document: doc,
document: doc!(text_field => text),
};
segment_writer.add_document(op, &schema).unwrap();
}
segment_writer.finalize().unwrap();
segment_writer.finalize()?;
}
{
let segment_reader = SegmentReader::open(&segment).unwrap();
let segment_reader = SegmentReader::open(&segment)?;
{
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text_field);
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text_field)?;
assert_eq!(fieldnorm_reader.fieldnorm(0), 8 + 5);
assert_eq!(fieldnorm_reader.fieldnorm(1), 2);
for i in 2..1000 {
@@ -312,6 +310,7 @@ pub mod tests {
assert_eq!(postings_e.doc(), TERMINATED);
}
}
Ok(())
}
#[test]
@@ -322,7 +321,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field => "g b b d c g c"));
index_writer.add_document(doc!(text_field => "g a b b a d c g c"));
assert!(index_writer.commit().is_ok());
@@ -354,7 +353,7 @@ pub mod tests {
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
for i in 0u64..num_docs as u64 {
let doc = doc!(value_field => 2u64, value_field => i % 2u64);
index_writer.add_document(doc);
@@ -425,7 +424,7 @@ pub mod tests {
// delete some of the documents
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.delete_term(term_0);
assert!(index_writer.commit().is_ok());
}
@@ -479,7 +478,7 @@ pub mod tests {
// delete everything else
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.delete_term(term_1);
assert!(index_writer.commit().is_ok());
}
@@ -522,7 +521,7 @@ pub mod tests {
let index = Index::create_in_ram(schema);
let posting_list_size = 1_000_000;
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
for _ in 0..posting_list_size {
let mut doc = Document::default();
if rng.gen_bool(1f64 / 15f64) {
@@ -730,7 +729,7 @@ mod bench {
let mut s = 0u32;
while segment_postings.doc() != TERMINATED {
s += (segment_postings.doc() & n) % 1024;
segment_postings.advance()
segment_postings.advance();
}
s
});

View File

@@ -105,6 +105,7 @@ impl MultiFieldPostingsWriter {
doc: DocId,
field: Field,
token_stream: &mut dyn TokenStream,
term_buffer: &mut Term,
) -> u32 {
let postings_writer =
self.per_field_postings_writers[field.field_id() as usize].deref_mut();
@@ -114,6 +115,7 @@ impl MultiFieldPostingsWriter {
field,
token_stream,
&mut self.heap,
term_buffer,
)
}
@@ -220,13 +222,20 @@ pub trait PostingsWriter {
field: Field,
token_stream: &mut dyn TokenStream,
heap: &mut MemoryArena,
term_buffer: &mut Term,
) -> u32 {
let mut term = Term::for_field(field);
term_buffer.set_field(field);
let mut sink = |token: &Token| {
// We skip all tokens with a len greater than u16.
if token.text.len() <= MAX_TOKEN_LEN {
term.set_text(token.text.as_str());
self.subscribe(term_index, doc_id, token.position as u32, &term, heap);
term_buffer.set_text(token.text.as_str());
self.subscribe(
term_index,
doc_id,
token.position as u32,
&term_buffer,
heap,
);
} else {
info!(
"A token exceeding MAX_TOKEN_LEN ({}>{}) was dropped. Search for \

View File

@@ -114,7 +114,7 @@ impl SegmentPostings {
.iter()
.map(|&fieldnorm| fieldnorm as u64)
.sum::<u64>();
total_num_tokens as Score / fieldnorms.len() as f32
total_num_tokens as Score / fieldnorms.len() as Score
})
.unwrap_or(0.0);
let mut postings_serializer = PostingsSerializer::new(

View File

@@ -1,5 +1,4 @@
use super::TermInfo;
use crate::common::{BinarySerializable, VInt};
use super::{FieldStat, FieldStats, TermInfo};
use crate::common::{CompositeWrite, CountingWriter};
use crate::core::Segment;
use crate::directory::WritePtr;
@@ -11,6 +10,10 @@ use crate::query::BM25Weight;
use crate::schema::Schema;
use crate::schema::{Field, FieldEntry, FieldType};
use crate::termdict::{TermDictionaryBuilder, TermOrdinal};
use crate::{
common::{BinarySerializable, VInt},
directory::TerminatingWrite,
};
use crate::{DocId, Score};
use std::cmp::Ordering;
use std::io::{self, Write};
@@ -51,6 +54,8 @@ pub struct InvertedIndexSerializer {
postings_write: CompositeWrite<WritePtr>,
positions_write: CompositeWrite<WritePtr>,
positionsidx_write: CompositeWrite<WritePtr>,
field_stats: FieldStats,
field_stats_write: WritePtr,
schema: Schema,
}
@@ -61,6 +66,7 @@ impl InvertedIndexSerializer {
postings_write: CompositeWrite<WritePtr>,
positions_write: CompositeWrite<WritePtr>,
positionsidx_write: CompositeWrite<WritePtr>,
field_stats_write: WritePtr,
schema: Schema,
) -> crate::Result<InvertedIndexSerializer> {
Ok(InvertedIndexSerializer {
@@ -68,18 +74,21 @@ impl InvertedIndexSerializer {
postings_write,
positions_write,
positionsidx_write,
field_stats: FieldStats::default(),
field_stats_write,
schema,
})
}
/// Open a new `PostingsSerializer` for the given segment
pub fn open(segment: &mut Segment) -> crate::Result<InvertedIndexSerializer> {
use crate::SegmentComponent::{POSITIONS, POSITIONSSKIP, POSTINGS, TERMS};
use crate::SegmentComponent::{FIELDSTATS, POSITIONS, POSITIONSSKIP, POSTINGS, TERMS};
InvertedIndexSerializer::create(
CompositeWrite::wrap(segment.open_write(TERMS)?),
CompositeWrite::wrap(segment.open_write(POSTINGS)?),
CompositeWrite::wrap(segment.open_write(POSITIONS)?),
CompositeWrite::wrap(segment.open_write(POSITIONSSKIP)?),
segment.open_write(FIELDSTATS)?,
segment.schema(),
)
}
@@ -94,6 +103,8 @@ impl InvertedIndexSerializer {
total_num_tokens: u64,
fieldnorm_reader: Option<FieldNormReader>,
) -> io::Result<FieldSerializer<'_>> {
self.field_stats
.insert(field, FieldStat::new(total_num_tokens));
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
let term_dictionary_write = self.terms_write.for_field(field);
let postings_write = self.postings_write.for_field(field);
@@ -112,7 +123,10 @@ impl InvertedIndexSerializer {
}
/// Closes the serializer.
pub fn close(self) -> io::Result<()> {
pub fn close(mut self) -> io::Result<()> {
self.field_stats
.serialize(self.field_stats_write.get_mut())?;
self.field_stats_write.terminate()?;
self.terms_write.close()?;
self.postings_write.close()?;
self.positions_write.close()?;
@@ -142,7 +156,6 @@ impl<'a> FieldSerializer<'a> {
positionsidx_write: &'a mut CountingWriter<WritePtr>,
fieldnorm_reader: Option<FieldNormReader>,
) -> io::Result<FieldSerializer<'a>> {
total_num_tokens.serialize(postings_write)?;
let (term_freq_enabled, position_enabled): (bool, bool) = match field_type {
FieldType::Str(ref text_options) => {
if let Some(text_indexing_options) = text_options.get_indexing_options() {
@@ -190,7 +203,8 @@ impl<'a> FieldSerializer<'a> {
.unwrap_or(0u64);
TermInfo {
doc_freq: 0,
postings_offset: self.postings_serializer.addr(),
postings_start_offset: self.postings_serializer.addr(),
postings_end_offset: 0u64,
positions_idx,
}
}
@@ -244,10 +258,12 @@ impl<'a> FieldSerializer<'a> {
/// using `VInt` encoding.
pub fn close_term(&mut self) -> io::Result<()> {
if self.term_open {
self.term_dictionary_builder
.insert_value(&self.current_term_info)?;
self.postings_serializer
.close_term(self.current_term_info.doc_freq)?;
let end_offset = self.postings_serializer.addr();
self.current_term_info.postings_end_offset = end_offset;
self.term_dictionary_builder
.insert_value(&self.current_term_info)?;
self.term_open = false;
}
Ok(())

View File

@@ -7,35 +7,49 @@ use std::io;
pub struct TermInfo {
/// Number of documents in the segment containing the term
pub doc_freq: u32,
/// Start offset within the postings (`.idx`) file.
pub postings_offset: u64,
/// Start offset of the posting list within the postings (`.idx`) file.
pub postings_start_offset: u64,
/// End offset of the posting list within the postings (`.idx`) file.
pub postings_end_offset: u64,
/// Start offset of the first block within the position (`.pos`) file.
pub positions_idx: u64,
}
impl TermInfo {
pub(crate) fn posting_num_bytes(&self) -> u32 {
let num_bytes = self.postings_end_offset - self.postings_start_offset;
assert!(num_bytes <= std::u32::MAX as u64);
num_bytes as u32
}
}
impl FixedSize for TermInfo {
/// Size required for the binary serialization of a `TermInfo` object.
/// This is large, but in practise, `TermInfo` are encoded in blocks and
/// only the first `TermInfo` of a block is serialized uncompressed.
/// The subsequent `TermInfo` are delta encoded and bitpacked.
const SIZE_IN_BYTES: usize = u32::SIZE_IN_BYTES + 2 * u64::SIZE_IN_BYTES;
const SIZE_IN_BYTES: usize = 2 * u32::SIZE_IN_BYTES + 2 * u64::SIZE_IN_BYTES;
}
impl BinarySerializable for TermInfo {
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
self.doc_freq.serialize(writer)?;
self.postings_offset.serialize(writer)?;
self.postings_start_offset.serialize(writer)?;
self.posting_num_bytes().serialize(writer)?;
self.positions_idx.serialize(writer)?;
Ok(())
}
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let doc_freq = u32::deserialize(reader)?;
let postings_offset = u64::deserialize(reader)?;
let postings_start_offset = u64::deserialize(reader)?;
let postings_num_bytes = u32::deserialize(reader)?;
let postings_end_offset = postings_start_offset + u64::from(postings_num_bytes);
let positions_idx = u64::deserialize(reader)?;
Ok(TermInfo {
doc_freq,
postings_offset,
postings_start_offset,
postings_end_offset,
positions_idx,
})
}

View File

@@ -83,7 +83,7 @@ mod tests {
let field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(field=>"aaa"));
index_writer.add_document(doc!(field=>"bbb"));
index_writer.commit().unwrap();

View File

@@ -5,7 +5,6 @@ use crate::query::{BitSetDocSet, Explanation};
use crate::query::{Scorer, Weight};
use crate::schema::{Field, IndexRecordOption};
use crate::termdict::{TermDictionary, TermStreamer};
use crate::Result;
use crate::TantivyError;
use crate::{DocId, Score};
use std::sync::Arc;
@@ -40,7 +39,7 @@ impl<A> Weight for AutomatonWeight<A>
where
A: Automaton + Send + Sync + 'static,
{
fn scorer(&self, reader: &SegmentReader, boost: Score) -> Result<Box<dyn Scorer>> {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let max_doc = reader.max_doc();
let mut doc_bitset = BitSet::with_max_value(max_doc);
let inverted_index = reader.inverted_index(self.field);
@@ -66,7 +65,7 @@ where
Ok(Box::new(const_scorer))
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0)?;
if scorer.seek(doc) == doc {
Ok(Explanation::new("AutomatonScorer", 1.0))
@@ -91,7 +90,7 @@ mod tests {
let mut schema = Schema::builder();
let title = schema.add_text_field("title", STRING);
let index = Index::create_in_ram(schema.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(title=>"abc"));
index_writer.add_document(doc!(title=>"bcd"));
index_writer.add_document(doc!(title=>"abcd"));

View File

@@ -4,19 +4,6 @@ use crate::{DocId, DocSet, Score, TERMINATED};
use std::ops::Deref;
use std::ops::DerefMut;
fn is_sorted<I: Iterator<Item = DocId>>(mut it: I) -> bool {
if let Some(first) = it.next() {
let mut prev = first;
for doc in it {
if doc < prev {
return false;
}
prev = doc;
}
}
true
}
/// Takes a term_scorers sorted by their current doc() and a threshold and returns
/// Returns (pivot_len, pivot_ord) defined as follows:
/// - `pivot_doc` lowest document that has a chance of exceeding (>) the threshold score.
@@ -55,37 +42,12 @@ fn find_pivot_doc(
Some((before_pivot_len, pivot_len, pivot_doc))
}
struct TermScorerWithMaxScore<'a> {
scorer: &'a mut TermScorer,
max_score: Score,
}
impl<'a> From<&'a mut TermScorer> for TermScorerWithMaxScore<'a> {
fn from(scorer: &'a mut TermScorer) -> Self {
let max_score = scorer.max_score();
TermScorerWithMaxScore { scorer, max_score }
}
}
impl<'a> Deref for TermScorerWithMaxScore<'a> {
type Target = TermScorer;
fn deref(&self) -> &Self::Target {
self.scorer
}
}
impl<'a> DerefMut for TermScorerWithMaxScore<'a> {
fn deref_mut(&mut self) -> &mut Self::Target {
self.scorer
}
}
// Before and after calling this method, scorers need to be sorted by their `.doc()`.
fn block_max_was_too_low_advance_one_scorer(
scorers: &mut Vec<TermScorerWithMaxScore>,
pivot_len: usize,
) {
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
let mut scorer_to_seek = pivot_len - 1;
let mut doc_to_seek_after = scorers[scorer_to_seek].doc();
for scorer_ord in (0..pivot_len - 1).rev() {
@@ -102,6 +64,7 @@ fn block_max_was_too_low_advance_one_scorer(
}
scorers[scorer_to_seek].seek(doc_to_seek_after + 1);
restore_ordering(scorers, scorer_to_seek);
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
}
// Given a list of term_scorers and a `ord` and assuming that `term_scorers[ord]` is sorted
@@ -177,64 +140,99 @@ pub fn block_wand(
.map(TermScorerWithMaxScore::from)
.collect();
scorers.sort_by_key(|scorer| scorer.doc());
loop {
// At this point we need to ensure that the scorers are sorted!
// At this point we need to ensure that the scorers are sorted!
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
while let Some((before_pivot_len, pivot_len, pivot_doc)) =
find_pivot_doc(&scorers[..], threshold)
{
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
if let Some((before_pivot_len, pivot_len, pivot_doc)) =
find_pivot_doc(&scorers[..], threshold)
{
debug_assert_ne!(pivot_doc, TERMINATED);
debug_assert!(before_pivot_len < pivot_len);
debug_assert_ne!(pivot_doc, TERMINATED);
debug_assert!(before_pivot_len < pivot_len);
let block_max_score_upperbound: Score = scorers[..pivot_len]
.iter_mut()
.map(|scorer| {
scorer.shallow_seek(pivot_doc);
scorer.block_max_score()
})
.sum();
let block_max_score_upperbound: Score = scorers[..pivot_len]
.iter_mut()
.map(|scorer| {
scorer.shallow_seek(pivot_doc);
scorer.block_max_score()
})
.sum();
// Beware after shallow advance, skip readers can be in advance compared to
// the segment posting lists.
//
// `block_segment_postings.load_block()` need to be called separately.
if block_max_score_upperbound <= threshold {
// Block max condition was not reached
// We could get away by simply advancing the scorers to DocId + 1 but it would
// be inefficient. The optimization requires proper explanation and was
// isolated in a different function.
block_max_was_too_low_advance_one_scorer(&mut scorers, pivot_len);
continue;
}
// Block max condition is observed.
//
// Let's try and advance all scorers before the pivot to the pivot.
if !align_scorers(&mut scorers, pivot_doc, before_pivot_len) {
// At least of the scorer does not contain the pivot.
//
// Let's stop scoring this pivot and go through the pivot selection again.
// Note that the current pivot is not necessarily a bad candidate and it
// may be picked again.
continue;
}
// At this point, all scorers are positioned on the doc.
let score = scorers[..pivot_len]
.iter_mut()
.map(|scorer| scorer.score())
.sum();
if score > threshold {
threshold = callback(pivot_doc, score);
}
// let's advance all of the scorers that are currently positioned on the pivot.
advance_all_scorers_on_pivot(&mut scorers, pivot_len);
} else {
return;
// Beware after shallow advance, skip readers can be in advance compared to
// the segment posting lists.
//
// `block_segment_postings.load_block()` need to be called separately.
if block_max_score_upperbound <= threshold {
// Block max condition was not reached
// We could get away by simply advancing the scorers to DocId + 1 but it would
// be inefficient. The optimization requires proper explanation and was
// isolated in a different function.
block_max_was_too_low_advance_one_scorer(&mut scorers, pivot_len);
continue;
}
// Block max condition is observed.
//
// Let's try and advance all scorers before the pivot to the pivot.
if !align_scorers(&mut scorers, pivot_doc, before_pivot_len) {
// At least of the scorer does not contain the pivot.
//
// Let's stop scoring this pivot and go through the pivot selection again.
// Note that the current pivot is not necessarily a bad candidate and it
// may be picked again.
continue;
}
// At this point, all scorers are positioned on the doc.
let score = scorers[..pivot_len]
.iter_mut()
.map(|scorer| scorer.score())
.sum();
if score > threshold {
threshold = callback(pivot_doc, score);
}
// let's advance all of the scorers that are currently positioned on the pivot.
advance_all_scorers_on_pivot(&mut scorers, pivot_len);
}
}
struct TermScorerWithMaxScore<'a> {
scorer: &'a mut TermScorer,
max_score: Score,
}
impl<'a> From<&'a mut TermScorer> for TermScorerWithMaxScore<'a> {
fn from(scorer: &'a mut TermScorer) -> Self {
let max_score = scorer.max_score();
TermScorerWithMaxScore { scorer, max_score }
}
}
impl<'a> Deref for TermScorerWithMaxScore<'a> {
type Target = TermScorer;
fn deref(&self) -> &Self::Target {
self.scorer
}
}
impl<'a> DerefMut for TermScorerWithMaxScore<'a> {
fn deref_mut(&mut self) -> &mut Self::Target {
self.scorer
}
}
fn is_sorted<I: Iterator<Item = DocId>>(mut it: I) -> bool {
if let Some(first) = it.next() {
let mut prev = first;
for doc in it {
if doc < prev {
return false;
}
prev = doc;
}
}
true
}
#[cfg(test)]
mod tests {
use crate::query::score_combiner::SumCombiner;
@@ -248,17 +246,21 @@ mod tests {
use std::iter;
struct Float(Score);
impl Eq for Float {}
impl PartialEq for Float {
fn eq(&self, other: &Self) -> bool {
self.cmp(&other) == Ordering::Equal
}
}
impl PartialOrd for Float {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for Float {
fn cmp(&self, other: &Self) -> Ordering {
other.0.partial_cmp(&self.0).unwrap_or(Ordering::Equal)

View File

@@ -32,7 +32,7 @@ mod tests {
let index = Index::create_in_ram(schema);
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
{
index_writer.add_document(doc!(text_field => "a b c"));
index_writer.add_document(doc!(text_field => "a c"));
@@ -224,7 +224,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field => "a b c"));
index_writer.add_document(doc!(text_field => "a c"));
index_writer.add_document(doc!(text_field => "b c"));

View File

@@ -144,7 +144,7 @@ mod tests {
fn test_boost_query_explain() {
let schema = Schema::builder().build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(Document::new());
assert!(index_writer.commit().is_ok());
let reader = index.reader().unwrap();

View File

@@ -177,7 +177,7 @@ mod test {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(
country_field => "japan",
));

View File

@@ -24,7 +24,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
for &text in texts {
let doc = doc!(text_field=>text);
index_writer.add_document(doc);
@@ -135,7 +135,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field=>"a b c"));
assert!(index_writer.commit().is_ok());
}
@@ -186,7 +186,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field=>"b"));
index_writer.add_document(doc!(text_field=>"a b"));
index_writer.add_document(doc!(text_field=>"b a"));
@@ -217,7 +217,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field=>"a b c d e f g h"));
assert!(index_writer.commit().is_ok());
}

View File

@@ -9,8 +9,8 @@ use crate::query::Weight;
use crate::query::{EmptyScorer, Explanation};
use crate::schema::IndexRecordOption;
use crate::schema::Term;
use crate::Score;
use crate::{DocId, DocSet};
use crate::{Result, Score};
pub struct PhraseWeight {
phrase_terms: Vec<(usize, Term)>,
@@ -32,7 +32,7 @@ impl PhraseWeight {
}
}
fn fieldnorm_reader(&self, reader: &SegmentReader) -> FieldNormReader {
fn fieldnorm_reader(&self, reader: &SegmentReader) -> crate::Result<FieldNormReader> {
let field = self.phrase_terms[0].1.field();
reader.get_fieldnorms_reader(field)
}
@@ -41,9 +41,9 @@ impl PhraseWeight {
&self,
reader: &SegmentReader,
boost: Score,
) -> Result<Option<PhraseScorer<SegmentPostings>>> {
) -> crate::Result<Option<PhraseScorer<SegmentPostings>>> {
let similarity_weight = self.similarity_weight.boost_by(boost);
let fieldnorm_reader = self.fieldnorm_reader(reader);
let fieldnorm_reader = self.fieldnorm_reader(reader)?;
if reader.has_deletes() {
let mut term_postings_list = Vec::new();
for &(offset, ref term) in &self.phrase_terms {
@@ -85,7 +85,7 @@ impl PhraseWeight {
}
impl Weight for PhraseWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> Result<Box<dyn Scorer>> {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
if let Some(scorer) = self.phrase_scorer(reader, boost)? {
Ok(Box::new(scorer))
} else {
@@ -93,7 +93,7 @@ impl Weight for PhraseWeight {
}
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let scorer_opt = self.phrase_scorer(reader, 1.0)?;
if scorer_opt.is_none() {
return Err(does_not_match(doc));
@@ -102,7 +102,7 @@ impl Weight for PhraseWeight {
if scorer.seek(doc) != doc {
return Err(does_not_match(doc));
}
let fieldnorm_reader = self.fieldnorm_reader(reader);
let fieldnorm_reader = self.fieldnorm_reader(reader)?;
let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc);
let phrase_count = scorer.phrase_count();
let mut explanation = Explanation::new("Phrase Scorer", scorer.score());

View File

@@ -9,7 +9,6 @@ use crate::query::{Query, Scorer, Weight};
use crate::schema::Type;
use crate::schema::{Field, IndexRecordOption, Term};
use crate::termdict::{TermDictionary, TermStreamer};
use crate::Result;
use crate::{DocId, Score};
use std::collections::Bound;
use std::ops::Range;
@@ -48,7 +47,7 @@ fn map_bound<TFrom, TTo, Transform: Fn(&TFrom) -> TTo>(
/// let schema = schema_builder.build();
///
/// let index = Index::create_in_ram(schema);
/// let mut index_writer = index.writer_with_num_threads(1, 6_000_000)?;
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// for year in 1950u64..2017u64 {
/// let num_docs_within_year = 10 + (year - 1950) * (year - 1950);
/// for _ in 0..num_docs_within_year {
@@ -246,7 +245,11 @@ impl RangeQuery {
}
impl Query for RangeQuery {
fn weight(&self, searcher: &Searcher, _scoring_enabled: bool) -> Result<Box<dyn Weight>> {
fn weight(
&self,
searcher: &Searcher,
_scoring_enabled: bool,
) -> crate::Result<Box<dyn Weight>> {
let schema = searcher.schema();
let value_type = schema.get_field_entry(self.field).field_type().value_type();
if value_type != self.value_type {
@@ -289,7 +292,7 @@ impl RangeWeight {
}
impl Weight for RangeWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> Result<Box<dyn Scorer>> {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let max_doc = reader.max_doc();
let mut doc_bitset = BitSet::with_max_value(max_doc);
@@ -315,7 +318,7 @@ impl Weight for RangeWeight {
Ok(Box::new(ConstScorer::new(doc_bitset, boost)))
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0)?;
if scorer.seek(doc) != doc {
return Err(does_not_match(doc));
@@ -342,7 +345,7 @@ mod tests {
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 6_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
for year in 1950u64..2017u64 {
let num_docs_within_year = 10 + (year - 1950) * (year - 1950);
for _ in 0..num_docs_within_year {
@@ -485,7 +488,7 @@ mod tests {
schema_builder.add_i64_field("year", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
let mut index_writer = index.writer_for_tests()?;
let title = schema.get_field("title").unwrap();
let year = schema.get_field("year").unwrap();
index_writer.add_document(doc!(

View File

@@ -103,7 +103,7 @@ mod test {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(
country_field => "japan",
));

View File

@@ -25,7 +25,7 @@ mod tests {
let index = Index::create_in_ram(schema);
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let doc = doc!(text_field => "a");
index_writer.add_document(doc);
assert!(index_writer.commit().is_ok());
@@ -50,7 +50,7 @@ mod tests {
let index = Index::create_in_ram(schema);
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
let mut index_writer = index.writer_for_tests()?;
for _ in 0..COMPRESSION_BLOCK_SIZE {
let doc = doc!(text_field => "a");
index_writer.add_document(doc);
@@ -86,7 +86,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(
left_field => "left1 left2 left2 left2f2 left2f2 left3 abcde abcde abcde abcde abcde abcde abcde abcde abcde abcewde abcde abcde",
right_field => "right1 right2",
@@ -136,7 +136,7 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 5_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field=>"a b"));
index_writer.add_document(doc!(text_field=>"a c"));
index_writer.delete_term(Term::from_field_text(text_field, "b"));
@@ -153,7 +153,7 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"a"));
index_writer.commit()?;

View File

@@ -4,11 +4,10 @@ use crate::docset::DocSet;
use crate::postings::SegmentPostings;
use crate::query::bm25::BM25Weight;
use crate::query::explanation::does_not_match;
use crate::query::weight::{for_each_pruning_scorer, for_each_scorer};
use crate::query::weight::for_each_scorer;
use crate::query::Weight;
use crate::query::{Explanation, Scorer};
use crate::schema::IndexRecordOption;
use crate::Result;
use crate::Term;
use crate::{DocId, Score};
@@ -19,12 +18,12 @@ pub struct TermWeight {
}
impl Weight for TermWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> Result<Box<dyn Scorer>> {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let term_scorer = self.specialized_scorer(reader, boost)?;
Ok(Box::new(term_scorer))
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let mut scorer = self.specialized_scorer(reader, 1.0)?;
if scorer.seek(doc) != doc {
return Err(does_not_match(doc));
@@ -32,7 +31,7 @@ impl Weight for TermWeight {
Ok(scorer.explain())
}
fn count(&self, reader: &SegmentReader) -> Result<u32> {
fn count(&self, reader: &SegmentReader) -> crate::Result<u32> {
if let Some(delete_bitset) = reader.delete_bitset() {
Ok(self.scorer(reader, 1.0)?.count(delete_bitset))
} else {
@@ -73,8 +72,8 @@ impl Weight for TermWeight {
reader: &SegmentReader,
callback: &mut dyn FnMut(DocId, Score) -> Score,
) -> crate::Result<()> {
let mut scorer = self.scorer(reader, 1.0)?;
for_each_pruning_scorer(&mut scorer, threshold, callback);
let scorer = self.specialized_scorer(reader, 1.0)?;
crate::query::boolean_query::block_wand(vec![scorer], threshold, callback);
Ok(())
}
}
@@ -96,10 +95,10 @@ impl TermWeight {
&self,
reader: &SegmentReader,
boost: Score,
) -> Result<TermScorer> {
) -> crate::Result<TermScorer> {
let field = self.term.field();
let inverted_index = reader.inverted_index(field);
let fieldnorm_reader = reader.get_fieldnorms_reader(field);
let fieldnorm_reader = reader.get_fieldnorms_reader(field)?;
let similarity_weight = self.similarity_weight.boost_by(boost);
let postings_opt: Option<SegmentPostings> =
inverted_index.read_postings(&self.term, self.index_record_option);

View File

@@ -398,9 +398,9 @@ mod bench {
use crate::query::score_combiner::DoNothingCombiner;
use crate::query::{ConstScorer, Union, VecDocSet};
use crate::tests;
use crate::DocId;
use crate::DocSet;
use crate::{tests, TERMINATED};
use test::Bencher;
#[bench]
@@ -414,10 +414,12 @@ mod bench {
union_docset
.iter()
.map(|doc_ids| VecDocSet::from(doc_ids.clone()))
.map(ConstScorer::new)
.map(|docset| ConstScorer::new(docset, 1.0))
.collect::<Vec<_>>(),
);
while v.advance() {}
while v.doc() != TERMINATED {
v.advance();
}
});
}
#[bench]
@@ -432,10 +434,12 @@ mod bench {
union_docset
.iter()
.map(|doc_ids| VecDocSet::from(doc_ids.clone()))
.map(ConstScorer::new)
.map(|docset| ConstScorer::new(docset, 1.0))
.collect::<Vec<_>>(),
);
while v.advance() {}
while v.doc() != TERMINATED {
v.advance();
}
});
}
}

View File

@@ -138,9 +138,11 @@ impl InnerIndexReader {
.collect::<crate::Result<_>>()?
};
let schema = self.index.schema();
let searchers = (0..self.num_searchers)
.map(|_| Searcher::new(schema.clone(), self.index.clone(), segment_readers.clone()))
.collect();
let searchers = std::iter::repeat_with(|| {
Searcher::new(schema.clone(), self.index.clone(), segment_readers.clone())
})
.take(self.num_searchers)
.collect();
self.searcher_pool.publish_new_generation(searchers);
Ok(())
}

View File

@@ -74,9 +74,8 @@ impl Document {
}
/// Add a text field.
pub fn add_text(&mut self, field: Field, text: &str) {
let value = Value::Str(String::from(text));
self.add(FieldValue::new(field, value));
pub fn add_text<S: ToString>(&mut self, field: Field, text: S) {
self.add(FieldValue::new(field, Value::Str(text.to_string())));
}
/// Add a pre-tokenized text field.
@@ -110,8 +109,8 @@ impl Document {
}
/// Add a bytes field
pub fn add_bytes(&mut self, field: Field, value: Vec<u8>) {
self.add(FieldValue::new(field, Value::Bytes(value)))
pub fn add_bytes<T: Into<Vec<u8>>>(&mut self, field: Field, value: T) {
self.add(FieldValue::new(field, Value::Bytes(value.into())))
}
/// Add a field value

View File

@@ -1,5 +1,5 @@
use crate::schema::IntOptions;
use crate::schema::TextOptions;
use crate::schema::{is_valid_field_name, IntOptions};
use crate::schema::FieldType;
use serde::de::{self, MapAccess, Visitor};
@@ -24,6 +24,7 @@ impl FieldEntry {
/// Creates a new u64 field entry in the schema, given
/// a name, and some options.
pub fn new_text(field_name: String, text_options: TextOptions) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry {
name: field_name,
field_type: FieldType::Str(text_options),
@@ -33,6 +34,7 @@ impl FieldEntry {
/// Creates a new u64 field entry in the schema, given
/// a name, and some options.
pub fn new_u64(field_name: String, field_type: IntOptions) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry {
name: field_name,
field_type: FieldType::U64(field_type),
@@ -42,6 +44,7 @@ impl FieldEntry {
/// Creates a new i64 field entry in the schema, given
/// a name, and some options.
pub fn new_i64(field_name: String, field_type: IntOptions) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry {
name: field_name,
field_type: FieldType::I64(field_type),
@@ -51,6 +54,7 @@ impl FieldEntry {
/// Creates a new f64 field entry in the schema, given
/// a name, and some options.
pub fn new_f64(field_name: String, field_type: IntOptions) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry {
name: field_name,
field_type: FieldType::F64(field_type),
@@ -60,6 +64,7 @@ impl FieldEntry {
/// Creates a new date field entry in the schema, given
/// a name, and some options.
pub fn new_date(field_name: String, field_type: IntOptions) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry {
name: field_name,
field_type: FieldType::Date(field_type),
@@ -68,6 +73,7 @@ impl FieldEntry {
/// Creates a field entry for a facet.
pub fn new_facet(field_name: String) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry {
name: field_name,
field_type: FieldType::HierarchicalFacet,
@@ -76,6 +82,7 @@ impl FieldEntry {
/// Creates a field entry for a bytes field
pub fn new_bytes(field_name: String) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry {
name: field_name,
field_type: FieldType::Bytes,
@@ -268,6 +275,12 @@ mod tests {
use crate::schema::TEXT;
use serde_json;
#[test]
#[should_panic]
fn test_invalid_field_name_should_panic() {
FieldEntry::new_text("-hello".to_string(), TEXT);
}
#[test]
fn test_json_serialization() {
let field_value = FieldEntry::new_text(String::from("title"), TEXT);

View File

@@ -149,14 +149,16 @@ pub use self::int_options::IntOptions;
use once_cell::sync::Lazy;
use regex::Regex;
/// Regular expression representing the restriction on a valid field names.
pub const FIELD_NAME_PATTERN: &str = r#"^[_a-zA-Z][_\-a-zA-Z0-9]*$"#;
/// Validator for a potential `field_name`.
/// Returns true iff the name can be use for a field name.
///
/// A field name must start by a letter `[a-zA-Z]`.
/// The other characters can be any alphanumic character `[a-ZA-Z0-9]` or `_`.
pub fn is_valid_field_name(field_name: &str) -> bool {
static FIELD_NAME_PTN: Lazy<Regex> =
Lazy::new(|| Regex::new("^[a-zA-Z][_a-zA-Z0-9]*$").unwrap());
static FIELD_NAME_PTN: Lazy<Regex> = Lazy::new(|| Regex::new(FIELD_NAME_PATTERN).unwrap());
FIELD_NAME_PTN.is_match(field_name)
}
@@ -170,6 +172,11 @@ mod tests {
assert!(is_valid_field_name("text"));
assert!(is_valid_field_name("text0"));
assert!(!is_valid_field_name("0text"));
assert!(is_valid_field_name("field-name"));
assert!(is_valid_field_name("field_name"));
assert!(!is_valid_field_name("field!name"));
assert!(!is_valid_field_name("-fieldname"));
assert!(is_valid_field_name("_fieldname"));
assert!(!is_valid_field_name(""));
assert!(!is_valid_field_name("シャボン玉"));
assert!(is_valid_field_name("my_text_field"));

View File

@@ -4,7 +4,6 @@ use super::Field;
use crate::common;
use crate::schema::Facet;
use crate::DateTime;
use byteorder::{BigEndian, ByteOrder};
use std::str;
/// Size (in bytes) of the buffer of a int field.
@@ -19,6 +18,10 @@ where
B: AsRef<[u8]>;
impl Term {
pub(crate) fn new() -> Term {
Term(Vec::with_capacity(100))
}
/// Builds a term given a field, and a i64-value
///
/// Assuming the term has a field id of 1, and a i64 value of 3234,
@@ -93,6 +96,12 @@ impl Term {
term
}
pub(crate) fn from_field_bytes(field: Field, bytes: &[u8]) -> Term {
let mut term = Term::for_field(field);
term.set_bytes(bytes);
term
}
/// Creates a new Term for a given field.
pub(crate) fn for_field(field: Field) -> Term {
let mut term = Term(Vec::with_capacity(100));
@@ -100,12 +109,10 @@ impl Term {
term
}
/// Returns the field.
pub fn set_field(&mut self, field: Field) {
if self.0.len() < 4 {
self.0.resize(4, 0u8);
}
BigEndian::write_u32(&mut self.0[0..4], field.field_id());
pub(crate) fn set_field(&mut self, field: Field) {
self.0.clear();
self.0
.extend_from_slice(&field.field_id().to_be_bytes()[..]);
}
/// Sets a u64 value in the term.
@@ -116,7 +123,7 @@ impl Term {
/// the natural order of the values.
pub fn set_u64(&mut self, val: u64) {
self.0.resize(INT_TERM_LEN, 0u8);
BigEndian::write_u64(&mut self.0[4..], val);
self.0[4..12].copy_from_slice(val.to_be_bytes().as_ref());
}
/// Sets a `i64` value in the term.
@@ -134,12 +141,6 @@ impl Term {
self.0.extend(bytes);
}
pub(crate) fn from_field_bytes(field: Field, bytes: &[u8]) -> Term {
let mut term = Term::for_field(field);
term.set_bytes(bytes);
term
}
/// Set the texts only, keeping the field untouched.
pub fn set_text(&mut self, text: &str) {
self.set_bytes(text.as_bytes());
@@ -157,7 +158,9 @@ where
/// Returns the field.
pub fn field(&self) -> Field {
Field::from_field_id(BigEndian::read_u32(&self.0.as_ref()[..4]))
let mut field_id_bytes = [0u8; 4];
field_id_bytes.copy_from_slice(&self.0.as_ref()[..4]);
Field::from_field_id(u32::from_be_bytes(field_id_bytes))
}
/// Returns the `u64` value stored in a term.
@@ -166,7 +169,9 @@ where
/// ... or returns an invalid value
/// if the term is not a `u64` field.
pub fn get_u64(&self) -> u64 {
BigEndian::read_u64(&self.0.as_ref()[4..])
let mut field_id_bytes = [0u8; 8];
field_id_bytes.copy_from_slice(self.value_bytes());
u64::from_be_bytes(field_id_bytes)
}
/// Returns the `i64` value stored in a term.
@@ -175,7 +180,7 @@ where
/// ... or returns an invalid value
/// if the term is not a `i64` field.
pub fn get_i64(&self) -> i64 {
common::u64_to_i64(BigEndian::read_u64(&self.0.as_ref()[4..]))
common::u64_to_i64(self.get_u64())
}
/// Returns the `f64` value stored in a term.
@@ -184,7 +189,7 @@ where
/// ... or returns an invalid value
/// if the term is not a `f64` field.
pub fn get_f64(&self) -> f64 {
common::u64_to_f64(BigEndian::read_u64(&self.0.as_ref()[4..]))
common::u64_to_f64(self.get_u64())
}
/// Returns the text associated with the term.

View File

@@ -221,6 +221,12 @@ impl<'a> From<&'a str> for Value {
}
}
impl<'a> From<&'a [u8]> for Value {
fn from(bytes: &'a [u8]) -> Value {
Value::Bytes(bytes.to_vec())
}
}
impl<'a> From<Facet> for Value {
fn from(facet: Facet) -> Value {
Value::Facet(facet)

View File

@@ -221,7 +221,7 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str)
/// # let text_field = schema_builder.add_text_field("text", TEXT);
/// # let schema = schema_builder.build();
/// # let index = Index::create_in_ram(schema);
/// # let mut index_writer = index.writer_with_num_threads(1, 30_000_000)?;
/// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// # let doc = doc!(text_field => r#"Comme je descendais des Fleuves impassibles,
/// # Je ne me sentis plus guidé par les haleurs :
/// # Des Peaux-Rouges criards les avaient pris pour cibles,
@@ -506,7 +506,7 @@ Survey in 2016, 2017, and 2018."#;
let index = Index::create_in_ram(schema);
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field => "a"));
index_writer.add_document(doc!(text_field => "a"));
index_writer.add_document(doc!(text_field => "a b"));
@@ -562,7 +562,7 @@ Survey in 2016, 2017, and 2018."#;
let index = Index::create_in_ram(schema);
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
{
let doc = doc ! (text_field => TEST_TEXT);
index_writer.add_document(doc);

View File

@@ -25,6 +25,8 @@ pub enum ComponentSpaceUsage {
Store(StoreSpaceUsage),
/// Some sort of raw byte count
Basic(ByteCount),
///
Unimplemented,
}
/// Represents combined space usage of an entire searcher and its component segments.
@@ -119,7 +121,7 @@ impl SegmentSpaceUsage {
/// Clones the underlying data.
/// Use the components directly if this is somehow in performance critical code.
pub fn component(&self, component: SegmentComponent) -> ComponentSpaceUsage {
use self::ComponentSpaceUsage::*;
use self::ComponentSpaceUsage::{Basic, PerField, Store, Unimplemented};
use crate::SegmentComponent::*;
match component {
POSTINGS => PerField(self.postings().clone()),
@@ -130,6 +132,7 @@ impl SegmentSpaceUsage {
TERMS => PerField(self.termdict().clone()),
STORE => Store(self.store().clone()),
DELETE => Basic(self.deletes()),
FIELDSTATS => Unimplemented,
}
}
@@ -336,7 +339,7 @@ mod test {
let index = Index::create_in_ram(schema.clone());
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(name => 1u64));
index_writer.add_document(doc!(name => 2u64));
index_writer.add_document(doc!(name => 10u64));
@@ -374,7 +377,7 @@ mod test {
let index = Index::create_in_ram(schema.clone());
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(name => "hi"));
index_writer.add_document(doc!(name => "this is a test"));
index_writer.add_document(
@@ -414,7 +417,7 @@ mod test {
let index = Index::create_in_ram(schema.clone());
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(name => "hi"));
index_writer.add_document(doc!(name => "this is a test"));
index_writer.add_document(
@@ -453,7 +456,7 @@ mod test {
let index = Index::create_in_ram(schema.clone());
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(name => 1u64));
index_writer.add_document(doc!(name => 2u64));
index_writer.add_document(doc!(name => 3u64));

View File

@@ -44,11 +44,13 @@ mod tests {
const BLOCK_SIZE: usize = 1_500;
fn make_term_info(val: u64) -> TermInfo {
fn make_term_info(term_ord: u64) -> TermInfo {
let offset = |term_ord: u64| term_ord * 100 + term_ord * term_ord;
TermInfo {
doc_freq: val as u32,
positions_idx: val * 2u64,
postings_offset: val * 3u64,
doc_freq: term_ord as u32,
postings_start_offset: offset(term_ord),
postings_end_offset: offset(term_ord + 1),
positions_idx: offset(term_ord) * 2u64,
}
}
@@ -138,7 +140,7 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
{
{
let mut doc = Document::default();
@@ -208,20 +210,14 @@ mod tests {
}
#[test]
fn test_stream_high_range_prefix_suffix() {
fn test_stream_high_range_prefix_suffix() -> std::io::Result<()> {
let buffer: Vec<u8> = {
let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
// term requires more than 16bits
term_dictionary_builder
.insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1))
.unwrap();
term_dictionary_builder
.insert("abcdefghijklmnopqrstuvwxyz", &make_term_info(2))
.unwrap();
term_dictionary_builder
.insert("abr", &make_term_info(2))
.unwrap();
term_dictionary_builder.finish().unwrap()
term_dictionary_builder.insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1))?;
term_dictionary_builder.insert("abcdefghijklmnopqrstuvwxyz", &make_term_info(2))?;
term_dictionary_builder.insert("abr", &make_term_info(3))?;
term_dictionary_builder.finish()?
};
let source = ReadOnlySource::from(buffer);
let term_dictionary: TermDictionary = TermDictionary::from_source(&source);
@@ -229,12 +225,15 @@ mod tests {
assert!(kv_stream.advance());
assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxy".as_bytes());
assert_eq!(kv_stream.value(), &make_term_info(1));
dbg!(make_term_info(1));
assert!(kv_stream.advance());
assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxyz".as_bytes());
assert_eq!(kv_stream.value(), &make_term_info(2));
assert!(kv_stream.advance());
assert_eq!(kv_stream.key(), "abr".as_bytes());
assert_eq!(kv_stream.value(), &make_term_info(3));
assert!(!kv_stream.advance());
Ok(())
}
#[test]

View File

@@ -57,21 +57,28 @@ impl TermInfoBlockMeta {
self.doc_freq_nbits + self.postings_offset_nbits + self.positions_idx_nbits
}
// Here inner_offset is the offset within the block, WITHOUT the first term_info.
// In other word, term_info #1,#2,#3 gets inner_offset 0,1,2... While term_info #0
// is encoded without bitpacking.
fn deserialize_term_info(&self, data: &[u8], inner_offset: usize) -> TermInfo {
assert!(inner_offset < BLOCK_LEN - 1);
let num_bits = self.num_bits() as usize;
let mut cursor = num_bits * inner_offset;
let postings_start_offset = extract_bits(data, cursor, self.postings_offset_nbits);
let postings_end_offset = self.ref_term_info.postings_start_offset
+ extract_bits(data, cursor + num_bits, self.postings_offset_nbits);
cursor += self.postings_offset_nbits as usize;
let doc_freq = extract_bits(data, cursor, self.doc_freq_nbits) as u32;
cursor += self.doc_freq_nbits as usize;
let postings_offset = extract_bits(data, cursor, self.postings_offset_nbits);
cursor += self.postings_offset_nbits as usize;
let positions_idx = extract_bits(data, cursor, self.positions_idx_nbits);
TermInfo {
doc_freq,
postings_offset: postings_offset + self.ref_term_info.postings_offset,
postings_start_offset: postings_start_offset + self.ref_term_info.postings_start_offset,
postings_end_offset,
positions_idx: positions_idx + self.ref_term_info.positions_idx,
}
}
@@ -126,14 +133,13 @@ impl TermInfoStore {
.expect("Failed to deserialize terminfoblockmeta");
let inner_offset = (term_ord as usize) % BLOCK_LEN;
if inner_offset == 0 {
term_info_block_data.ref_term_info
} else {
let term_info_data = self.term_info_source.as_slice();
term_info_block_data.deserialize_term_info(
&term_info_data[term_info_block_data.offset as usize..],
inner_offset - 1,
)
return term_info_block_data.ref_term_info;
}
let term_info_data = self.term_info_source.as_slice();
term_info_block_data.deserialize_term_info(
&term_info_data[term_info_block_data.offset as usize..],
inner_offset - 1,
)
}
pub fn num_terms(&self) -> usize {
@@ -154,16 +160,17 @@ fn bitpack_serialize<W: Write>(
term_info_block_meta: &TermInfoBlockMeta,
term_info: &TermInfo,
) -> io::Result<()> {
bit_packer.write(
term_info.postings_start_offset,
term_info_block_meta.postings_offset_nbits,
write,
)?;
bit_packer.write(
u64::from(term_info.doc_freq),
term_info_block_meta.doc_freq_nbits,
write,
)?;
bit_packer.write(
term_info.postings_offset,
term_info_block_meta.postings_offset_nbits,
write,
)?;
bit_packer.write(
term_info.positions_idx,
term_info_block_meta.positions_idx_nbits,
@@ -183,23 +190,27 @@ impl TermInfoStoreWriter {
}
fn flush_block(&mut self) -> io::Result<()> {
if self.term_infos.is_empty() {
return Ok(());
}
let mut bit_packer = BitPacker::new();
let ref_term_info = self.term_infos[0].clone();
let last_term_info = if let Some(last_term_info) = self.term_infos.last().cloned() {
last_term_info
} else {
return Ok(());
};
let postings_end_offset =
last_term_info.postings_end_offset - ref_term_info.postings_start_offset;
for term_info in &mut self.term_infos[1..] {
term_info.postings_offset -= ref_term_info.postings_offset;
term_info.postings_start_offset -= ref_term_info.postings_start_offset;
term_info.positions_idx -= ref_term_info.positions_idx;
}
let mut max_doc_freq: u32 = 0u32;
let mut max_postings_offset: u64 = 0u64;
let mut max_positions_idx: u64 = 0u64;
let max_postings_offset: u64 = postings_end_offset;
let max_positions_idx: u64 = last_term_info.positions_idx;
for term_info in &self.term_infos[1..] {
max_doc_freq = cmp::max(max_doc_freq, term_info.doc_freq);
max_postings_offset = cmp::max(max_postings_offset, term_info.postings_offset);
max_positions_idx = cmp::max(max_positions_idx, term_info.positions_idx);
}
let max_doc_freq_nbits: u8 = compute_num_bits(u64::from(max_doc_freq));
@@ -224,6 +235,12 @@ impl TermInfoStoreWriter {
)?;
}
bit_packer.write(
postings_end_offset,
term_info_block_meta.postings_offset_nbits,
&mut self.buffer_term_infos,
)?;
// Block need end up at the end of a byte.
bit_packer.flush(&mut self.buffer_term_infos)?;
self.term_infos.clear();
@@ -232,6 +249,7 @@ impl TermInfoStoreWriter {
}
pub fn write_term_info(&mut self, term_info: &TermInfo) -> io::Result<()> {
assert!(term_info.postings_end_offset >= term_info.postings_start_offset);
self.num_terms += 1u64;
self.term_infos.push(term_info.clone());
if self.term_infos.len() >= BLOCK_LEN {
@@ -291,10 +309,11 @@ mod tests {
#[test]
fn test_term_info_block_meta_serialization() {
let term_info_block_meta = TermInfoBlockMeta {
offset: 2009,
offset: 2009u64,
ref_term_info: TermInfo {
doc_freq: 512,
postings_offset: 51,
postings_start_offset: 51,
postings_end_offset: 57u64,
positions_idx: 3584,
},
doc_freq_nbits: 10,
@@ -312,10 +331,12 @@ mod tests {
fn test_pack() {
let mut store_writer = TermInfoStoreWriter::new();
let mut term_infos = vec![];
let offset = |i| (i * 13 + i * i) as u64;
for i in 0..1000 {
let term_info = TermInfo {
doc_freq: i as u32,
postings_offset: (i / 10) as u64,
postings_start_offset: offset(i),
postings_end_offset: offset(i + 1),
positions_idx: (i * 7) as u64,
};
store_writer.write_term_info(&term_info).unwrap();
@@ -325,7 +346,12 @@ mod tests {
store_writer.serialize(&mut buffer).unwrap();
let term_info_store = TermInfoStore::open(&ReadOnlySource::from(buffer));
for i in 0..1000 {
assert_eq!(term_info_store.get(i as u64), term_infos[i]);
assert_eq!(
term_info_store.get(i as u64),
term_infos[i],
"term info {}",
i
);
}
}
}