Compare commits

...

13 Commits

Author SHA1 Message Date
Paul Masurel
063ed30f66 Added field norm readers 2020-07-20 11:59:43 +09:00
Paul Masurel
6db8bb49d6 Assert nearly equals macro (#853)
* Assert nearly equals macro

* Renamed specialized_scorer in TermScorer
2020-07-17 16:40:41 +09:00
lyj
410aed0176 Update segment_updater.rs (#848) 2020-07-16 12:33:11 +09:00
aptend
00a239a712 fix typo in index_meta.rs (#851) 2020-07-16 12:32:45 +09:00
Paul Masurel
68fe406924 Removed asserts (#850) 2020-07-16 12:24:55 +09:00
Paul Masurel
f71b04acb0 Bugfix. (#849)
go_to_first_doc was typically calling seek with a target smaller than
doc.

Since SegmentPostings typically do a linear search on the full block,
regardless of the current position, it could have our segment postings
go backward.
2020-07-16 10:57:51 +09:00
lyj
1ab7f660a4 Update index.rs (#846) 2020-07-02 15:11:38 +09:00
Sean Stangl
0ebbc4cb5a Fix incorrect SimpleTokenizer link in documentation (#844) 2020-07-01 10:26:36 +09:00
lyj
5300cb5da0 Update mod.rs (#845) 2020-07-01 10:25:26 +09:00
Ype Kingma
7d773abc92 Boolean query: do not combine excluded scores. (#840)
* Do nothing when combining score values of excluded scores.

* Add test case for two excluded.

* Test score for two excluded terms.

* Use TopDocs in test_boolean_query_two_excluded
2020-06-08 20:01:19 +09:00
Paul Masurel
c34541ccce Alive doc iterator. (#837) 2020-06-05 19:42:51 +09:00
Paul Masurel
1cc5bd706c Fixes build for no-default-features (#839) 2020-06-05 19:41:55 +09:00
Paul Masurel
4026d183bc Small readability change 2020-06-03 09:04:57 +09:00
38 changed files with 442 additions and 249 deletions

View File

@@ -117,11 +117,16 @@ fn main() -> tantivy::Result<()> {
if let Some(mut block_segment_postings) =
inverted_index.read_block_postings(&term_the, IndexRecordOption::Basic)
{
while block_segment_postings.advance() {
loop {
let docs = block_segment_postings.docs();
if docs.is_empty() {
break;
}
// Once again these docs MAY contains deleted documents as well.
let docs = block_segment_postings.docs();
// Prints `Docs [0, 2].`
println!("Docs {:?}", docs);
block_segment_postings.advance();
}
}
}

View File

@@ -24,8 +24,10 @@ use crate::IndexWriter;
use std::borrow::BorrowMut;
use std::collections::HashSet;
use std::fmt;
#[cfg(feature = "mmap")]
use std::path::{Path, PathBuf};
use std::path::Path;
use std::path::PathBuf;
use std::sync::Arc;
fn load_metas(
@@ -281,7 +283,7 @@ impl Index {
TantivyError::LockFailure(
err,
Some(
"Failed to acquire index lock. If you are using\
"Failed to acquire index lock. If you are using \
a regular directory, this means there is already an \
`IndexWriter` working on this `Directory`, in this process \
or in a different process."

View File

@@ -213,7 +213,7 @@ pub struct IndexMeta {
#[serde(skip_serializing_if = "Option::is_none")]
/// Payload associated to the last commit.
///
/// Upon commit, clients can optionally add a small `Striing` payload to their commit
/// Upon commit, clients can optionally add a small `String` payload to their commit
/// to help identify this commit.
/// This payload is entirely unused by tantivy.
pub payload: Option<String>,

View File

@@ -8,7 +8,7 @@ use crate::directory::ReadOnlySource;
use crate::fastfield::DeleteBitSet;
use crate::fastfield::FacetReader;
use crate::fastfield::FastFieldReaders;
use crate::fieldnorm::FieldNormReader;
use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
use crate::schema::Field;
use crate::schema::FieldType;
use crate::schema::Schema;
@@ -48,7 +48,7 @@ pub struct SegmentReader {
positions_composite: CompositeFile,
positions_idx_composite: CompositeFile,
fast_fields_readers: Arc<FastFieldReaders>,
fieldnorms_composite: CompositeFile,
fieldnorm_readers: FieldNormReaders,
store_source: ReadOnlySource,
delete_bitset_opt: Option<DeleteBitSet>,
@@ -126,8 +126,8 @@ impl SegmentReader {
/// They are simply stored as a fast field, serialized in
/// the `.fieldnorm` file of the segment.
pub fn get_fieldnorms_reader(&self, field: Field) -> FieldNormReader {
if let Some(fieldnorm_source) = self.fieldnorms_composite.open_read(field) {
FieldNormReader::open(fieldnorm_source)
if let Some(fieldnorm_reader) = self.fieldnorm_readers.get_field(field) {
fieldnorm_reader
} else {
let field_name = self.schema.get_field_name(field);
let err_msg = format!(
@@ -178,8 +178,8 @@ impl SegmentReader {
let fast_field_readers =
Arc::new(FastFieldReaders::load_all(&schema, &fast_fields_composite)?);
let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
let fieldnorms_composite = CompositeFile::open(&fieldnorms_data)?;
let fieldnorm_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
let fieldnorm_readers = FieldNormReaders::new(fieldnorm_data)?;
let delete_bitset_opt = if segment.meta().has_deletes() {
let delete_data = segment.open_read(SegmentComponent::DELETE)?;
@@ -195,7 +195,7 @@ impl SegmentReader {
termdict_composite,
postings_composite,
fast_fields_readers: fast_field_readers,
fieldnorms_composite,
fieldnorm_readers,
segment_id: segment.id(),
store_source,
delete_bitset_opt,
@@ -295,8 +295,8 @@ impl SegmentReader {
}
/// Returns an iterator that will iterate over the alive document ids
pub fn doc_ids_alive(&self) -> SegmentReaderAliveDocsIterator<'_> {
SegmentReaderAliveDocsIterator::new(&self)
pub fn doc_ids_alive<'a>(&'a self) -> impl Iterator<Item = DocId> + 'a {
(0u32..self.max_doc).filter(move |doc| !self.is_deleted(*doc))
}
/// Summarize total space usage of this segment.
@@ -308,7 +308,7 @@ impl SegmentReader {
self.positions_composite.space_usage(),
self.positions_idx_composite.space_usage(),
self.fast_fields_readers.space_usage(),
self.fieldnorms_composite.space_usage(),
self.fieldnorm_readers.space_usage(),
self.get_store_reader().space_usage(),
self.delete_bitset_opt
.as_ref()
@@ -324,52 +324,6 @@ impl fmt::Debug for SegmentReader {
}
}
/// Implements the iterator trait to allow easy iteration
/// over non-deleted ("alive") DocIds in a SegmentReader
pub struct SegmentReaderAliveDocsIterator<'a> {
reader: &'a SegmentReader,
max_doc: DocId,
current: DocId,
}
impl<'a> SegmentReaderAliveDocsIterator<'a> {
pub fn new(reader: &'a SegmentReader) -> SegmentReaderAliveDocsIterator<'a> {
SegmentReaderAliveDocsIterator {
reader,
max_doc: reader.max_doc(),
current: 0,
}
}
}
impl<'a> Iterator for SegmentReaderAliveDocsIterator<'a> {
type Item = DocId;
fn next(&mut self) -> Option<Self::Item> {
// TODO: Use TinySet (like in BitSetDocSet) to speed this process up
if self.current >= self.max_doc {
return None;
}
// find the next alive doc id
while self.reader.is_deleted(self.current) {
self.current += 1;
if self.current >= self.max_doc {
return None;
}
}
// capture the current alive DocId
let result = Some(self.current);
// move down the chain
self.current += 1;
result
}
}
#[cfg(test)]
mod test {
use crate::core::Index;

View File

@@ -38,6 +38,7 @@ pub trait DocSet {
/// Calling `seek(TERMINATED)` is also legal and is the normal way to consume a DocSet.
fn seek(&mut self, target: DocId) -> DocId {
let mut doc = self.doc();
debug_assert!(doc <= target);
while doc < target {
doc = self.advance();
}

View File

@@ -21,7 +21,7 @@ mod reader;
mod serializer;
mod writer;
pub use self::reader::FieldNormReader;
pub use self::reader::{FieldNormReader, FieldNormReaders};
pub use self::serializer::FieldNormsSerializer;
pub use self::writer::FieldNormsWriter;

View File

@@ -1,6 +1,41 @@
use super::{fieldnorm_to_id, id_to_fieldnorm};
use crate::common::CompositeFile;
use crate::directory::ReadOnlySource;
use crate::schema::Field;
use crate::space_usage::PerFieldSpaceUsage;
use crate::DocId;
use std::sync::Arc;
/// Reader for the fieldnorm (for each document, the number of tokens indexed in the
/// field) of all indexed fields in the index.
///
/// Each fieldnorm is approximately compressed over one byte. We refer to this byte as
/// `fieldnorm_id`.
/// The mapping from `fieldnorm` to `fieldnorm_id` is given by monotonic.
#[derive(Clone)]
pub struct FieldNormReaders {
data: Arc<CompositeFile>,
}
impl FieldNormReaders {
/// Creates a field norm reader.
pub fn new(source: ReadOnlySource) -> crate::Result<FieldNormReaders> {
let data = CompositeFile::open(&source)?;
Ok(FieldNormReaders {
data: Arc::new(data),
})
}
/// Returns the FieldNormReader for a specific field.
pub fn get_field(&self, field: Field) -> Option<FieldNormReader> {
self.data.open_read(field).map(FieldNormReader::open)
}
/// Return a break down of the space usage per field.
pub fn space_usage(&self) -> PerFieldSpaceUsage {
self.data.space_usage()
}
}
/// Reads the fieldnorm associated to a document.
/// The fieldnorm represents the length associated to
@@ -19,6 +54,7 @@ use crate::DocId;
/// Apart from compression, this scale also makes it possible to
/// precompute computationally expensive functions of the fieldnorm
/// in a very short array.
#[derive(Clone)]
pub struct FieldNormReader {
data: ReadOnlySource,
}
@@ -29,6 +65,11 @@ impl FieldNormReader {
FieldNormReader { data }
}
/// Returns the number of documents in this segment.
pub fn num_docs(&self) -> u32 {
self.data.len() as u32
}
/// Returns the `fieldnorm` associated to a doc id.
/// The fieldnorm is a value approximating the number
/// of tokens in a given field of the `doc_id`.
@@ -65,10 +106,11 @@ impl FieldNormReader {
}
#[cfg(test)]
impl From<Vec<u32>> for FieldNormReader {
fn from(field_norms: Vec<u32>) -> FieldNormReader {
impl From<&[u32]> for FieldNormReader {
fn from(field_norms: &[u32]) -> FieldNormReader {
let field_norms_id = field_norms
.into_iter()
.iter()
.cloned()
.map(FieldNormReader::fieldnorm_to_id)
.collect::<Vec<u8>>();
let field_norms_data = ReadOnlySource::from(field_norms_id);

View File

@@ -78,11 +78,12 @@ impl FieldNormsWriter {
}
/// Serialize the seen fieldnorm values to the serializer for all fields.
pub fn serialize(&self, fieldnorms_serializer: &mut FieldNormsSerializer) -> io::Result<()> {
pub fn serialize(&self, mut fieldnorms_serializer: FieldNormsSerializer) -> io::Result<()> {
for &field in self.fields.iter() {
let fieldnorm_values: &[u8] = &self.fieldnorms_buffer[field.field_id() as usize][..];
fieldnorms_serializer.serialize_field(field, fieldnorm_values)?;
}
fieldnorms_serializer.close()?;
Ok(())
}
}

View File

@@ -167,7 +167,7 @@ impl IndexMerger {
fn write_fieldnorms(
&self,
fieldnorms_serializer: &mut FieldNormsSerializer,
mut fieldnorms_serializer: FieldNormsSerializer,
) -> crate::Result<()> {
let fields = FieldNormsWriter::fields_with_fieldnorm(&self.schema);
let mut fieldnorms_data = Vec::with_capacity(self.max_doc as usize);
@@ -182,6 +182,7 @@ impl IndexMerger {
}
fieldnorms_serializer.serialize_field(field, &fieldnorms_data[..])?;
}
fieldnorms_serializer.close()?;
Ok(())
}
@@ -589,48 +590,45 @@ impl IndexMerger {
// of all of the segments containing the given term.
//
// These segments are non-empty and advance has already been called.
if !segment_postings.is_empty() {
// If not, the `term` will be entirely removed.
// We know that there is at least one document containing
// the term, so we add it.
let to_term_ord = field_serializer.new_term(term_bytes)?;
if let Some(ref mut term_ord_mapping) = term_ord_mapping_opt {
for (segment_ord, from_term_ord) in merged_terms.matching_segments() {
term_ord_mapping.register_from_to(segment_ord, from_term_ord, to_term_ord);
}
}
// We can now serialize this postings, by pushing each document to the
// postings serializer.
for (segment_ord, mut segment_postings) in segment_postings {
let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
let mut doc = segment_postings.doc();
while doc != TERMINATED {
// deleted doc are skipped as they do not have a `remapped_doc_id`.
if let Some(remapped_doc_id) = old_to_new_doc_id[doc as usize] {
// we make sure to only write the term iff
// there is at least one document.
let term_freq = segment_postings.term_freq();
segment_postings.positions(&mut positions_buffer);
let delta_positions = delta_computer.compute_delta(&positions_buffer);
field_serializer.write_doc(
remapped_doc_id,
term_freq,
delta_positions,
)?;
}
doc = segment_postings.advance();
}
}
// closing the term.
field_serializer.close_term()?;
if segment_postings.is_empty() {
continue;
}
// If not, the `term` will be entirely removed.
// We know that there is at least one document containing
// the term, so we add it.
let to_term_ord = field_serializer.new_term(term_bytes)?;
if let Some(ref mut term_ord_mapping) = term_ord_mapping_opt {
for (segment_ord, from_term_ord) in merged_terms.matching_segments() {
term_ord_mapping.register_from_to(segment_ord, from_term_ord, to_term_ord);
}
}
// We can now serialize this postings, by pushing each document to the
// postings serializer.
for (segment_ord, mut segment_postings) in segment_postings {
let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
let mut doc = segment_postings.doc();
while doc != TERMINATED {
// deleted doc are skipped as they do not have a `remapped_doc_id`.
if let Some(remapped_doc_id) = old_to_new_doc_id[doc as usize] {
// we make sure to only write the term iff
// there is at least one document.
let term_freq = segment_postings.term_freq();
segment_postings.positions(&mut positions_buffer);
let delta_positions = delta_computer.compute_delta(&positions_buffer);
field_serializer.write_doc(remapped_doc_id, term_freq, delta_positions)?;
}
doc = segment_postings.advance();
}
}
// closing the term.
field_serializer.close_term()?;
}
field_serializer.close()?;
Ok(term_ord_mapping_opt)
@@ -671,8 +669,10 @@ impl IndexMerger {
impl SerializableSegment for IndexMerger {
fn write(&self, mut serializer: SegmentSerializer) -> crate::Result<u32> {
if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() {
self.write_fieldnorms(fieldnorms_serializer)?;
}
let term_ord_mappings = self.write_postings(serializer.get_postings_serializer())?;
self.write_fieldnorms(serializer.get_fieldnorms_serializer())?;
self.write_fast_fields(serializer.get_fast_field_serializer(), term_ord_mappings)?;
self.write_storable_fields(serializer.get_store_writer())?;
serializer.close()?;
@@ -1507,12 +1507,9 @@ mod tests {
for i in 0..100 {
let mut doc = Document::new();
doc.add_f64(field, 42.0);
doc.add_f64(multi_field, 0.24);
doc.add_f64(multi_field, 0.27);
writer.add_document(doc);
if i % 5 == 0 {
writer.commit()?;
}
@@ -1524,7 +1521,6 @@ mod tests {
// If a merging thread fails, we should end up with more
// than one segment here
assert_eq!(1, index.searchable_segments()?.len());
Ok(())
}
}

View File

@@ -8,15 +8,16 @@ use crate::store::StoreWriter;
/// Segment serializer is in charge of laying out on disk
/// the data accumulated and sorted by the `SegmentWriter`.
pub struct SegmentSerializer {
segment: Segment,
store_writer: StoreWriter,
fast_field_serializer: FastFieldSerializer,
fieldnorms_serializer: FieldNormsSerializer,
fieldnorms_serializer: Option<FieldNormsSerializer>,
postings_serializer: InvertedIndexSerializer,
}
impl SegmentSerializer {
/// Creates a new `SegmentSerializer`.
pub fn for_segment(segment: &mut Segment) -> crate::Result<SegmentSerializer> {
pub fn for_segment(mut segment: Segment) -> crate::Result<SegmentSerializer> {
let store_write = segment.open_write(SegmentComponent::STORE)?;
let fast_field_write = segment.open_write(SegmentComponent::FASTFIELDS)?;
@@ -25,15 +26,21 @@ impl SegmentSerializer {
let fieldnorms_write = segment.open_write(SegmentComponent::FIELDNORMS)?;
let fieldnorms_serializer = FieldNormsSerializer::from_write(fieldnorms_write)?;
let postings_serializer = InvertedIndexSerializer::open(segment)?;
let postings_serializer = InvertedIndexSerializer::open(&mut segment)?;
Ok(SegmentSerializer {
segment,
store_writer: StoreWriter::new(store_write),
fast_field_serializer,
fieldnorms_serializer,
fieldnorms_serializer: Some(fieldnorms_serializer),
postings_serializer,
})
}
#[allow(dead_code)]
pub fn segment(&self) -> &Segment {
&self.segment
}
/// Accessor to the `PostingsSerializer`.
pub fn get_postings_serializer(&mut self) -> &mut InvertedIndexSerializer {
&mut self.postings_serializer
@@ -44,9 +51,11 @@ impl SegmentSerializer {
&mut self.fast_field_serializer
}
/// Accessor to the field norm serializer.
pub fn get_fieldnorms_serializer(&mut self) -> &mut FieldNormsSerializer {
&mut self.fieldnorms_serializer
/// Extract the field norm serializer.
///
/// Note the fieldnorms serializer can only be extracted once.
pub fn extract_fieldnorms_serializer(&mut self) -> Option<FieldNormsSerializer> {
self.fieldnorms_serializer.take()
}
/// Accessor to the `StoreWriter`.
@@ -55,11 +64,13 @@ impl SegmentSerializer {
}
/// Finalize the segment serialization.
pub fn close(self) -> crate::Result<()> {
pub fn close(mut self) -> crate::Result<()> {
if let Some(fieldnorms_serializer) = self.extract_fieldnorms_serializer() {
fieldnorms_serializer.close()?;
}
self.fast_field_serializer.close()?;
self.postings_serializer.close()?;
self.store_writer.close()?;
self.fieldnorms_serializer.close()?;
Ok(())
}
}

View File

@@ -112,7 +112,7 @@ fn merge(
target_opstamp: Opstamp,
) -> crate::Result<SegmentEntry> {
// first we need to apply deletes to our segment.
let mut merged_segment = index.new_segment();
let merged_segment = index.new_segment();
// First we apply all of the delet to the merged segment, up to the target opstamp.
for segment_entry in &mut segment_entries {
@@ -131,12 +131,13 @@ fn merge(
let merger: IndexMerger = IndexMerger::open(index.schema(), &segments[..])?;
// ... we just serialize this index merger in our new segment to merge the two segments.
let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment)?;
let segment_serializer = SegmentSerializer::for_segment(merged_segment.clone())?;
let num_docs = merger.write(segment_serializer)?;
let segment_meta = index.new_segment_meta(merged_segment.id(), num_docs);
let merged_segment_id = merged_segment.id();
let segment_meta = index.new_segment_meta(merged_segment_id, num_docs);
Ok(SegmentEntry::new(segment_meta, delete_cursor, None))
}
@@ -521,7 +522,7 @@ impl SegmentUpdater {
///
/// Upon termination of the current merging threads,
/// merge opportunity may appear.
//
///
/// We keep waiting until the merge policy judges that
/// no opportunity is available.
///

View File

@@ -62,11 +62,12 @@ impl SegmentWriter {
/// - schema
pub fn for_segment(
memory_budget: usize,
mut segment: Segment,
segment: Segment,
schema: &Schema,
) -> crate::Result<SegmentWriter> {
let tokenizer_manager = segment.index().tokenizers().clone();
let table_num_bits = initial_table_size(memory_budget)?;
let segment_serializer = SegmentSerializer::for_segment(&mut segment)?;
let segment_serializer = SegmentSerializer::for_segment(segment)?;
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_num_bits);
let tokenizers = schema
.fields()
@@ -76,7 +77,7 @@ impl SegmentWriter {
.get_indexing_options()
.and_then(|text_index_option| {
let tokenizer_name = &text_index_option.tokenizer();
segment.index().tokenizers().get(tokenizer_name)
tokenizer_manager.get(tokenizer_name)
}),
_ => None,
},
@@ -280,9 +281,11 @@ fn write(
fieldnorms_writer: &FieldNormsWriter,
mut serializer: SegmentSerializer,
) -> crate::Result<()> {
if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() {
fieldnorms_writer.serialize(fieldnorms_serializer)?;
}
let term_ord_map = multifield_postings.serialize(serializer.get_postings_serializer())?;
fast_field_writers.serialize(serializer.get_fast_field_serializer(), &term_ord_map)?;
fieldnorms_writer.serialize(serializer.get_fieldnorms_serializer())?;
serializer.close()?;
Ok(())
}

View File

@@ -298,17 +298,26 @@ mod tests {
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
pub fn assert_nearly_equals(expected: f32, val: f32) {
assert!(
nearly_equals(val, expected),
"Got {}, expected {}.",
val,
expected
);
}
pub fn nearly_equals(a: f32, b: f32) -> bool {
(a - b).abs() < 0.0005 * (a + b).abs()
/// Checks if left and right are close one to each other.
/// Panics if the two values are more than 0.5% apart.
#[macro_export]
macro_rules! assert_nearly_equals {
($left:expr, $right:expr) => {{
match (&$left, &$right) {
(left_val, right_val) => {
let diff = (left_val - right_val).abs();
let add = left_val.abs() + right_val.abs();
if diff > 0.0005 * add {
panic!(
r#"assertion failed: `(left ~= right)`
left: `{:?}`,
right: `{:?}`"#,
&*left_val, &*right_val
)
}
}
}
}};
}
pub fn generate_nonunique_unsorted(max_value: u32, n_elems: usize) -> Vec<u32> {

View File

@@ -47,7 +47,6 @@ fn decode_vint_block(
doc_offset: DocId,
num_vint_docs: usize,
) {
doc_decoder.clear();
let num_consumed_bytes = doc_decoder.uncompress_vint_sorted(data, doc_offset, num_vint_docs);
if let Some(freq_decoder) = freq_decoder_opt {
freq_decoder.uncompress_vint_unsorted(&data[num_consumed_bytes..], num_vint_docs);
@@ -99,7 +98,7 @@ impl BlockSegmentPostings {
data: postings_data,
skip_reader,
};
block_segment_postings.advance();
block_segment_postings.load_block();
block_segment_postings
}
@@ -117,13 +116,13 @@ impl BlockSegmentPostings {
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, postings_data);
self.data = ReadOnlySource::new(postings_data);
self.loaded_offset = std::usize::MAX;
self.loaded_offset = std::usize::MAX;
if let Some(skip_data) = skip_data_opt {
self.skip_reader.reset(skip_data, doc_freq);
} else {
self.skip_reader.reset(ReadOnlySource::empty(), doc_freq);
}
self.doc_freq = doc_freq as usize;
self.load_block();
}
/// Returns the document frequency associated to this block postings.
@@ -215,6 +214,10 @@ impl BlockSegmentPostings {
);
}
BlockInfo::VInt(num_vint_docs) => {
self.doc_decoder.clear();
if num_vint_docs == 0 {
return;
}
decode_vint_block(
&mut self.doc_decoder,
if let FreqReadingOption::ReadFreq = self.freq_reading_option {
@@ -233,12 +236,9 @@ impl BlockSegmentPostings {
/// Advance to the next block.
///
/// Returns false iff there was no remaining blocks.
pub fn advance(&mut self) -> bool {
if !self.skip_reader.advance() {
return false;
}
pub fn advance(&mut self) {
self.skip_reader.advance();
self.load_block();
true
}
/// Returns an empty segment postings object
@@ -294,7 +294,8 @@ mod tests {
#[test]
fn test_empty_block_segment_postings() {
let mut postings = BlockSegmentPostings::empty();
assert!(!postings.advance());
postings.advance();
assert!(postings.docs().is_empty());
assert_eq!(postings.doc_freq(), 0);
}
@@ -306,13 +307,14 @@ mod tests {
assert_eq!(block_segments.doc_freq(), 100_000);
loop {
let block = block_segments.docs();
if block.is_empty() {
break;
}
for (i, doc) in block.iter().cloned().enumerate() {
assert_eq!(offset + (i as u32), doc);
}
offset += block.len() as u32;
if block_segments.advance() {
break;
}
block_segments.advance();
}
}
@@ -421,7 +423,6 @@ mod tests {
let term_info = inverted_index.get_term_info(&term).unwrap();
inverted_index.reset_block_postings_from_terminfo(&term_info, &mut block_segments);
}
assert!(block_segments.advance());
assert_eq!(block_segments.docs(), &[1, 3, 5]);
}
}

View File

@@ -109,6 +109,7 @@ impl BlockDecoder {
}
pub fn clear(&mut self) {
self.output_len = 0;
self.output.0.iter_mut().for_each(|el| *el = TERMINATED);
}
}
@@ -244,6 +245,19 @@ pub mod tests {
}
}
#[test]
fn test_clearing() {
let mut encoder = BlockEncoder::new();
let vals = (0u32..128u32).map(|i| i * 3).collect::<Vec<_>>();
let (num_bits, compressed) = encoder.compress_block_sorted(&vals[..], 0u32);
let mut decoder = BlockDecoder::default();
decoder.uncompress_block_sorted(compressed, 0u32, num_bits);
assert_eq!(decoder.output_len, 128);
assert_eq!(decoder.output_array(), &vals[..]);
decoder.clear();
assert!(decoder.output_array().is_empty());
}
#[test]
fn test_encode_unsorted_block_with_junk() {
let mut compressed: Vec<u8> = Vec::new();

View File

@@ -582,6 +582,9 @@ pub mod tests {
) {
for target in targets {
let mut postings_opt = postings_factory();
if target < postings_opt.doc() {
continue;
}
let mut postings_unopt = UnoptimizedDocSet::wrap(postings_factory());
let skip_result_opt = postings_opt.seek(target);
let skip_result_unopt = postings_unopt.seek(target);

View File

@@ -100,14 +100,15 @@ impl DocSet for SegmentPostings {
}
fn seek(&mut self, target: DocId) -> DocId {
if self.doc() == target {
return target;
debug_assert!(self.doc() <= target);
if self.doc() >= target {
return self.doc();
}
self.block_cursor.seek(target);
// At this point we are on the block, that might contain our document.
let output = self.block_cursor.docs_aligned();
self.cur = self.block_searcher.search_in_block(&output, target);
// The last block is not full and padded with the value TERMINATED,
@@ -123,6 +124,7 @@ impl DocSet for SegmentPostings {
// After the search, the cursor should point to the first value of TERMINATED.
let doc = output.0[self.cur];
debug_assert!(doc >= target);
debug_assert_eq!(doc, self.doc());
doc
}

View File

@@ -81,25 +81,41 @@ impl Default for BlockInfo {
impl SkipReader {
pub fn new(data: ReadOnlySource, doc_freq: u32, skip_info: IndexRecordOption) -> SkipReader {
SkipReader {
last_doc_in_block: 0u32,
let mut skip_reader = SkipReader {
last_doc_in_block: if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 {
0
} else {
TERMINATED
},
last_doc_in_previous_block: 0u32,
owned_read: OwnedRead::new(data),
skip_info,
block_info: BlockInfo::default(),
block_info: BlockInfo::VInt(doc_freq),
byte_offset: 0,
remaining_docs: doc_freq,
position_offset: 0u64,
};
if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 {
skip_reader.read_block_info();
}
skip_reader
}
pub fn reset(&mut self, data: ReadOnlySource, doc_freq: u32) {
self.last_doc_in_block = 0u32;
self.last_doc_in_block = if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 {
0
} else {
TERMINATED
};
self.last_doc_in_previous_block = 0u32;
self.owned_read = OwnedRead::new(data);
self.block_info = BlockInfo::default();
self.block_info = BlockInfo::VInt(doc_freq);
self.byte_offset = 0;
self.remaining_docs = doc_freq;
self.position_offset = 0u64;
if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 {
self.read_block_info();
}
}
#[cfg(test)]
@@ -165,7 +181,7 @@ impl SkipReader {
}
}
pub fn advance(&mut self) -> bool {
pub fn advance(&mut self) {
match self.block_info {
BlockInfo::BitPacked {
doc_num_bits,
@@ -177,17 +193,17 @@ impl SkipReader {
self.position_offset += tf_sum as u64;
}
BlockInfo::VInt(num_vint_docs) => {
self.remaining_docs -= num_vint_docs;
debug_assert_eq!(num_vint_docs, self.remaining_docs);
self.remaining_docs = 0;
self.byte_offset = std::usize::MAX;
}
}
self.last_doc_in_previous_block = self.last_doc_in_block;
if self.remaining_docs >= COMPRESSION_BLOCK_SIZE as u32 {
self.read_block_info();
true
} else {
self.last_doc_in_block = TERMINATED;
self.block_info = BlockInfo::VInt(self.remaining_docs);
self.remaining_docs > 0
}
}
}
@@ -217,7 +233,6 @@ mod tests {
doc_freq,
IndexRecordOption::WithFreqs,
);
assert!(skip_reader.advance());
assert_eq!(skip_reader.last_doc_in_block(), 1u32);
assert_eq!(
skip_reader.block_info(),
@@ -227,7 +242,7 @@ mod tests {
tf_sum: 0
}
);
assert!(skip_reader.advance());
skip_reader.advance();
assert_eq!(skip_reader.last_doc_in_block(), 5u32);
assert_eq!(
skip_reader.block_info(),
@@ -237,9 +252,12 @@ mod tests {
tf_sum: 0
}
);
assert!(skip_reader.advance());
skip_reader.advance();
assert_eq!(skip_reader.block_info(), BlockInfo::VInt(3u32));
assert!(!skip_reader.advance());
skip_reader.advance();
assert_eq!(skip_reader.block_info(), BlockInfo::VInt(0u32));
skip_reader.advance();
assert_eq!(skip_reader.block_info(), BlockInfo::VInt(0u32));
}
#[test]
@@ -256,7 +274,6 @@ mod tests {
doc_freq,
IndexRecordOption::Basic,
);
assert!(skip_reader.advance());
assert_eq!(skip_reader.last_doc_in_block(), 1u32);
assert_eq!(
skip_reader.block_info(),
@@ -266,7 +283,7 @@ mod tests {
tf_sum: 0u32
}
);
assert!(skip_reader.advance());
skip_reader.advance();
assert_eq!(skip_reader.last_doc_in_block(), 5u32);
assert_eq!(
skip_reader.block_info(),
@@ -276,9 +293,12 @@ mod tests {
tf_sum: 0u32
}
);
assert!(skip_reader.advance());
skip_reader.advance();
assert_eq!(skip_reader.block_info(), BlockInfo::VInt(3u32));
assert!(!skip_reader.advance());
skip_reader.advance();
assert_eq!(skip_reader.block_info(), BlockInfo::VInt(0u32));
skip_reader.advance();
assert_eq!(skip_reader.block_info(), BlockInfo::VInt(0u32));
}
#[test]
@@ -294,7 +314,6 @@ mod tests {
doc_freq,
IndexRecordOption::Basic,
);
assert!(skip_reader.advance());
assert_eq!(skip_reader.last_doc_in_block(), 1u32);
assert_eq!(
skip_reader.block_info(),
@@ -304,6 +323,7 @@ mod tests {
tf_sum: 0u32
}
);
assert!(!skip_reader.advance());
skip_reader.advance();
assert_eq!(skip_reader.block_info(), BlockInfo::VInt(0u32));
}
}

View File

@@ -43,7 +43,6 @@ where
fn scorer(&self, reader: &SegmentReader, boost: f32) -> Result<Box<dyn Scorer>> {
let max_doc = reader.max_doc();
let mut doc_bitset = BitSet::with_max_value(max_doc);
let inverted_index = reader.inverted_index(self.field);
let term_dict = inverted_index.terms();
let mut term_stream = self.automaton_stream(term_dict);
@@ -52,12 +51,14 @@ where
let mut block_segment_postings = inverted_index
.read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic);
loop {
for &doc in block_segment_postings.docs() {
doc_bitset.insert(doc);
}
if !block_segment_postings.advance() {
let docs = block_segment_postings.docs();
if docs.is_empty() {
break;
}
for &doc in docs {
doc_bitset.insert(doc);
}
block_segment_postings.advance();
}
}
let doc_bitset = BitSetDocSet::from(doc_bitset);

View File

@@ -139,10 +139,10 @@ impl BM25Weight {
mod tests {
use super::idf;
use crate::tests::assert_nearly_equals;
use crate::assert_nearly_equals;
#[test]
fn test_idf() {
assert_nearly_equals(idf(1, 2), 0.6931472);
assert_nearly_equals!(idf(1, 2), 0.6931472);
}
}

View File

@@ -94,7 +94,7 @@ impl BooleanWeight {
let exclude_scorer_opt: Option<Box<dyn Scorer>> = per_occur_scorers
.remove(&Occur::MustNot)
.map(scorer_union::<TScoreCombiner>)
.map(scorer_union::<DoNothingCombiner>)
.map(Into::into);
let must_scorer_opt: Option<Box<dyn Scorer>> = per_occur_scorers

View File

@@ -7,7 +7,9 @@ pub use self::boolean_query::BooleanQuery;
mod tests {
use super::*;
use crate::assert_nearly_equals;
use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE;
use crate::collector::TopDocs;
use crate::query::score_combiner::SumWithCoordsCombiner;
use crate::query::term_query::TermScorer;
use crate::query::Intersection;
@@ -18,9 +20,8 @@ mod tests {
use crate::query::Scorer;
use crate::query::TermQuery;
use crate::schema::*;
use crate::tests::assert_nearly_equals;
use crate::Index;
use crate::{DocAddress, DocId};
use crate::{DocAddress, DocId, Score};
fn aux_test_helper() -> (Index, Field) {
let mut schema_builder = Schema::builder();
@@ -140,7 +141,6 @@ mod tests {
.map(|doc| doc.1)
.collect::<Vec<DocId>>()
};
{
let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a"))]);
assert_eq!(matching_docs(&boolean_query), vec![0, 1, 3]);
@@ -177,6 +177,54 @@ mod tests {
}
}
#[test]
pub fn test_boolean_query_two_excluded() {
let (index, text_field) = aux_test_helper();
let make_term_query = |text: &str| {
let term_query = TermQuery::new(
Term::from_field_text(text_field, text),
IndexRecordOption::Basic,
);
let query: Box<dyn Query> = Box::new(term_query);
query
};
let reader = index.reader().unwrap();
let matching_topdocs = |query: &dyn Query| {
reader
.searcher()
.search(query, &TopDocs::with_limit(3))
.unwrap()
};
let score_doc_4: Score; // score of doc 4 should not be influenced by exclusion
{
let boolean_query_no_excluded =
BooleanQuery::from(vec![(Occur::Must, make_term_query("d"))]);
let topdocs_no_excluded = matching_topdocs(&boolean_query_no_excluded);
assert_eq!(topdocs_no_excluded.len(), 2);
let (top_score, top_doc) = topdocs_no_excluded[0];
assert_eq!(top_doc, DocAddress(0, 4));
assert_eq!(topdocs_no_excluded[1].1, DocAddress(0, 3)); // ignore score of doc 3.
score_doc_4 = top_score;
}
{
let boolean_query_two_excluded = BooleanQuery::from(vec![
(Occur::Must, make_term_query("d")),
(Occur::MustNot, make_term_query("a")),
(Occur::MustNot, make_term_query("b")),
]);
let topdocs_excluded = matching_topdocs(&boolean_query_two_excluded);
assert_eq!(topdocs_excluded.len(), 1);
let (top_score, top_doc) = topdocs_excluded[0];
assert_eq!(top_doc, DocAddress(0, 4));
assert_eq!(top_score, score_doc_4);
}
}
#[test]
pub fn test_boolean_query_with_weight() {
let mut schema_builder = Schema::builder();
@@ -208,14 +256,14 @@ mod tests {
.scorer(searcher.segment_reader(0u32), 1.0f32)
.unwrap();
assert_eq!(boolean_scorer.doc(), 0u32);
assert_nearly_equals(boolean_scorer.score(), 0.84163445f32);
assert_nearly_equals!(boolean_scorer.score(), 0.84163445f32);
}
{
let mut boolean_scorer = boolean_weight
.scorer(searcher.segment_reader(0u32), 2.0f32)
.unwrap();
assert_eq!(boolean_scorer.doc(), 0u32);
assert_nearly_equals(boolean_scorer.score(), 1.6832689f32);
assert_nearly_equals!(boolean_scorer.score(), 1.6832689f32);
}
}
@@ -274,7 +322,7 @@ mod tests {
index_writer.add_document(doc!(
// tf = 1 1
title => "PDF Мастер Класс \"Морячок\" (Оксана Лифенко)",
// tf = 0 0
// tf = 0 0
text => "https://i.ibb.co/pzvHrDN/I3d U T6 Gg TM.jpg\nhttps://i.ibb.co/NFrb6v6/N0ls Z9nwjb U.jpg\nВ описание входит штаны, кофта, берет, матросский воротник. Описание продается в формате PDF, состоит из 12 страниц формата А4 и может быть напечатано на любом принтере.\nОписание предназначено для кукол BJD RealPuki от FairyLand, но может подойти и другим подобным куклам. Также вы можете вязать этот наряд из обычной пряжи, и он подойдет для куколок побольше.\nhttps://vk.com/market 95724412?w=product 95724412_2212"
));
for _ in 0..1_000 {

View File

@@ -3,6 +3,11 @@ use crate::query::Scorer;
use crate::DocId;
use crate::Score;
#[inline(always)]
fn is_within<TDocSetExclude: DocSet>(docset: &mut TDocSetExclude, doc: DocId) -> bool {
docset.doc() <= doc && docset.seek(doc) == doc
}
/// Filters a given `DocSet` by removing the docs from a given `DocSet`.
///
/// The excluding docset has no impact on scoring.
@@ -23,8 +28,7 @@ where
) -> Exclude<TDocSet, TDocSetExclude> {
while underlying_docset.doc() != TERMINATED {
let target = underlying_docset.doc();
if excluding_docset.seek(target) != target {
// this document is not excluded.
if !is_within(&mut excluding_docset, target) {
break;
}
underlying_docset.advance();
@@ -36,42 +40,30 @@ where
}
}
impl<TDocSet, TDocSetExclude> Exclude<TDocSet, TDocSetExclude>
where
TDocSet: DocSet,
TDocSetExclude: DocSet,
{
/// Returns true iff the doc is not removed.
///
/// The method has to be called with non strictly
/// increasing `doc`.
fn accept(&mut self) -> bool {
let doc = self.underlying_docset.doc();
self.excluding_docset.seek(doc) != doc
}
}
impl<TDocSet, TDocSetExclude> DocSet for Exclude<TDocSet, TDocSetExclude>
where
TDocSet: DocSet,
TDocSetExclude: DocSet,
{
fn advance(&mut self) -> DocId {
while self.underlying_docset.advance() != TERMINATED {
if self.accept() {
return self.doc();
loop {
let candidate = self.underlying_docset.advance();
if candidate == TERMINATED {
return TERMINATED;
}
if !is_within(&mut self.excluding_docset, candidate) {
return candidate;
}
}
TERMINATED
}
fn seek(&mut self, target: DocId) -> DocId {
let underlying_seek_result = self.underlying_docset.seek(target);
if underlying_seek_result == TERMINATED {
let candidate = self.underlying_docset.seek(target);
if candidate == TERMINATED {
return TERMINATED;
}
if self.accept() {
return underlying_seek_result;
if !is_within(&mut self.excluding_docset, candidate) {
return candidate;
}
self.advance()
}
@@ -129,7 +121,7 @@ mod tests {
VecDocSet::from(vec![1, 2, 3, 10, 16, 24]),
))
},
vec![1, 2, 5, 8, 10, 15, 24],
vec![5, 8, 10, 15, 24],
);
}

View File

@@ -163,10 +163,10 @@ impl Query for FuzzyTermQuery {
#[cfg(test)]
mod test {
use super::FuzzyTermQuery;
use crate::assert_nearly_equals;
use crate::collector::TopDocs;
use crate::schema::Schema;
use crate::schema::TEXT;
use crate::tests::assert_nearly_equals;
use crate::Index;
use crate::Term;
@@ -199,7 +199,7 @@ mod test {
.unwrap();
assert_eq!(top_docs.len(), 1, "Expected only 1 document");
let (score, _) = top_docs[0];
assert_nearly_equals(1f32, score);
assert_nearly_equals!(1f32, score);
}
// fails because non-prefix Levenshtein distance is more than 1 (add 'a' and 'n')
@@ -223,7 +223,7 @@ mod test {
.unwrap();
assert_eq!(top_docs.len(), 1, "Expected only 1 document");
let (score, _) = top_docs[0];
assert_nearly_equals(1f32, score);
assert_nearly_equals!(1f32, score);
}
}
}

View File

@@ -53,7 +53,8 @@ pub struct Intersection<TDocSet: DocSet, TOtherDocSet: DocSet = Box<dyn Scorer>>
}
fn go_to_first_doc<TDocSet: DocSet>(docsets: &mut [TDocSet]) -> DocId {
let mut candidate = 0;
assert!(!docsets.is_empty());
let mut candidate = docsets.iter().map(TDocSet::doc).max().unwrap();
'outer: loop {
for docset in docsets.iter_mut() {
let seek_doc = docset.seek(candidate);
@@ -119,6 +120,9 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
}
}
debug_assert_eq!(candidate, self.left.doc());
debug_assert_eq!(candidate, self.right.doc());
debug_assert!(self.others.iter().all(|docset| docset.doc() == candidate));
return candidate;
}
}
@@ -129,7 +133,10 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
for docset in &mut self.others {
docsets.push(docset);
}
go_to_first_doc(&mut docsets[..])
let doc = go_to_first_doc(&mut docsets[..]);
debug_assert!(docsets.iter().all(|docset| docset.doc() == doc));
debug_assert!(doc >= target);
doc
}
fn doc(&self) -> DocId {

View File

@@ -10,12 +10,13 @@ pub use self::phrase_weight::PhraseWeight;
pub mod tests {
use super::*;
use crate::assert_nearly_equals;
use crate::collector::tests::{TEST_COLLECTOR_WITHOUT_SCORE, TEST_COLLECTOR_WITH_SCORE};
use crate::core::Index;
use crate::query::Weight;
use crate::schema::{Schema, Term, TEXT};
use crate::tests::assert_nearly_equals;
use crate::DocAddress;
use crate::DocId;
use crate::{DocAddress, TERMINATED};
pub fn create_index(texts: &[&'static str]) -> Index {
let mut schema_builder = Schema::builder();
@@ -67,6 +68,23 @@ pub mod tests {
assert!(test_query(vec!["g", "a"]).is_empty());
}
#[test]
pub fn test_phrase_query_simple() -> crate::Result<()> {
let index = create_index(&["a b b d c g c", "a b a b c"]);
let text_field = index.schema().get_field("text").unwrap();
let searcher = index.reader()?.searcher();
let terms: Vec<Term> = vec!["a", "b", "c"]
.iter()
.map(|text| Term::from_field_text(text_field, text))
.collect();
let phrase_query = PhraseQuery::new(terms);
let phrase_weight = phrase_query.phrase_weight(&searcher, false)?;
let mut phrase_scorer = phrase_weight.scorer(searcher.segment_reader(0), 1.0f32)?;
assert_eq!(phrase_scorer.doc(), 1);
assert_eq!(phrase_scorer.advance(), TERMINATED);
Ok(())
}
#[test]
pub fn test_phrase_query_no_score() {
let index = create_index(&[
@@ -157,8 +175,8 @@ pub mod tests {
.to_vec()
};
let scores = test_query(vec!["a", "b"]);
assert_nearly_equals(scores[0], 0.40618482);
assert_nearly_equals(scores[1], 0.46844664);
assert_nearly_equals!(scores[0], 0.40618482);
assert_nearly_equals!(scores[1], 0.46844664);
}
#[test] // motivated by #234

View File

@@ -239,6 +239,7 @@ impl<TPostings: Postings> DocSet for PhraseScorer<TPostings> {
}
fn seek(&mut self, target: DocId) -> DocId {
debug_assert!(target >= self.doc());
let doc = self.intersection_docset.seek(target);
if doc == TERMINATED || self.phrase_match() {
return doc;
@@ -266,7 +267,6 @@ impl<TPostings: Postings> Scorer for PhraseScorer<TPostings> {
#[cfg(test)]
mod tests {
use super::{intersection, intersection_count};
fn test_intersection_sym(left: &[u32], right: &[u32], expected: &[u32]) {

View File

@@ -113,7 +113,7 @@ fn trim_ast(logical_ast: LogicalAST) -> Option<LogicalAST> {
/// The language covered by the current parser is extremely simple.
///
/// * simple terms: "e.g.: `Barack Obama` are simply tokenized using
/// tantivy's [`SimpleTokenizer`](tantivy::tokenizer::SimpleTokenizer), hence
/// tantivy's [`SimpleTokenizer`](../tokenizer/struct.SimpleTokenizer.html), hence
/// becoming `["barack", "obama"]`. The terms are then searched within
/// the default terms of the query parser.
///

View File

@@ -301,12 +301,14 @@ impl Weight for RangeWeight {
let mut block_segment_postings = inverted_index
.read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic);
loop {
let docs = block_segment_postings.docs();
if docs.is_empty() {
break;
}
for &doc in block_segment_postings.docs() {
doc_bitset.insert(doc);
}
if !block_segment_postings.advance() {
break;
}
block_segment_postings.advance();
}
}
let doc_bitset = BitSetDocSet::from(doc_bitset);

View File

@@ -89,10 +89,10 @@ impl Query for RegexQuery {
#[cfg(test)]
mod test {
use super::RegexQuery;
use crate::assert_nearly_equals;
use crate::collector::TopDocs;
use crate::schema::TEXT;
use crate::schema::{Field, Schema};
use crate::tests::assert_nearly_equals;
use crate::{Index, IndexReader};
use std::sync::Arc;
use tantivy_fst::Regex;
@@ -129,7 +129,7 @@ mod test {
.unwrap();
assert_eq!(scored_docs.len(), 1, "Expected only 1 document");
let (score, _) = scored_docs[0];
assert_nearly_equals(1f32, score);
assert_nearly_equals!(1f32, score);
}
let top_docs = searcher
.search(&query_matching_zero, &TopDocs::with_limit(2))

View File

@@ -72,7 +72,7 @@ where
let doc = self.doc();
let mut score_combiner = TScoreCombiner::default();
score_combiner.update(&mut self.req_scorer);
if self.opt_scorer.seek(doc) == doc {
if self.opt_scorer.doc() <= doc && self.opt_scorer.seek(doc) == doc {
score_combiner.update(&mut self.opt_scorer);
}
let score = score_combiner.score();

View File

@@ -9,13 +9,14 @@ pub use self::term_weight::TermWeight;
#[cfg(test)]
mod tests {
use crate::assert_nearly_equals;
use crate::collector::TopDocs;
use crate::docset::DocSet;
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
use crate::query::{Query, QueryParser, Scorer, TermQuery};
use crate::schema::{Field, IndexRecordOption, Schema, STRING, TEXT};
use crate::tests::assert_nearly_equals;
use crate::Index;
use crate::Term;
use crate::{Index, TERMINATED};
#[test]
pub fn test_term_query_no_freq() {
@@ -42,6 +43,41 @@ mod tests {
assert_eq!(term_scorer.score(), 0.28768212);
}
#[test]
pub fn test_term_query_multiple_of_block_len() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", STRING);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
for _ in 0..COMPRESSION_BLOCK_SIZE {
let doc = doc!(text_field => "a");
index_writer.add_document(doc);
}
index_writer.commit()?;
}
let searcher = index.reader()?.searcher();
let term_query = TermQuery::new(
Term::from_field_text(text_field, "a"),
IndexRecordOption::Basic,
);
let term_weight = term_query.weight(&searcher, true)?;
let segment_reader = searcher.segment_reader(0);
let mut term_scorer = term_weight.scorer(segment_reader, 1.0f32)?;
for i in 0u32..COMPRESSION_BLOCK_SIZE as u32 {
assert_eq!(term_scorer.doc(), i);
if i == COMPRESSION_BLOCK_SIZE as u32 - 1u32 {
assert_eq!(term_scorer.advance(), TERMINATED);
} else {
assert_eq!(term_scorer.advance(), i + 1);
}
}
assert_eq!(term_scorer.doc(), TERMINATED);
Ok(())
}
#[test]
pub fn test_term_weight() {
let mut schema_builder = Schema::builder();
@@ -69,7 +105,7 @@ mod tests {
.unwrap();
assert_eq!(topdocs.len(), 1);
let (score, _) = topdocs[0];
assert_nearly_equals(0.77802235, score);
assert_nearly_equals!(0.77802235, score);
}
{
let term = Term::from_field_text(left_field, "left1");
@@ -79,9 +115,9 @@ mod tests {
.unwrap();
assert_eq!(top_docs.len(), 2);
let (score1, _) = top_docs[0];
assert_nearly_equals(0.27101856, score1);
assert_nearly_equals!(0.27101856, score1);
let (score2, _) = top_docs[1];
assert_nearly_equals(0.13736556, score2);
assert_nearly_equals!(0.13736556, score2);
}
{
let query_parser = QueryParser::for_index(&index, vec![]);
@@ -89,9 +125,9 @@ mod tests {
let top_docs = searcher.search(&query, &TopDocs::with_limit(2)).unwrap();
assert_eq!(top_docs.len(), 2);
let (score1, _) = top_docs[0];
assert_nearly_equals(0.9153879, score1);
assert_nearly_equals!(0.9153879, score1);
let (score2, _) = top_docs[1];
assert_nearly_equals(0.27101856, score2);
assert_nearly_equals!(0.27101856, score2);
}
}
@@ -112,6 +148,27 @@ mod tests {
assert_eq!(term_query.count(&*reader.searcher()).unwrap(), 1);
}
#[test]
fn test_term_query_simple_seek() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"a"));
index_writer.commit()?;
let term_a = Term::from_field_text(text_field, "a");
let term_query = TermQuery::new(term_a, IndexRecordOption::Basic);
let searcher = index.reader()?.searcher();
let term_weight = term_query.weight(&searcher, false)?;
let mut term_scorer = term_weight.scorer(searcher.segment_reader(0u32), 1.0f32)?;
assert_eq!(term_scorer.doc(), 0u32);
term_scorer.seek(1u32);
assert_eq!(term_scorer.doc(), 1u32);
Ok(())
}
#[test]
fn test_term_query_debug() {
let term_query = TermQuery::new(

View File

@@ -20,12 +20,12 @@ pub struct TermWeight {
impl Weight for TermWeight {
fn scorer(&self, reader: &SegmentReader, boost: f32) -> Result<Box<dyn Scorer>> {
let term_scorer = self.scorer_specialized(reader, boost)?;
let term_scorer = self.specialized_scorer(reader, boost)?;
Ok(Box::new(term_scorer))
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
let mut scorer = self.scorer_specialized(reader, 1.0f32)?;
let mut scorer = self.specialized_scorer(reader, 1.0f32)?;
if scorer.seek(doc) != doc {
return Err(does_not_match(doc));
}
@@ -52,7 +52,7 @@ impl Weight for TermWeight {
reader: &SegmentReader,
callback: &mut dyn FnMut(DocId, Score),
) -> crate::Result<()> {
let mut scorer = self.scorer_specialized(reader, 1.0f32)?;
let mut scorer = self.specialized_scorer(reader, 1.0f32)?;
for_each_scorer(&mut scorer, callback);
Ok(())
}
@@ -92,7 +92,7 @@ impl TermWeight {
}
}
fn scorer_specialized(&self, reader: &SegmentReader, boost: f32) -> Result<TermScorer> {
fn specialized_scorer(&self, reader: &SegmentReader, boost: f32) -> Result<TermScorer> {
let field = self.term.field();
let inverted_index = reader.inverted_index(field);
let fieldnorm_reader = reader.get_fieldnorms_reader(field);

View File

@@ -183,7 +183,10 @@ where
// advance all docsets to a doc >= to the target.
#[cfg_attr(feature = "cargo-clippy", allow(clippy::clippy::collapsible_if))]
unordered_drain_filter(&mut self.docsets, |docset| {
docset.seek(target) == TERMINATED
if docset.doc() < target {
docset.seek(target);
}
docset.doc() == TERMINATED
});
// at this point all of the docsets

View File

@@ -22,7 +22,7 @@ pub enum ReloadPolicy {
/// The index is entirely reloaded manually.
/// All updates of the index should be manual.
///
/// No change is reflected automatically. You are required to call `.load_seacher()` manually.
/// No change is reflected automatically. You are required to call `IndexReader::reload()` manually.
Manual,
/// The index is reloaded within milliseconds after a new commit is available.
/// This is made possible by watching changes in the `meta.json` file.

View File

@@ -14,7 +14,7 @@ use std::fmt;
/// - a field name
/// - a field type, itself wrapping up options describing
/// how the field should be indexed.
#[derive(Clone, Debug, Eq, PartialEq)]
#[derive(Clone, Debug, PartialEq)]
pub struct FieldEntry {
name: String,
field_type: FieldType,

View File

@@ -48,7 +48,7 @@ pub enum Type {
/// A `FieldType` describes the type (text, u64) of a field as well as
/// how it should be handled by tantivy.
#[derive(Clone, Debug, Eq, PartialEq)]
#[derive(Clone, Debug, PartialEq)]
pub enum FieldType {
/// String field type configuration
Str(TextOptions),

View File

@@ -6,7 +6,7 @@ use std::borrow::Cow;
use std::ops::BitOr;
/// Define how a text field should be handled by tantivy.
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct TextOptions {
indexing: Option<TextFieldIndexing>,
stored: bool,
@@ -51,7 +51,7 @@ impl Default for TextOptions {
/// - the amount of information that should be stored about the presence of a term in a document.
/// Essentially, should we store the term frequency and/or the positions (See [`IndexRecordOption`](./enum.IndexRecordOption.html)).
/// - the name of the `Tokenizer` that should be used to process the field.
#[derive(Clone, PartialEq, Eq, Debug, Serialize, Deserialize)]
#[derive(Clone, PartialEq, Debug, Serialize, Deserialize)]
pub struct TextFieldIndexing {
record: IndexRecordOption,
tokenizer: Cow<'static, str>,