Compare commits

..

1 Commits

Author SHA1 Message Date
Paul Masurel
e4759b1d82 Fixes build for no-default-features 2020-06-05 19:40:32 +09:00
38 changed files with 207 additions and 401 deletions

View File

@@ -117,16 +117,11 @@ fn main() -> tantivy::Result<()> {
if let Some(mut block_segment_postings) = if let Some(mut block_segment_postings) =
inverted_index.read_block_postings(&term_the, IndexRecordOption::Basic) inverted_index.read_block_postings(&term_the, IndexRecordOption::Basic)
{ {
loop { while block_segment_postings.advance() {
let docs = block_segment_postings.docs();
if docs.is_empty() {
break;
}
// Once again these docs MAY contains deleted documents as well. // Once again these docs MAY contains deleted documents as well.
let docs = block_segment_postings.docs(); let docs = block_segment_postings.docs();
// Prints `Docs [0, 2].` // Prints `Docs [0, 2].`
println!("Docs {:?}", docs); println!("Docs {:?}", docs);
block_segment_postings.advance();
} }
} }
} }

View File

@@ -283,7 +283,7 @@ impl Index {
TantivyError::LockFailure( TantivyError::LockFailure(
err, err,
Some( Some(
"Failed to acquire index lock. If you are using \ "Failed to acquire index lock. If you are using\
a regular directory, this means there is already an \ a regular directory, this means there is already an \
`IndexWriter` working on this `Directory`, in this process \ `IndexWriter` working on this `Directory`, in this process \
or in a different process." or in a different process."

View File

@@ -213,7 +213,7 @@ pub struct IndexMeta {
#[serde(skip_serializing_if = "Option::is_none")] #[serde(skip_serializing_if = "Option::is_none")]
/// Payload associated to the last commit. /// Payload associated to the last commit.
/// ///
/// Upon commit, clients can optionally add a small `String` payload to their commit /// Upon commit, clients can optionally add a small `Striing` payload to their commit
/// to help identify this commit. /// to help identify this commit.
/// This payload is entirely unused by tantivy. /// This payload is entirely unused by tantivy.
pub payload: Option<String>, pub payload: Option<String>,

View File

@@ -8,7 +8,7 @@ use crate::directory::ReadOnlySource;
use crate::fastfield::DeleteBitSet; use crate::fastfield::DeleteBitSet;
use crate::fastfield::FacetReader; use crate::fastfield::FacetReader;
use crate::fastfield::FastFieldReaders; use crate::fastfield::FastFieldReaders;
use crate::fieldnorm::{FieldNormReader, FieldNormReaders}; use crate::fieldnorm::FieldNormReader;
use crate::schema::Field; use crate::schema::Field;
use crate::schema::FieldType; use crate::schema::FieldType;
use crate::schema::Schema; use crate::schema::Schema;
@@ -48,7 +48,7 @@ pub struct SegmentReader {
positions_composite: CompositeFile, positions_composite: CompositeFile,
positions_idx_composite: CompositeFile, positions_idx_composite: CompositeFile,
fast_fields_readers: Arc<FastFieldReaders>, fast_fields_readers: Arc<FastFieldReaders>,
fieldnorm_readers: FieldNormReaders, fieldnorms_composite: CompositeFile,
store_source: ReadOnlySource, store_source: ReadOnlySource,
delete_bitset_opt: Option<DeleteBitSet>, delete_bitset_opt: Option<DeleteBitSet>,
@@ -126,8 +126,8 @@ impl SegmentReader {
/// They are simply stored as a fast field, serialized in /// They are simply stored as a fast field, serialized in
/// the `.fieldnorm` file of the segment. /// the `.fieldnorm` file of the segment.
pub fn get_fieldnorms_reader(&self, field: Field) -> FieldNormReader { pub fn get_fieldnorms_reader(&self, field: Field) -> FieldNormReader {
if let Some(fieldnorm_reader) = self.fieldnorm_readers.get_field(field) { if let Some(fieldnorm_source) = self.fieldnorms_composite.open_read(field) {
fieldnorm_reader FieldNormReader::open(fieldnorm_source)
} else { } else {
let field_name = self.schema.get_field_name(field); let field_name = self.schema.get_field_name(field);
let err_msg = format!( let err_msg = format!(
@@ -178,8 +178,8 @@ impl SegmentReader {
let fast_field_readers = let fast_field_readers =
Arc::new(FastFieldReaders::load_all(&schema, &fast_fields_composite)?); Arc::new(FastFieldReaders::load_all(&schema, &fast_fields_composite)?);
let fieldnorm_data = segment.open_read(SegmentComponent::FIELDNORMS)?; let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
let fieldnorm_readers = FieldNormReaders::new(fieldnorm_data)?; let fieldnorms_composite = CompositeFile::open(&fieldnorms_data)?;
let delete_bitset_opt = if segment.meta().has_deletes() { let delete_bitset_opt = if segment.meta().has_deletes() {
let delete_data = segment.open_read(SegmentComponent::DELETE)?; let delete_data = segment.open_read(SegmentComponent::DELETE)?;
@@ -195,7 +195,7 @@ impl SegmentReader {
termdict_composite, termdict_composite,
postings_composite, postings_composite,
fast_fields_readers: fast_field_readers, fast_fields_readers: fast_field_readers,
fieldnorm_readers, fieldnorms_composite,
segment_id: segment.id(), segment_id: segment.id(),
store_source, store_source,
delete_bitset_opt, delete_bitset_opt,
@@ -295,8 +295,8 @@ impl SegmentReader {
} }
/// Returns an iterator that will iterate over the alive document ids /// Returns an iterator that will iterate over the alive document ids
pub fn doc_ids_alive<'a>(&'a self) -> impl Iterator<Item = DocId> + 'a { pub fn doc_ids_alive(&self) -> SegmentReaderAliveDocsIterator<'_> {
(0u32..self.max_doc).filter(move |doc| !self.is_deleted(*doc)) SegmentReaderAliveDocsIterator::new(&self)
} }
/// Summarize total space usage of this segment. /// Summarize total space usage of this segment.
@@ -308,7 +308,7 @@ impl SegmentReader {
self.positions_composite.space_usage(), self.positions_composite.space_usage(),
self.positions_idx_composite.space_usage(), self.positions_idx_composite.space_usage(),
self.fast_fields_readers.space_usage(), self.fast_fields_readers.space_usage(),
self.fieldnorm_readers.space_usage(), self.fieldnorms_composite.space_usage(),
self.get_store_reader().space_usage(), self.get_store_reader().space_usage(),
self.delete_bitset_opt self.delete_bitset_opt
.as_ref() .as_ref()
@@ -324,6 +324,52 @@ impl fmt::Debug for SegmentReader {
} }
} }
/// Implements the iterator trait to allow easy iteration
/// over non-deleted ("alive") DocIds in a SegmentReader
pub struct SegmentReaderAliveDocsIterator<'a> {
reader: &'a SegmentReader,
max_doc: DocId,
current: DocId,
}
impl<'a> SegmentReaderAliveDocsIterator<'a> {
pub fn new(reader: &'a SegmentReader) -> SegmentReaderAliveDocsIterator<'a> {
SegmentReaderAliveDocsIterator {
reader,
max_doc: reader.max_doc(),
current: 0,
}
}
}
impl<'a> Iterator for SegmentReaderAliveDocsIterator<'a> {
type Item = DocId;
fn next(&mut self) -> Option<Self::Item> {
// TODO: Use TinySet (like in BitSetDocSet) to speed this process up
if self.current >= self.max_doc {
return None;
}
// find the next alive doc id
while self.reader.is_deleted(self.current) {
self.current += 1;
if self.current >= self.max_doc {
return None;
}
}
// capture the current alive DocId
let result = Some(self.current);
// move down the chain
self.current += 1;
result
}
}
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use crate::core::Index; use crate::core::Index;

View File

@@ -38,7 +38,6 @@ pub trait DocSet {
/// Calling `seek(TERMINATED)` is also legal and is the normal way to consume a DocSet. /// Calling `seek(TERMINATED)` is also legal and is the normal way to consume a DocSet.
fn seek(&mut self, target: DocId) -> DocId { fn seek(&mut self, target: DocId) -> DocId {
let mut doc = self.doc(); let mut doc = self.doc();
debug_assert!(doc <= target);
while doc < target { while doc < target {
doc = self.advance(); doc = self.advance();
} }

View File

@@ -21,7 +21,7 @@ mod reader;
mod serializer; mod serializer;
mod writer; mod writer;
pub use self::reader::{FieldNormReader, FieldNormReaders}; pub use self::reader::FieldNormReader;
pub use self::serializer::FieldNormsSerializer; pub use self::serializer::FieldNormsSerializer;
pub use self::writer::FieldNormsWriter; pub use self::writer::FieldNormsWriter;

View File

@@ -1,41 +1,6 @@
use super::{fieldnorm_to_id, id_to_fieldnorm}; use super::{fieldnorm_to_id, id_to_fieldnorm};
use crate::common::CompositeFile;
use crate::directory::ReadOnlySource; use crate::directory::ReadOnlySource;
use crate::schema::Field;
use crate::space_usage::PerFieldSpaceUsage;
use crate::DocId; use crate::DocId;
use std::sync::Arc;
/// Reader for the fieldnorm (for each document, the number of tokens indexed in the
/// field) of all indexed fields in the index.
///
/// Each fieldnorm is approximately compressed over one byte. We refer to this byte as
/// `fieldnorm_id`.
/// The mapping from `fieldnorm` to `fieldnorm_id` is given by monotonic.
#[derive(Clone)]
pub struct FieldNormReaders {
data: Arc<CompositeFile>,
}
impl FieldNormReaders {
/// Creates a field norm reader.
pub fn new(source: ReadOnlySource) -> crate::Result<FieldNormReaders> {
let data = CompositeFile::open(&source)?;
Ok(FieldNormReaders {
data: Arc::new(data),
})
}
/// Returns the FieldNormReader for a specific field.
pub fn get_field(&self, field: Field) -> Option<FieldNormReader> {
self.data.open_read(field).map(FieldNormReader::open)
}
/// Return a break down of the space usage per field.
pub fn space_usage(&self) -> PerFieldSpaceUsage {
self.data.space_usage()
}
}
/// Reads the fieldnorm associated to a document. /// Reads the fieldnorm associated to a document.
/// The fieldnorm represents the length associated to /// The fieldnorm represents the length associated to
@@ -54,7 +19,6 @@ impl FieldNormReaders {
/// Apart from compression, this scale also makes it possible to /// Apart from compression, this scale also makes it possible to
/// precompute computationally expensive functions of the fieldnorm /// precompute computationally expensive functions of the fieldnorm
/// in a very short array. /// in a very short array.
#[derive(Clone)]
pub struct FieldNormReader { pub struct FieldNormReader {
data: ReadOnlySource, data: ReadOnlySource,
} }
@@ -65,11 +29,6 @@ impl FieldNormReader {
FieldNormReader { data } FieldNormReader { data }
} }
/// Returns the number of documents in this segment.
pub fn num_docs(&self) -> u32 {
self.data.len() as u32
}
/// Returns the `fieldnorm` associated to a doc id. /// Returns the `fieldnorm` associated to a doc id.
/// The fieldnorm is a value approximating the number /// The fieldnorm is a value approximating the number
/// of tokens in a given field of the `doc_id`. /// of tokens in a given field of the `doc_id`.
@@ -106,11 +65,10 @@ impl FieldNormReader {
} }
#[cfg(test)] #[cfg(test)]
impl From<&[u32]> for FieldNormReader { impl From<Vec<u32>> for FieldNormReader {
fn from(field_norms: &[u32]) -> FieldNormReader { fn from(field_norms: Vec<u32>) -> FieldNormReader {
let field_norms_id = field_norms let field_norms_id = field_norms
.iter() .into_iter()
.cloned()
.map(FieldNormReader::fieldnorm_to_id) .map(FieldNormReader::fieldnorm_to_id)
.collect::<Vec<u8>>(); .collect::<Vec<u8>>();
let field_norms_data = ReadOnlySource::from(field_norms_id); let field_norms_data = ReadOnlySource::from(field_norms_id);

View File

@@ -78,12 +78,11 @@ impl FieldNormsWriter {
} }
/// Serialize the seen fieldnorm values to the serializer for all fields. /// Serialize the seen fieldnorm values to the serializer for all fields.
pub fn serialize(&self, mut fieldnorms_serializer: FieldNormsSerializer) -> io::Result<()> { pub fn serialize(&self, fieldnorms_serializer: &mut FieldNormsSerializer) -> io::Result<()> {
for &field in self.fields.iter() { for &field in self.fields.iter() {
let fieldnorm_values: &[u8] = &self.fieldnorms_buffer[field.field_id() as usize][..]; let fieldnorm_values: &[u8] = &self.fieldnorms_buffer[field.field_id() as usize][..];
fieldnorms_serializer.serialize_field(field, fieldnorm_values)?; fieldnorms_serializer.serialize_field(field, fieldnorm_values)?;
} }
fieldnorms_serializer.close()?;
Ok(()) Ok(())
} }
} }

View File

@@ -167,7 +167,7 @@ impl IndexMerger {
fn write_fieldnorms( fn write_fieldnorms(
&self, &self,
mut fieldnorms_serializer: FieldNormsSerializer, fieldnorms_serializer: &mut FieldNormsSerializer,
) -> crate::Result<()> { ) -> crate::Result<()> {
let fields = FieldNormsWriter::fields_with_fieldnorm(&self.schema); let fields = FieldNormsWriter::fields_with_fieldnorm(&self.schema);
let mut fieldnorms_data = Vec::with_capacity(self.max_doc as usize); let mut fieldnorms_data = Vec::with_capacity(self.max_doc as usize);
@@ -182,7 +182,6 @@ impl IndexMerger {
} }
fieldnorms_serializer.serialize_field(field, &fieldnorms_data[..])?; fieldnorms_serializer.serialize_field(field, &fieldnorms_data[..])?;
} }
fieldnorms_serializer.close()?;
Ok(()) Ok(())
} }
@@ -669,10 +668,8 @@ impl IndexMerger {
impl SerializableSegment for IndexMerger { impl SerializableSegment for IndexMerger {
fn write(&self, mut serializer: SegmentSerializer) -> crate::Result<u32> { fn write(&self, mut serializer: SegmentSerializer) -> crate::Result<u32> {
if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() {
self.write_fieldnorms(fieldnorms_serializer)?;
}
let term_ord_mappings = self.write_postings(serializer.get_postings_serializer())?; let term_ord_mappings = self.write_postings(serializer.get_postings_serializer())?;
self.write_fieldnorms(serializer.get_fieldnorms_serializer())?;
self.write_fast_fields(serializer.get_fast_field_serializer(), term_ord_mappings)?; self.write_fast_fields(serializer.get_fast_field_serializer(), term_ord_mappings)?;
self.write_storable_fields(serializer.get_store_writer())?; self.write_storable_fields(serializer.get_store_writer())?;
serializer.close()?; serializer.close()?;
@@ -1507,9 +1504,12 @@ mod tests {
for i in 0..100 { for i in 0..100 {
let mut doc = Document::new(); let mut doc = Document::new();
doc.add_f64(field, 42.0); doc.add_f64(field, 42.0);
doc.add_f64(multi_field, 0.24); doc.add_f64(multi_field, 0.24);
doc.add_f64(multi_field, 0.27); doc.add_f64(multi_field, 0.27);
writer.add_document(doc); writer.add_document(doc);
if i % 5 == 0 { if i % 5 == 0 {
writer.commit()?; writer.commit()?;
} }
@@ -1521,6 +1521,7 @@ mod tests {
// If a merging thread fails, we should end up with more // If a merging thread fails, we should end up with more
// than one segment here // than one segment here
assert_eq!(1, index.searchable_segments()?.len()); assert_eq!(1, index.searchable_segments()?.len());
Ok(()) Ok(())
} }
} }

View File

@@ -8,16 +8,15 @@ use crate::store::StoreWriter;
/// Segment serializer is in charge of laying out on disk /// Segment serializer is in charge of laying out on disk
/// the data accumulated and sorted by the `SegmentWriter`. /// the data accumulated and sorted by the `SegmentWriter`.
pub struct SegmentSerializer { pub struct SegmentSerializer {
segment: Segment,
store_writer: StoreWriter, store_writer: StoreWriter,
fast_field_serializer: FastFieldSerializer, fast_field_serializer: FastFieldSerializer,
fieldnorms_serializer: Option<FieldNormsSerializer>, fieldnorms_serializer: FieldNormsSerializer,
postings_serializer: InvertedIndexSerializer, postings_serializer: InvertedIndexSerializer,
} }
impl SegmentSerializer { impl SegmentSerializer {
/// Creates a new `SegmentSerializer`. /// Creates a new `SegmentSerializer`.
pub fn for_segment(mut segment: Segment) -> crate::Result<SegmentSerializer> { pub fn for_segment(segment: &mut Segment) -> crate::Result<SegmentSerializer> {
let store_write = segment.open_write(SegmentComponent::STORE)?; let store_write = segment.open_write(SegmentComponent::STORE)?;
let fast_field_write = segment.open_write(SegmentComponent::FASTFIELDS)?; let fast_field_write = segment.open_write(SegmentComponent::FASTFIELDS)?;
@@ -26,21 +25,15 @@ impl SegmentSerializer {
let fieldnorms_write = segment.open_write(SegmentComponent::FIELDNORMS)?; let fieldnorms_write = segment.open_write(SegmentComponent::FIELDNORMS)?;
let fieldnorms_serializer = FieldNormsSerializer::from_write(fieldnorms_write)?; let fieldnorms_serializer = FieldNormsSerializer::from_write(fieldnorms_write)?;
let postings_serializer = InvertedIndexSerializer::open(&mut segment)?; let postings_serializer = InvertedIndexSerializer::open(segment)?;
Ok(SegmentSerializer { Ok(SegmentSerializer {
segment,
store_writer: StoreWriter::new(store_write), store_writer: StoreWriter::new(store_write),
fast_field_serializer, fast_field_serializer,
fieldnorms_serializer: Some(fieldnorms_serializer), fieldnorms_serializer,
postings_serializer, postings_serializer,
}) })
} }
#[allow(dead_code)]
pub fn segment(&self) -> &Segment {
&self.segment
}
/// Accessor to the `PostingsSerializer`. /// Accessor to the `PostingsSerializer`.
pub fn get_postings_serializer(&mut self) -> &mut InvertedIndexSerializer { pub fn get_postings_serializer(&mut self) -> &mut InvertedIndexSerializer {
&mut self.postings_serializer &mut self.postings_serializer
@@ -51,11 +44,9 @@ impl SegmentSerializer {
&mut self.fast_field_serializer &mut self.fast_field_serializer
} }
/// Extract the field norm serializer. /// Accessor to the field norm serializer.
/// pub fn get_fieldnorms_serializer(&mut self) -> &mut FieldNormsSerializer {
/// Note the fieldnorms serializer can only be extracted once. &mut self.fieldnorms_serializer
pub fn extract_fieldnorms_serializer(&mut self) -> Option<FieldNormsSerializer> {
self.fieldnorms_serializer.take()
} }
/// Accessor to the `StoreWriter`. /// Accessor to the `StoreWriter`.
@@ -64,13 +55,11 @@ impl SegmentSerializer {
} }
/// Finalize the segment serialization. /// Finalize the segment serialization.
pub fn close(mut self) -> crate::Result<()> { pub fn close(self) -> crate::Result<()> {
if let Some(fieldnorms_serializer) = self.extract_fieldnorms_serializer() {
fieldnorms_serializer.close()?;
}
self.fast_field_serializer.close()?; self.fast_field_serializer.close()?;
self.postings_serializer.close()?; self.postings_serializer.close()?;
self.store_writer.close()?; self.store_writer.close()?;
self.fieldnorms_serializer.close()?;
Ok(()) Ok(())
} }
} }

View File

@@ -112,7 +112,7 @@ fn merge(
target_opstamp: Opstamp, target_opstamp: Opstamp,
) -> crate::Result<SegmentEntry> { ) -> crate::Result<SegmentEntry> {
// first we need to apply deletes to our segment. // first we need to apply deletes to our segment.
let merged_segment = index.new_segment(); let mut merged_segment = index.new_segment();
// First we apply all of the delet to the merged segment, up to the target opstamp. // First we apply all of the delet to the merged segment, up to the target opstamp.
for segment_entry in &mut segment_entries { for segment_entry in &mut segment_entries {
@@ -131,13 +131,12 @@ fn merge(
let merger: IndexMerger = IndexMerger::open(index.schema(), &segments[..])?; let merger: IndexMerger = IndexMerger::open(index.schema(), &segments[..])?;
// ... we just serialize this index merger in our new segment to merge the two segments. // ... we just serialize this index merger in our new segment to merge the two segments.
let segment_serializer = SegmentSerializer::for_segment(merged_segment.clone())?; let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment)?;
let num_docs = merger.write(segment_serializer)?; let num_docs = merger.write(segment_serializer)?;
let merged_segment_id = merged_segment.id(); let segment_meta = index.new_segment_meta(merged_segment.id(), num_docs);
let segment_meta = index.new_segment_meta(merged_segment_id, num_docs);
Ok(SegmentEntry::new(segment_meta, delete_cursor, None)) Ok(SegmentEntry::new(segment_meta, delete_cursor, None))
} }
@@ -522,7 +521,7 @@ impl SegmentUpdater {
/// ///
/// Upon termination of the current merging threads, /// Upon termination of the current merging threads,
/// merge opportunity may appear. /// merge opportunity may appear.
/// //
/// We keep waiting until the merge policy judges that /// We keep waiting until the merge policy judges that
/// no opportunity is available. /// no opportunity is available.
/// ///

View File

@@ -62,12 +62,11 @@ impl SegmentWriter {
/// - schema /// - schema
pub fn for_segment( pub fn for_segment(
memory_budget: usize, memory_budget: usize,
segment: Segment, mut segment: Segment,
schema: &Schema, schema: &Schema,
) -> crate::Result<SegmentWriter> { ) -> crate::Result<SegmentWriter> {
let tokenizer_manager = segment.index().tokenizers().clone();
let table_num_bits = initial_table_size(memory_budget)?; let table_num_bits = initial_table_size(memory_budget)?;
let segment_serializer = SegmentSerializer::for_segment(segment)?; let segment_serializer = SegmentSerializer::for_segment(&mut segment)?;
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_num_bits); let multifield_postings = MultiFieldPostingsWriter::new(schema, table_num_bits);
let tokenizers = schema let tokenizers = schema
.fields() .fields()
@@ -77,7 +76,7 @@ impl SegmentWriter {
.get_indexing_options() .get_indexing_options()
.and_then(|text_index_option| { .and_then(|text_index_option| {
let tokenizer_name = &text_index_option.tokenizer(); let tokenizer_name = &text_index_option.tokenizer();
tokenizer_manager.get(tokenizer_name) segment.index().tokenizers().get(tokenizer_name)
}), }),
_ => None, _ => None,
}, },
@@ -281,11 +280,9 @@ fn write(
fieldnorms_writer: &FieldNormsWriter, fieldnorms_writer: &FieldNormsWriter,
mut serializer: SegmentSerializer, mut serializer: SegmentSerializer,
) -> crate::Result<()> { ) -> crate::Result<()> {
if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() {
fieldnorms_writer.serialize(fieldnorms_serializer)?;
}
let term_ord_map = multifield_postings.serialize(serializer.get_postings_serializer())?; let term_ord_map = multifield_postings.serialize(serializer.get_postings_serializer())?;
fast_field_writers.serialize(serializer.get_fast_field_serializer(), &term_ord_map)?; fast_field_writers.serialize(serializer.get_fast_field_serializer(), &term_ord_map)?;
fieldnorms_writer.serialize(serializer.get_fieldnorms_serializer())?;
serializer.close()?; serializer.close()?;
Ok(()) Ok(())
} }

View File

@@ -298,26 +298,17 @@ mod tests {
use rand::rngs::StdRng; use rand::rngs::StdRng;
use rand::{Rng, SeedableRng}; use rand::{Rng, SeedableRng};
/// Checks if left and right are close one to each other. pub fn assert_nearly_equals(expected: f32, val: f32) {
/// Panics if the two values are more than 0.5% apart. assert!(
#[macro_export] nearly_equals(val, expected),
macro_rules! assert_nearly_equals { "Got {}, expected {}.",
($left:expr, $right:expr) => {{ val,
match (&$left, &$right) { expected
(left_val, right_val) => { );
let diff = (left_val - right_val).abs(); }
let add = left_val.abs() + right_val.abs();
if diff > 0.0005 * add { pub fn nearly_equals(a: f32, b: f32) -> bool {
panic!( (a - b).abs() < 0.0005 * (a + b).abs()
r#"assertion failed: `(left ~= right)`
left: `{:?}`,
right: `{:?}`"#,
&*left_val, &*right_val
)
}
}
}
}};
} }
pub fn generate_nonunique_unsorted(max_value: u32, n_elems: usize) -> Vec<u32> { pub fn generate_nonunique_unsorted(max_value: u32, n_elems: usize) -> Vec<u32> {

View File

@@ -47,6 +47,7 @@ fn decode_vint_block(
doc_offset: DocId, doc_offset: DocId,
num_vint_docs: usize, num_vint_docs: usize,
) { ) {
doc_decoder.clear();
let num_consumed_bytes = doc_decoder.uncompress_vint_sorted(data, doc_offset, num_vint_docs); let num_consumed_bytes = doc_decoder.uncompress_vint_sorted(data, doc_offset, num_vint_docs);
if let Some(freq_decoder) = freq_decoder_opt { if let Some(freq_decoder) = freq_decoder_opt {
freq_decoder.uncompress_vint_unsorted(&data[num_consumed_bytes..], num_vint_docs); freq_decoder.uncompress_vint_unsorted(&data[num_consumed_bytes..], num_vint_docs);
@@ -98,7 +99,7 @@ impl BlockSegmentPostings {
data: postings_data, data: postings_data,
skip_reader, skip_reader,
}; };
block_segment_postings.load_block(); block_segment_postings.advance();
block_segment_postings block_segment_postings
} }
@@ -116,13 +117,13 @@ impl BlockSegmentPostings {
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, postings_data); let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, postings_data);
self.data = ReadOnlySource::new(postings_data); self.data = ReadOnlySource::new(postings_data);
self.loaded_offset = std::usize::MAX; self.loaded_offset = std::usize::MAX;
self.loaded_offset = std::usize::MAX;
if let Some(skip_data) = skip_data_opt { if let Some(skip_data) = skip_data_opt {
self.skip_reader.reset(skip_data, doc_freq); self.skip_reader.reset(skip_data, doc_freq);
} else { } else {
self.skip_reader.reset(ReadOnlySource::empty(), doc_freq); self.skip_reader.reset(ReadOnlySource::empty(), doc_freq);
} }
self.doc_freq = doc_freq as usize; self.doc_freq = doc_freq as usize;
self.load_block();
} }
/// Returns the document frequency associated to this block postings. /// Returns the document frequency associated to this block postings.
@@ -214,10 +215,6 @@ impl BlockSegmentPostings {
); );
} }
BlockInfo::VInt(num_vint_docs) => { BlockInfo::VInt(num_vint_docs) => {
self.doc_decoder.clear();
if num_vint_docs == 0 {
return;
}
decode_vint_block( decode_vint_block(
&mut self.doc_decoder, &mut self.doc_decoder,
if let FreqReadingOption::ReadFreq = self.freq_reading_option { if let FreqReadingOption::ReadFreq = self.freq_reading_option {
@@ -236,9 +233,12 @@ impl BlockSegmentPostings {
/// Advance to the next block. /// Advance to the next block.
/// ///
/// Returns false iff there was no remaining blocks. /// Returns false iff there was no remaining blocks.
pub fn advance(&mut self) { pub fn advance(&mut self) -> bool {
self.skip_reader.advance(); if !self.skip_reader.advance() {
return false;
}
self.load_block(); self.load_block();
true
} }
/// Returns an empty segment postings object /// Returns an empty segment postings object
@@ -294,8 +294,7 @@ mod tests {
#[test] #[test]
fn test_empty_block_segment_postings() { fn test_empty_block_segment_postings() {
let mut postings = BlockSegmentPostings::empty(); let mut postings = BlockSegmentPostings::empty();
postings.advance(); assert!(!postings.advance());
assert!(postings.docs().is_empty());
assert_eq!(postings.doc_freq(), 0); assert_eq!(postings.doc_freq(), 0);
} }
@@ -307,14 +306,13 @@ mod tests {
assert_eq!(block_segments.doc_freq(), 100_000); assert_eq!(block_segments.doc_freq(), 100_000);
loop { loop {
let block = block_segments.docs(); let block = block_segments.docs();
if block.is_empty() {
break;
}
for (i, doc) in block.iter().cloned().enumerate() { for (i, doc) in block.iter().cloned().enumerate() {
assert_eq!(offset + (i as u32), doc); assert_eq!(offset + (i as u32), doc);
} }
offset += block.len() as u32; offset += block.len() as u32;
block_segments.advance(); if block_segments.advance() {
break;
}
} }
} }
@@ -423,6 +421,7 @@ mod tests {
let term_info = inverted_index.get_term_info(&term).unwrap(); let term_info = inverted_index.get_term_info(&term).unwrap();
inverted_index.reset_block_postings_from_terminfo(&term_info, &mut block_segments); inverted_index.reset_block_postings_from_terminfo(&term_info, &mut block_segments);
} }
assert!(block_segments.advance());
assert_eq!(block_segments.docs(), &[1, 3, 5]); assert_eq!(block_segments.docs(), &[1, 3, 5]);
} }
} }

View File

@@ -109,7 +109,6 @@ impl BlockDecoder {
} }
pub fn clear(&mut self) { pub fn clear(&mut self) {
self.output_len = 0;
self.output.0.iter_mut().for_each(|el| *el = TERMINATED); self.output.0.iter_mut().for_each(|el| *el = TERMINATED);
} }
} }
@@ -245,19 +244,6 @@ pub mod tests {
} }
} }
#[test]
fn test_clearing() {
let mut encoder = BlockEncoder::new();
let vals = (0u32..128u32).map(|i| i * 3).collect::<Vec<_>>();
let (num_bits, compressed) = encoder.compress_block_sorted(&vals[..], 0u32);
let mut decoder = BlockDecoder::default();
decoder.uncompress_block_sorted(compressed, 0u32, num_bits);
assert_eq!(decoder.output_len, 128);
assert_eq!(decoder.output_array(), &vals[..]);
decoder.clear();
assert!(decoder.output_array().is_empty());
}
#[test] #[test]
fn test_encode_unsorted_block_with_junk() { fn test_encode_unsorted_block_with_junk() {
let mut compressed: Vec<u8> = Vec::new(); let mut compressed: Vec<u8> = Vec::new();

View File

@@ -582,9 +582,6 @@ pub mod tests {
) { ) {
for target in targets { for target in targets {
let mut postings_opt = postings_factory(); let mut postings_opt = postings_factory();
if target < postings_opt.doc() {
continue;
}
let mut postings_unopt = UnoptimizedDocSet::wrap(postings_factory()); let mut postings_unopt = UnoptimizedDocSet::wrap(postings_factory());
let skip_result_opt = postings_opt.seek(target); let skip_result_opt = postings_opt.seek(target);
let skip_result_unopt = postings_unopt.seek(target); let skip_result_unopt = postings_unopt.seek(target);

View File

@@ -100,15 +100,14 @@ impl DocSet for SegmentPostings {
} }
fn seek(&mut self, target: DocId) -> DocId { fn seek(&mut self, target: DocId) -> DocId {
debug_assert!(self.doc() <= target); if self.doc() == target {
if self.doc() >= target { return target;
return self.doc();
} }
self.block_cursor.seek(target); self.block_cursor.seek(target);
// At this point we are on the block, that might contain our document. // At this point we are on the block, that might contain our document.
let output = self.block_cursor.docs_aligned(); let output = self.block_cursor.docs_aligned();
self.cur = self.block_searcher.search_in_block(&output, target); self.cur = self.block_searcher.search_in_block(&output, target);
// The last block is not full and padded with the value TERMINATED, // The last block is not full and padded with the value TERMINATED,
@@ -124,7 +123,6 @@ impl DocSet for SegmentPostings {
// After the search, the cursor should point to the first value of TERMINATED. // After the search, the cursor should point to the first value of TERMINATED.
let doc = output.0[self.cur]; let doc = output.0[self.cur];
debug_assert!(doc >= target); debug_assert!(doc >= target);
debug_assert_eq!(doc, self.doc());
doc doc
} }

View File

@@ -81,41 +81,25 @@ impl Default for BlockInfo {
impl SkipReader { impl SkipReader {
pub fn new(data: ReadOnlySource, doc_freq: u32, skip_info: IndexRecordOption) -> SkipReader { pub fn new(data: ReadOnlySource, doc_freq: u32, skip_info: IndexRecordOption) -> SkipReader {
let mut skip_reader = SkipReader { SkipReader {
last_doc_in_block: if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 { last_doc_in_block: 0u32,
0
} else {
TERMINATED
},
last_doc_in_previous_block: 0u32, last_doc_in_previous_block: 0u32,
owned_read: OwnedRead::new(data), owned_read: OwnedRead::new(data),
skip_info, skip_info,
block_info: BlockInfo::VInt(doc_freq), block_info: BlockInfo::default(),
byte_offset: 0, byte_offset: 0,
remaining_docs: doc_freq, remaining_docs: doc_freq,
position_offset: 0u64, position_offset: 0u64,
};
if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 {
skip_reader.read_block_info();
} }
skip_reader
} }
pub fn reset(&mut self, data: ReadOnlySource, doc_freq: u32) { pub fn reset(&mut self, data: ReadOnlySource, doc_freq: u32) {
self.last_doc_in_block = if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 { self.last_doc_in_block = 0u32;
0
} else {
TERMINATED
};
self.last_doc_in_previous_block = 0u32; self.last_doc_in_previous_block = 0u32;
self.owned_read = OwnedRead::new(data); self.owned_read = OwnedRead::new(data);
self.block_info = BlockInfo::VInt(doc_freq); self.block_info = BlockInfo::default();
self.byte_offset = 0; self.byte_offset = 0;
self.remaining_docs = doc_freq; self.remaining_docs = doc_freq;
self.position_offset = 0u64;
if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 {
self.read_block_info();
}
} }
#[cfg(test)] #[cfg(test)]
@@ -181,7 +165,7 @@ impl SkipReader {
} }
} }
pub fn advance(&mut self) { pub fn advance(&mut self) -> bool {
match self.block_info { match self.block_info {
BlockInfo::BitPacked { BlockInfo::BitPacked {
doc_num_bits, doc_num_bits,
@@ -193,17 +177,17 @@ impl SkipReader {
self.position_offset += tf_sum as u64; self.position_offset += tf_sum as u64;
} }
BlockInfo::VInt(num_vint_docs) => { BlockInfo::VInt(num_vint_docs) => {
debug_assert_eq!(num_vint_docs, self.remaining_docs); self.remaining_docs -= num_vint_docs;
self.remaining_docs = 0;
self.byte_offset = std::usize::MAX;
} }
} }
self.last_doc_in_previous_block = self.last_doc_in_block; self.last_doc_in_previous_block = self.last_doc_in_block;
if self.remaining_docs >= COMPRESSION_BLOCK_SIZE as u32 { if self.remaining_docs >= COMPRESSION_BLOCK_SIZE as u32 {
self.read_block_info(); self.read_block_info();
true
} else { } else {
self.last_doc_in_block = TERMINATED; self.last_doc_in_block = TERMINATED;
self.block_info = BlockInfo::VInt(self.remaining_docs); self.block_info = BlockInfo::VInt(self.remaining_docs);
self.remaining_docs > 0
} }
} }
} }
@@ -233,6 +217,7 @@ mod tests {
doc_freq, doc_freq,
IndexRecordOption::WithFreqs, IndexRecordOption::WithFreqs,
); );
assert!(skip_reader.advance());
assert_eq!(skip_reader.last_doc_in_block(), 1u32); assert_eq!(skip_reader.last_doc_in_block(), 1u32);
assert_eq!( assert_eq!(
skip_reader.block_info(), skip_reader.block_info(),
@@ -242,7 +227,7 @@ mod tests {
tf_sum: 0 tf_sum: 0
} }
); );
skip_reader.advance(); assert!(skip_reader.advance());
assert_eq!(skip_reader.last_doc_in_block(), 5u32); assert_eq!(skip_reader.last_doc_in_block(), 5u32);
assert_eq!( assert_eq!(
skip_reader.block_info(), skip_reader.block_info(),
@@ -252,12 +237,9 @@ mod tests {
tf_sum: 0 tf_sum: 0
} }
); );
skip_reader.advance(); assert!(skip_reader.advance());
assert_eq!(skip_reader.block_info(), BlockInfo::VInt(3u32)); assert_eq!(skip_reader.block_info(), BlockInfo::VInt(3u32));
skip_reader.advance(); assert!(!skip_reader.advance());
assert_eq!(skip_reader.block_info(), BlockInfo::VInt(0u32));
skip_reader.advance();
assert_eq!(skip_reader.block_info(), BlockInfo::VInt(0u32));
} }
#[test] #[test]
@@ -274,6 +256,7 @@ mod tests {
doc_freq, doc_freq,
IndexRecordOption::Basic, IndexRecordOption::Basic,
); );
assert!(skip_reader.advance());
assert_eq!(skip_reader.last_doc_in_block(), 1u32); assert_eq!(skip_reader.last_doc_in_block(), 1u32);
assert_eq!( assert_eq!(
skip_reader.block_info(), skip_reader.block_info(),
@@ -283,7 +266,7 @@ mod tests {
tf_sum: 0u32 tf_sum: 0u32
} }
); );
skip_reader.advance(); assert!(skip_reader.advance());
assert_eq!(skip_reader.last_doc_in_block(), 5u32); assert_eq!(skip_reader.last_doc_in_block(), 5u32);
assert_eq!( assert_eq!(
skip_reader.block_info(), skip_reader.block_info(),
@@ -293,12 +276,9 @@ mod tests {
tf_sum: 0u32 tf_sum: 0u32
} }
); );
skip_reader.advance(); assert!(skip_reader.advance());
assert_eq!(skip_reader.block_info(), BlockInfo::VInt(3u32)); assert_eq!(skip_reader.block_info(), BlockInfo::VInt(3u32));
skip_reader.advance(); assert!(!skip_reader.advance());
assert_eq!(skip_reader.block_info(), BlockInfo::VInt(0u32));
skip_reader.advance();
assert_eq!(skip_reader.block_info(), BlockInfo::VInt(0u32));
} }
#[test] #[test]
@@ -314,6 +294,7 @@ mod tests {
doc_freq, doc_freq,
IndexRecordOption::Basic, IndexRecordOption::Basic,
); );
assert!(skip_reader.advance());
assert_eq!(skip_reader.last_doc_in_block(), 1u32); assert_eq!(skip_reader.last_doc_in_block(), 1u32);
assert_eq!( assert_eq!(
skip_reader.block_info(), skip_reader.block_info(),
@@ -323,7 +304,6 @@ mod tests {
tf_sum: 0u32 tf_sum: 0u32
} }
); );
skip_reader.advance(); assert!(!skip_reader.advance());
assert_eq!(skip_reader.block_info(), BlockInfo::VInt(0u32));
} }
} }

View File

@@ -43,6 +43,7 @@ where
fn scorer(&self, reader: &SegmentReader, boost: f32) -> Result<Box<dyn Scorer>> { fn scorer(&self, reader: &SegmentReader, boost: f32) -> Result<Box<dyn Scorer>> {
let max_doc = reader.max_doc(); let max_doc = reader.max_doc();
let mut doc_bitset = BitSet::with_max_value(max_doc); let mut doc_bitset = BitSet::with_max_value(max_doc);
let inverted_index = reader.inverted_index(self.field); let inverted_index = reader.inverted_index(self.field);
let term_dict = inverted_index.terms(); let term_dict = inverted_index.terms();
let mut term_stream = self.automaton_stream(term_dict); let mut term_stream = self.automaton_stream(term_dict);
@@ -51,14 +52,12 @@ where
let mut block_segment_postings = inverted_index let mut block_segment_postings = inverted_index
.read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic); .read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic);
loop { loop {
let docs = block_segment_postings.docs(); for &doc in block_segment_postings.docs() {
if docs.is_empty() {
break;
}
for &doc in docs {
doc_bitset.insert(doc); doc_bitset.insert(doc);
} }
block_segment_postings.advance(); if !block_segment_postings.advance() {
break;
}
} }
} }
let doc_bitset = BitSetDocSet::from(doc_bitset); let doc_bitset = BitSetDocSet::from(doc_bitset);

View File

@@ -139,10 +139,10 @@ impl BM25Weight {
mod tests { mod tests {
use super::idf; use super::idf;
use crate::assert_nearly_equals; use crate::tests::assert_nearly_equals;
#[test] #[test]
fn test_idf() { fn test_idf() {
assert_nearly_equals!(idf(1, 2), 0.6931472); assert_nearly_equals(idf(1, 2), 0.6931472);
} }
} }

View File

@@ -94,7 +94,7 @@ impl BooleanWeight {
let exclude_scorer_opt: Option<Box<dyn Scorer>> = per_occur_scorers let exclude_scorer_opt: Option<Box<dyn Scorer>> = per_occur_scorers
.remove(&Occur::MustNot) .remove(&Occur::MustNot)
.map(scorer_union::<DoNothingCombiner>) .map(scorer_union::<TScoreCombiner>)
.map(Into::into); .map(Into::into);
let must_scorer_opt: Option<Box<dyn Scorer>> = per_occur_scorers let must_scorer_opt: Option<Box<dyn Scorer>> = per_occur_scorers

View File

@@ -7,9 +7,7 @@ pub use self::boolean_query::BooleanQuery;
mod tests { mod tests {
use super::*; use super::*;
use crate::assert_nearly_equals;
use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE; use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE;
use crate::collector::TopDocs;
use crate::query::score_combiner::SumWithCoordsCombiner; use crate::query::score_combiner::SumWithCoordsCombiner;
use crate::query::term_query::TermScorer; use crate::query::term_query::TermScorer;
use crate::query::Intersection; use crate::query::Intersection;
@@ -20,8 +18,9 @@ mod tests {
use crate::query::Scorer; use crate::query::Scorer;
use crate::query::TermQuery; use crate::query::TermQuery;
use crate::schema::*; use crate::schema::*;
use crate::tests::assert_nearly_equals;
use crate::Index; use crate::Index;
use crate::{DocAddress, DocId, Score}; use crate::{DocAddress, DocId};
fn aux_test_helper() -> (Index, Field) { fn aux_test_helper() -> (Index, Field) {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
@@ -141,6 +140,7 @@ mod tests {
.map(|doc| doc.1) .map(|doc| doc.1)
.collect::<Vec<DocId>>() .collect::<Vec<DocId>>()
}; };
{ {
let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a"))]); let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a"))]);
assert_eq!(matching_docs(&boolean_query), vec![0, 1, 3]); assert_eq!(matching_docs(&boolean_query), vec![0, 1, 3]);
@@ -177,54 +177,6 @@ mod tests {
} }
} }
#[test]
pub fn test_boolean_query_two_excluded() {
let (index, text_field) = aux_test_helper();
let make_term_query = |text: &str| {
let term_query = TermQuery::new(
Term::from_field_text(text_field, text),
IndexRecordOption::Basic,
);
let query: Box<dyn Query> = Box::new(term_query);
query
};
let reader = index.reader().unwrap();
let matching_topdocs = |query: &dyn Query| {
reader
.searcher()
.search(query, &TopDocs::with_limit(3))
.unwrap()
};
let score_doc_4: Score; // score of doc 4 should not be influenced by exclusion
{
let boolean_query_no_excluded =
BooleanQuery::from(vec![(Occur::Must, make_term_query("d"))]);
let topdocs_no_excluded = matching_topdocs(&boolean_query_no_excluded);
assert_eq!(topdocs_no_excluded.len(), 2);
let (top_score, top_doc) = topdocs_no_excluded[0];
assert_eq!(top_doc, DocAddress(0, 4));
assert_eq!(topdocs_no_excluded[1].1, DocAddress(0, 3)); // ignore score of doc 3.
score_doc_4 = top_score;
}
{
let boolean_query_two_excluded = BooleanQuery::from(vec![
(Occur::Must, make_term_query("d")),
(Occur::MustNot, make_term_query("a")),
(Occur::MustNot, make_term_query("b")),
]);
let topdocs_excluded = matching_topdocs(&boolean_query_two_excluded);
assert_eq!(topdocs_excluded.len(), 1);
let (top_score, top_doc) = topdocs_excluded[0];
assert_eq!(top_doc, DocAddress(0, 4));
assert_eq!(top_score, score_doc_4);
}
}
#[test] #[test]
pub fn test_boolean_query_with_weight() { pub fn test_boolean_query_with_weight() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
@@ -256,14 +208,14 @@ mod tests {
.scorer(searcher.segment_reader(0u32), 1.0f32) .scorer(searcher.segment_reader(0u32), 1.0f32)
.unwrap(); .unwrap();
assert_eq!(boolean_scorer.doc(), 0u32); assert_eq!(boolean_scorer.doc(), 0u32);
assert_nearly_equals!(boolean_scorer.score(), 0.84163445f32); assert_nearly_equals(boolean_scorer.score(), 0.84163445f32);
} }
{ {
let mut boolean_scorer = boolean_weight let mut boolean_scorer = boolean_weight
.scorer(searcher.segment_reader(0u32), 2.0f32) .scorer(searcher.segment_reader(0u32), 2.0f32)
.unwrap(); .unwrap();
assert_eq!(boolean_scorer.doc(), 0u32); assert_eq!(boolean_scorer.doc(), 0u32);
assert_nearly_equals!(boolean_scorer.score(), 1.6832689f32); assert_nearly_equals(boolean_scorer.score(), 1.6832689f32);
} }
} }
@@ -322,7 +274,7 @@ mod tests {
index_writer.add_document(doc!( index_writer.add_document(doc!(
// tf = 1 1 // tf = 1 1
title => "PDF Мастер Класс \"Морячок\" (Оксана Лифенко)", title => "PDF Мастер Класс \"Морячок\" (Оксана Лифенко)",
// tf = 0 0 // tf = 0 0
text => "https://i.ibb.co/pzvHrDN/I3d U T6 Gg TM.jpg\nhttps://i.ibb.co/NFrb6v6/N0ls Z9nwjb U.jpg\nВ описание входит штаны, кофта, берет, матросский воротник. Описание продается в формате PDF, состоит из 12 страниц формата А4 и может быть напечатано на любом принтере.\nОписание предназначено для кукол BJD RealPuki от FairyLand, но может подойти и другим подобным куклам. Также вы можете вязать этот наряд из обычной пряжи, и он подойдет для куколок побольше.\nhttps://vk.com/market 95724412?w=product 95724412_2212" text => "https://i.ibb.co/pzvHrDN/I3d U T6 Gg TM.jpg\nhttps://i.ibb.co/NFrb6v6/N0ls Z9nwjb U.jpg\nВ описание входит штаны, кофта, берет, матросский воротник. Описание продается в формате PDF, состоит из 12 страниц формата А4 и может быть напечатано на любом принтере.\nОписание предназначено для кукол BJD RealPuki от FairyLand, но может подойти и другим подобным куклам. Также вы можете вязать этот наряд из обычной пряжи, и он подойдет для куколок побольше.\nhttps://vk.com/market 95724412?w=product 95724412_2212"
)); ));
for _ in 0..1_000 { for _ in 0..1_000 {

View File

@@ -3,11 +3,6 @@ use crate::query::Scorer;
use crate::DocId; use crate::DocId;
use crate::Score; use crate::Score;
#[inline(always)]
fn is_within<TDocSetExclude: DocSet>(docset: &mut TDocSetExclude, doc: DocId) -> bool {
docset.doc() <= doc && docset.seek(doc) == doc
}
/// Filters a given `DocSet` by removing the docs from a given `DocSet`. /// Filters a given `DocSet` by removing the docs from a given `DocSet`.
/// ///
/// The excluding docset has no impact on scoring. /// The excluding docset has no impact on scoring.
@@ -28,7 +23,8 @@ where
) -> Exclude<TDocSet, TDocSetExclude> { ) -> Exclude<TDocSet, TDocSetExclude> {
while underlying_docset.doc() != TERMINATED { while underlying_docset.doc() != TERMINATED {
let target = underlying_docset.doc(); let target = underlying_docset.doc();
if !is_within(&mut excluding_docset, target) { if excluding_docset.seek(target) != target {
// this document is not excluded.
break; break;
} }
underlying_docset.advance(); underlying_docset.advance();
@@ -40,30 +36,42 @@ where
} }
} }
impl<TDocSet, TDocSetExclude> Exclude<TDocSet, TDocSetExclude>
where
TDocSet: DocSet,
TDocSetExclude: DocSet,
{
/// Returns true iff the doc is not removed.
///
/// The method has to be called with non strictly
/// increasing `doc`.
fn accept(&mut self) -> bool {
let doc = self.underlying_docset.doc();
self.excluding_docset.seek(doc) != doc
}
}
impl<TDocSet, TDocSetExclude> DocSet for Exclude<TDocSet, TDocSetExclude> impl<TDocSet, TDocSetExclude> DocSet for Exclude<TDocSet, TDocSetExclude>
where where
TDocSet: DocSet, TDocSet: DocSet,
TDocSetExclude: DocSet, TDocSetExclude: DocSet,
{ {
fn advance(&mut self) -> DocId { fn advance(&mut self) -> DocId {
loop { while self.underlying_docset.advance() != TERMINATED {
let candidate = self.underlying_docset.advance(); if self.accept() {
if candidate == TERMINATED { return self.doc();
return TERMINATED;
}
if !is_within(&mut self.excluding_docset, candidate) {
return candidate;
} }
} }
TERMINATED
} }
fn seek(&mut self, target: DocId) -> DocId { fn seek(&mut self, target: DocId) -> DocId {
let candidate = self.underlying_docset.seek(target); let underlying_seek_result = self.underlying_docset.seek(target);
if candidate == TERMINATED { if underlying_seek_result == TERMINATED {
return TERMINATED; return TERMINATED;
} }
if !is_within(&mut self.excluding_docset, candidate) { if self.accept() {
return candidate; return underlying_seek_result;
} }
self.advance() self.advance()
} }
@@ -121,7 +129,7 @@ mod tests {
VecDocSet::from(vec![1, 2, 3, 10, 16, 24]), VecDocSet::from(vec![1, 2, 3, 10, 16, 24]),
)) ))
}, },
vec![5, 8, 10, 15, 24], vec![1, 2, 5, 8, 10, 15, 24],
); );
} }

View File

@@ -163,10 +163,10 @@ impl Query for FuzzyTermQuery {
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use super::FuzzyTermQuery; use super::FuzzyTermQuery;
use crate::assert_nearly_equals;
use crate::collector::TopDocs; use crate::collector::TopDocs;
use crate::schema::Schema; use crate::schema::Schema;
use crate::schema::TEXT; use crate::schema::TEXT;
use crate::tests::assert_nearly_equals;
use crate::Index; use crate::Index;
use crate::Term; use crate::Term;
@@ -199,7 +199,7 @@ mod test {
.unwrap(); .unwrap();
assert_eq!(top_docs.len(), 1, "Expected only 1 document"); assert_eq!(top_docs.len(), 1, "Expected only 1 document");
let (score, _) = top_docs[0]; let (score, _) = top_docs[0];
assert_nearly_equals!(1f32, score); assert_nearly_equals(1f32, score);
} }
// fails because non-prefix Levenshtein distance is more than 1 (add 'a' and 'n') // fails because non-prefix Levenshtein distance is more than 1 (add 'a' and 'n')
@@ -223,7 +223,7 @@ mod test {
.unwrap(); .unwrap();
assert_eq!(top_docs.len(), 1, "Expected only 1 document"); assert_eq!(top_docs.len(), 1, "Expected only 1 document");
let (score, _) = top_docs[0]; let (score, _) = top_docs[0];
assert_nearly_equals!(1f32, score); assert_nearly_equals(1f32, score);
} }
} }
} }

View File

@@ -53,8 +53,7 @@ pub struct Intersection<TDocSet: DocSet, TOtherDocSet: DocSet = Box<dyn Scorer>>
} }
fn go_to_first_doc<TDocSet: DocSet>(docsets: &mut [TDocSet]) -> DocId { fn go_to_first_doc<TDocSet: DocSet>(docsets: &mut [TDocSet]) -> DocId {
assert!(!docsets.is_empty()); let mut candidate = 0;
let mut candidate = docsets.iter().map(TDocSet::doc).max().unwrap();
'outer: loop { 'outer: loop {
for docset in docsets.iter_mut() { for docset in docsets.iter_mut() {
let seek_doc = docset.seek(candidate); let seek_doc = docset.seek(candidate);
@@ -120,9 +119,6 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
} }
} }
debug_assert_eq!(candidate, self.left.doc());
debug_assert_eq!(candidate, self.right.doc());
debug_assert!(self.others.iter().all(|docset| docset.doc() == candidate));
return candidate; return candidate;
} }
} }
@@ -133,10 +129,7 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
for docset in &mut self.others { for docset in &mut self.others {
docsets.push(docset); docsets.push(docset);
} }
let doc = go_to_first_doc(&mut docsets[..]); go_to_first_doc(&mut docsets[..])
debug_assert!(docsets.iter().all(|docset| docset.doc() == doc));
debug_assert!(doc >= target);
doc
} }
fn doc(&self) -> DocId { fn doc(&self) -> DocId {

View File

@@ -10,13 +10,12 @@ pub use self::phrase_weight::PhraseWeight;
pub mod tests { pub mod tests {
use super::*; use super::*;
use crate::assert_nearly_equals;
use crate::collector::tests::{TEST_COLLECTOR_WITHOUT_SCORE, TEST_COLLECTOR_WITH_SCORE}; use crate::collector::tests::{TEST_COLLECTOR_WITHOUT_SCORE, TEST_COLLECTOR_WITH_SCORE};
use crate::core::Index; use crate::core::Index;
use crate::query::Weight;
use crate::schema::{Schema, Term, TEXT}; use crate::schema::{Schema, Term, TEXT};
use crate::tests::assert_nearly_equals;
use crate::DocAddress;
use crate::DocId; use crate::DocId;
use crate::{DocAddress, TERMINATED};
pub fn create_index(texts: &[&'static str]) -> Index { pub fn create_index(texts: &[&'static str]) -> Index {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
@@ -68,23 +67,6 @@ pub mod tests {
assert!(test_query(vec!["g", "a"]).is_empty()); assert!(test_query(vec!["g", "a"]).is_empty());
} }
#[test]
pub fn test_phrase_query_simple() -> crate::Result<()> {
let index = create_index(&["a b b d c g c", "a b a b c"]);
let text_field = index.schema().get_field("text").unwrap();
let searcher = index.reader()?.searcher();
let terms: Vec<Term> = vec!["a", "b", "c"]
.iter()
.map(|text| Term::from_field_text(text_field, text))
.collect();
let phrase_query = PhraseQuery::new(terms);
let phrase_weight = phrase_query.phrase_weight(&searcher, false)?;
let mut phrase_scorer = phrase_weight.scorer(searcher.segment_reader(0), 1.0f32)?;
assert_eq!(phrase_scorer.doc(), 1);
assert_eq!(phrase_scorer.advance(), TERMINATED);
Ok(())
}
#[test] #[test]
pub fn test_phrase_query_no_score() { pub fn test_phrase_query_no_score() {
let index = create_index(&[ let index = create_index(&[
@@ -175,8 +157,8 @@ pub mod tests {
.to_vec() .to_vec()
}; };
let scores = test_query(vec!["a", "b"]); let scores = test_query(vec!["a", "b"]);
assert_nearly_equals!(scores[0], 0.40618482); assert_nearly_equals(scores[0], 0.40618482);
assert_nearly_equals!(scores[1], 0.46844664); assert_nearly_equals(scores[1], 0.46844664);
} }
#[test] // motivated by #234 #[test] // motivated by #234

View File

@@ -239,7 +239,6 @@ impl<TPostings: Postings> DocSet for PhraseScorer<TPostings> {
} }
fn seek(&mut self, target: DocId) -> DocId { fn seek(&mut self, target: DocId) -> DocId {
debug_assert!(target >= self.doc());
let doc = self.intersection_docset.seek(target); let doc = self.intersection_docset.seek(target);
if doc == TERMINATED || self.phrase_match() { if doc == TERMINATED || self.phrase_match() {
return doc; return doc;
@@ -267,6 +266,7 @@ impl<TPostings: Postings> Scorer for PhraseScorer<TPostings> {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::{intersection, intersection_count}; use super::{intersection, intersection_count};
fn test_intersection_sym(left: &[u32], right: &[u32], expected: &[u32]) { fn test_intersection_sym(left: &[u32], right: &[u32], expected: &[u32]) {

View File

@@ -113,7 +113,7 @@ fn trim_ast(logical_ast: LogicalAST) -> Option<LogicalAST> {
/// The language covered by the current parser is extremely simple. /// The language covered by the current parser is extremely simple.
/// ///
/// * simple terms: "e.g.: `Barack Obama` are simply tokenized using /// * simple terms: "e.g.: `Barack Obama` are simply tokenized using
/// tantivy's [`SimpleTokenizer`](../tokenizer/struct.SimpleTokenizer.html), hence /// tantivy's [`SimpleTokenizer`](tantivy::tokenizer::SimpleTokenizer), hence
/// becoming `["barack", "obama"]`. The terms are then searched within /// becoming `["barack", "obama"]`. The terms are then searched within
/// the default terms of the query parser. /// the default terms of the query parser.
/// ///

View File

@@ -301,14 +301,12 @@ impl Weight for RangeWeight {
let mut block_segment_postings = inverted_index let mut block_segment_postings = inverted_index
.read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic); .read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic);
loop { loop {
let docs = block_segment_postings.docs();
if docs.is_empty() {
break;
}
for &doc in block_segment_postings.docs() { for &doc in block_segment_postings.docs() {
doc_bitset.insert(doc); doc_bitset.insert(doc);
} }
block_segment_postings.advance(); if !block_segment_postings.advance() {
break;
}
} }
} }
let doc_bitset = BitSetDocSet::from(doc_bitset); let doc_bitset = BitSetDocSet::from(doc_bitset);

View File

@@ -89,10 +89,10 @@ impl Query for RegexQuery {
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use super::RegexQuery; use super::RegexQuery;
use crate::assert_nearly_equals;
use crate::collector::TopDocs; use crate::collector::TopDocs;
use crate::schema::TEXT; use crate::schema::TEXT;
use crate::schema::{Field, Schema}; use crate::schema::{Field, Schema};
use crate::tests::assert_nearly_equals;
use crate::{Index, IndexReader}; use crate::{Index, IndexReader};
use std::sync::Arc; use std::sync::Arc;
use tantivy_fst::Regex; use tantivy_fst::Regex;
@@ -129,7 +129,7 @@ mod test {
.unwrap(); .unwrap();
assert_eq!(scored_docs.len(), 1, "Expected only 1 document"); assert_eq!(scored_docs.len(), 1, "Expected only 1 document");
let (score, _) = scored_docs[0]; let (score, _) = scored_docs[0];
assert_nearly_equals!(1f32, score); assert_nearly_equals(1f32, score);
} }
let top_docs = searcher let top_docs = searcher
.search(&query_matching_zero, &TopDocs::with_limit(2)) .search(&query_matching_zero, &TopDocs::with_limit(2))

View File

@@ -72,7 +72,7 @@ where
let doc = self.doc(); let doc = self.doc();
let mut score_combiner = TScoreCombiner::default(); let mut score_combiner = TScoreCombiner::default();
score_combiner.update(&mut self.req_scorer); score_combiner.update(&mut self.req_scorer);
if self.opt_scorer.doc() <= doc && self.opt_scorer.seek(doc) == doc { if self.opt_scorer.seek(doc) == doc {
score_combiner.update(&mut self.opt_scorer); score_combiner.update(&mut self.opt_scorer);
} }
let score = score_combiner.score(); let score = score_combiner.score();

View File

@@ -9,14 +9,13 @@ pub use self::term_weight::TermWeight;
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::assert_nearly_equals;
use crate::collector::TopDocs; use crate::collector::TopDocs;
use crate::docset::DocSet; use crate::docset::DocSet;
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
use crate::query::{Query, QueryParser, Scorer, TermQuery}; use crate::query::{Query, QueryParser, Scorer, TermQuery};
use crate::schema::{Field, IndexRecordOption, Schema, STRING, TEXT}; use crate::schema::{Field, IndexRecordOption, Schema, STRING, TEXT};
use crate::tests::assert_nearly_equals;
use crate::Index;
use crate::Term; use crate::Term;
use crate::{Index, TERMINATED};
#[test] #[test]
pub fn test_term_query_no_freq() { pub fn test_term_query_no_freq() {
@@ -43,41 +42,6 @@ mod tests {
assert_eq!(term_scorer.score(), 0.28768212); assert_eq!(term_scorer.score(), 0.28768212);
} }
#[test]
pub fn test_term_query_multiple_of_block_len() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", STRING);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
for _ in 0..COMPRESSION_BLOCK_SIZE {
let doc = doc!(text_field => "a");
index_writer.add_document(doc);
}
index_writer.commit()?;
}
let searcher = index.reader()?.searcher();
let term_query = TermQuery::new(
Term::from_field_text(text_field, "a"),
IndexRecordOption::Basic,
);
let term_weight = term_query.weight(&searcher, true)?;
let segment_reader = searcher.segment_reader(0);
let mut term_scorer = term_weight.scorer(segment_reader, 1.0f32)?;
for i in 0u32..COMPRESSION_BLOCK_SIZE as u32 {
assert_eq!(term_scorer.doc(), i);
if i == COMPRESSION_BLOCK_SIZE as u32 - 1u32 {
assert_eq!(term_scorer.advance(), TERMINATED);
} else {
assert_eq!(term_scorer.advance(), i + 1);
}
}
assert_eq!(term_scorer.doc(), TERMINATED);
Ok(())
}
#[test] #[test]
pub fn test_term_weight() { pub fn test_term_weight() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
@@ -105,7 +69,7 @@ mod tests {
.unwrap(); .unwrap();
assert_eq!(topdocs.len(), 1); assert_eq!(topdocs.len(), 1);
let (score, _) = topdocs[0]; let (score, _) = topdocs[0];
assert_nearly_equals!(0.77802235, score); assert_nearly_equals(0.77802235, score);
} }
{ {
let term = Term::from_field_text(left_field, "left1"); let term = Term::from_field_text(left_field, "left1");
@@ -115,9 +79,9 @@ mod tests {
.unwrap(); .unwrap();
assert_eq!(top_docs.len(), 2); assert_eq!(top_docs.len(), 2);
let (score1, _) = top_docs[0]; let (score1, _) = top_docs[0];
assert_nearly_equals!(0.27101856, score1); assert_nearly_equals(0.27101856, score1);
let (score2, _) = top_docs[1]; let (score2, _) = top_docs[1];
assert_nearly_equals!(0.13736556, score2); assert_nearly_equals(0.13736556, score2);
} }
{ {
let query_parser = QueryParser::for_index(&index, vec![]); let query_parser = QueryParser::for_index(&index, vec![]);
@@ -125,9 +89,9 @@ mod tests {
let top_docs = searcher.search(&query, &TopDocs::with_limit(2)).unwrap(); let top_docs = searcher.search(&query, &TopDocs::with_limit(2)).unwrap();
assert_eq!(top_docs.len(), 2); assert_eq!(top_docs.len(), 2);
let (score1, _) = top_docs[0]; let (score1, _) = top_docs[0];
assert_nearly_equals!(0.9153879, score1); assert_nearly_equals(0.9153879, score1);
let (score2, _) = top_docs[1]; let (score2, _) = top_docs[1];
assert_nearly_equals!(0.27101856, score2); assert_nearly_equals(0.27101856, score2);
} }
} }
@@ -148,27 +112,6 @@ mod tests {
assert_eq!(term_query.count(&*reader.searcher()).unwrap(), 1); assert_eq!(term_query.count(&*reader.searcher()).unwrap(), 1);
} }
#[test]
fn test_term_query_simple_seek() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"a"));
index_writer.commit()?;
let term_a = Term::from_field_text(text_field, "a");
let term_query = TermQuery::new(term_a, IndexRecordOption::Basic);
let searcher = index.reader()?.searcher();
let term_weight = term_query.weight(&searcher, false)?;
let mut term_scorer = term_weight.scorer(searcher.segment_reader(0u32), 1.0f32)?;
assert_eq!(term_scorer.doc(), 0u32);
term_scorer.seek(1u32);
assert_eq!(term_scorer.doc(), 1u32);
Ok(())
}
#[test] #[test]
fn test_term_query_debug() { fn test_term_query_debug() {
let term_query = TermQuery::new( let term_query = TermQuery::new(

View File

@@ -20,12 +20,12 @@ pub struct TermWeight {
impl Weight for TermWeight { impl Weight for TermWeight {
fn scorer(&self, reader: &SegmentReader, boost: f32) -> Result<Box<dyn Scorer>> { fn scorer(&self, reader: &SegmentReader, boost: f32) -> Result<Box<dyn Scorer>> {
let term_scorer = self.specialized_scorer(reader, boost)?; let term_scorer = self.scorer_specialized(reader, boost)?;
Ok(Box::new(term_scorer)) Ok(Box::new(term_scorer))
} }
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> { fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
let mut scorer = self.specialized_scorer(reader, 1.0f32)?; let mut scorer = self.scorer_specialized(reader, 1.0f32)?;
if scorer.seek(doc) != doc { if scorer.seek(doc) != doc {
return Err(does_not_match(doc)); return Err(does_not_match(doc));
} }
@@ -52,7 +52,7 @@ impl Weight for TermWeight {
reader: &SegmentReader, reader: &SegmentReader,
callback: &mut dyn FnMut(DocId, Score), callback: &mut dyn FnMut(DocId, Score),
) -> crate::Result<()> { ) -> crate::Result<()> {
let mut scorer = self.specialized_scorer(reader, 1.0f32)?; let mut scorer = self.scorer_specialized(reader, 1.0f32)?;
for_each_scorer(&mut scorer, callback); for_each_scorer(&mut scorer, callback);
Ok(()) Ok(())
} }
@@ -92,7 +92,7 @@ impl TermWeight {
} }
} }
fn specialized_scorer(&self, reader: &SegmentReader, boost: f32) -> Result<TermScorer> { fn scorer_specialized(&self, reader: &SegmentReader, boost: f32) -> Result<TermScorer> {
let field = self.term.field(); let field = self.term.field();
let inverted_index = reader.inverted_index(field); let inverted_index = reader.inverted_index(field);
let fieldnorm_reader = reader.get_fieldnorms_reader(field); let fieldnorm_reader = reader.get_fieldnorms_reader(field);

View File

@@ -183,10 +183,7 @@ where
// advance all docsets to a doc >= to the target. // advance all docsets to a doc >= to the target.
#[cfg_attr(feature = "cargo-clippy", allow(clippy::clippy::collapsible_if))] #[cfg_attr(feature = "cargo-clippy", allow(clippy::clippy::collapsible_if))]
unordered_drain_filter(&mut self.docsets, |docset| { unordered_drain_filter(&mut self.docsets, |docset| {
if docset.doc() < target { docset.seek(target) == TERMINATED
docset.seek(target);
}
docset.doc() == TERMINATED
}); });
// at this point all of the docsets // at this point all of the docsets

View File

@@ -22,7 +22,7 @@ pub enum ReloadPolicy {
/// The index is entirely reloaded manually. /// The index is entirely reloaded manually.
/// All updates of the index should be manual. /// All updates of the index should be manual.
/// ///
/// No change is reflected automatically. You are required to call `IndexReader::reload()` manually. /// No change is reflected automatically. You are required to call `.load_seacher()` manually.
Manual, Manual,
/// The index is reloaded within milliseconds after a new commit is available. /// The index is reloaded within milliseconds after a new commit is available.
/// This is made possible by watching changes in the `meta.json` file. /// This is made possible by watching changes in the `meta.json` file.

View File

@@ -14,7 +14,7 @@ use std::fmt;
/// - a field name /// - a field name
/// - a field type, itself wrapping up options describing /// - a field type, itself wrapping up options describing
/// how the field should be indexed. /// how the field should be indexed.
#[derive(Clone, Debug, PartialEq)] #[derive(Clone, Debug, Eq, PartialEq)]
pub struct FieldEntry { pub struct FieldEntry {
name: String, name: String,
field_type: FieldType, field_type: FieldType,

View File

@@ -48,7 +48,7 @@ pub enum Type {
/// A `FieldType` describes the type (text, u64) of a field as well as /// A `FieldType` describes the type (text, u64) of a field as well as
/// how it should be handled by tantivy. /// how it should be handled by tantivy.
#[derive(Clone, Debug, PartialEq)] #[derive(Clone, Debug, Eq, PartialEq)]
pub enum FieldType { pub enum FieldType {
/// String field type configuration /// String field type configuration
Str(TextOptions), Str(TextOptions),

View File

@@ -6,7 +6,7 @@ use std::borrow::Cow;
use std::ops::BitOr; use std::ops::BitOr;
/// Define how a text field should be handled by tantivy. /// Define how a text field should be handled by tantivy.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
pub struct TextOptions { pub struct TextOptions {
indexing: Option<TextFieldIndexing>, indexing: Option<TextFieldIndexing>,
stored: bool, stored: bool,
@@ -51,7 +51,7 @@ impl Default for TextOptions {
/// - the amount of information that should be stored about the presence of a term in a document. /// - the amount of information that should be stored about the presence of a term in a document.
/// Essentially, should we store the term frequency and/or the positions (See [`IndexRecordOption`](./enum.IndexRecordOption.html)). /// Essentially, should we store the term frequency and/or the positions (See [`IndexRecordOption`](./enum.IndexRecordOption.html)).
/// - the name of the `Tokenizer` that should be used to process the field. /// - the name of the `Tokenizer` that should be used to process the field.
#[derive(Clone, PartialEq, Debug, Serialize, Deserialize)] #[derive(Clone, PartialEq, Eq, Debug, Serialize, Deserialize)]
pub struct TextFieldIndexing { pub struct TextFieldIndexing {
record: IndexRecordOption, record: IndexRecordOption,
tokenizer: Cow<'static, str>, tokenizer: Cow<'static, str>,