From 75ea74e4650ea372302e46cf81d39412c646f7ad Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Fri, 12 Jun 2020 11:27:22 +0900 Subject: [PATCH] added blockwand information --- Cargo.toml | 1 + src/common/serialize.rs | 13 ++ src/common/vint.rs | 2 +- src/directory/footer.rs | 65 ++++-- src/fieldnorm/reader.rs | 34 ++- src/indexer/merger.rs | 117 ++++++++-- src/indexer/segment_serializer.rs | 8 +- src/indexer/segment_writer.rs | 13 +- src/lib.rs | 33 ++- src/postings/block_segment_postings.rs | 102 +++++++-- src/postings/mod.rs | 6 +- src/postings/postings_writer.rs | 14 +- src/postings/recorder.rs | 21 ++ src/postings/segment_postings.rs | 76 +++++-- src/postings/serializer.rs | 108 +++++++-- src/postings/skip.rs | 116 +++++++--- src/query/bm25.rs | 60 +++-- src/query/boolean_query/block_wand.rs | 254 ++++++++++++++++------ src/query/boolean_query/boolean_weight.rs | 79 ++++--- src/query/boolean_query/mod.rs | 6 +- src/query/explanation.rs | 7 + src/query/fuzzy_query.rs | 6 +- src/query/mod.rs | 2 +- src/query/phrase_query/mod.rs | 6 +- src/query/regex_query.rs | 4 +- src/query/term_query/mod.rs | 12 +- src/query/term_query/term_scorer.rs | 163 +++++++++++++- src/query/term_query/term_weight.rs | 8 +- src/schema/text_options.rs | 12 - 29 files changed, 1006 insertions(+), 342 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 070f3104e..a74bf61ad 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -58,6 +58,7 @@ winapi = "0.3" rand = "0.7" maplit = "1" matches = "0.1.8" +proptest = "0.10" [dev-dependencies.fail] version = "0.4" diff --git a/src/common/serialize.rs b/src/common/serialize.rs index a9680a72b..6b89bbe70 100644 --- a/src/common/serialize.rs +++ b/src/common/serialize.rs @@ -89,6 +89,19 @@ impl FixedSize for u64 { const SIZE_IN_BYTES: usize = 8; } +impl BinarySerializable for f32 { + fn serialize(&self, writer: &mut W) -> io::Result<()> { + writer.write_f32::(*self) + } + fn deserialize(reader: &mut R) -> io::Result { + reader.read_f32::() + } +} + +impl FixedSize for f32 { + const SIZE_IN_BYTES: usize = 4; +} + impl BinarySerializable for i64 { fn serialize(&self, writer: &mut W) -> io::Result<()> { writer.write_i64::(*self) diff --git a/src/common/vint.rs b/src/common/vint.rs index b7f52d612..0fc4ed232 100644 --- a/src/common/vint.rs +++ b/src/common/vint.rs @@ -5,7 +5,7 @@ use std::io::Read; use std::io::Write; /// Wrapper over a `u64` that serializes as a variable int. -#[derive(Debug, Eq, PartialEq)] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] pub struct VInt(pub u64); const STOP_BIT: u8 = 128; diff --git a/src/directory/footer.rs b/src/directory/footer.rs index d11519f5c..80b88d64b 100644 --- a/src/directory/footer.rs +++ b/src/directory/footer.rs @@ -94,12 +94,24 @@ impl Footer { match &self.versioned_footer { VersionedFooter::V1 { crc32: _crc, - store_compression: compression, + store_compression, } => { - if &library_version.store_compression != compression { + if &library_version.store_compression != store_compression { return Err(Incompatibility::CompressionMismatch { library_compression_format: library_version.store_compression.to_string(), - index_compression_format: compression.to_string(), + index_compression_format: store_compression.to_string(), + }); + } + Ok(()) + } + VersionedFooter::V2 { + crc32: _crc, + store_compression, + } => { + if &library_version.store_compression != store_compression { + return Err(Incompatibility::CompressionMismatch { + library_compression_format: library_version.store_compression.to_string(), + index_compression_format: store_compression.to_string(), }); } Ok(()) @@ -120,24 +132,29 @@ pub enum VersionedFooter { crc32: CrcHashU32, store_compression: String, }, + // Introduction of the Block WAND information. + V2 { + crc32: CrcHashU32, + store_compression: String, + }, } impl BinarySerializable for VersionedFooter { fn serialize(&self, writer: &mut W) -> io::Result<()> { let mut buf = Vec::new(); match self { - VersionedFooter::V1 { + VersionedFooter::V2 { crc32, store_compression: compression, } => { // Serializes a valid `VersionedFooter` or panics if the version is unknown // [ version | crc_hash | compression_mode ] // [ 0..4 | 4..8 | variable ] - BinarySerializable::serialize(&1u32, &mut buf)?; + BinarySerializable::serialize(&2u32, &mut buf)?; BinarySerializable::serialize(crc32, &mut buf)?; BinarySerializable::serialize(compression, &mut buf)?; } - VersionedFooter::UnknownVersion => { + VersionedFooter::V1 { .. } | VersionedFooter::UnknownVersion => { return Err(io::Error::new( io::ErrorKind::InvalidInput, "Cannot serialize an unknown versioned footer ", @@ -168,10 +185,17 @@ impl BinarySerializable for VersionedFooter { let version = u32::deserialize(&mut cursor)?; if version == 1 { let crc32 = u32::deserialize(&mut cursor)?; - let compression = String::deserialize(&mut cursor)?; + let store_compression = String::deserialize(&mut cursor)?; Ok(VersionedFooter::V1 { crc32, - store_compression: compression, + store_compression, + }) + } else if version == 2 { + let crc32 = u32::deserialize(&mut cursor)?; + let store_compression = String::deserialize(&mut cursor)?; + Ok(VersionedFooter::V2 { + crc32, + store_compression, }) } else { Ok(VersionedFooter::UnknownVersion) @@ -182,6 +206,7 @@ impl BinarySerializable for VersionedFooter { impl VersionedFooter { pub fn crc(&self) -> Option { match self { + VersionedFooter::V2 { crc32, .. } => Some(*crc32), VersionedFooter::V1 { crc32, .. } => Some(*crc32), VersionedFooter::UnknownVersion { .. } => None, } @@ -219,7 +244,7 @@ impl Write for FooterProxy { impl TerminatingWrite for FooterProxy { fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()> { let crc32 = self.hasher.take().unwrap().finalize(); - let footer = Footer::new(VersionedFooter::V1 { + let footer = Footer::new(VersionedFooter::V2 { crc32, store_compression: crate::store::COMPRESSION.to_string(), }); @@ -248,15 +273,11 @@ mod tests { assert!(footer_proxy.terminate().is_ok()); assert_eq!(vec.len(), 167); let footer = Footer::deserialize(&mut &vec[..]).unwrap(); - if let VersionedFooter::V1 { - crc32: _, - store_compression, - } = footer.versioned_footer - { - assert_eq!(store_compression, crate::store::COMPRESSION); - } else { - panic!("Versioned footer should be V1."); - } + assert!(matches!( + footer.versioned_footer, + VersionedFooter::V2 { store_compression, .. } + if store_compression == crate::store::COMPRESSION + )); assert_eq!(&footer.version, crate::version()); } @@ -264,7 +285,7 @@ mod tests { fn test_serialize_deserialize_footer() { let mut buffer = Vec::new(); let crc32 = 123456u32; - let footer: Footer = Footer::new(VersionedFooter::V1 { + let footer: Footer = Footer::new(VersionedFooter::V2 { crc32, store_compression: "lz4".to_string(), }); @@ -276,7 +297,7 @@ mod tests { #[test] fn footer_length() { let crc32 = 1111111u32; - let versioned_footer = VersionedFooter::V1 { + let versioned_footer = VersionedFooter::V2 { crc32, store_compression: "lz4".to_string(), }; @@ -297,7 +318,7 @@ mod tests { // versionned footer length 12 | 128, // index format version - 1, + 2, 0, 0, 0, @@ -316,7 +337,7 @@ mod tests { let versioned_footer = VersionedFooter::deserialize(&mut cursor).unwrap(); assert!(cursor.is_empty()); let expected_crc: u32 = LittleEndian::read_u32(&v_footer_bytes[5..9]) as CrcHashU32; - let expected_versioned_footer: VersionedFooter = VersionedFooter::V1 { + let expected_versioned_footer: VersionedFooter = VersionedFooter::V2 { crc32: expected_crc, store_compression: "lz4".to_string(), }; diff --git a/src/fieldnorm/reader.rs b/src/fieldnorm/reader.rs index 087384e52..193dda5f8 100644 --- a/src/fieldnorm/reader.rs +++ b/src/fieldnorm/reader.rs @@ -1,30 +1,37 @@ use super::{fieldnorm_to_id, id_to_fieldnorm}; -use crate::directory::ReadOnlySource; -use crate::DocId; use crate::common::CompositeFile; +use crate::directory::ReadOnlySource; use crate::schema::Field; -use std::sync::Arc; use crate::space_usage::PerFieldSpaceUsage; +use crate::DocId; +use std::sync::Arc; +/// Reader for the fieldnorm (for each document, the number of tokens indexed in the +/// field) of all indexed fields in the index. +/// +/// Each fieldnorm is approximately compressed over one byte. We refer to this byte as +/// `fieldnorm_id`. +/// The mapping from `fieldnorm` to `fieldnorm_id` is given by monotonic. #[derive(Clone)] pub struct FieldNormReaders { data: Arc, } impl FieldNormReaders { + /// Creates a field norm reader. pub fn new(source: ReadOnlySource) -> crate::Result { let data = CompositeFile::open(&source)?; Ok(FieldNormReaders { - data: Arc::new(data) + data: Arc::new(data), }) } + /// Returns the FieldNormReader for a specific field. pub fn get_field(&self, field: Field) -> Option { - self.data - .open_read(field) - .map(FieldNormReader::open) + self.data.open_read(field).map(FieldNormReader::open) } + /// Return a break down of the space usage per field. pub fn space_usage(&self) -> PerFieldSpaceUsage { self.data.space_usage() } @@ -47,6 +54,7 @@ impl FieldNormReaders { /// Apart from compression, this scale also makes it possible to /// precompute computationally expensive functions of the fieldnorm /// in a very short array. +#[derive(Clone)] pub struct FieldNormReader { data: ReadOnlySource, } @@ -57,6 +65,11 @@ impl FieldNormReader { FieldNormReader { data } } + /// Returns the number of documents in this segment. + pub fn num_docs(&self) -> u32 { + self.data.len() as u32 + } + /// Returns the `fieldnorm` associated to a doc id. /// The fieldnorm is a value approximating the number /// of tokens in a given field of the `doc_id`. @@ -93,10 +106,11 @@ impl FieldNormReader { } #[cfg(test)] -impl From> for FieldNormReader { - fn from(field_norms: Vec) -> FieldNormReader { +impl From<&[u32]> for FieldNormReader { + fn from(field_norms: &[u32]) -> FieldNormReader { let field_norms_id = field_norms - .into_iter() + .iter() + .cloned() .map(FieldNormReader::fieldnorm_to_id) .collect::>(); let field_norms_data = ReadOnlySource::from(field_norms_id); diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index c2a1886b3..b13705369 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -8,9 +8,9 @@ use crate::fastfield::DeleteBitSet; use crate::fastfield::FastFieldReader; use crate::fastfield::FastFieldSerializer; use crate::fastfield::MultiValueIntFastFieldReader; -use crate::fieldnorm::{FieldNormReader, FieldNormReaders}; use crate::fieldnorm::FieldNormsSerializer; use crate::fieldnorm::FieldNormsWriter; +use crate::fieldnorm::{FieldNormReader, FieldNormReaders}; use crate::indexer::SegmentSerializer; use crate::postings::InvertedIndexSerializer; use crate::postings::Postings; @@ -181,9 +181,9 @@ impl IndexMerger { } } fieldnorms_serializer.serialize_field(field, &fieldnorms_data[..])?; - } - fieldnorms_serializer.close()?; - Ok(()) + } + fieldnorms_serializer.close()?; + Ok(()) } fn write_fast_fields( @@ -493,7 +493,7 @@ impl IndexMerger { indexed_field: Field, field_type: &FieldType, serializer: &mut InvertedIndexSerializer, - fieldnorm_reader: Option + fieldnorm_reader: Option, ) -> crate::Result> { let mut positions_buffer: Vec = Vec::with_capacity(1_000); let mut delta_computer = DeltaComputer::new(); @@ -552,7 +552,8 @@ impl IndexMerger { // - Segment 2's doc ids become [seg0.max_doc + seg1.max_doc, // seg0.max_doc + seg1.max_doc + seg2.max_doc] // ... - let mut field_serializer = serializer.new_field(indexed_field, total_num_tokens, fieldnorm_reader)?; + let mut field_serializer = + serializer.new_field(indexed_field, total_num_tokens, fieldnorm_reader)?; let field_entry = self.schema.get_field_entry(indexed_field); @@ -596,7 +597,11 @@ impl IndexMerger { // We know that there is at least one document containing // the term, so we add it. - let to_term_ord = field_serializer.new_term(term_bytes)?; + let term_doc_freq = segment_postings + .iter() + .map(|(_, segment_posting)| segment_posting.doc_freq()) + .sum(); + let to_term_ord = field_serializer.new_term(term_bytes, term_doc_freq)?; if let Some(ref mut term_ord_mapping) = term_ord_mapping_opt { for (segment_ord, from_term_ord) in merged_terms.matching_segments() { @@ -617,8 +622,7 @@ impl IndexMerger { // there is at least one document. let term_freq = segment_postings.term_freq(); segment_postings.positions(&mut positions_buffer); - let delta_positions = - delta_computer.compute_delta(&positions_buffer); + let delta_positions = delta_computer.compute_delta(&positions_buffer); field_serializer.write_doc( remapped_doc_id, term_freq, @@ -641,15 +645,18 @@ impl IndexMerger { fn write_postings( &self, serializer: &mut InvertedIndexSerializer, - fieldnorm_readers: FieldNormReaders + fieldnorm_readers: FieldNormReaders, ) -> crate::Result> { let mut term_ordinal_mappings = HashMap::new(); for (field, field_entry) in self.schema.fields() { let fieldnorm_reader = fieldnorm_readers.get_field(field); if field_entry.is_indexed() { - if let Some(term_ordinal_mapping) = - self.write_postings_for_field(field, field_entry.field_type(), serializer, fieldnorm_reader)? - { + if let Some(term_ordinal_mapping) = self.write_postings_for_field( + field, + field_entry.field_type(), + serializer, + fieldnorm_reader, + )? { term_ordinal_mappings.insert(field, term_ordinal_mapping); } } @@ -675,12 +682,15 @@ impl IndexMerger { impl SerializableSegment for IndexMerger { fn write(&self, mut serializer: SegmentSerializer) -> crate::Result { - if let Some(fieldnorms_serializer) = serializer.get_fieldnorms_serializer() { + if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() { self.write_fieldnorms(fieldnorms_serializer)?; } - let fieldnorm_data = serializer.segment().open_read(SegmentComponent::FIELDNORMS)?; + let fieldnorm_data = serializer + .segment() + .open_read(SegmentComponent::FIELDNORMS)?; let fieldnorm_readers = FieldNormReaders::new(fieldnorm_data)?; - let term_ord_mappings = self.write_postings(serializer.get_postings_serializer(), fieldnorm_readers)?; + let term_ord_mappings = + self.write_postings(serializer.get_postings_serializer(), fieldnorm_readers)?; self.write_fast_fields(serializer.get_fast_field_serializer(), term_ord_mappings)?; self.write_storable_fields(serializer.get_store_writer())?; serializer.close()?; @@ -690,15 +700,15 @@ impl SerializableSegment for IndexMerger { #[cfg(test)] mod tests { + use crate::assert_nearly_equals; use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE; use crate::collector::tests::{BytesFastFieldTestCollector, FastFieldTestCollector}; use crate::collector::{Count, FacetCollector}; use crate::core::Index; use crate::query::AllQuery; use crate::query::BooleanQuery; + use crate::query::Scorer; use crate::query::TermQuery; - use crate::schema; - use crate::schema::Cardinality; use crate::schema::Document; use crate::schema::Facet; use crate::schema::IndexRecordOption; @@ -706,9 +716,11 @@ mod tests { use crate::schema::Term; use crate::schema::TextFieldIndexing; use crate::schema::INDEXED; + use crate::schema::{Cardinality, TEXT}; use crate::DocAddress; use crate::IndexWriter; use crate::Searcher; + use crate::{schema, DocSet, SegmentId}; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use futures::executor::block_on; use std::io::Cursor; @@ -1515,12 +1527,9 @@ mod tests { for i in 0..100 { let mut doc = Document::new(); doc.add_f64(field, 42.0); - doc.add_f64(multi_field, 0.24); doc.add_f64(multi_field, 0.27); - writer.add_document(doc); - if i % 5 == 0 { writer.commit()?; } @@ -1532,6 +1541,72 @@ mod tests { // If a merging thread fails, we should end up with more // than one segment here assert_eq!(1, index.searchable_segments()?.len()); + Ok(()) + } + + #[test] + fn test_merged_index_has_blockwand() -> crate::Result<()> { + let mut builder = schema::SchemaBuilder::new(); + let text = builder.add_text_field("text", TEXT); + let index = Index::create_in_ram(builder.build()); + let mut writer = index.writer_with_num_threads(1, 3_000_000)?; + let happy_term = Term::from_field_text(text, "happy"); + let term_query = TermQuery::new(happy_term, IndexRecordOption::WithFreqs); + for _ in 0..62 { + writer.add_document(doc!(text=>"hello happy tax payer")); + } + writer.commit()?; + let reader = index.reader()?; + let searcher = reader.searcher(); + let mut term_scorer = term_query + .specialized_weight(&searcher, true) + .specialized_scorer(searcher.segment_reader(0u32), 1.0f32)?; + assert_eq!(term_scorer.doc(), 0); + assert_nearly_equals!(term_scorer.block_max_score(), 0.0079681855); + assert_nearly_equals!(term_scorer.score(), 0.0079681855); + for _ in 0..81 { + writer.add_document(doc!(text=>"hello happy tax payer")); + } + writer.commit()?; + reader.reload()?; + let searcher = reader.searcher(); + + assert_eq!(searcher.segment_readers().len(), 2); + for segment_reader in searcher.segment_readers() { + let mut term_scorer = term_query + .specialized_weight(&searcher, true) + .specialized_scorer(segment_reader, 1.0f32)?; + // the difference compared to before is instrinsic to the bm25 formula. no worries there. + for doc in segment_reader.doc_ids_alive() { + assert_eq!(term_scorer.doc(), doc); + assert_nearly_equals!(term_scorer.block_max_score(), 0.003478312); + assert_nearly_equals!(term_scorer.score(), 0.003478312); + term_scorer.advance(); + } + } + + let segment_ids: Vec = searcher + .segment_readers() + .iter() + .map(|reader| reader.segment_id()) + .collect(); + block_on(writer.merge(&segment_ids[..]))?; + + reader.reload()?; + let searcher = reader.searcher(); + assert_eq!(searcher.segment_readers().len(), 1); + + let segment_reader = searcher.segment_reader(0u32); + let mut term_scorer = term_query + .specialized_weight(&searcher, true) + .specialized_scorer(segment_reader, 1.0f32)?; + // the difference compared to before is instrinsic to the bm25 formula. no worries there. + for doc in segment_reader.doc_ids_alive() { + assert_eq!(term_scorer.doc(), doc); + assert_nearly_equals!(term_scorer.block_max_score(), 0.003478312); + assert_nearly_equals!(term_scorer.score(), 0.003478312); + term_scorer.advance(); + } Ok(()) } diff --git a/src/indexer/segment_serializer.rs b/src/indexer/segment_serializer.rs index 03385db3d..a4ac9d677 100644 --- a/src/indexer/segment_serializer.rs +++ b/src/indexer/segment_serializer.rs @@ -50,8 +50,10 @@ impl SegmentSerializer { &mut self.fast_field_serializer } - /// Accessor to the field norm serializer. - pub fn get_fieldnorms_serializer(&mut self) -> Option { + /// Extract the field norm serializer. + /// + /// Note the fieldnorms serializer can only be extracted once. + pub fn extract_fieldnorms_serializer(&mut self) -> Option { self.fieldnorms_serializer.take() } @@ -62,7 +64,7 @@ impl SegmentSerializer { /// Finalize the segment serialization. pub fn close(mut self) -> crate::Result<()> { - if let Some(fieldnorms_serializer) = self.get_fieldnorms_serializer() { + if let Some(fieldnorms_serializer) = self.extract_fieldnorms_serializer() { fieldnorms_serializer.close()?; } self.fast_field_serializer.close()?; diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index bae33e99d..d4bc4a0d1 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -2,7 +2,7 @@ use super::operation::AddOperation; use crate::core::Segment; use crate::core::SerializableSegment; use crate::fastfield::FastFieldsWriter; -use crate::fieldnorm::{FieldNormsWriter, FieldNormReaders}; +use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter}; use crate::indexer::segment_serializer::SegmentSerializer; use crate::postings::compute_table_size; use crate::postings::MultiFieldPostingsWriter; @@ -14,8 +14,8 @@ use crate::schema::{Field, FieldEntry}; use crate::tokenizer::{BoxTokenStream, PreTokenizedStream}; use crate::tokenizer::{FacetTokenizer, TextAnalyzer}; use crate::tokenizer::{TokenStreamChain, Tokenizer}; -use crate::{DocId, SegmentComponent}; use crate::Opstamp; +use crate::{DocId, SegmentComponent}; use std::io; use std::str; @@ -281,12 +281,15 @@ fn write( fieldnorms_writer: &FieldNormsWriter, mut serializer: SegmentSerializer, ) -> crate::Result<()> { - if let Some(fieldnorms_serializer) = serializer.get_fieldnorms_serializer() { + if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() { fieldnorms_writer.serialize(fieldnorms_serializer)?; } - let fieldnorm_data = serializer.segment().open_read(SegmentComponent::FIELDNORMS)?; + let fieldnorm_data = serializer + .segment() + .open_read(SegmentComponent::FIELDNORMS)?; let fieldnorm_readers = FieldNormReaders::new(fieldnorm_data)?; - let term_ord_map = multifield_postings.serialize(serializer.get_postings_serializer(), fieldnorm_readers)?; + let term_ord_map = + multifield_postings.serialize(serializer.get_postings_serializer(), fieldnorm_readers)?; fast_field_writers.serialize(serializer.get_fast_field_serializer(), &term_ord_map)?; serializer.close()?; Ok(()) diff --git a/src/lib.rs b/src/lib.rs index 7367050ce..9f9856f0a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -173,7 +173,7 @@ use once_cell::sync::Lazy; use serde::{Deserialize, Serialize}; /// Index format version. -const INDEX_FORMAT_VERSION: u32 = 1; +const INDEX_FORMAT_VERSION: u32 = 2; /// Structure version for the index. #[derive(Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -298,17 +298,26 @@ mod tests { use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; - pub fn assert_nearly_equals(expected: f32, val: f32) { - assert!( - nearly_equals(val, expected), - "Got {}, expected {}.", - val, - expected - ); - } - - pub fn nearly_equals(a: f32, b: f32) -> bool { - (a - b).abs() < 0.0005 * (a + b).abs() + /// Checks if left and right are close one to each other. + /// Panics if the two values are more than 0.5% apart. + #[macro_export] + macro_rules! assert_nearly_equals { + ($left:expr, $right:expr) => {{ + match (&$left, &$right) { + (left_val, right_val) => { + let diff = (left_val - right_val).abs(); + let add = left_val.abs() + right_val.abs(); + if diff > 0.0005 * add { + panic!( + r#"assertion failed: `(left ~= right)` + left: `{:?}`, + right: `{:?}`"#, + &*left_val, &*right_val + ) + } + } + } + }}; } pub fn generate_nonunique_unsorted(max_value: u32, n_elems: usize) -> Vec { diff --git a/src/postings/block_segment_postings.rs b/src/postings/block_segment_postings.rs index 9c55e52c3..8ca63bd1e 100644 --- a/src/postings/block_segment_postings.rs +++ b/src/postings/block_segment_postings.rs @@ -1,11 +1,21 @@ use crate::common::{BinarySerializable, VInt}; use crate::directory::ReadOnlySource; +use crate::fieldnorm::FieldNormReader; use crate::postings::compression::{ AlignedBuffer, BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE, }; use crate::postings::{BlockInfo, FreqReadingOption, SkipReader}; +use crate::query::BM25Weight; use crate::schema::IndexRecordOption; -use crate::{DocId, TERMINATED}; +use crate::{DocId, Score, TERMINATED}; + +fn max_f32>(mut it: I) -> Option { + if let Some(first) = it.next() { + Some(it.fold(first, f32::max)) + } else { + None + } +} /// `BlockSegmentPostings` is a cursor iterating over blocks /// of documents. @@ -19,6 +29,7 @@ pub struct BlockSegmentPostings { loaded_offset: usize, freq_decoder: BlockDecoder, freq_reading_option: FreqReadingOption, + block_max_score_cache: Option, doc_freq: u32, @@ -94,14 +105,52 @@ impl BlockSegmentPostings { loaded_offset: std::usize::MAX, freq_decoder: BlockDecoder::with_val(1), freq_reading_option, + block_max_score_cache: None, doc_freq, data: postings_data, skip_reader, }; - block_segment_postings.advance(); + block_segment_postings.load_block(); block_segment_postings } + /// Returns the block_max_score for the current block. + /// It does not require the block to be loaded. For instance, it is ok to call this method + /// after having called `.shallow_advance(..)`. + /// + /// See `TermScorer::block_max_score(..)` for more information. + pub fn block_max_score( + &mut self, + fieldnorm_reader: &FieldNormReader, + bm25_weight: &BM25Weight, + ) -> Score { + let (block_max_score_cache, skip_reader, doc_decoder, freq_decoder) = ( + &mut self.block_max_score_cache, + &self.skip_reader, + &self.doc_decoder, + &self.freq_decoder, + ); + *block_max_score_cache.get_or_insert_with(|| { + skip_reader + .block_max_score(bm25_weight) + .or_else(|| { + let docs = doc_decoder.output_array(); + let freqs = freq_decoder.output_array(); + max_f32(docs.iter().cloned().zip(freqs.iter().cloned()).map( + |(doc, term_freq)| { + let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc); + bm25_weight.score(fieldnorm_id, term_freq) + }, + )) + }) + .unwrap_or(0f32) + }) + } + + pub(crate) fn freq_reading_option(&self) -> FreqReadingOption { + self.freq_reading_option + } + // Resets the block segment postings on another position // in the postings file. // @@ -125,7 +174,8 @@ impl BlockSegmentPostings { self.doc_freq = doc_freq; } - /// Returns the document frequency associated to this block postings. + /// Returns the overall number of documents in the block postings. + /// It does not take in account whether documents are deleted or not. /// /// This `doc_freq` is simply the sum of the length of all of the blocks /// length, and it does not take in account deleted documents. @@ -139,29 +189,41 @@ impl BlockSegmentPostings { /// returned by `.docs()` is empty. #[inline] pub fn docs(&self) -> &[DocId] { + debug_assert!(self.block_is_loaded()); self.doc_decoder.output_array() } + /// Returns a full block, regardless of whetehr the block is complete or incomplete ( + /// as it happens for the last block of the posting list). + /// + /// In the latter case, the block is guaranteed to be padded with the sentinel value: + /// `TERMINATED`. The array is also guaranteed to be aligned on 16 bytes = 128 bits. + /// + /// This method is useful to run SSE2 linear search. #[inline(always)] pub(crate) fn docs_aligned(&self) -> &AlignedBuffer { + debug_assert!(self.block_is_loaded()); self.doc_decoder.output_aligned() } /// Return the document at index `idx` of the block. #[inline(always)] pub fn doc(&self, idx: usize) -> u32 { + debug_assert!(self.block_is_loaded()); self.doc_decoder.output(idx) } /// Return the array of `term freq` in the block. #[inline] pub fn freqs(&self) -> &[u32] { + debug_assert!(self.block_is_loaded()); self.freq_decoder.output_array() } /// Return the frequency at index `idx` of the block. #[inline] pub fn freq(&self, idx: usize) -> u32 { + debug_assert!(self.block_is_loaded()); self.freq_decoder.output(idx) } @@ -172,6 +234,7 @@ impl BlockSegmentPostings { /// of any number between 1 and `NUM_DOCS_PER_BLOCK - 1` #[inline] pub fn block_len(&self) -> usize { + debug_assert!(self.block_is_loaded()); self.doc_decoder.output_len } @@ -179,16 +242,22 @@ impl BlockSegmentPostings { self.skip_reader.position_offset() } - /// Position on a block that may contains `target_doc`. + /// Dangerous API! This calls seek on the skip list, + /// but does not `.load_block()` afterwards. /// + /// `.load_block()` needs to be called manually afterwards. /// If all docs are smaller than target, the block loaded may be empty, /// or be the last an incomplete VInt block. - pub fn seek(&mut self, target_doc: DocId) { + pub(crate) fn shallow_seek(&mut self, target_doc: DocId) { self.skip_reader.seek(target_doc); - self.load_block(); } - fn load_block(&mut self) { + pub(crate) fn block_is_loaded(&self) -> bool { + self.loaded_offset == self.skip_reader.byte_offset() + } + + pub(crate) fn load_block(&mut self) { + self.block_max_score_cache = None; let offset = self.skip_reader.byte_offset(); if self.loaded_offset == offset { return; @@ -213,7 +282,7 @@ impl BlockSegmentPostings { tf_num_bits, ); } - BlockInfo::VInt(num_vint_docs) => { + BlockInfo::VInt { num_docs, .. } => { decode_vint_block( &mut self.doc_decoder, if let FreqReadingOption::ReadFreq = self.freq_reading_option { @@ -223,7 +292,7 @@ impl BlockSegmentPostings { }, &self.data.as_slice()[offset..], self.skip_reader.last_doc_in_previous_block, - num_vint_docs as usize, + num_docs as usize, ); } } @@ -234,6 +303,8 @@ impl BlockSegmentPostings { /// Returns false iff there was no remaining blocks. pub fn advance(&mut self) -> bool { if !self.skip_reader.advance() { + self.doc_decoder.clear(); + self.freq_decoder.clear(); return false; } self.load_block(); @@ -244,9 +315,10 @@ impl BlockSegmentPostings { pub fn empty() -> BlockSegmentPostings { BlockSegmentPostings { doc_decoder: BlockDecoder::with_val(TERMINATED), - loaded_offset: std::usize::MAX, + loaded_offset: 0, freq_decoder: BlockDecoder::with_val(1), freq_reading_option: FreqReadingOption::NoFreq, + block_max_score_cache: None, doc_freq: 0, data: ReadOnlySource::new(vec![]), skip_reader: SkipReader::new(ReadOnlySource::new(vec![]), 0, IndexRecordOption::Basic), @@ -272,8 +344,10 @@ mod tests { #[test] fn test_empty_segment_postings() { let mut postings = SegmentPostings::empty(); + assert_eq!(postings.doc(), TERMINATED); assert_eq!(postings.advance(), TERMINATED); assert_eq!(postings.advance(), TERMINATED); + assert_eq!(postings.doc_freq(), 0); assert_eq!(postings.len(), 0); } @@ -372,19 +446,21 @@ mod tests { } #[test] - fn test_block_segment_postings_skip2() { + fn test_block_segment_postings_seek2() { let mut docs = vec![0]; for i in 0..1300 { docs.push((i * i / 100) + i); } let mut block_postings = build_block_postings(&docs[..]); for i in vec![0, 424, 10000] { - block_postings.seek(i); + block_postings.shallow_seek(i); + block_postings.load_block(); let docs = block_postings.docs(); assert!(docs[0] <= i); assert!(docs.last().cloned().unwrap_or(0u32) >= i); } - block_postings.seek(100_000); + block_postings.shallow_seek(100_000); + block_postings.load_block(); assert_eq!(block_postings.doc(COMPRESSION_BLOCK_SIZE - 1), TERMINATED); } diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 463d6ce0d..5d99594b6 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -73,8 +73,10 @@ pub mod tests { let mut segment = index.new_segment(); let mut posting_serializer = InvertedIndexSerializer::open(&mut segment).unwrap(); { - let mut field_serializer = posting_serializer.new_field(text_field, 120 * 4, None).unwrap(); - field_serializer.new_term("abc".as_bytes()).unwrap(); + let mut field_serializer = posting_serializer + .new_field(text_field, 120 * 4, None) + .unwrap(); + field_serializer.new_term("abc".as_bytes(), 12u32).unwrap(); for doc_id in 0u32..120u32 { let delta_positions = vec![1, 2, 3, 2]; field_serializer diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index 10433c0f7..65072eecf 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -1,5 +1,6 @@ use super::stacker::{Addr, MemoryArena, TermHashMap}; +use crate::fieldnorm::FieldNormReaders; use crate::postings::recorder::{ BufferLender, NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder, }; @@ -16,7 +17,6 @@ use std::collections::HashMap; use std::io; use std::marker::PhantomData; use std::ops::DerefMut; -use crate::fieldnorm::FieldNormReaders; fn posting_from_field_entry(field_entry: &FieldEntry) -> Box { match *field_entry.field_type() { @@ -129,7 +129,7 @@ impl MultiFieldPostingsWriter { pub fn serialize( &self, serializer: &mut InvertedIndexSerializer, - fieldnorm_readers: FieldNormReaders + fieldnorm_readers: FieldNormReaders, ) -> crate::Result>> { let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> = self.term_index.iter().collect(); @@ -164,8 +164,11 @@ impl MultiFieldPostingsWriter { let postings_writer = &self.per_field_postings_writers[field.field_id() as usize]; let fieldnorm_reader = fieldnorm_readers.get_field(field); - let mut field_serializer = - serializer.new_field(field, postings_writer.total_num_tokens(), fieldnorm_reader)?; + let mut field_serializer = serializer.new_field( + field, + postings_writer.total_num_tokens(), + fieldnorm_reader, + )?; postings_writer.serialize( &term_offsets[start..stop], &mut field_serializer, @@ -300,7 +303,8 @@ impl PostingsWriter for SpecializedPostingsWriter let mut buffer_lender = BufferLender::default(); for &(term_bytes, addr, _) in term_addrs { let recorder: Rec = termdict_heap.read(addr); - serializer.new_term(&term_bytes[4..])?; + let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32); + serializer.new_term(&term_bytes[4..], term_doc_freq)?; recorder.serialize(&mut buffer_lender, serializer, heap)?; serializer.close_term()?; } diff --git a/src/postings/recorder.rs b/src/postings/recorder.rs index 0a1dd9217..72ab12f22 100644 --- a/src/postings/recorder.rs +++ b/src/postings/recorder.rs @@ -75,6 +75,8 @@ pub(crate) trait Recorder: Copy + 'static { serializer: &mut FieldSerializer<'_>, heap: &MemoryArena, ) -> io::Result<()>; + /// Returns the number of document containg this term. + fn term_doc_freq(&self) -> Option; } /// Only records the doc ids @@ -113,11 +115,16 @@ impl Recorder for NothingRecorder { ) -> io::Result<()> { let buffer = buffer_lender.lend_u8(); self.stack.read_to_end(heap, buffer); + // TODO avoid reading twice. for doc in VInt32Reader::new(&buffer[..]) { serializer.write_doc(doc as u32, 0u32, &[][..])?; } Ok(()) } + + fn term_doc_freq(&self) -> Option { + None + } } /// Recorder encoding document ids, and term frequencies @@ -126,6 +133,7 @@ pub struct TermFrequencyRecorder { stack: ExpUnrolledLinkedList, current_doc: DocId, current_tf: u32, + term_doc_freq: u32, } impl Recorder for TermFrequencyRecorder { @@ -134,6 +142,7 @@ impl Recorder for TermFrequencyRecorder { stack: ExpUnrolledLinkedList::new(), current_doc: u32::max_value(), current_tf: 0u32, + term_doc_freq: 0u32, } } @@ -142,6 +151,7 @@ impl Recorder for TermFrequencyRecorder { } fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) { + self.term_doc_freq += 1; self.current_doc = doc; let _ = write_u32_vint(doc, &mut self.stack.writer(heap)); } @@ -172,6 +182,10 @@ impl Recorder for TermFrequencyRecorder { Ok(()) } + + fn term_doc_freq(&self) -> Option { + Some(self.term_doc_freq) + } } /// Recorder encoding term frequencies as well as positions. @@ -179,12 +193,14 @@ impl Recorder for TermFrequencyRecorder { pub struct TFAndPositionRecorder { stack: ExpUnrolledLinkedList, current_doc: DocId, + term_doc_freq: u32, } impl Recorder for TFAndPositionRecorder { fn new() -> Self { TFAndPositionRecorder { stack: ExpUnrolledLinkedList::new(), current_doc: u32::max_value(), + term_doc_freq: 0u32, } } @@ -194,6 +210,7 @@ impl Recorder for TFAndPositionRecorder { fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) { self.current_doc = doc; + self.term_doc_freq += 1u32; let _ = write_u32_vint(doc, &mut self.stack.writer(heap)); } @@ -233,6 +250,10 @@ impl Recorder for TFAndPositionRecorder { } Ok(()) } + + fn term_doc_freq(&self) -> Option { + Some(self.term_doc_freq) + } } #[cfg(test)] diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index a7268fe44..2c587a661 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -10,9 +10,10 @@ use crate::postings::BlockSearcher; use crate::postings::Postings; use crate::schema::IndexRecordOption; -use crate::DocId; +use crate::{DocId, TERMINATED}; use crate::directory::ReadOnlySource; +use crate::fieldnorm::FieldNormReader; use crate::postings::BlockSegmentPostings; /// `SegmentPostings` represents the inverted list or postings associated to @@ -38,6 +39,8 @@ impl SegmentPostings { } } + /// Returns the overall number of documents in the block postings. + /// It does not take in account whether documents are deleted or not. pub fn doc_freq(&self) -> u32 { self.block_cursor.doc_freq() } @@ -54,6 +57,7 @@ impl SegmentPostings { let mut buffer = Vec::new(); { let mut postings_serializer = PostingsSerializer::new(&mut buffer, false, false, None); + postings_serializer.new_term(docs.len() as u32); for &doc in docs { postings_serializer.write_doc(doc, 1u32); } @@ -70,6 +74,29 @@ impl SegmentPostings { SegmentPostings::from_block_postings(block_segment_postings, None) } + /// Helper functions to create `SegmentPostings` for tests. + pub fn create_from_docs_and_tfs( + doc_and_tfs: &[(u32, u32)], + fieldnorm_reader: Option, + ) -> crate::Result { + let mut buffer = Vec::new(); + let mut postings_serializer = + PostingsSerializer::new(&mut buffer, true, false, fieldnorm_reader); + postings_serializer.new_term(doc_and_tfs.len() as u32); + for &(doc, tf) in doc_and_tfs { + postings_serializer.write_doc(doc, tf); + } + postings_serializer + .close_term(doc_and_tfs.len() as u32)?; + let block_segment_postings = BlockSegmentPostings::from_data( + doc_and_tfs.len() as u32, + ReadOnlySource::from(buffer), + IndexRecordOption::WithFreqs, + IndexRecordOption::WithFreqs, + ); + Ok(SegmentPostings::from_block_postings(block_segment_postings, None)) + } + /// Reads a Segment postings from an &[u8] /// /// * `len` - number of document in the posting lists. @@ -87,27 +114,10 @@ impl SegmentPostings { block_searcher: BlockSearcher::default(), } } -} -impl DocSet for SegmentPostings { - // goes to the next element. - // next needs to be called a first time to point to the correct element. - #[inline] - fn advance(&mut self) -> DocId { - if self.cur == COMPRESSION_BLOCK_SIZE - 1 { - self.cur = 0; - self.block_cursor.advance(); - } else { - self.cur += 1; - } - self.doc() - } - fn seek(&mut self, target: DocId) -> DocId { - if self.doc() == target { - return target; - } - self.block_cursor.seek(target); + pub(crate) fn seek_after_shallow(&mut self, target: DocId) -> DocId { + self.block_cursor.load_block(); // At this point we are on the block, that might contain our document. let output = self.block_cursor.docs_aligned(); @@ -129,6 +139,32 @@ impl DocSet for SegmentPostings { debug_assert!(doc >= target); doc } +} + +impl DocSet for SegmentPostings { + // goes to the next element. + // next needs to be called a first time to point to the correct element. + #[inline] + fn advance(&mut self) -> DocId { + assert!(self.block_cursor.block_is_loaded()); + if self.cur == COMPRESSION_BLOCK_SIZE - 1 { + self.cur = 0; + if !self.block_cursor.advance() { + return TERMINATED; + } + } else { + self.cur += 1; + } + self.doc() + } + + fn seek(&mut self, target_doc: DocId) -> DocId { + if self.doc() == target_doc { + return target_doc; + } + self.block_cursor.shallow_seek(target_doc); + self.seek_after_shallow(target_doc) + } /// Return the current document's `DocId`. #[inline] diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index 13f7c1827..2acf4bd90 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -3,15 +3,17 @@ use crate::common::{BinarySerializable, VInt}; use crate::common::{CompositeWrite, CountingWriter}; use crate::core::Segment; use crate::directory::WritePtr; +use crate::fieldnorm::FieldNormReader; use crate::positions::PositionSerializer; use crate::postings::compression::{BlockEncoder, VIntEncoder, COMPRESSION_BLOCK_SIZE}; use crate::postings::skip::SkipSerializer; +use crate::query::BM25Weight; use crate::schema::Schema; use crate::schema::{Field, FieldEntry, FieldType}; use crate::termdict::{TermDictionaryBuilder, TermOrdinal}; use crate::DocId; +use std::cmp::Ordering; use std::io::{self, Write}; -use crate::fieldnorm::FieldNormReader; /// `InvertedIndexSerializer` is in charge of serializing /// postings on disk, in the @@ -48,7 +50,7 @@ pub struct InvertedIndexSerializer { terms_write: CompositeWrite, postings_write: CompositeWrite, positions_write: CompositeWrite, - positions_idx_write: CompositeWrite, + positionsidx_write: CompositeWrite, schema: Schema, } @@ -58,14 +60,14 @@ impl InvertedIndexSerializer { terms_write: CompositeWrite, postings_write: CompositeWrite, positions_write: CompositeWrite, - positions_idx_write: CompositeWrite, + positionsidx_write: CompositeWrite, schema: Schema, ) -> crate::Result { Ok(InvertedIndexSerializer { terms_write, postings_write, positions_write, - positions_idx_write, + positionsidx_write, schema, }) } @@ -90,22 +92,22 @@ impl InvertedIndexSerializer { &mut self, field: Field, total_num_tokens: u64, - fieldnorm_reader: Option + fieldnorm_reader: Option, ) -> io::Result> { let field_entry: &FieldEntry = self.schema.get_field_entry(field); let term_dictionary_write = self.terms_write.for_field(field); let postings_write = self.postings_write.for_field(field); total_num_tokens.serialize(postings_write)?; let positions_write = self.positions_write.for_field(field); - let positions_idx_write = self.positions_idx_write.for_field(field); + let positionsidx_write = self.positionsidx_write.for_field(field); let field_type: FieldType = (*field_entry.field_type()).clone(); FieldSerializer::create( &field_type, term_dictionary_write, postings_write, positions_write, - positions_idx_write, - fieldnorm_reader + positionsidx_write, + fieldnorm_reader, ) } @@ -114,7 +116,7 @@ impl InvertedIndexSerializer { self.terms_write.close()?; self.postings_write.close()?; self.positions_write.close()?; - self.positions_idx_write.close()?; + self.positionsidx_write.close()?; Ok(()) } } @@ -137,7 +139,7 @@ impl<'a> FieldSerializer<'a> { postings_write: &'a mut CountingWriter, positions_write: &'a mut CountingWriter, positionsidx_write: &'a mut CountingWriter, - fieldnorm_reader: Option + fieldnorm_reader: Option, ) -> io::Result> { let (term_freq_enabled, position_enabled): (bool, bool) = match field_type { FieldType::Str(ref text_options) => { @@ -151,8 +153,12 @@ impl<'a> FieldSerializer<'a> { _ => (false, false), }; let term_dictionary_builder = TermDictionaryBuilder::create(term_dictionary_write)?; - let postings_serializer = - PostingsSerializer::new(postings_write, term_freq_enabled, position_enabled, fieldnorm_reader); + let postings_serializer = PostingsSerializer::new( + postings_write, + term_freq_enabled, + position_enabled, + fieldnorm_reader, + ); let positions_serializer_opt = if position_enabled { Some(PositionSerializer::new(positions_write, positionsidx_write)) } else { @@ -165,7 +171,7 @@ impl<'a> FieldSerializer<'a> { positions_serializer_opt, current_term_info: TermInfo::default(), term_open: false, - num_terms: TermOrdinal::default() + num_terms: TermOrdinal::default(), }) } @@ -185,8 +191,8 @@ impl<'a> FieldSerializer<'a> { /// Starts the postings for a new term. /// * term - the term. It needs to come after the previous term according /// to the lexicographical order. - /// * doc_freq - return the number of document containing the term. - pub fn new_term(&mut self, term: &[u8]) -> io::Result { + /// * term_doc_freq - return the number of document containing the term. + pub fn new_term(&mut self, term: &[u8], term_doc_freq: u32) -> io::Result { assert!( !self.term_open, "Called new_term, while the previous term was not closed." @@ -197,6 +203,7 @@ impl<'a> FieldSerializer<'a> { self.term_dictionary_builder.insert_key(term)?; let term_ordinal = self.num_terms; self.num_terms += 1; + self.postings_serializer.new_term(term_doc_freq); Ok(term_ordinal) } @@ -310,8 +317,21 @@ pub struct PostingsSerializer { termfreq_enabled: bool, termfreq_sum_enabled: bool, - fieldnorm_reader: Option, + + bm25_weight: Option, + + num_docs: u32, // Number of docs in the segment + avg_fieldnorm: f32, // Average number of term in the field for that segment. + // this value is used to compute the block wand information. +} + +fn get_avg_fieldnorm(fieldnorm_reader: &FieldNormReader) -> f32 { + let num_docs = fieldnorm_reader.num_docs(); + let sum_fieldnorm: f32 = (0u32..num_docs) + .map(|doc| fieldnorm_reader.fieldnorm(doc) as f32) + .sum(); + sum_fieldnorm / (num_docs as f32) } impl PostingsSerializer { @@ -319,8 +339,16 @@ impl PostingsSerializer { write: W, termfreq_enabled: bool, termfreq_sum_enabled: bool, - fieldnorm_reader: Option + fieldnorm_reader: Option, ) -> PostingsSerializer { + let avg_fieldnorm: f32 = fieldnorm_reader + .as_ref() + .map(get_avg_fieldnorm) + .unwrap_or(0f32); + let num_docs = fieldnorm_reader + .as_ref() + .map(|fieldnorm_reader| fieldnorm_reader.num_docs()) + .unwrap_or(0u32); PostingsSerializer { output_write: CountingWriter::wrap(write), @@ -335,6 +363,21 @@ impl PostingsSerializer { termfreq_sum_enabled, fieldnorm_reader, + bm25_weight: None, + + num_docs, + avg_fieldnorm, + } + } + + pub fn new_term(&mut self, term_doc_freq: u32) { + if self.termfreq_enabled && self.num_docs > 0 { + let bm25_weight = BM25Weight::for_one_term( + term_doc_freq as u64, + self.num_docs as u64, + self.avg_fieldnorm, + ); + self.bm25_weight = Some(bm25_weight); } } @@ -351,7 +394,6 @@ impl PostingsSerializer { self.postings_write.extend(block_encoded); } if self.termfreq_enabled { - // encode the term_freqs let (num_bits, block_encoded): (u8, &[u8]) = self .block_encoder .compress_block_unsorted(&self.block.term_freqs()); @@ -361,10 +403,32 @@ impl PostingsSerializer { let sum_freq = self.block.term_freqs().iter().cloned().sum(); self.skip_write.write_total_term_freq(sum_freq); } - if let Some(fieldnorm_reader) = &self.fieldnorm_reader { - let docs = self.block.doc_ids; - let tfs = self.block.term_freqs; + let mut blockwand_params_opt = None; + if let Some(bm25_weight) = self.bm25_weight.as_ref() { + if let Some(fieldnorm_reader) = self.fieldnorm_reader.as_ref() { + let docs = self.block.doc_ids(); + let term_freqs = self.block.term_freqs(); + blockwand_params_opt = docs + .iter() + .cloned() + .map(|doc| fieldnorm_reader.fieldnorm_id(doc)) + .zip(term_freqs.iter().cloned()) + .max_by( + |(left_fieldnorm_id, left_term_freq), + (right_fieldnorm_id, right_term_freq)| { + let left_score = + bm25_weight.tf_factor(*left_fieldnorm_id, *left_term_freq); + let right_score = + bm25_weight.tf_factor(*right_fieldnorm_id, *right_term_freq); + left_score + .partial_cmp(&right_score) + .unwrap_or(Ordering::Equal) + }, + ); + } } + let (fieldnorm_id, term_freq) = blockwand_params_opt.unwrap_or((0u8, 0u32)); + self.skip_write.write_blockwand_max(fieldnorm_id, term_freq); } self.block.clear(); } @@ -413,6 +477,7 @@ impl PostingsSerializer { } self.skip_write.clear(); self.postings_write.clear(); + self.bm25_weight = None; Ok(()) } @@ -425,4 +490,3 @@ impl PostingsSerializer { self.last_doc_id_encoded = 0; } } - diff --git a/src/postings/skip.rs b/src/postings/skip.rs index 46097777c..3c1b8c4ab 100644 --- a/src/postings/skip.rs +++ b/src/postings/skip.rs @@ -1,6 +1,7 @@ -use crate::common::BinarySerializable; +use crate::common::{BinarySerializable, VInt}; use crate::directory::ReadOnlySource; use crate::postings::compression::{compressed_block_size, COMPRESSION_BLOCK_SIZE}; +use crate::query::BM25Weight; use crate::schema::IndexRecordOption; use crate::{DocId, Score, TERMINATED}; use owned_read::OwnedRead; @@ -40,6 +41,11 @@ impl SkipSerializer { .expect("Should never fail"); } + pub fn write_blockwand_max(&mut self, fieldnorm_id: u8, term_freq: u32) { + self.buffer.push(fieldnorm_id); + VInt(term_freq as u64).serialize_into_vec(&mut self.buffer); + } + pub fn data(&self) -> &[u8] { &self.buffer[..] } @@ -63,25 +69,29 @@ pub(crate) struct SkipReader { position_offset: u64, } -#[derive(Clone, Eq, PartialEq, Copy, Debug)] +#[derive(Clone, Copy, Debug)] pub(crate) enum BlockInfo { BitPacked { doc_num_bits: u8, tf_num_bits: u8, tf_sum: u32, + block_wand_fieldnorm_id: u8, + block_wand_term_freq: u32, + }, + VInt { + num_docs: u32, }, - VInt(u32), } impl Default for BlockInfo { fn default() -> Self { - BlockInfo::VInt(0) + BlockInfo::VInt { num_docs: 0u32 } } } impl SkipReader { pub fn new(data: ReadOnlySource, doc_freq: u32, skip_info: IndexRecordOption) -> SkipReader { - SkipReader { + let mut skip_reader = SkipReader { last_doc_in_block: 0u32, last_doc_in_previous_block: 0u32, owned_read: OwnedRead::new(data), @@ -90,7 +100,9 @@ impl SkipReader { byte_offset: 0, remaining_docs: doc_freq, position_offset: 0u64, - } + }; + skip_reader.advance(); + skip_reader } pub fn reset(&mut self, data: ReadOnlySource, doc_freq: u32) { @@ -102,8 +114,15 @@ impl SkipReader { self.remaining_docs = doc_freq; } - pub fn block_max_score(&self) -> Score { - f32::MAX + pub fn block_max_score(&self, bm25_weight: &BM25Weight) -> Option { + match self.block_info { + BlockInfo::BitPacked { + block_wand_fieldnorm_id, + block_wand_term_freq, + .. + } => Some(bm25_weight.score(block_wand_fieldnorm_id, block_wand_term_freq)), + BlockInfo::VInt { .. } => None, + } } pub(crate) fn last_doc_in_block(&self) -> DocId { @@ -129,25 +148,38 @@ impl SkipReader { doc_num_bits, tf_num_bits: 0, tf_sum: 0, + block_wand_fieldnorm_id: 0, + block_wand_term_freq: 0, }; } IndexRecordOption::WithFreqs => { let tf_num_bits = self.owned_read.get(1); + let block_wand_fieldnorm_id = self.owned_read.get(2); + self.owned_read.advance(3); + let block_wand_term_freq = + VInt::deserialize_u64(&mut self.owned_read).unwrap() as u32; self.block_info = BlockInfo::BitPacked { doc_num_bits, tf_num_bits, tf_sum: 0, + block_wand_fieldnorm_id, + block_wand_term_freq, }; - self.owned_read.advance(2); } IndexRecordOption::WithFreqsAndPositions => { let tf_num_bits = self.owned_read.get(1); self.owned_read.advance(2); let tf_sum = u32::deserialize(&mut self.owned_read).expect("Failed reading tf_sum"); + let block_wand_fieldnorm_id = self.owned_read.get(0); + self.owned_read.advance(1); + let block_wand_term_freq = + VInt::deserialize_u64(&mut self.owned_read).unwrap() as u32; self.block_info = BlockInfo::BitPacked { doc_num_bits, tf_num_bits, tf_sum, + block_wand_fieldnorm_id, + block_wand_term_freq, }; } } @@ -173,13 +205,14 @@ impl SkipReader { doc_num_bits, tf_num_bits, tf_sum, + .. } => { self.remaining_docs -= COMPRESSION_BLOCK_SIZE as u32; self.byte_offset += compressed_block_size(doc_num_bits + tf_num_bits); self.position_offset += tf_sum as u64; } - BlockInfo::VInt(num_vint_docs) => { - self.remaining_docs -= num_vint_docs; + BlockInfo::VInt { num_docs, .. } => { + self.remaining_docs -= num_docs; } } self.last_doc_in_previous_block = self.last_doc_in_block; @@ -188,7 +221,9 @@ impl SkipReader { true } else { self.last_doc_in_block = TERMINATED; - self.block_info = BlockInfo::VInt(self.remaining_docs); + self.block_info = BlockInfo::VInt { + num_docs: self.remaining_docs, + }; self.remaining_docs > 0 } } @@ -209,8 +244,10 @@ mod tests { let mut skip_serializer = SkipSerializer::new(); skip_serializer.write_doc(1u32, 2u8); skip_serializer.write_term_freq(3u8); + skip_serializer.write_blockwand_max(13u8, 3u32); skip_serializer.write_doc(5u32, 5u8); skip_serializer.write_term_freq(2u8); + skip_serializer.write_blockwand_max(8u8, 2u32); skip_serializer.data().to_owned() }; let doc_freq = 3u32 + (COMPRESSION_BLOCK_SIZE * 2) as u32; @@ -219,28 +256,34 @@ mod tests { doc_freq, IndexRecordOption::WithFreqs, ); - assert!(skip_reader.advance()); assert_eq!(skip_reader.last_doc_in_block(), 1u32); - assert_eq!( - skip_reader.block_info(), + assert!(matches!( + skip_reader.block_info, BlockInfo::BitPacked { doc_num_bits: 2u8, tf_num_bits: 3u8, - tf_sum: 0 + tf_sum: 0, + block_wand_fieldnorm_id: 13, + block_wand_term_freq: 3 } - ); + )); assert!(skip_reader.advance()); assert_eq!(skip_reader.last_doc_in_block(), 5u32); - assert_eq!( + assert!(matches!( skip_reader.block_info(), BlockInfo::BitPacked { doc_num_bits: 5u8, tf_num_bits: 2u8, - tf_sum: 0 + tf_sum: 0, + block_wand_fieldnorm_id: 8, + block_wand_term_freq: 2 } - ); + )); assert!(skip_reader.advance()); - assert_eq!(skip_reader.block_info(), BlockInfo::VInt(3u32)); + assert!(matches!( + skip_reader.block_info(), + BlockInfo::VInt { num_docs: 3u32 } + )); assert!(!skip_reader.advance()); } @@ -258,28 +301,34 @@ mod tests { doc_freq, IndexRecordOption::Basic, ); - assert!(skip_reader.advance()); assert_eq!(skip_reader.last_doc_in_block(), 1u32); - assert_eq!( + assert!(matches!( skip_reader.block_info(), BlockInfo::BitPacked { doc_num_bits: 2u8, tf_num_bits: 0, - tf_sum: 0u32 + tf_sum: 0u32, + block_wand_fieldnorm_id: 0, + block_wand_term_freq: 0 } - ); + )); assert!(skip_reader.advance()); assert_eq!(skip_reader.last_doc_in_block(), 5u32); - assert_eq!( + assert!(matches!( skip_reader.block_info(), BlockInfo::BitPacked { doc_num_bits: 5u8, tf_num_bits: 0, - tf_sum: 0u32 + tf_sum: 0u32, + block_wand_fieldnorm_id: 0, + block_wand_term_freq: 0 } - ); + )); assert!(skip_reader.advance()); - assert_eq!(skip_reader.block_info(), BlockInfo::VInt(3u32)); + assert!(matches!( + skip_reader.block_info(), + BlockInfo::VInt { num_docs: 3u32 } + )); assert!(!skip_reader.advance()); } @@ -296,16 +345,17 @@ mod tests { doc_freq, IndexRecordOption::Basic, ); - assert!(skip_reader.advance()); assert_eq!(skip_reader.last_doc_in_block(), 1u32); - assert_eq!( + assert!(matches!( skip_reader.block_info(), BlockInfo::BitPacked { doc_num_bits: 2u8, tf_num_bits: 0, - tf_sum: 0u32 + tf_sum: 0u32, + block_wand_fieldnorm_id: 0, + block_wand_term_freq: 0 } - ); + )); assert!(!skip_reader.advance()); } } diff --git a/src/query/bm25.rs b/src/query/bm25.rs index b06aefc46..4f1c246b4 100644 --- a/src/query/bm25.rs +++ b/src/query/bm25.rs @@ -3,26 +3,27 @@ use crate::query::Explanation; use crate::Score; use crate::Searcher; use crate::Term; -use serde::Serialize; use serde::Deserialize; +use serde::Serialize; -const DEFAULT_K1: f32 = 1.2; -const DEFAULT_B: f32 = 0.75; +const K1: f32 = 1.2; +const B: f32 = 0.75; fn idf(doc_freq: u64, doc_count: u64) -> f32 { + assert!(doc_count >= doc_freq, "{} >= {}", doc_count, doc_freq); let x = ((doc_count - doc_freq) as f32 + 0.5) / (doc_freq as f32 + 0.5); (1f32 + x).ln() } -fn cached_tf_component(fieldnorm: u32, average_fieldnorm: f32, K1: f32, B: f32) -> f32 { +fn cached_tf_component(fieldnorm: u32, average_fieldnorm: f32) -> f32 { K1 * (1f32 - B + B * fieldnorm as f32 / average_fieldnorm) } -fn compute_tf_cache(average_fieldnorm: f32, K1: f32, B: f32) -> [f32; 256] { +fn compute_tf_cache(average_fieldnorm: f32) -> [f32; 256] { let mut cache = [0f32; 256]; for (fieldnorm_id, cache_mut) in cache.iter_mut().enumerate() { let fieldnorm = FieldNormReader::id_to_fieldnorm(fieldnorm_id as u8); - *cache_mut = cached_tf_component(fieldnorm, average_fieldnorm, K1, B); + *cache_mut = cached_tf_component(fieldnorm, average_fieldnorm); } cache } @@ -31,8 +32,6 @@ fn compute_tf_cache(average_fieldnorm: f32, K1: f32, B: f32) -> [f32; 256] { pub struct BM25Params { pub idf: f32, pub avg_fieldnorm: f32, - pub K1: f32, - pub B: f32, } pub struct BM25Weight { @@ -40,15 +39,6 @@ pub struct BM25Weight { weight: f32, cache: [f32; 256], average_fieldnorm: f32, - K1: f32, - B: f32, -} - -impl From for BM25Weight { - fn from(bm25_params: BM25Params) -> Self { - let idf_explain = Explanation::new("no-explanation", bm25_params.idf); - BM25Weight::new(idf_explain, bm25_params.avg_fieldnorm, bm25_params.K1, bm25_params.B) - } } impl BM25Weight { @@ -58,8 +48,6 @@ impl BM25Weight { weight: self.weight * boost, cache: self.cache, average_fieldnorm: self.average_fieldnorm, - K1: self.K1, - B: self.B } } @@ -95,7 +83,7 @@ impl BM25Weight { }) .sum::(); let idf_explain = Explanation::new("idf", idf); - BM25Weight::new(idf_explain, average_fieldnorm, DEFAULT_K1, DEFAULT_B) + BM25Weight::new(idf_explain, average_fieldnorm) } } @@ -108,32 +96,38 @@ impl BM25Weight { term_doc_freq as f32, ); idf_explain.add_const("N, total number of docs", total_num_docs as f32); - BM25Weight::new(idf_explain, avg_fieldnorm, DEFAULT_K1, DEFAULT_B) + BM25Weight::new(idf_explain, avg_fieldnorm) } - fn new(idf_explain: Explanation, average_fieldnorm: f32, K1: f32, B: f32) -> BM25Weight { + fn new(idf_explain: Explanation, average_fieldnorm: f32) -> BM25Weight { let weight = idf_explain.value() * (1f32 + K1); BM25Weight { idf_explain, weight, - cache: compute_tf_cache(average_fieldnorm, K1, B), + cache: compute_tf_cache(average_fieldnorm), average_fieldnorm, - K1, - B } } #[inline(always)] pub fn score(&self, fieldnorm_id: u8, term_freq: u32) -> Score { - let norm = self.cache[fieldnorm_id as usize]; + self.weight * self.tf_factor(fieldnorm_id, term_freq) + } + + pub fn max_score(&self) -> Score { + self.score(255u8, 2_013_265_944) + } + + #[inline(always)] + pub(crate) fn tf_factor(&self, fieldnorm_id: u8, term_freq: u32) -> f32 { let term_freq = term_freq as f32; - self.weight * term_freq / (term_freq + norm) + let norm = self.cache[fieldnorm_id as usize]; + term_freq / (term_freq + norm) } pub fn explain(&self, fieldnorm_id: u8, term_freq: u32) -> Explanation { // The explain format is directly copied from Lucene's. // (So, Kudos to Lucene) - let score = self.score(fieldnorm_id, term_freq); let norm = self.cache[fieldnorm_id as usize]; @@ -146,8 +140,8 @@ impl BM25Weight { ); tf_explanation.add_const("freq, occurrences of term within document", term_freq); - tf_explanation.add_const("k1, term saturation parameter", self.K1); - tf_explanation.add_const("b, length normalization parameter", self.B); + tf_explanation.add_const("k1, term saturation parameter", K1); + tf_explanation.add_const("b, length normalization parameter", B); tf_explanation.add_const( "dl, length of field", FieldNormReader::id_to_fieldnorm(fieldnorm_id) as f32, @@ -155,7 +149,7 @@ impl BM25Weight { tf_explanation.add_const("avgdl, average length of field", self.average_fieldnorm); let mut explanation = Explanation::new("TermQuery, product of...", score); - explanation.add_detail(Explanation::new("(K1+1)", self.K1 + 1f32)); + explanation.add_detail(Explanation::new("(K1+1)", K1 + 1f32)); explanation.add_detail(self.idf_explain.clone()); explanation.add_detail(tf_explanation); explanation @@ -166,10 +160,10 @@ impl BM25Weight { mod tests { use super::idf; - use crate::tests::assert_nearly_equals; + use crate::assert_nearly_equals; #[test] fn test_idf() { - assert_nearly_equals(idf(1, 2), std::f32::consts::LN_2); + assert_nearly_equals!(idf(1, 2), std::f32::consts::LN_2); } } diff --git a/src/query/boolean_query/block_wand.rs b/src/query/boolean_query/block_wand.rs index 9df5c7bf7..29db81344 100644 --- a/src/query/boolean_query/block_wand.rs +++ b/src/query/boolean_query/block_wand.rs @@ -1,62 +1,164 @@ use crate::query::term_query::TermScorer; use crate::query::Scorer; use crate::{DocId, DocSet, Score, TERMINATED}; +use std::ops::DerefMut; +use std::ops::Deref; -/// Returns the lowest document that has a chance of exceeding the -/// threshold score. +/// Takes a term_scorers sorted by their current doc() and a threshold and returns +/// Returns (pivot_len, pivot_ord) defined as follows: +/// - `pivot_doc` lowest document that has a chance of exceeding (>) the threshold score. +/// - `before_pivot_len` number of term_scorers such that term_scorer.doc() < pivot. +/// - `pivot_len` number of term_scorers such that term_scorer.doc() <= pivot. /// -/// term_scorers are assumed sorted by .doc(). -fn find_pivot_doc(term_scorers: &[TermScorer], threshold: f32) -> Option { +/// We always have `before_pivot_len` < `pivot_len`. +/// +/// None is returned if we establish that no document can exceed the threshold. +fn find_pivot_doc(term_scorers: &[TermScorerWithMaxScore], threshold: f32) -> Option<(usize, usize, DocId)> { let mut max_score = 0.0f32; - for term_scorer in term_scorers { - max_score += term_scorer.max_score(); + let mut before_pivot_len = 0; + let mut pivot_doc = TERMINATED; + while before_pivot_len < term_scorers.len() { + let term_scorer = &term_scorers[before_pivot_len]; + max_score += term_scorer.max_score; if max_score > threshold { - return Some(term_scorer.doc()); - } - } - None -} - -fn shallow_advance(scorers: &mut Vec, pivot: DocId) -> Score { - let mut block_max_score_upperbound = 0.0f32; - for scorer in scorers { - if scorer.doc() > pivot { + pivot_doc = term_scorer.doc(); break; } - scorer.postings.block_cursor.seek(pivot); - block_max_score_upperbound += scorer.postings.block_cursor.skip_reader.block_max_score(); + before_pivot_len += 1; } - block_max_score_upperbound + if pivot_doc == TERMINATED { + return None; + } + // Right now i is an ordinal, we want a len. + let mut pivot_len = before_pivot_len + 1; + // Some other term_scorer may be positioned on the same document. + pivot_len += term_scorers[pivot_len..].iter() + .take_while(|term_scorer| term_scorer.doc() == pivot_doc) + .count(); + Some((before_pivot_len, pivot_len, pivot_doc)) } -fn compute_score(scorers: &mut Vec, doc: DocId) -> Score { +// Shallow advance all of the scorers that are positioned before or on pivot, +// and returns their number as well as the sum of their block_max_score. +// +// This gives us a tighter upperbound of DocId score, without decoding +// any blocks. +// +// After calling this method, some TermScorer might be in a state in which +// their block is not loaded. +fn shallow_advance(scorers: &mut Vec, pivot: DocId) -> Score { + scorers.iter_mut() + .map(|scorer| { + scorer.shallow_seek(pivot); + scorer.block_max_score() + }) + .sum() +} + +struct TermScorerWithMaxScore<'a> { + scorer: &'a mut TermScorer, + max_score: f32, +} + +impl<'a> From<&'a mut TermScorer> for TermScorerWithMaxScore<'a> { + fn from(scorer: &'a mut TermScorer) -> Self { + let max_score = scorer.max_score(); + TermScorerWithMaxScore { + scorer, + max_score + } + } +} + +impl<'a> Deref for TermScorerWithMaxScore<'a> { + type Target = TermScorer; + + fn deref(&self) -> &Self::Target { + self.scorer + } +} + + +impl<'a> DerefMut for TermScorerWithMaxScore<'a> { + fn deref_mut(&mut self) -> &mut Self::Target { + self.scorer + } +} + +// Before and after calling this method, scorers need to be sorted by their `.doc()`. +fn block_max_was_too_low_advance_one_scorer(scorers: &mut Vec, pivot_len: usize) { + let mut scorer_to_seek = pivot_len - 1; + let mut doc_to_seek_after = scorers[scorer_to_seek].doc(); + for scorer_ord in (0..pivot_len - 1).rev() { + let scorer = &scorers[scorer_ord]; + if scorer.last_doc_in_block() <= doc_to_seek_after { + doc_to_seek_after = scorer.last_doc_in_block(); + scorer_to_seek = scorer_ord; + } + } + for scorer in &scorers[pivot_len..] { + if scorer.doc() <= doc_to_seek_after { + doc_to_seek_after = scorer.doc(); + } + } + scorers[scorer_to_seek].seek(doc_to_seek_after + 1); + restore_ordering(scorers, scorer_to_seek); +} + + +// Given a list of term_scorers and a `ord` and assuming that `term_scorers[ord]` is sorted +// except term_scorers[ord] that might be in advance compared to its ranks, +// bubble up term_scorers[ord] in order to restore the ordering. +fn restore_ordering(term_scorers: &mut Vec, ord: usize) { + let doc = term_scorers[ord].doc(); + for i in ord + 1..term_scorers.len() { + if term_scorers[i].doc() >= doc { + break; + } + term_scorers.swap(i, i - 1); + } +} + + +// Attempts to advance all term_scorers between `&term_scorers[0..before_len]` to the pivot. +// If this works, return true. +// If this fails (ie: one of the term_scorer does not contain `pivot_doc` and seek goes past the +// pivot), reorder the term_scorers to ensure the list is still sorted and returns `false`. +// If a term_scorer reach TERMINATED in the process return false remove the term_scorer and return. +fn align_scorers(term_scorers: &mut Vec, pivot_doc: DocId, before_pivot_len: usize) -> bool { + debug_assert_ne!(pivot_doc, TERMINATED); + for i in (0..before_pivot_len).rev() { + let new_doc = term_scorers[i].seek(pivot_doc); + if new_doc != pivot_doc { + if new_doc == TERMINATED { + term_scorers.swap_remove(i); + } + // We went past the pivot. + // We just go through the outer loop mechanic (Note that pivot is + // still a possible candidate). + // + // Termination is still guaranteed since we can only consider the same + // pivot at most term_scorers.len() - 1 times. + restore_ordering(term_scorers, i); + return false; + } + } + return true; +} + +// Assumes terms_scorers[..pivot_len] are positioned on the same doc (pivot_doc). +// Advance term_scorers[..pivot_len] and out of these removes the terminated scores. +// Restores the ordering of term_scorers. +fn advance_all_scorers_on_pivot(term_scorers: &mut Vec, pivot_len: usize) { let mut i = 0; - let mut score = 0.0f32; - while i < scorers.len() { - if scorers[i].doc() > doc { - break; - } - if scorers[i].seek(doc) == TERMINATED { - scorers.swap_remove(i); + for _ in 0..pivot_len { + if term_scorers[i].advance() == TERMINATED { + term_scorers.swap_remove(i); } else { - score += scorers[i].score(); i += 1; } } - score -} - -fn advance_all_scorers(scorers: &mut Vec, pivot: DocId) { - let mut i = 0; - while i < scorers.len() { - if scorers[i].doc() == pivot { - if scorers[i].advance() == TERMINATED { - scorers.swap_remove(i); - continue; - } - } - i += 1; - } + term_scorers.sort_by_key(|scorer| scorer.doc()); } pub fn block_wand( @@ -64,42 +166,56 @@ pub fn block_wand( mut threshold: f32, callback: &mut dyn FnMut(u32, Score) -> Score, ) { + let mut scorers: Vec = scorers.iter_mut().map(TermScorerWithMaxScore::from).collect(); + scorers.sort_by_key(|scorer| scorer.doc()); loop { - scorers.sort_by_key(|scorer| scorer.doc()); - let pivot_opt = find_pivot_doc(&scorers, threshold); - if let Some(pivot_doc) = pivot_opt { - let block_max_score_upperbound = shallow_advance(&mut scorers, pivot_doc); - // TODO bug: more than one scorer can point on the pivot. + // At this point we need to ensure that the scorers are sorted! + if let Some((before_pivot_len, pivot_len, pivot_doc)) = find_pivot_doc(&scorers[..], threshold) { + debug_assert_ne!(pivot_doc, TERMINATED); + debug_assert!(before_pivot_len < pivot_len); + let block_max_score_upperbound: Score = scorers[..pivot_len].iter_mut() + .map(|scorer| { + scorer.shallow_seek(pivot_doc); + scorer.block_max_score() + }) + .sum(); + + // Beware after shallow advance, skip readers can be in advance compared to + // the segment posting lists. + // + // `block_segment_postings.load_block()` need to be called separately. if block_max_score_upperbound <= threshold { - // TODO choose a better candidate. - if scorers[0].seek(pivot_doc + 1) == TERMINATED { - scorers.swap_remove(0); - } + // Block max condition was not reached. + // We could get away by simply advancing the scorers to DocId + 1 but it would + // be inefficient. The optimization requires proper explanation and was + // isolated in a different function. + block_max_was_too_low_advance_one_scorer(&mut scorers, pivot_len); continue; } - if scorers[0].doc() != pivot_doc { - // all scorers are not aligned on pivot_doc. - if let Some(scorer_ord) = scorers - .iter_mut() - .take_while(|scorer| scorer.doc() < pivot_doc) - .enumerate() - .min_by_key(|(_ord, scorer)| scorer.doc_freq()) - .map(|(ord, _scorer)| ord) - { - // TODOD FIX seek, right now the block will never get loaded. - if scorers[scorer_ord].seek(pivot_doc) == TERMINATED { - scorers.swap_remove(scorer_ord); - } - continue; - } + // Block max condition is observed. + // + // Let's try and advance all scorers before the pivot to the pivot. + if !align_scorers(&mut scorers, pivot_doc, before_pivot_len) { + // At least of the scorer does not contain the pivot. + // + // Let's stop scoring this pivot and go through the pivot selection again. + // Note that the current pivot is not necessarily a bad candidate and it + // may be picked again. + continue; } - // TODO no need to fully score? - let score = compute_score(&mut scorers, pivot_doc); + + // At this point, all scorers are positioned on the doc. + let score = scorers[..pivot_len] + .iter_mut() + .map(|scorer| scorer.score()) + .sum(); if score > threshold { threshold = callback(pivot_doc, score); } - advance_all_scorers(&mut scorers, pivot_doc); + // let's advance all of the scorers that are currently positioned on the pivot. + advance_all_scorers_on_pivot(&mut scorers, pivot_len); + } else { return; } diff --git a/src/query/boolean_query/boolean_weight.rs b/src/query/boolean_query/boolean_weight.rs index 7f75f3983..fed92c393 100644 --- a/src/query/boolean_query/boolean_weight.rs +++ b/src/query/boolean_query/boolean_weight.rs @@ -1,4 +1,5 @@ use crate::core::SegmentReader; +use crate::postings::FreqReadingOption; use crate::query::explanation::does_not_match; use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner, SumWithCoordsCombiner}; use crate::query::term_query::TermScorer; @@ -8,9 +9,9 @@ use crate::query::Exclude; use crate::query::Occur; use crate::query::RequiredOptionalScorer; use crate::query::Scorer; +use crate::query::Union; use crate::query::Weight; use crate::query::{intersect_scorers, Explanation}; -use crate::query::Union; use crate::{DocId, Score}; use std::collections::HashMap; @@ -35,20 +36,30 @@ where .into_iter() .map(|scorer| *(scorer.downcast::().map_err(|_| ()).unwrap())) .collect(); - return SpecializedScorer::TermUnion(scorers); + if scorers + .iter() + .all(|scorer| scorer.freq_reading_option() == FreqReadingOption::ReadFreq) + { + // Block wand is only available iff we read frequencies. + return SpecializedScorer::TermUnion(scorers); + } else { + return SpecializedScorer::Other(Box::new(Union::<_, TScoreCombiner>::from( + scorers, + ))); + } } } SpecializedScorer::Other(Box::new(Union::<_, TScoreCombiner>::from(scorers))) } fn into_box_scorer(scorer: SpecializedScorer) -> Box { - match scorer { - SpecializedScorer::TermUnion(term_scorers) => { - let union_scorer = Union::::from(term_scorers); - Box::new(union_scorer) - }, - SpecializedScorer::Other(scorer) => scorer, + match scorer { + SpecializedScorer::TermUnion(term_scorers) => { + let union_scorer = Union::::from(term_scorers); + Box::new(union_scorer) } + SpecializedScorer::Other(scorer) => scorer, + } } pub struct BooleanWeight { @@ -94,36 +105,37 @@ impl BooleanWeight { let exclude_scorer_opt: Option> = per_occur_scorers .remove(&Occur::MustNot) .map(scorer_union::) - .map(|specialized_scorer| into_box_scorer::(specialized_scorer)); + .map(into_box_scorer::); let must_scorer_opt: Option> = per_occur_scorers .remove(&Occur::Must) .map(intersect_scorers); - let positive_scorer: SpecializedScorer = - match (should_scorer_opt, must_scorer_opt) { - (Some(should_scorer), Some(must_scorer)) => { - if self.scoring_enabled { - SpecializedScorer::Other(Box::new(RequiredOptionalScorer::< - Box, - Box, - TScoreCombiner, - >::new( - must_scorer, into_box_scorer::(should_scorer) - ))) - } else { - SpecializedScorer::Other(must_scorer) - } + let positive_scorer: SpecializedScorer = match (should_scorer_opt, must_scorer_opt) { + (Some(should_scorer), Some(must_scorer)) => { + if self.scoring_enabled { + SpecializedScorer::Other(Box::new(RequiredOptionalScorer::< + Box, + Box, + TScoreCombiner, + >::new( + must_scorer, + into_box_scorer::(should_scorer), + ))) + } else { + SpecializedScorer::Other(must_scorer) } - (None, Some(must_scorer)) => SpecializedScorer::Other(must_scorer), - (Some(should_scorer), None) => should_scorer, - (None, None) => { - return Ok(SpecializedScorer::Other(Box::new(EmptyScorer))); - } - }; + } + (None, Some(must_scorer)) => SpecializedScorer::Other(must_scorer), + (Some(should_scorer), None) => should_scorer, + (None, None) => { + return Ok(SpecializedScorer::Other(Box::new(EmptyScorer))); + } + }; if let Some(exclude_scorer) = exclude_scorer_opt { - let positive_scorer_boxed: Box = into_box_scorer::(positive_scorer); + let positive_scorer_boxed: Box = + into_box_scorer::(positive_scorer); Ok(SpecializedScorer::Other(Box::new(Exclude::new( positive_scorer_boxed, exclude_scorer, @@ -147,7 +159,9 @@ impl Weight for BooleanWeight { } } else if self.scoring_enabled { self.complex_scorer::(reader, boost) - .map(|specialized_scorer| into_box_scorer::(specialized_scorer)) + .map(|specialized_scorer| { + into_box_scorer::(specialized_scorer) + }) } else { self.complex_scorer::(reader, boost) .map(into_box_scorer::) @@ -182,7 +196,8 @@ impl Weight for BooleanWeight { let scorer = self.complex_scorer::(reader, 1.0f32)?; match scorer { SpecializedScorer::TermUnion(term_scorers) => { - let mut union_scorer = Union::::from(term_scorers); + let mut union_scorer = + Union::::from(term_scorers); for_each_scorer(&mut union_scorer, callback); } SpecializedScorer::Other(mut scorer) => { diff --git a/src/query/boolean_query/mod.rs b/src/query/boolean_query/mod.rs index 3cc1ef58b..2e249b8f4 100644 --- a/src/query/boolean_query/mod.rs +++ b/src/query/boolean_query/mod.rs @@ -9,6 +9,7 @@ pub use self::boolean_query::BooleanQuery; mod tests { use super::*; + use crate::assert_nearly_equals; use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE; use crate::query::score_combiner::SumWithCoordsCombiner; use crate::query::term_query::TermScorer; @@ -20,7 +21,6 @@ mod tests { use crate::query::Scorer; use crate::query::TermQuery; use crate::schema::*; - use crate::tests::assert_nearly_equals; use crate::Index; use crate::{DocAddress, DocId}; @@ -210,14 +210,14 @@ mod tests { .scorer(searcher.segment_reader(0u32), 1.0f32) .unwrap(); assert_eq!(boolean_scorer.doc(), 0u32); - assert_nearly_equals(boolean_scorer.score(), 0.84163445f32); + assert_nearly_equals!(boolean_scorer.score(), 0.84163445f32); } { let mut boolean_scorer = boolean_weight .scorer(searcher.segment_reader(0u32), 2.0f32) .unwrap(); assert_eq!(boolean_scorer.doc(), 0u32); - assert_nearly_equals(boolean_scorer.score(), 1.6832689f32); + assert_nearly_equals!(boolean_scorer.score(), 1.6832689f32); } } diff --git a/src/query/explanation.rs b/src/query/explanation.rs index cb4048611..339f7c47c 100644 --- a/src/query/explanation.rs +++ b/src/query/explanation.rs @@ -1,5 +1,6 @@ use crate::{DocId, TantivyError}; use serde::Serialize; +use std::fmt; pub(crate) fn does_not_match(doc: DocId) -> TantivyError { TantivyError::InvalidArgument(format!("Document #({}) does not match", doc)) @@ -18,6 +19,12 @@ pub struct Explanation { details: Vec, } +impl fmt::Debug for Explanation { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "Explanation({})", self.to_pretty_json()) + } +} + impl Explanation { /// Creates a new explanation object. pub fn new(description: T, value: f32) -> Explanation { diff --git a/src/query/fuzzy_query.rs b/src/query/fuzzy_query.rs index 35c8d5654..a7429d548 100644 --- a/src/query/fuzzy_query.rs +++ b/src/query/fuzzy_query.rs @@ -163,10 +163,10 @@ impl Query for FuzzyTermQuery { #[cfg(test)] mod test { use super::FuzzyTermQuery; + use crate::assert_nearly_equals; use crate::collector::TopDocs; use crate::schema::Schema; use crate::schema::TEXT; - use crate::tests::assert_nearly_equals; use crate::Index; use crate::Term; @@ -199,7 +199,7 @@ mod test { .unwrap(); assert_eq!(top_docs.len(), 1, "Expected only 1 document"); let (score, _) = top_docs[0]; - assert_nearly_equals(1f32, score); + assert_nearly_equals!(1f32, score); } // fails because non-prefix Levenshtein distance is more than 1 (add 'a' and 'n') @@ -223,7 +223,7 @@ mod test { .unwrap(); assert_eq!(top_docs.len(), 1, "Expected only 1 document"); let (score, _) = top_docs[0]; - assert_nearly_equals(1f32, score); + assert_nearly_equals!(1f32, score); } } } diff --git a/src/query/mod.rs b/src/query/mod.rs index a2ea322fd..dfeeea290 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -26,6 +26,7 @@ mod weight; mod vec_docset; pub(crate) mod score_combiner; +pub(crate) use self::bm25::BM25Weight; pub use self::intersection::Intersection; pub use self::union::Union; @@ -35,7 +36,6 @@ pub use self::vec_docset::VecDocSet; pub use self::all_query::{AllQuery, AllScorer, AllWeight}; pub use self::automaton_weight::AutomatonWeight; pub use self::bitset::BitSetDocSet; -pub use self::bm25::BM25Params; pub use self::boolean_query::BooleanQuery; pub use self::boost_query::BoostQuery; pub use self::empty_query::{EmptyQuery, EmptyScorer, EmptyWeight}; diff --git a/src/query/phrase_query/mod.rs b/src/query/phrase_query/mod.rs index 95eeefd70..1560711ef 100644 --- a/src/query/phrase_query/mod.rs +++ b/src/query/phrase_query/mod.rs @@ -10,10 +10,10 @@ pub use self::phrase_weight::PhraseWeight; pub mod tests { use super::*; + use crate::assert_nearly_equals; use crate::collector::tests::{TEST_COLLECTOR_WITHOUT_SCORE, TEST_COLLECTOR_WITH_SCORE}; use crate::core::Index; use crate::schema::{Schema, Term, TEXT}; - use crate::tests::assert_nearly_equals; use crate::DocAddress; use crate::DocId; @@ -157,8 +157,8 @@ pub mod tests { .to_vec() }; let scores = test_query(vec!["a", "b"]); - assert_nearly_equals(scores[0], 0.40618482); - assert_nearly_equals(scores[1], 0.46844664); + assert_nearly_equals!(scores[0], 0.40618482); + assert_nearly_equals!(scores[1], 0.46844664); } #[test] // motivated by #234 diff --git a/src/query/regex_query.rs b/src/query/regex_query.rs index 42c521ce3..6672a8c42 100644 --- a/src/query/regex_query.rs +++ b/src/query/regex_query.rs @@ -89,10 +89,10 @@ impl Query for RegexQuery { #[cfg(test)] mod test { use super::RegexQuery; + use crate::assert_nearly_equals; use crate::collector::TopDocs; use crate::schema::TEXT; use crate::schema::{Field, Schema}; - use crate::tests::assert_nearly_equals; use crate::{Index, IndexReader}; use std::sync::Arc; use tantivy_fst::Regex; @@ -129,7 +129,7 @@ mod test { .unwrap(); assert_eq!(scored_docs.len(), 1, "Expected only 1 document"); let (score, _) = scored_docs[0]; - assert_nearly_equals(1f32, score); + assert_nearly_equals!(1f32, score); } let top_docs = searcher .search(&query_matching_zero, &TopDocs::with_limit(2)) diff --git a/src/query/term_query/mod.rs b/src/query/term_query/mod.rs index 0ea904cf1..5e16c4054 100644 --- a/src/query/term_query/mod.rs +++ b/src/query/term_query/mod.rs @@ -9,11 +9,11 @@ pub use self::term_weight::TermWeight; #[cfg(test)] mod tests { + use crate::assert_nearly_equals; use crate::collector::TopDocs; use crate::docset::DocSet; use crate::query::{Query, QueryParser, Scorer, TermQuery}; use crate::schema::{Field, IndexRecordOption, Schema, STRING, TEXT}; - use crate::tests::assert_nearly_equals; use crate::Index; use crate::Term; @@ -69,7 +69,7 @@ mod tests { .unwrap(); assert_eq!(topdocs.len(), 1); let (score, _) = topdocs[0]; - assert_nearly_equals(0.77802235, score); + assert_nearly_equals!(0.77802235, score); } { let term = Term::from_field_text(left_field, "left1"); @@ -79,9 +79,9 @@ mod tests { .unwrap(); assert_eq!(top_docs.len(), 2); let (score1, _) = top_docs[0]; - assert_nearly_equals(0.27101856, score1); + assert_nearly_equals!(0.27101856, score1); let (score2, _) = top_docs[1]; - assert_nearly_equals(0.13736556, score2); + assert_nearly_equals!(0.13736556, score2); } { let query_parser = QueryParser::for_index(&index, vec![]); @@ -89,9 +89,9 @@ mod tests { let top_docs = searcher.search(&query, &TopDocs::with_limit(2)).unwrap(); assert_eq!(top_docs.len(), 2); let (score1, _) = top_docs[0]; - assert_nearly_equals(0.9153879, score1); + assert_nearly_equals!(0.9153879, score1); let (score2, _) = top_docs[1]; - assert_nearly_equals(0.27101856, score2); + assert_nearly_equals!(0.27101856, score2); } } diff --git a/src/query/term_query/term_scorer.rs b/src/query/term_query/term_scorer.rs index b98e82e58..c910eead8 100644 --- a/src/query/term_query/term_scorer.rs +++ b/src/query/term_query/term_scorer.rs @@ -4,12 +4,12 @@ use crate::DocId; use crate::Score; use crate::fieldnorm::FieldNormReader; -use crate::postings::Postings; use crate::postings::SegmentPostings; +use crate::postings::{FreqReadingOption, Postings}; use crate::query::bm25::BM25Weight; pub struct TermScorer { - pub(crate) postings: SegmentPostings, + postings: SegmentPostings, fieldnorm_reader: FieldNormReader, similarity_weight: BM25Weight, } @@ -26,9 +26,62 @@ impl TermScorer { similarity_weight, } } -} -impl TermScorer { + pub(crate) fn shallow_seek(&mut self, target_doc: DocId) { + self.postings.block_cursor.shallow_seek(target_doc) + } + + + #[cfg(test)] + pub fn create_for_test( + doc_and_tfs: &[(DocId, u32)], + fieldnorm_vals: &[u32], + similarity_weight: BM25Weight, + ) -> crate::Result { + assert!(!doc_and_tfs.is_empty()); + assert!(doc_and_tfs.len() <= fieldnorm_vals.len()); + + let doc_freq = doc_and_tfs.len(); + let max_doc = doc_and_tfs.last().unwrap().0 + 1; + let mut fieldnorms: Vec = std::iter::repeat(1).take(max_doc as usize).collect(); + + for i in 0..doc_freq { + let doc = doc_and_tfs[i].0; + let fieldnorm = fieldnorm_vals[i]; + fieldnorms[doc as usize] = fieldnorm; + } + let fieldnorm_reader = FieldNormReader::from(&fieldnorms[..]); + + let segment_postings = + SegmentPostings::create_from_docs_and_tfs(doc_and_tfs, Some(fieldnorm_reader.clone()))?; + + Ok(TermScorer::new(segment_postings, fieldnorm_reader, similarity_weight)) + } + + /// See `FreqReadingOption`. + pub(crate) fn freq_reading_option(&self) -> FreqReadingOption { + self.postings.block_cursor.freq_reading_option() + } + + /// Returns the maximum score for the current block. + /// + /// In some rare case, the result may not be exact. In this case a lower value is returned, + /// (and may lead us to return a lesser document). + /// + /// At index time, we store the (fieldnorm_id, term frequency) pair that maximizes the + /// score assuming the average fieldnorm computed on this segment. + /// + /// Though extremely rare, it is theoretically possible that the actual average fieldnorm + /// is different enough from the current segment average fieldnorm that the maximum over a + /// specific is achieved on a different document. + /// + /// (The result is on the other hand guaranteed to be correct if there is only one segment). + pub fn block_max_score(&mut self) -> Score { + self.postings + .block_cursor + .block_max_score(&self.fieldnorm_reader, &self.similarity_weight) + } + pub fn term_freq(&self) -> u32 { self.postings.term_freq() } @@ -48,7 +101,11 @@ impl TermScorer { } pub fn max_score(&self) -> f32 { - f32::MAX + self.similarity_weight.max_score() + } + + pub fn last_doc_in_block(&self) -> DocId { + self.postings.block_cursor.skip_reader.last_doc_in_block() } } @@ -77,3 +134,99 @@ impl Scorer for TermScorer { self.similarity_weight.score(fieldnorm_id, term_freq) } } + +#[cfg(test)] +mod tests { + use crate::assert_nearly_equals; + use crate::postings::compression::COMPRESSION_BLOCK_SIZE; + use crate::query::term_query::TermScorer; + use crate::query::{BM25Weight, Scorer}; + use crate::{DocId, DocSet, TERMINATED}; + use proptest::prelude::*; + + #[test] + fn test_term_scorer_max_score() -> crate::Result<()> { + let bm25_weight = BM25Weight::for_one_term(3, 6, 10f32); + let mut term_scorer = + TermScorer::create_for_test(&[(2, 3), (3, 12), (7, 8)], &[10, 12, 100], bm25_weight)?; + let max_scorer = term_scorer.max_score(); + assert_eq!(max_scorer, 1.3990127f32); + assert_eq!(term_scorer.doc(), 2); + assert_eq!(term_scorer.term_freq(), 3); + assert_nearly_equals!(term_scorer.block_max_score(), 1.3676447f32); + assert_nearly_equals!(term_scorer.score(), 1.0892314f32); + assert_eq!(term_scorer.advance(), 3); + assert_eq!(term_scorer.doc(), 3); + assert_eq!(term_scorer.term_freq(), 12); + assert_nearly_equals!(term_scorer.score(), 1.3676447f32); + assert_eq!(term_scorer.advance(), 7); + assert_eq!(term_scorer.doc(), 7); + assert_eq!(term_scorer.term_freq(), 8); + assert_nearly_equals!(term_scorer.score(), 0.72015285f32); + assert_eq!(term_scorer.advance(), TERMINATED); + Ok(()) + } + + + #[test] + fn test_term_scorer_shallow_advance() -> crate::Result<()> { + let bm25_weight = BM25Weight::for_one_term(300, 1024, 10f32); + let mut doc_and_tfs = vec![]; + for i in 0u32..300u32 { + let doc = i * 10; + doc_and_tfs.push((doc, 1u32 + doc % 3u32)); + } + let fieldnorms: Vec = std::iter::repeat(10u32).take(1024).collect(); + let mut term_scorer = + TermScorer::create_for_test(&doc_and_tfs, &fieldnorms, bm25_weight)?; + assert_eq!(term_scorer.doc(), 0u32); + term_scorer.shallow_seek(1289); + assert_eq!(term_scorer.doc(), 0u32); + term_scorer.seek(1289); + assert_eq!(term_scorer.doc(), 1290); + Ok(()) + } + + proptest! { + #[test] + fn test_term_scorer_block_max_score(term_freqs_fieldnorms in proptest::collection::vec((1u32..10u32, 0u32..100u32), 80..300)) { + let term_doc_freq = term_freqs_fieldnorms.len(); + let doc_tfs: Vec<(u32, u32)> = term_freqs_fieldnorms.iter() + .cloned() + .enumerate() + .map(|(doc, (tf, _))| (doc as u32, tf)) + .collect(); + + let mut fieldnorms: Vec = vec![]; + for i in 0..term_doc_freq { + let (tf, num_extra_terms) = term_freqs_fieldnorms[i]; + fieldnorms.push(tf + num_extra_terms); + } + let average_fieldnorm = fieldnorms + .iter() + .cloned() + .sum::() as f32 / term_doc_freq as f32; + // Average fieldnorm is over the entire index, + // not necessarily the docs that are in the posting list. + // For this reason we multiply by 1.1 to make a realistic value. + let bm25_weight = BM25Weight::for_one_term(term_doc_freq as u64, + term_doc_freq as u64 * 10u64, + average_fieldnorm); + + let mut term_scorer = + TermScorer::create_for_test(&doc_tfs[..], &fieldnorms[..], bm25_weight).unwrap(); + + let docs: Vec = (0..term_doc_freq).map(|doc| doc as DocId).collect(); + for block in docs.chunks(COMPRESSION_BLOCK_SIZE) { + let block_max_score = term_scorer.block_max_score(); + let mut block_max_score_computed = 0.0f32; + for &doc in block { + assert_eq!(term_scorer.doc(), doc); + block_max_score_computed = block_max_score_computed.max(term_scorer.score()); + term_scorer.advance(); + } + assert_nearly_equals!(block_max_score_computed, block_max_score); + } + } + } +} diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index 57ca3f87e..af8ec5b09 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -20,12 +20,12 @@ pub struct TermWeight { impl Weight for TermWeight { fn scorer(&self, reader: &SegmentReader, boost: f32) -> Result> { - let term_scorer = self.scorer_specialized(reader, boost)?; + let term_scorer = self.specialized_scorer(reader, boost)?; Ok(Box::new(term_scorer)) } fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result { - let mut scorer = self.scorer_specialized(reader, 1.0f32)?; + let mut scorer = self.specialized_scorer(reader, 1.0f32)?; if scorer.seek(doc) != doc { return Err(does_not_match(doc)); } @@ -52,7 +52,7 @@ impl Weight for TermWeight { reader: &SegmentReader, callback: &mut dyn FnMut(DocId, Score), ) -> crate::Result<()> { - let mut scorer = self.scorer_specialized(reader, 1.0f32)?; + let mut scorer = self.specialized_scorer(reader, 1.0f32)?; for_each_scorer(&mut scorer, callback); Ok(()) } @@ -92,7 +92,7 @@ impl TermWeight { } } - fn scorer_specialized(&self, reader: &SegmentReader, boost: f32) -> Result { + pub fn specialized_scorer(&self, reader: &SegmentReader, boost: f32) -> Result { let field = self.term.field(); let inverted_index = reader.inverted_index(field); let fieldnorm_reader = reader.get_fieldnorms_reader(field); diff --git a/src/schema/text_options.rs b/src/schema/text_options.rs index 20be3b764..16ffe3d21 100644 --- a/src/schema/text_options.rs +++ b/src/schema/text_options.rs @@ -4,7 +4,6 @@ use crate::schema::IndexRecordOption; use serde::{Deserialize, Serialize}; use std::borrow::Cow; use std::ops::BitOr; -use crate::query::BM25Params; /// Define how a text field should be handled by tantivy. #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] @@ -46,7 +45,6 @@ impl Default for TextOptions { } } - /// Configuration defining indexing for a text field. /// /// It defines @@ -57,8 +55,6 @@ impl Default for TextOptions { pub struct TextFieldIndexing { record: IndexRecordOption, tokenizer: Cow<'static, str>, - #[serde(skip_serializing_if = "Option::is_none")] - block_wand_bm25: Option, } impl Default for TextFieldIndexing { @@ -66,7 +62,6 @@ impl Default for TextFieldIndexing { TextFieldIndexing { tokenizer: Cow::Borrowed("default"), record: IndexRecordOption::Basic, - block_wand_bm25: None } } } @@ -97,11 +92,6 @@ impl TextFieldIndexing { pub fn index_option(&self) -> IndexRecordOption { self.record } - - pub fn set_blockwand_bm25(mut self, bm25_params: BM25Params) -> TextFieldIndexing { - self.block_wand_bm25 = Some(bm25_params); - self - } } /// The field will be untokenized and indexed @@ -109,7 +99,6 @@ pub const STRING: TextOptions = TextOptions { indexing: Some(TextFieldIndexing { tokenizer: Cow::Borrowed("raw"), record: IndexRecordOption::Basic, - block_wand_bm25: None }), stored: false, }; @@ -119,7 +108,6 @@ pub const TEXT: TextOptions = TextOptions { indexing: Some(TextFieldIndexing { tokenizer: Cow::Borrowed("default"), record: IndexRecordOption::WithFreqsAndPositions, - block_wand_bm25: None, }), stored: false, };