From 7df5a8a530162d4e87393995024e227a619260ff Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Fri, 5 Jun 2020 19:37:38 +0900 Subject: [PATCH] ll --- src/core/segment_reader.rs | 16 ++--- src/fieldnorm/mod.rs | 2 +- src/fieldnorm/reader.rs | 28 ++++++++ src/fieldnorm/writer.rs | 3 +- src/indexer/merger.rs | 30 ++++++--- src/indexer/segment_serializer.rs | 24 ++++--- src/indexer/segment_updater.rs | 8 ++- src/indexer/segment_writer.rs | 19 ++++-- src/postings/mod.rs | 2 +- src/postings/postings_writer.rs | 5 +- src/postings/segment_postings.rs | 2 +- src/postings/serializer.rs | 108 +++++++++++++++++++++++++++++- src/postings/skip.rs | 4 ++ src/query/bm25.rs | 2 +- 14 files changed, 208 insertions(+), 45 deletions(-) diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index b97d498b8..a3505ff56 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -8,7 +8,7 @@ use crate::directory::ReadOnlySource; use crate::fastfield::DeleteBitSet; use crate::fastfield::FacetReader; use crate::fastfield::FastFieldReaders; -use crate::fieldnorm::FieldNormReader; +use crate::fieldnorm::{FieldNormReader, FieldNormReaders}; use crate::schema::Field; use crate::schema::FieldType; use crate::schema::Schema; @@ -48,7 +48,7 @@ pub struct SegmentReader { positions_composite: CompositeFile, positions_idx_composite: CompositeFile, fast_fields_readers: Arc, - fieldnorms_composite: CompositeFile, + fieldnorm_readers: FieldNormReaders, store_source: ReadOnlySource, delete_bitset_opt: Option, @@ -126,8 +126,8 @@ impl SegmentReader { /// They are simply stored as a fast field, serialized in /// the `.fieldnorm` file of the segment. pub fn get_fieldnorms_reader(&self, field: Field) -> FieldNormReader { - if let Some(fieldnorm_source) = self.fieldnorms_composite.open_read(field) { - FieldNormReader::open(fieldnorm_source) + if let Some(fieldnorm_source) = self.fieldnorm_readers.get_field(field) { + fieldnorm_source } else { let field_name = self.schema.get_field_name(field); let err_msg = format!( @@ -178,8 +178,8 @@ impl SegmentReader { let fast_field_readers = Arc::new(FastFieldReaders::load_all(&schema, &fast_fields_composite)?); - let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?; - let fieldnorms_composite = CompositeFile::open(&fieldnorms_data)?; + let fieldnorm_data = segment.open_read(SegmentComponent::FIELDNORMS)?; + let fieldnorm_readers = FieldNormReaders::new(fieldnorm_data)?; let delete_bitset_opt = if segment.meta().has_deletes() { let delete_data = segment.open_read(SegmentComponent::DELETE)?; @@ -195,7 +195,7 @@ impl SegmentReader { termdict_composite, postings_composite, fast_fields_readers: fast_field_readers, - fieldnorms_composite, + fieldnorm_readers, segment_id: segment.id(), store_source, delete_bitset_opt, @@ -308,7 +308,7 @@ impl SegmentReader { self.positions_composite.space_usage(), self.positions_idx_composite.space_usage(), self.fast_fields_readers.space_usage(), - self.fieldnorms_composite.space_usage(), + self.fieldnorm_readers.space_usage(), self.get_store_reader().space_usage(), self.delete_bitset_opt .as_ref() diff --git a/src/fieldnorm/mod.rs b/src/fieldnorm/mod.rs index 12370608d..7450376c7 100644 --- a/src/fieldnorm/mod.rs +++ b/src/fieldnorm/mod.rs @@ -21,7 +21,7 @@ mod reader; mod serializer; mod writer; -pub use self::reader::FieldNormReader; +pub use self::reader::{FieldNormReader, FieldNormReaders}; pub use self::serializer::FieldNormsSerializer; pub use self::writer::FieldNormsWriter; diff --git a/src/fieldnorm/reader.rs b/src/fieldnorm/reader.rs index 8a57739fa..087384e52 100644 --- a/src/fieldnorm/reader.rs +++ b/src/fieldnorm/reader.rs @@ -1,6 +1,34 @@ use super::{fieldnorm_to_id, id_to_fieldnorm}; use crate::directory::ReadOnlySource; use crate::DocId; +use crate::common::CompositeFile; +use crate::schema::Field; +use std::sync::Arc; +use crate::space_usage::PerFieldSpaceUsage; + +#[derive(Clone)] +pub struct FieldNormReaders { + data: Arc, +} + +impl FieldNormReaders { + pub fn new(source: ReadOnlySource) -> crate::Result { + let data = CompositeFile::open(&source)?; + Ok(FieldNormReaders { + data: Arc::new(data) + }) + } + + pub fn get_field(&self, field: Field) -> Option { + self.data + .open_read(field) + .map(FieldNormReader::open) + } + + pub fn space_usage(&self) -> PerFieldSpaceUsage { + self.data.space_usage() + } +} /// Reads the fieldnorm associated to a document. /// The fieldnorm represents the length associated to diff --git a/src/fieldnorm/writer.rs b/src/fieldnorm/writer.rs index ceeac05d2..5c72a1362 100644 --- a/src/fieldnorm/writer.rs +++ b/src/fieldnorm/writer.rs @@ -78,11 +78,12 @@ impl FieldNormsWriter { } /// Serialize the seen fieldnorm values to the serializer for all fields. - pub fn serialize(&self, fieldnorms_serializer: &mut FieldNormsSerializer) -> io::Result<()> { + pub fn serialize(&self, mut fieldnorms_serializer: FieldNormsSerializer) -> io::Result<()> { for &field in self.fields.iter() { let fieldnorm_values: &[u8] = &self.fieldnorms_buffer[field.field_id() as usize][..]; fieldnorms_serializer.serialize_field(field, fieldnorm_values)?; } + fieldnorms_serializer.close()?; Ok(()) } } diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 0bdea7047..c2a1886b3 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -8,7 +8,7 @@ use crate::fastfield::DeleteBitSet; use crate::fastfield::FastFieldReader; use crate::fastfield::FastFieldSerializer; use crate::fastfield::MultiValueIntFastFieldReader; -use crate::fieldnorm::FieldNormReader; +use crate::fieldnorm::{FieldNormReader, FieldNormReaders}; use crate::fieldnorm::FieldNormsSerializer; use crate::fieldnorm::FieldNormsWriter; use crate::indexer::SegmentSerializer; @@ -20,7 +20,7 @@ use crate::schema::{Field, Schema}; use crate::store::StoreWriter; use crate::termdict::TermMerger; use crate::termdict::TermOrdinal; -use crate::DocId; +use crate::{DocId, SegmentComponent}; use std::cmp; use std::collections::HashMap; @@ -167,7 +167,7 @@ impl IndexMerger { fn write_fieldnorms( &self, - fieldnorms_serializer: &mut FieldNormsSerializer, + mut fieldnorms_serializer: FieldNormsSerializer, ) -> crate::Result<()> { let fields = FieldNormsWriter::fields_with_fieldnorm(&self.schema); let mut fieldnorms_data = Vec::with_capacity(self.max_doc as usize); @@ -181,8 +181,9 @@ impl IndexMerger { } } fieldnorms_serializer.serialize_field(field, &fieldnorms_data[..])?; - } - Ok(()) + } + fieldnorms_serializer.close()?; + Ok(()) } fn write_fast_fields( @@ -492,6 +493,7 @@ impl IndexMerger { indexed_field: Field, field_type: &FieldType, serializer: &mut InvertedIndexSerializer, + fieldnorm_reader: Option ) -> crate::Result> { let mut positions_buffer: Vec = Vec::with_capacity(1_000); let mut delta_computer = DeltaComputer::new(); @@ -550,7 +552,7 @@ impl IndexMerger { // - Segment 2's doc ids become [seg0.max_doc + seg1.max_doc, // seg0.max_doc + seg1.max_doc + seg2.max_doc] // ... - let mut field_serializer = serializer.new_field(indexed_field, total_num_tokens)?; + let mut field_serializer = serializer.new_field(indexed_field, total_num_tokens, fieldnorm_reader)?; let field_entry = self.schema.get_field_entry(indexed_field); @@ -615,8 +617,8 @@ impl IndexMerger { // there is at least one document. let term_freq = segment_postings.term_freq(); segment_postings.positions(&mut positions_buffer); - - let delta_positions = delta_computer.compute_delta(&positions_buffer); + let delta_positions = + delta_computer.compute_delta(&positions_buffer); field_serializer.write_doc( remapped_doc_id, term_freq, @@ -639,12 +641,14 @@ impl IndexMerger { fn write_postings( &self, serializer: &mut InvertedIndexSerializer, + fieldnorm_readers: FieldNormReaders ) -> crate::Result> { let mut term_ordinal_mappings = HashMap::new(); for (field, field_entry) in self.schema.fields() { + let fieldnorm_reader = fieldnorm_readers.get_field(field); if field_entry.is_indexed() { if let Some(term_ordinal_mapping) = - self.write_postings_for_field(field, field_entry.field_type(), serializer)? + self.write_postings_for_field(field, field_entry.field_type(), serializer, fieldnorm_reader)? { term_ordinal_mappings.insert(field, term_ordinal_mapping); } @@ -671,8 +675,12 @@ impl IndexMerger { impl SerializableSegment for IndexMerger { fn write(&self, mut serializer: SegmentSerializer) -> crate::Result { - let term_ord_mappings = self.write_postings(serializer.get_postings_serializer())?; - self.write_fieldnorms(serializer.get_fieldnorms_serializer())?; + if let Some(fieldnorms_serializer) = serializer.get_fieldnorms_serializer() { + self.write_fieldnorms(fieldnorms_serializer)?; + } + let fieldnorm_data = serializer.segment().open_read(SegmentComponent::FIELDNORMS)?; + let fieldnorm_readers = FieldNormReaders::new(fieldnorm_data)?; + let term_ord_mappings = self.write_postings(serializer.get_postings_serializer(), fieldnorm_readers)?; self.write_fast_fields(serializer.get_fast_field_serializer(), term_ord_mappings)?; self.write_storable_fields(serializer.get_store_writer())?; serializer.close()?; diff --git a/src/indexer/segment_serializer.rs b/src/indexer/segment_serializer.rs index d259b6bb5..03385db3d 100644 --- a/src/indexer/segment_serializer.rs +++ b/src/indexer/segment_serializer.rs @@ -8,15 +8,16 @@ use crate::store::StoreWriter; /// Segment serializer is in charge of laying out on disk /// the data accumulated and sorted by the `SegmentWriter`. pub struct SegmentSerializer { + segment: Segment, store_writer: StoreWriter, fast_field_serializer: FastFieldSerializer, - fieldnorms_serializer: FieldNormsSerializer, + fieldnorms_serializer: Option, postings_serializer: InvertedIndexSerializer, } impl SegmentSerializer { /// Creates a new `SegmentSerializer`. - pub fn for_segment(segment: &mut Segment) -> crate::Result { + pub fn for_segment(mut segment: Segment) -> crate::Result { let store_write = segment.open_write(SegmentComponent::STORE)?; let fast_field_write = segment.open_write(SegmentComponent::FASTFIELDS)?; @@ -25,15 +26,20 @@ impl SegmentSerializer { let fieldnorms_write = segment.open_write(SegmentComponent::FIELDNORMS)?; let fieldnorms_serializer = FieldNormsSerializer::from_write(fieldnorms_write)?; - let postings_serializer = InvertedIndexSerializer::open(segment)?; + let postings_serializer = InvertedIndexSerializer::open(&mut segment)?; Ok(SegmentSerializer { + segment, store_writer: StoreWriter::new(store_write), fast_field_serializer, - fieldnorms_serializer, + fieldnorms_serializer: Some(fieldnorms_serializer), postings_serializer, }) } + pub fn segment(&self) -> &Segment { + &self.segment + } + /// Accessor to the `PostingsSerializer`. pub fn get_postings_serializer(&mut self) -> &mut InvertedIndexSerializer { &mut self.postings_serializer @@ -45,8 +51,8 @@ impl SegmentSerializer { } /// Accessor to the field norm serializer. - pub fn get_fieldnorms_serializer(&mut self) -> &mut FieldNormsSerializer { - &mut self.fieldnorms_serializer + pub fn get_fieldnorms_serializer(&mut self) -> Option { + self.fieldnorms_serializer.take() } /// Accessor to the `StoreWriter`. @@ -55,11 +61,13 @@ impl SegmentSerializer { } /// Finalize the segment serialization. - pub fn close(self) -> crate::Result<()> { + pub fn close(mut self) -> crate::Result<()> { + if let Some(fieldnorms_serializer) = self.get_fieldnorms_serializer() { + fieldnorms_serializer.close()?; + } self.fast_field_serializer.close()?; self.postings_serializer.close()?; self.store_writer.close()?; - self.fieldnorms_serializer.close()?; Ok(()) } } diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index 14cdb628a..59b6280a3 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -112,7 +112,7 @@ fn merge( target_opstamp: Opstamp, ) -> crate::Result { // first we need to apply deletes to our segment. - let mut merged_segment = index.new_segment(); + let merged_segment = index.new_segment(); // First we apply all of the delet to the merged segment, up to the target opstamp. for segment_entry in &mut segment_entries { @@ -130,12 +130,14 @@ fn merge( // An IndexMerger is like a "view" of our merged segments. let merger: IndexMerger = IndexMerger::open(index.schema(), &segments[..])?; + let merged_segment_id = merged_segment.id(); + // ... we just serialize this index merger in our new segment to merge the two segments. - let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment)?; + let segment_serializer = SegmentSerializer::for_segment(merged_segment)?; let num_docs = merger.write(segment_serializer)?; - let segment_meta = index.new_segment_meta(merged_segment.id(), num_docs); + let segment_meta = index.new_segment_meta(merged_segment_id, num_docs); Ok(SegmentEntry::new(segment_meta, delete_cursor, None)) } diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 4b2b1f35d..bae33e99d 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -2,7 +2,7 @@ use super::operation::AddOperation; use crate::core::Segment; use crate::core::SerializableSegment; use crate::fastfield::FastFieldsWriter; -use crate::fieldnorm::FieldNormsWriter; +use crate::fieldnorm::{FieldNormsWriter, FieldNormReaders}; use crate::indexer::segment_serializer::SegmentSerializer; use crate::postings::compute_table_size; use crate::postings::MultiFieldPostingsWriter; @@ -14,7 +14,7 @@ use crate::schema::{Field, FieldEntry}; use crate::tokenizer::{BoxTokenStream, PreTokenizedStream}; use crate::tokenizer::{FacetTokenizer, TextAnalyzer}; use crate::tokenizer::{TokenStreamChain, Tokenizer}; -use crate::DocId; +use crate::{DocId, SegmentComponent}; use crate::Opstamp; use std::io; use std::str; @@ -62,11 +62,12 @@ impl SegmentWriter { /// - schema pub fn for_segment( memory_budget: usize, - mut segment: Segment, + segment: Segment, schema: &Schema, ) -> crate::Result { + let tokenizer_manager = segment.index().tokenizers().clone(); let table_num_bits = initial_table_size(memory_budget)?; - let segment_serializer = SegmentSerializer::for_segment(&mut segment)?; + let segment_serializer = SegmentSerializer::for_segment(segment)?; let multifield_postings = MultiFieldPostingsWriter::new(schema, table_num_bits); let tokenizers = schema .fields() @@ -76,7 +77,7 @@ impl SegmentWriter { .get_indexing_options() .and_then(|text_index_option| { let tokenizer_name = &text_index_option.tokenizer(); - segment.index().tokenizers().get(tokenizer_name) + tokenizer_manager.get(tokenizer_name) }), _ => None, }, @@ -280,9 +281,13 @@ fn write( fieldnorms_writer: &FieldNormsWriter, mut serializer: SegmentSerializer, ) -> crate::Result<()> { - let term_ord_map = multifield_postings.serialize(serializer.get_postings_serializer())?; + if let Some(fieldnorms_serializer) = serializer.get_fieldnorms_serializer() { + fieldnorms_writer.serialize(fieldnorms_serializer)?; + } + let fieldnorm_data = serializer.segment().open_read(SegmentComponent::FIELDNORMS)?; + let fieldnorm_readers = FieldNormReaders::new(fieldnorm_data)?; + let term_ord_map = multifield_postings.serialize(serializer.get_postings_serializer(), fieldnorm_readers)?; fast_field_writers.serialize(serializer.get_fast_field_serializer(), &term_ord_map)?; - fieldnorms_writer.serialize(serializer.get_fieldnorms_serializer())?; serializer.close()?; Ok(()) } diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 20ca73d44..463d6ce0d 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -73,7 +73,7 @@ pub mod tests { let mut segment = index.new_segment(); let mut posting_serializer = InvertedIndexSerializer::open(&mut segment).unwrap(); { - let mut field_serializer = posting_serializer.new_field(text_field, 120 * 4).unwrap(); + let mut field_serializer = posting_serializer.new_field(text_field, 120 * 4, None).unwrap(); field_serializer.new_term("abc".as_bytes()).unwrap(); for doc_id in 0u32..120u32 { let delta_positions = vec![1, 2, 3, 2]; diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index 776c55cc5..10433c0f7 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -16,6 +16,7 @@ use std::collections::HashMap; use std::io; use std::marker::PhantomData; use std::ops::DerefMut; +use crate::fieldnorm::FieldNormReaders; fn posting_from_field_entry(field_entry: &FieldEntry) -> Box { match *field_entry.field_type() { @@ -128,6 +129,7 @@ impl MultiFieldPostingsWriter { pub fn serialize( &self, serializer: &mut InvertedIndexSerializer, + fieldnorm_readers: FieldNormReaders ) -> crate::Result>> { let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> = self.term_index.iter().collect(); @@ -161,8 +163,9 @@ impl MultiFieldPostingsWriter { } let postings_writer = &self.per_field_postings_writers[field.field_id() as usize]; + let fieldnorm_reader = fieldnorm_readers.get_field(field); let mut field_serializer = - serializer.new_field(field, postings_writer.total_num_tokens())?; + serializer.new_field(field, postings_writer.total_num_tokens(), fieldnorm_reader)?; postings_writer.serialize( &term_offsets[start..stop], &mut field_serializer, diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index 7c2a266cf..a7268fe44 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -53,7 +53,7 @@ impl SegmentPostings { pub fn create_from_docs(docs: &[u32]) -> SegmentPostings { let mut buffer = Vec::new(); { - let mut postings_serializer = PostingsSerializer::new(&mut buffer, false, false); + let mut postings_serializer = PostingsSerializer::new(&mut buffer, false, false, None); for &doc in docs { postings_serializer.write_doc(doc, 1u32); } diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index c1245eb1a..21fdf9038 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -11,6 +11,8 @@ use crate::schema::{Field, FieldEntry, FieldType}; use crate::termdict::{TermDictionaryBuilder, TermOrdinal}; use crate::DocId; use std::io::{self, Write}; +use crate::fieldnorm::FieldNormReader; +use std::cmp::Ordering; /// `InvertedIndexSerializer` is in charge of serializing /// postings on disk, in the @@ -89,6 +91,7 @@ impl InvertedIndexSerializer { &mut self, field: Field, total_num_tokens: u64, + fieldnorm_reader: Option ) -> io::Result> { let field_entry: &FieldEntry = self.schema.get_field_entry(field); let term_dictionary_write = self.terms_write.for_field(field); @@ -103,6 +106,7 @@ impl InvertedIndexSerializer { postings_write, positions_write, positionsidx_write, + fieldnorm_reader ) } @@ -134,6 +138,7 @@ impl<'a> FieldSerializer<'a> { postings_write: &'a mut CountingWriter, positions_write: &'a mut CountingWriter, positionsidx_write: &'a mut CountingWriter, + fieldnorm_reader: Option ) -> io::Result> { let (term_freq_enabled, position_enabled): (bool, bool) = match field_type { FieldType::Str(ref text_options) => { @@ -148,7 +153,7 @@ impl<'a> FieldSerializer<'a> { }; let term_dictionary_builder = TermDictionaryBuilder::create(term_dictionary_write)?; let postings_serializer = - PostingsSerializer::new(postings_write, term_freq_enabled, position_enabled); + PostingsSerializer::new(postings_write, term_freq_enabled, position_enabled, fieldnorm_reader); let positions_serializer_opt = if position_enabled { Some(PositionSerializer::new(positions_write, positionsidx_write)) } else { @@ -161,7 +166,7 @@ impl<'a> FieldSerializer<'a> { positions_serializer_opt, current_term_info: TermInfo::default(), term_open: false, - num_terms: TermOrdinal::default(), + num_terms: TermOrdinal::default() }) } @@ -306,6 +311,10 @@ pub struct PostingsSerializer { termfreq_enabled: bool, termfreq_sum_enabled: bool, + + fieldnorm_reader: Option, + + tf_fn_output: Vec<(u8, u32)> } impl PostingsSerializer { @@ -313,6 +322,7 @@ impl PostingsSerializer { write: W, termfreq_enabled: bool, termfreq_sum_enabled: bool, + fieldnorm_reader: Option ) -> PostingsSerializer { PostingsSerializer { output_write: CountingWriter::wrap(write), @@ -326,6 +336,10 @@ impl PostingsSerializer { last_doc_id_encoded: 0u32, termfreq_enabled, termfreq_sum_enabled, + + fieldnorm_reader, + + tf_fn_output: Vec::new() } } @@ -352,6 +366,19 @@ impl PostingsSerializer { let sum_freq = self.block.term_freqs().iter().cloned().sum(); self.skip_write.write_total_term_freq(sum_freq); } + if let Some(fieldnorm_reader) = &self.fieldnorm_reader { + let docs = self.block.doc_ids; + let tfs = self.block.term_freqs; + let fn_id_tf_pairs = (0..COMPRESSION_BLOCK_SIZE) + .map(|i| { + let doc = docs[i]; + let tf = tfs[i]; + let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc); + (fieldnorm_id, tf) + }); + find_maximal_pairs(fn_id_tf_pairs, &mut self.tf_fn_output); + self.skip_write.write_blockwand_info(&self.tf_fn_output[..]); + } } self.block.clear(); } @@ -412,3 +439,80 @@ impl PostingsSerializer { self.last_doc_id_encoded = 0; } } + + +fn cmp(left: (u8, u32), right: (u8, u32)) -> Option { + let fieldnorm_cmp = left.0.cmp(&right.0).reverse(); + let term_freq_cmp= left.1.cmp(&right.1); + match (fieldnorm_cmp, term_freq_cmp) { + (Ordering::Equal, Ordering::Equal) => Some(Ordering::Equal), + (Ordering::Less, Ordering::Greater) | (Ordering::Greater, Ordering::Less) => None, + (Ordering::Less, _) | (_, Ordering::Less) => Some(Ordering::Less), + (Ordering::Greater, _) | (_, Ordering::Greater) => Some(Ordering::Greater), + } +} + +fn remove_lower(output: &mut Vec<(u8, u32)>, new_el: (u8, u32)) { + let mut i = 0; + while i < output.len() { + match cmp(output[i], new_el) { + Some(Ordering::Equal) | Some(Ordering::Greater) => { + return; + } + Some(Ordering::Less) => { + output.swap_remove(i); + } + None => { + i += 1; + } + } + } + output.push(new_el); +} + +fn find_maximal_pairs>(mut fn_tf_it: Iter, output: &mut Vec<(u8, u32)>) { + output.clear(); + if let Some((u32, u8)) = fn_tf_it.next() { + output.push((u32, u8)); + } else { + return; + } + for (fieldnorm_id, term_freq) in fn_tf_it { + remove_lower(output, (fieldnorm_id, term_freq)); + } +} + +#[cfg(test)] +mod tests { + use super::find_maximal_pairs; + + #[test] + fn test_tf_fn_id_empty() { + let mut output: Vec<(u8, u32)> = Vec::new(); + find_maximal_pairs(vec![].into_iter(), &mut output); + assert_eq!(&output[..], &[]); + } + + #[test] + fn test_tf_fn_id_output_should_be_cleared() { + let mut output: Vec<(u8, u32)> = vec![(1u8, 1u32)]; + find_maximal_pairs(vec![].into_iter(), &mut output); + assert_eq!(&output[..], &[]); + } + + #[test] + fn test_tf_fn_id_no_reduction() { + let mut output: Vec<(u8, u32)> = Vec::new(); + find_maximal_pairs(vec![(1u8, 3u32), (2u8, 4u32)].into_iter(), &mut output); + assert_eq!(&output[..], &[(1u8, 3u32), (2u8, 4u32)]); + } + + #[test] + fn test_tf_fn_id_reduction() { + let mut output: Vec<(u8, u32)> = Vec::new(); + find_maximal_pairs(vec![(1u8, 3u32), (2u8, 2u32)].into_iter(), &mut output); + assert_eq!(&output[..], &[(1u8, 3u32)]); + find_maximal_pairs(vec![(2u8, 2u32), (1u8, 3u32)].into_iter(), &mut output); + assert_eq!(&output[..], &[(1u8, 3u32)]); + } +} \ No newline at end of file diff --git a/src/postings/skip.rs b/src/postings/skip.rs index 46097777c..69e6239c9 100644 --- a/src/postings/skip.rs +++ b/src/postings/skip.rs @@ -40,6 +40,10 @@ impl SkipSerializer { .expect("Should never fail"); } + pub fn write_blockwand_info(&mut self, fn_tf_pairs: &[(u8, u32)]) { + + } + pub fn data(&self) -> &[u8] { &self.buffer[..] } diff --git a/src/query/bm25.rs b/src/query/bm25.rs index 48f84fece..05f4f46a5 100644 --- a/src/query/bm25.rs +++ b/src/query/bm25.rs @@ -143,6 +143,6 @@ mod tests { #[test] fn test_idf() { - assert_nearly_equals(idf(1, 2), 0.6931472); + assert_nearly_equals(idf(1, 2), std::f32::consts::LN_2); } }