mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-28 14:10:42 +00:00
ll
This commit is contained in:
@@ -8,7 +8,7 @@ use crate::directory::ReadOnlySource;
|
||||
use crate::fastfield::DeleteBitSet;
|
||||
use crate::fastfield::FacetReader;
|
||||
use crate::fastfield::FastFieldReaders;
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
|
||||
use crate::schema::Field;
|
||||
use crate::schema::FieldType;
|
||||
use crate::schema::Schema;
|
||||
@@ -48,7 +48,7 @@ pub struct SegmentReader {
|
||||
positions_composite: CompositeFile,
|
||||
positions_idx_composite: CompositeFile,
|
||||
fast_fields_readers: Arc<FastFieldReaders>,
|
||||
fieldnorms_composite: CompositeFile,
|
||||
fieldnorm_readers: FieldNormReaders,
|
||||
|
||||
store_source: ReadOnlySource,
|
||||
delete_bitset_opt: Option<DeleteBitSet>,
|
||||
@@ -126,8 +126,8 @@ impl SegmentReader {
|
||||
/// They are simply stored as a fast field, serialized in
|
||||
/// the `.fieldnorm` file of the segment.
|
||||
pub fn get_fieldnorms_reader(&self, field: Field) -> FieldNormReader {
|
||||
if let Some(fieldnorm_source) = self.fieldnorms_composite.open_read(field) {
|
||||
FieldNormReader::open(fieldnorm_source)
|
||||
if let Some(fieldnorm_source) = self.fieldnorm_readers.get_field(field) {
|
||||
fieldnorm_source
|
||||
} else {
|
||||
let field_name = self.schema.get_field_name(field);
|
||||
let err_msg = format!(
|
||||
@@ -178,8 +178,8 @@ impl SegmentReader {
|
||||
let fast_field_readers =
|
||||
Arc::new(FastFieldReaders::load_all(&schema, &fast_fields_composite)?);
|
||||
|
||||
let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
|
||||
let fieldnorms_composite = CompositeFile::open(&fieldnorms_data)?;
|
||||
let fieldnorm_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
|
||||
let fieldnorm_readers = FieldNormReaders::new(fieldnorm_data)?;
|
||||
|
||||
let delete_bitset_opt = if segment.meta().has_deletes() {
|
||||
let delete_data = segment.open_read(SegmentComponent::DELETE)?;
|
||||
@@ -195,7 +195,7 @@ impl SegmentReader {
|
||||
termdict_composite,
|
||||
postings_composite,
|
||||
fast_fields_readers: fast_field_readers,
|
||||
fieldnorms_composite,
|
||||
fieldnorm_readers,
|
||||
segment_id: segment.id(),
|
||||
store_source,
|
||||
delete_bitset_opt,
|
||||
@@ -308,7 +308,7 @@ impl SegmentReader {
|
||||
self.positions_composite.space_usage(),
|
||||
self.positions_idx_composite.space_usage(),
|
||||
self.fast_fields_readers.space_usage(),
|
||||
self.fieldnorms_composite.space_usage(),
|
||||
self.fieldnorm_readers.space_usage(),
|
||||
self.get_store_reader().space_usage(),
|
||||
self.delete_bitset_opt
|
||||
.as_ref()
|
||||
|
||||
@@ -21,7 +21,7 @@ mod reader;
|
||||
mod serializer;
|
||||
mod writer;
|
||||
|
||||
pub use self::reader::FieldNormReader;
|
||||
pub use self::reader::{FieldNormReader, FieldNormReaders};
|
||||
pub use self::serializer::FieldNormsSerializer;
|
||||
pub use self::writer::FieldNormsWriter;
|
||||
|
||||
|
||||
@@ -1,6 +1,34 @@
|
||||
use super::{fieldnorm_to_id, id_to_fieldnorm};
|
||||
use crate::directory::ReadOnlySource;
|
||||
use crate::DocId;
|
||||
use crate::common::CompositeFile;
|
||||
use crate::schema::Field;
|
||||
use std::sync::Arc;
|
||||
use crate::space_usage::PerFieldSpaceUsage;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct FieldNormReaders {
|
||||
data: Arc<CompositeFile>,
|
||||
}
|
||||
|
||||
impl FieldNormReaders {
|
||||
pub fn new(source: ReadOnlySource) -> crate::Result<FieldNormReaders> {
|
||||
let data = CompositeFile::open(&source)?;
|
||||
Ok(FieldNormReaders {
|
||||
data: Arc::new(data)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn get_field(&self, field: Field) -> Option<FieldNormReader> {
|
||||
self.data
|
||||
.open_read(field)
|
||||
.map(FieldNormReader::open)
|
||||
}
|
||||
|
||||
pub fn space_usage(&self) -> PerFieldSpaceUsage {
|
||||
self.data.space_usage()
|
||||
}
|
||||
}
|
||||
|
||||
/// Reads the fieldnorm associated to a document.
|
||||
/// The fieldnorm represents the length associated to
|
||||
|
||||
@@ -78,11 +78,12 @@ impl FieldNormsWriter {
|
||||
}
|
||||
|
||||
/// Serialize the seen fieldnorm values to the serializer for all fields.
|
||||
pub fn serialize(&self, fieldnorms_serializer: &mut FieldNormsSerializer) -> io::Result<()> {
|
||||
pub fn serialize(&self, mut fieldnorms_serializer: FieldNormsSerializer) -> io::Result<()> {
|
||||
for &field in self.fields.iter() {
|
||||
let fieldnorm_values: &[u8] = &self.fieldnorms_buffer[field.field_id() as usize][..];
|
||||
fieldnorms_serializer.serialize_field(field, fieldnorm_values)?;
|
||||
}
|
||||
fieldnorms_serializer.close()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,7 +8,7 @@ use crate::fastfield::DeleteBitSet;
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::fastfield::FastFieldSerializer;
|
||||
use crate::fastfield::MultiValueIntFastFieldReader;
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
|
||||
use crate::fieldnorm::FieldNormsSerializer;
|
||||
use crate::fieldnorm::FieldNormsWriter;
|
||||
use crate::indexer::SegmentSerializer;
|
||||
@@ -20,7 +20,7 @@ use crate::schema::{Field, Schema};
|
||||
use crate::store::StoreWriter;
|
||||
use crate::termdict::TermMerger;
|
||||
use crate::termdict::TermOrdinal;
|
||||
use crate::DocId;
|
||||
use crate::{DocId, SegmentComponent};
|
||||
use std::cmp;
|
||||
use std::collections::HashMap;
|
||||
|
||||
@@ -167,7 +167,7 @@ impl IndexMerger {
|
||||
|
||||
fn write_fieldnorms(
|
||||
&self,
|
||||
fieldnorms_serializer: &mut FieldNormsSerializer,
|
||||
mut fieldnorms_serializer: FieldNormsSerializer,
|
||||
) -> crate::Result<()> {
|
||||
let fields = FieldNormsWriter::fields_with_fieldnorm(&self.schema);
|
||||
let mut fieldnorms_data = Vec::with_capacity(self.max_doc as usize);
|
||||
@@ -181,8 +181,9 @@ impl IndexMerger {
|
||||
}
|
||||
}
|
||||
fieldnorms_serializer.serialize_field(field, &fieldnorms_data[..])?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
fieldnorms_serializer.close()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_fast_fields(
|
||||
@@ -492,6 +493,7 @@ impl IndexMerger {
|
||||
indexed_field: Field,
|
||||
field_type: &FieldType,
|
||||
serializer: &mut InvertedIndexSerializer,
|
||||
fieldnorm_reader: Option<FieldNormReader>
|
||||
) -> crate::Result<Option<TermOrdinalMapping>> {
|
||||
let mut positions_buffer: Vec<u32> = Vec::with_capacity(1_000);
|
||||
let mut delta_computer = DeltaComputer::new();
|
||||
@@ -550,7 +552,7 @@ impl IndexMerger {
|
||||
// - Segment 2's doc ids become [seg0.max_doc + seg1.max_doc,
|
||||
// seg0.max_doc + seg1.max_doc + seg2.max_doc]
|
||||
// ...
|
||||
let mut field_serializer = serializer.new_field(indexed_field, total_num_tokens)?;
|
||||
let mut field_serializer = serializer.new_field(indexed_field, total_num_tokens, fieldnorm_reader)?;
|
||||
|
||||
let field_entry = self.schema.get_field_entry(indexed_field);
|
||||
|
||||
@@ -615,8 +617,8 @@ impl IndexMerger {
|
||||
// there is at least one document.
|
||||
let term_freq = segment_postings.term_freq();
|
||||
segment_postings.positions(&mut positions_buffer);
|
||||
|
||||
let delta_positions = delta_computer.compute_delta(&positions_buffer);
|
||||
let delta_positions =
|
||||
delta_computer.compute_delta(&positions_buffer);
|
||||
field_serializer.write_doc(
|
||||
remapped_doc_id,
|
||||
term_freq,
|
||||
@@ -639,12 +641,14 @@ impl IndexMerger {
|
||||
fn write_postings(
|
||||
&self,
|
||||
serializer: &mut InvertedIndexSerializer,
|
||||
fieldnorm_readers: FieldNormReaders
|
||||
) -> crate::Result<HashMap<Field, TermOrdinalMapping>> {
|
||||
let mut term_ordinal_mappings = HashMap::new();
|
||||
for (field, field_entry) in self.schema.fields() {
|
||||
let fieldnorm_reader = fieldnorm_readers.get_field(field);
|
||||
if field_entry.is_indexed() {
|
||||
if let Some(term_ordinal_mapping) =
|
||||
self.write_postings_for_field(field, field_entry.field_type(), serializer)?
|
||||
self.write_postings_for_field(field, field_entry.field_type(), serializer, fieldnorm_reader)?
|
||||
{
|
||||
term_ordinal_mappings.insert(field, term_ordinal_mapping);
|
||||
}
|
||||
@@ -671,8 +675,12 @@ impl IndexMerger {
|
||||
|
||||
impl SerializableSegment for IndexMerger {
|
||||
fn write(&self, mut serializer: SegmentSerializer) -> crate::Result<u32> {
|
||||
let term_ord_mappings = self.write_postings(serializer.get_postings_serializer())?;
|
||||
self.write_fieldnorms(serializer.get_fieldnorms_serializer())?;
|
||||
if let Some(fieldnorms_serializer) = serializer.get_fieldnorms_serializer() {
|
||||
self.write_fieldnorms(fieldnorms_serializer)?;
|
||||
}
|
||||
let fieldnorm_data = serializer.segment().open_read(SegmentComponent::FIELDNORMS)?;
|
||||
let fieldnorm_readers = FieldNormReaders::new(fieldnorm_data)?;
|
||||
let term_ord_mappings = self.write_postings(serializer.get_postings_serializer(), fieldnorm_readers)?;
|
||||
self.write_fast_fields(serializer.get_fast_field_serializer(), term_ord_mappings)?;
|
||||
self.write_storable_fields(serializer.get_store_writer())?;
|
||||
serializer.close()?;
|
||||
|
||||
@@ -8,15 +8,16 @@ use crate::store::StoreWriter;
|
||||
/// Segment serializer is in charge of laying out on disk
|
||||
/// the data accumulated and sorted by the `SegmentWriter`.
|
||||
pub struct SegmentSerializer {
|
||||
segment: Segment,
|
||||
store_writer: StoreWriter,
|
||||
fast_field_serializer: FastFieldSerializer,
|
||||
fieldnorms_serializer: FieldNormsSerializer,
|
||||
fieldnorms_serializer: Option<FieldNormsSerializer>,
|
||||
postings_serializer: InvertedIndexSerializer,
|
||||
}
|
||||
|
||||
impl SegmentSerializer {
|
||||
/// Creates a new `SegmentSerializer`.
|
||||
pub fn for_segment(segment: &mut Segment) -> crate::Result<SegmentSerializer> {
|
||||
pub fn for_segment(mut segment: Segment) -> crate::Result<SegmentSerializer> {
|
||||
let store_write = segment.open_write(SegmentComponent::STORE)?;
|
||||
|
||||
let fast_field_write = segment.open_write(SegmentComponent::FASTFIELDS)?;
|
||||
@@ -25,15 +26,20 @@ impl SegmentSerializer {
|
||||
let fieldnorms_write = segment.open_write(SegmentComponent::FIELDNORMS)?;
|
||||
let fieldnorms_serializer = FieldNormsSerializer::from_write(fieldnorms_write)?;
|
||||
|
||||
let postings_serializer = InvertedIndexSerializer::open(segment)?;
|
||||
let postings_serializer = InvertedIndexSerializer::open(&mut segment)?;
|
||||
Ok(SegmentSerializer {
|
||||
segment,
|
||||
store_writer: StoreWriter::new(store_write),
|
||||
fast_field_serializer,
|
||||
fieldnorms_serializer,
|
||||
fieldnorms_serializer: Some(fieldnorms_serializer),
|
||||
postings_serializer,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn segment(&self) -> &Segment {
|
||||
&self.segment
|
||||
}
|
||||
|
||||
/// Accessor to the `PostingsSerializer`.
|
||||
pub fn get_postings_serializer(&mut self) -> &mut InvertedIndexSerializer {
|
||||
&mut self.postings_serializer
|
||||
@@ -45,8 +51,8 @@ impl SegmentSerializer {
|
||||
}
|
||||
|
||||
/// Accessor to the field norm serializer.
|
||||
pub fn get_fieldnorms_serializer(&mut self) -> &mut FieldNormsSerializer {
|
||||
&mut self.fieldnorms_serializer
|
||||
pub fn get_fieldnorms_serializer(&mut self) -> Option<FieldNormsSerializer> {
|
||||
self.fieldnorms_serializer.take()
|
||||
}
|
||||
|
||||
/// Accessor to the `StoreWriter`.
|
||||
@@ -55,11 +61,13 @@ impl SegmentSerializer {
|
||||
}
|
||||
|
||||
/// Finalize the segment serialization.
|
||||
pub fn close(self) -> crate::Result<()> {
|
||||
pub fn close(mut self) -> crate::Result<()> {
|
||||
if let Some(fieldnorms_serializer) = self.get_fieldnorms_serializer() {
|
||||
fieldnorms_serializer.close()?;
|
||||
}
|
||||
self.fast_field_serializer.close()?;
|
||||
self.postings_serializer.close()?;
|
||||
self.store_writer.close()?;
|
||||
self.fieldnorms_serializer.close()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -112,7 +112,7 @@ fn merge(
|
||||
target_opstamp: Opstamp,
|
||||
) -> crate::Result<SegmentEntry> {
|
||||
// first we need to apply deletes to our segment.
|
||||
let mut merged_segment = index.new_segment();
|
||||
let merged_segment = index.new_segment();
|
||||
|
||||
// First we apply all of the delet to the merged segment, up to the target opstamp.
|
||||
for segment_entry in &mut segment_entries {
|
||||
@@ -130,12 +130,14 @@ fn merge(
|
||||
// An IndexMerger is like a "view" of our merged segments.
|
||||
let merger: IndexMerger = IndexMerger::open(index.schema(), &segments[..])?;
|
||||
|
||||
let merged_segment_id = merged_segment.id();
|
||||
|
||||
// ... we just serialize this index merger in our new segment to merge the two segments.
|
||||
let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment)?;
|
||||
let segment_serializer = SegmentSerializer::for_segment(merged_segment)?;
|
||||
|
||||
let num_docs = merger.write(segment_serializer)?;
|
||||
|
||||
let segment_meta = index.new_segment_meta(merged_segment.id(), num_docs);
|
||||
let segment_meta = index.new_segment_meta(merged_segment_id, num_docs);
|
||||
|
||||
Ok(SegmentEntry::new(segment_meta, delete_cursor, None))
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@ use super::operation::AddOperation;
|
||||
use crate::core::Segment;
|
||||
use crate::core::SerializableSegment;
|
||||
use crate::fastfield::FastFieldsWriter;
|
||||
use crate::fieldnorm::FieldNormsWriter;
|
||||
use crate::fieldnorm::{FieldNormsWriter, FieldNormReaders};
|
||||
use crate::indexer::segment_serializer::SegmentSerializer;
|
||||
use crate::postings::compute_table_size;
|
||||
use crate::postings::MultiFieldPostingsWriter;
|
||||
@@ -14,7 +14,7 @@ use crate::schema::{Field, FieldEntry};
|
||||
use crate::tokenizer::{BoxTokenStream, PreTokenizedStream};
|
||||
use crate::tokenizer::{FacetTokenizer, TextAnalyzer};
|
||||
use crate::tokenizer::{TokenStreamChain, Tokenizer};
|
||||
use crate::DocId;
|
||||
use crate::{DocId, SegmentComponent};
|
||||
use crate::Opstamp;
|
||||
use std::io;
|
||||
use std::str;
|
||||
@@ -62,11 +62,12 @@ impl SegmentWriter {
|
||||
/// - schema
|
||||
pub fn for_segment(
|
||||
memory_budget: usize,
|
||||
mut segment: Segment,
|
||||
segment: Segment,
|
||||
schema: &Schema,
|
||||
) -> crate::Result<SegmentWriter> {
|
||||
let tokenizer_manager = segment.index().tokenizers().clone();
|
||||
let table_num_bits = initial_table_size(memory_budget)?;
|
||||
let segment_serializer = SegmentSerializer::for_segment(&mut segment)?;
|
||||
let segment_serializer = SegmentSerializer::for_segment(segment)?;
|
||||
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_num_bits);
|
||||
let tokenizers = schema
|
||||
.fields()
|
||||
@@ -76,7 +77,7 @@ impl SegmentWriter {
|
||||
.get_indexing_options()
|
||||
.and_then(|text_index_option| {
|
||||
let tokenizer_name = &text_index_option.tokenizer();
|
||||
segment.index().tokenizers().get(tokenizer_name)
|
||||
tokenizer_manager.get(tokenizer_name)
|
||||
}),
|
||||
_ => None,
|
||||
},
|
||||
@@ -280,9 +281,13 @@ fn write(
|
||||
fieldnorms_writer: &FieldNormsWriter,
|
||||
mut serializer: SegmentSerializer,
|
||||
) -> crate::Result<()> {
|
||||
let term_ord_map = multifield_postings.serialize(serializer.get_postings_serializer())?;
|
||||
if let Some(fieldnorms_serializer) = serializer.get_fieldnorms_serializer() {
|
||||
fieldnorms_writer.serialize(fieldnorms_serializer)?;
|
||||
}
|
||||
let fieldnorm_data = serializer.segment().open_read(SegmentComponent::FIELDNORMS)?;
|
||||
let fieldnorm_readers = FieldNormReaders::new(fieldnorm_data)?;
|
||||
let term_ord_map = multifield_postings.serialize(serializer.get_postings_serializer(), fieldnorm_readers)?;
|
||||
fast_field_writers.serialize(serializer.get_fast_field_serializer(), &term_ord_map)?;
|
||||
fieldnorms_writer.serialize(serializer.get_fieldnorms_serializer())?;
|
||||
serializer.close()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -73,7 +73,7 @@ pub mod tests {
|
||||
let mut segment = index.new_segment();
|
||||
let mut posting_serializer = InvertedIndexSerializer::open(&mut segment).unwrap();
|
||||
{
|
||||
let mut field_serializer = posting_serializer.new_field(text_field, 120 * 4).unwrap();
|
||||
let mut field_serializer = posting_serializer.new_field(text_field, 120 * 4, None).unwrap();
|
||||
field_serializer.new_term("abc".as_bytes()).unwrap();
|
||||
for doc_id in 0u32..120u32 {
|
||||
let delta_positions = vec![1, 2, 3, 2];
|
||||
|
||||
@@ -16,6 +16,7 @@ use std::collections::HashMap;
|
||||
use std::io;
|
||||
use std::marker::PhantomData;
|
||||
use std::ops::DerefMut;
|
||||
use crate::fieldnorm::FieldNormReaders;
|
||||
|
||||
fn posting_from_field_entry(field_entry: &FieldEntry) -> Box<dyn PostingsWriter> {
|
||||
match *field_entry.field_type() {
|
||||
@@ -128,6 +129,7 @@ impl MultiFieldPostingsWriter {
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut InvertedIndexSerializer,
|
||||
fieldnorm_readers: FieldNormReaders
|
||||
) -> crate::Result<HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>> {
|
||||
let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> =
|
||||
self.term_index.iter().collect();
|
||||
@@ -161,8 +163,9 @@ impl MultiFieldPostingsWriter {
|
||||
}
|
||||
|
||||
let postings_writer = &self.per_field_postings_writers[field.field_id() as usize];
|
||||
let fieldnorm_reader = fieldnorm_readers.get_field(field);
|
||||
let mut field_serializer =
|
||||
serializer.new_field(field, postings_writer.total_num_tokens())?;
|
||||
serializer.new_field(field, postings_writer.total_num_tokens(), fieldnorm_reader)?;
|
||||
postings_writer.serialize(
|
||||
&term_offsets[start..stop],
|
||||
&mut field_serializer,
|
||||
|
||||
@@ -53,7 +53,7 @@ impl SegmentPostings {
|
||||
pub fn create_from_docs(docs: &[u32]) -> SegmentPostings {
|
||||
let mut buffer = Vec::new();
|
||||
{
|
||||
let mut postings_serializer = PostingsSerializer::new(&mut buffer, false, false);
|
||||
let mut postings_serializer = PostingsSerializer::new(&mut buffer, false, false, None);
|
||||
for &doc in docs {
|
||||
postings_serializer.write_doc(doc, 1u32);
|
||||
}
|
||||
|
||||
@@ -11,6 +11,8 @@ use crate::schema::{Field, FieldEntry, FieldType};
|
||||
use crate::termdict::{TermDictionaryBuilder, TermOrdinal};
|
||||
use crate::DocId;
|
||||
use std::io::{self, Write};
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
/// `InvertedIndexSerializer` is in charge of serializing
|
||||
/// postings on disk, in the
|
||||
@@ -89,6 +91,7 @@ impl InvertedIndexSerializer {
|
||||
&mut self,
|
||||
field: Field,
|
||||
total_num_tokens: u64,
|
||||
fieldnorm_reader: Option<FieldNormReader>
|
||||
) -> io::Result<FieldSerializer<'_>> {
|
||||
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
|
||||
let term_dictionary_write = self.terms_write.for_field(field);
|
||||
@@ -103,6 +106,7 @@ impl InvertedIndexSerializer {
|
||||
postings_write,
|
||||
positions_write,
|
||||
positionsidx_write,
|
||||
fieldnorm_reader
|
||||
)
|
||||
}
|
||||
|
||||
@@ -134,6 +138,7 @@ impl<'a> FieldSerializer<'a> {
|
||||
postings_write: &'a mut CountingWriter<WritePtr>,
|
||||
positions_write: &'a mut CountingWriter<WritePtr>,
|
||||
positionsidx_write: &'a mut CountingWriter<WritePtr>,
|
||||
fieldnorm_reader: Option<FieldNormReader>
|
||||
) -> io::Result<FieldSerializer<'a>> {
|
||||
let (term_freq_enabled, position_enabled): (bool, bool) = match field_type {
|
||||
FieldType::Str(ref text_options) => {
|
||||
@@ -148,7 +153,7 @@ impl<'a> FieldSerializer<'a> {
|
||||
};
|
||||
let term_dictionary_builder = TermDictionaryBuilder::create(term_dictionary_write)?;
|
||||
let postings_serializer =
|
||||
PostingsSerializer::new(postings_write, term_freq_enabled, position_enabled);
|
||||
PostingsSerializer::new(postings_write, term_freq_enabled, position_enabled, fieldnorm_reader);
|
||||
let positions_serializer_opt = if position_enabled {
|
||||
Some(PositionSerializer::new(positions_write, positionsidx_write))
|
||||
} else {
|
||||
@@ -161,7 +166,7 @@ impl<'a> FieldSerializer<'a> {
|
||||
positions_serializer_opt,
|
||||
current_term_info: TermInfo::default(),
|
||||
term_open: false,
|
||||
num_terms: TermOrdinal::default(),
|
||||
num_terms: TermOrdinal::default()
|
||||
})
|
||||
}
|
||||
|
||||
@@ -306,6 +311,10 @@ pub struct PostingsSerializer<W: Write> {
|
||||
|
||||
termfreq_enabled: bool,
|
||||
termfreq_sum_enabled: bool,
|
||||
|
||||
fieldnorm_reader: Option<FieldNormReader>,
|
||||
|
||||
tf_fn_output: Vec<(u8, u32)>
|
||||
}
|
||||
|
||||
impl<W: Write> PostingsSerializer<W> {
|
||||
@@ -313,6 +322,7 @@ impl<W: Write> PostingsSerializer<W> {
|
||||
write: W,
|
||||
termfreq_enabled: bool,
|
||||
termfreq_sum_enabled: bool,
|
||||
fieldnorm_reader: Option<FieldNormReader>
|
||||
) -> PostingsSerializer<W> {
|
||||
PostingsSerializer {
|
||||
output_write: CountingWriter::wrap(write),
|
||||
@@ -326,6 +336,10 @@ impl<W: Write> PostingsSerializer<W> {
|
||||
last_doc_id_encoded: 0u32,
|
||||
termfreq_enabled,
|
||||
termfreq_sum_enabled,
|
||||
|
||||
fieldnorm_reader,
|
||||
|
||||
tf_fn_output: Vec::new()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -352,6 +366,19 @@ impl<W: Write> PostingsSerializer<W> {
|
||||
let sum_freq = self.block.term_freqs().iter().cloned().sum();
|
||||
self.skip_write.write_total_term_freq(sum_freq);
|
||||
}
|
||||
if let Some(fieldnorm_reader) = &self.fieldnorm_reader {
|
||||
let docs = self.block.doc_ids;
|
||||
let tfs = self.block.term_freqs;
|
||||
let fn_id_tf_pairs = (0..COMPRESSION_BLOCK_SIZE)
|
||||
.map(|i| {
|
||||
let doc = docs[i];
|
||||
let tf = tfs[i];
|
||||
let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc);
|
||||
(fieldnorm_id, tf)
|
||||
});
|
||||
find_maximal_pairs(fn_id_tf_pairs, &mut self.tf_fn_output);
|
||||
self.skip_write.write_blockwand_info(&self.tf_fn_output[..]);
|
||||
}
|
||||
}
|
||||
self.block.clear();
|
||||
}
|
||||
@@ -412,3 +439,80 @@ impl<W: Write> PostingsSerializer<W> {
|
||||
self.last_doc_id_encoded = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
fn cmp(left: (u8, u32), right: (u8, u32)) -> Option<Ordering> {
|
||||
let fieldnorm_cmp = left.0.cmp(&right.0).reverse();
|
||||
let term_freq_cmp= left.1.cmp(&right.1);
|
||||
match (fieldnorm_cmp, term_freq_cmp) {
|
||||
(Ordering::Equal, Ordering::Equal) => Some(Ordering::Equal),
|
||||
(Ordering::Less, Ordering::Greater) | (Ordering::Greater, Ordering::Less) => None,
|
||||
(Ordering::Less, _) | (_, Ordering::Less) => Some(Ordering::Less),
|
||||
(Ordering::Greater, _) | (_, Ordering::Greater) => Some(Ordering::Greater),
|
||||
}
|
||||
}
|
||||
|
||||
fn remove_lower(output: &mut Vec<(u8, u32)>, new_el: (u8, u32)) {
|
||||
let mut i = 0;
|
||||
while i < output.len() {
|
||||
match cmp(output[i], new_el) {
|
||||
Some(Ordering::Equal) | Some(Ordering::Greater) => {
|
||||
return;
|
||||
}
|
||||
Some(Ordering::Less) => {
|
||||
output.swap_remove(i);
|
||||
}
|
||||
None => {
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
output.push(new_el);
|
||||
}
|
||||
|
||||
fn find_maximal_pairs<Iter: Iterator<Item=(u8, u32)>>(mut fn_tf_it: Iter, output: &mut Vec<(u8, u32)>) {
|
||||
output.clear();
|
||||
if let Some((u32, u8)) = fn_tf_it.next() {
|
||||
output.push((u32, u8));
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
for (fieldnorm_id, term_freq) in fn_tf_it {
|
||||
remove_lower(output, (fieldnorm_id, term_freq));
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::find_maximal_pairs;
|
||||
|
||||
#[test]
|
||||
fn test_tf_fn_id_empty() {
|
||||
let mut output: Vec<(u8, u32)> = Vec::new();
|
||||
find_maximal_pairs(vec![].into_iter(), &mut output);
|
||||
assert_eq!(&output[..], &[]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tf_fn_id_output_should_be_cleared() {
|
||||
let mut output: Vec<(u8, u32)> = vec![(1u8, 1u32)];
|
||||
find_maximal_pairs(vec![].into_iter(), &mut output);
|
||||
assert_eq!(&output[..], &[]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tf_fn_id_no_reduction() {
|
||||
let mut output: Vec<(u8, u32)> = Vec::new();
|
||||
find_maximal_pairs(vec![(1u8, 3u32), (2u8, 4u32)].into_iter(), &mut output);
|
||||
assert_eq!(&output[..], &[(1u8, 3u32), (2u8, 4u32)]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tf_fn_id_reduction() {
|
||||
let mut output: Vec<(u8, u32)> = Vec::new();
|
||||
find_maximal_pairs(vec![(1u8, 3u32), (2u8, 2u32)].into_iter(), &mut output);
|
||||
assert_eq!(&output[..], &[(1u8, 3u32)]);
|
||||
find_maximal_pairs(vec![(2u8, 2u32), (1u8, 3u32)].into_iter(), &mut output);
|
||||
assert_eq!(&output[..], &[(1u8, 3u32)]);
|
||||
}
|
||||
}
|
||||
@@ -40,6 +40,10 @@ impl SkipSerializer {
|
||||
.expect("Should never fail");
|
||||
}
|
||||
|
||||
pub fn write_blockwand_info(&mut self, fn_tf_pairs: &[(u8, u32)]) {
|
||||
|
||||
}
|
||||
|
||||
pub fn data(&self) -> &[u8] {
|
||||
&self.buffer[..]
|
||||
}
|
||||
|
||||
@@ -143,6 +143,6 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_idf() {
|
||||
assert_nearly_equals(idf(1, 2), 0.6931472);
|
||||
assert_nearly_equals(idf(1, 2), std::f32::consts::LN_2);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user