Compare commits

...

1 Commits

Author SHA1 Message Date
Paul Masurel
063ed30f66 Added field norm readers 2020-07-20 11:59:43 +09:00
11 changed files with 96 additions and 39 deletions

View File

@@ -8,7 +8,7 @@ use crate::directory::ReadOnlySource;
use crate::fastfield::DeleteBitSet; use crate::fastfield::DeleteBitSet;
use crate::fastfield::FacetReader; use crate::fastfield::FacetReader;
use crate::fastfield::FastFieldReaders; use crate::fastfield::FastFieldReaders;
use crate::fieldnorm::FieldNormReader; use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
use crate::schema::Field; use crate::schema::Field;
use crate::schema::FieldType; use crate::schema::FieldType;
use crate::schema::Schema; use crate::schema::Schema;
@@ -48,7 +48,7 @@ pub struct SegmentReader {
positions_composite: CompositeFile, positions_composite: CompositeFile,
positions_idx_composite: CompositeFile, positions_idx_composite: CompositeFile,
fast_fields_readers: Arc<FastFieldReaders>, fast_fields_readers: Arc<FastFieldReaders>,
fieldnorms_composite: CompositeFile, fieldnorm_readers: FieldNormReaders,
store_source: ReadOnlySource, store_source: ReadOnlySource,
delete_bitset_opt: Option<DeleteBitSet>, delete_bitset_opt: Option<DeleteBitSet>,
@@ -126,8 +126,8 @@ impl SegmentReader {
/// They are simply stored as a fast field, serialized in /// They are simply stored as a fast field, serialized in
/// the `.fieldnorm` file of the segment. /// the `.fieldnorm` file of the segment.
pub fn get_fieldnorms_reader(&self, field: Field) -> FieldNormReader { pub fn get_fieldnorms_reader(&self, field: Field) -> FieldNormReader {
if let Some(fieldnorm_source) = self.fieldnorms_composite.open_read(field) { if let Some(fieldnorm_reader) = self.fieldnorm_readers.get_field(field) {
FieldNormReader::open(fieldnorm_source) fieldnorm_reader
} else { } else {
let field_name = self.schema.get_field_name(field); let field_name = self.schema.get_field_name(field);
let err_msg = format!( let err_msg = format!(
@@ -178,8 +178,8 @@ impl SegmentReader {
let fast_field_readers = let fast_field_readers =
Arc::new(FastFieldReaders::load_all(&schema, &fast_fields_composite)?); Arc::new(FastFieldReaders::load_all(&schema, &fast_fields_composite)?);
let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?; let fieldnorm_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
let fieldnorms_composite = CompositeFile::open(&fieldnorms_data)?; let fieldnorm_readers = FieldNormReaders::new(fieldnorm_data)?;
let delete_bitset_opt = if segment.meta().has_deletes() { let delete_bitset_opt = if segment.meta().has_deletes() {
let delete_data = segment.open_read(SegmentComponent::DELETE)?; let delete_data = segment.open_read(SegmentComponent::DELETE)?;
@@ -195,7 +195,7 @@ impl SegmentReader {
termdict_composite, termdict_composite,
postings_composite, postings_composite,
fast_fields_readers: fast_field_readers, fast_fields_readers: fast_field_readers,
fieldnorms_composite, fieldnorm_readers,
segment_id: segment.id(), segment_id: segment.id(),
store_source, store_source,
delete_bitset_opt, delete_bitset_opt,
@@ -308,7 +308,7 @@ impl SegmentReader {
self.positions_composite.space_usage(), self.positions_composite.space_usage(),
self.positions_idx_composite.space_usage(), self.positions_idx_composite.space_usage(),
self.fast_fields_readers.space_usage(), self.fast_fields_readers.space_usage(),
self.fieldnorms_composite.space_usage(), self.fieldnorm_readers.space_usage(),
self.get_store_reader().space_usage(), self.get_store_reader().space_usage(),
self.delete_bitset_opt self.delete_bitset_opt
.as_ref() .as_ref()

View File

@@ -21,7 +21,7 @@ mod reader;
mod serializer; mod serializer;
mod writer; mod writer;
pub use self::reader::FieldNormReader; pub use self::reader::{FieldNormReader, FieldNormReaders};
pub use self::serializer::FieldNormsSerializer; pub use self::serializer::FieldNormsSerializer;
pub use self::writer::FieldNormsWriter; pub use self::writer::FieldNormsWriter;

View File

@@ -1,6 +1,41 @@
use super::{fieldnorm_to_id, id_to_fieldnorm}; use super::{fieldnorm_to_id, id_to_fieldnorm};
use crate::common::CompositeFile;
use crate::directory::ReadOnlySource; use crate::directory::ReadOnlySource;
use crate::schema::Field;
use crate::space_usage::PerFieldSpaceUsage;
use crate::DocId; use crate::DocId;
use std::sync::Arc;
/// Reader for the fieldnorm (for each document, the number of tokens indexed in the
/// field) of all indexed fields in the index.
///
/// Each fieldnorm is approximately compressed over one byte. We refer to this byte as
/// `fieldnorm_id`.
/// The mapping from `fieldnorm` to `fieldnorm_id` is given by monotonic.
#[derive(Clone)]
pub struct FieldNormReaders {
data: Arc<CompositeFile>,
}
impl FieldNormReaders {
/// Creates a field norm reader.
pub fn new(source: ReadOnlySource) -> crate::Result<FieldNormReaders> {
let data = CompositeFile::open(&source)?;
Ok(FieldNormReaders {
data: Arc::new(data),
})
}
/// Returns the FieldNormReader for a specific field.
pub fn get_field(&self, field: Field) -> Option<FieldNormReader> {
self.data.open_read(field).map(FieldNormReader::open)
}
/// Return a break down of the space usage per field.
pub fn space_usage(&self) -> PerFieldSpaceUsage {
self.data.space_usage()
}
}
/// Reads the fieldnorm associated to a document. /// Reads the fieldnorm associated to a document.
/// The fieldnorm represents the length associated to /// The fieldnorm represents the length associated to
@@ -19,6 +54,7 @@ use crate::DocId;
/// Apart from compression, this scale also makes it possible to /// Apart from compression, this scale also makes it possible to
/// precompute computationally expensive functions of the fieldnorm /// precompute computationally expensive functions of the fieldnorm
/// in a very short array. /// in a very short array.
#[derive(Clone)]
pub struct FieldNormReader { pub struct FieldNormReader {
data: ReadOnlySource, data: ReadOnlySource,
} }
@@ -29,6 +65,11 @@ impl FieldNormReader {
FieldNormReader { data } FieldNormReader { data }
} }
/// Returns the number of documents in this segment.
pub fn num_docs(&self) -> u32 {
self.data.len() as u32
}
/// Returns the `fieldnorm` associated to a doc id. /// Returns the `fieldnorm` associated to a doc id.
/// The fieldnorm is a value approximating the number /// The fieldnorm is a value approximating the number
/// of tokens in a given field of the `doc_id`. /// of tokens in a given field of the `doc_id`.
@@ -65,10 +106,11 @@ impl FieldNormReader {
} }
#[cfg(test)] #[cfg(test)]
impl From<Vec<u32>> for FieldNormReader { impl From<&[u32]> for FieldNormReader {
fn from(field_norms: Vec<u32>) -> FieldNormReader { fn from(field_norms: &[u32]) -> FieldNormReader {
let field_norms_id = field_norms let field_norms_id = field_norms
.into_iter() .iter()
.cloned()
.map(FieldNormReader::fieldnorm_to_id) .map(FieldNormReader::fieldnorm_to_id)
.collect::<Vec<u8>>(); .collect::<Vec<u8>>();
let field_norms_data = ReadOnlySource::from(field_norms_id); let field_norms_data = ReadOnlySource::from(field_norms_id);

View File

@@ -78,11 +78,12 @@ impl FieldNormsWriter {
} }
/// Serialize the seen fieldnorm values to the serializer for all fields. /// Serialize the seen fieldnorm values to the serializer for all fields.
pub fn serialize(&self, fieldnorms_serializer: &mut FieldNormsSerializer) -> io::Result<()> { pub fn serialize(&self, mut fieldnorms_serializer: FieldNormsSerializer) -> io::Result<()> {
for &field in self.fields.iter() { for &field in self.fields.iter() {
let fieldnorm_values: &[u8] = &self.fieldnorms_buffer[field.field_id() as usize][..]; let fieldnorm_values: &[u8] = &self.fieldnorms_buffer[field.field_id() as usize][..];
fieldnorms_serializer.serialize_field(field, fieldnorm_values)?; fieldnorms_serializer.serialize_field(field, fieldnorm_values)?;
} }
fieldnorms_serializer.close()?;
Ok(()) Ok(())
} }
} }

View File

@@ -167,7 +167,7 @@ impl IndexMerger {
fn write_fieldnorms( fn write_fieldnorms(
&self, &self,
fieldnorms_serializer: &mut FieldNormsSerializer, mut fieldnorms_serializer: FieldNormsSerializer,
) -> crate::Result<()> { ) -> crate::Result<()> {
let fields = FieldNormsWriter::fields_with_fieldnorm(&self.schema); let fields = FieldNormsWriter::fields_with_fieldnorm(&self.schema);
let mut fieldnorms_data = Vec::with_capacity(self.max_doc as usize); let mut fieldnorms_data = Vec::with_capacity(self.max_doc as usize);
@@ -182,6 +182,7 @@ impl IndexMerger {
} }
fieldnorms_serializer.serialize_field(field, &fieldnorms_data[..])?; fieldnorms_serializer.serialize_field(field, &fieldnorms_data[..])?;
} }
fieldnorms_serializer.close()?;
Ok(()) Ok(())
} }
@@ -668,8 +669,10 @@ impl IndexMerger {
impl SerializableSegment for IndexMerger { impl SerializableSegment for IndexMerger {
fn write(&self, mut serializer: SegmentSerializer) -> crate::Result<u32> { fn write(&self, mut serializer: SegmentSerializer) -> crate::Result<u32> {
if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() {
self.write_fieldnorms(fieldnorms_serializer)?;
}
let term_ord_mappings = self.write_postings(serializer.get_postings_serializer())?; let term_ord_mappings = self.write_postings(serializer.get_postings_serializer())?;
self.write_fieldnorms(serializer.get_fieldnorms_serializer())?;
self.write_fast_fields(serializer.get_fast_field_serializer(), term_ord_mappings)?; self.write_fast_fields(serializer.get_fast_field_serializer(), term_ord_mappings)?;
self.write_storable_fields(serializer.get_store_writer())?; self.write_storable_fields(serializer.get_store_writer())?;
serializer.close()?; serializer.close()?;
@@ -1504,12 +1507,9 @@ mod tests {
for i in 0..100 { for i in 0..100 {
let mut doc = Document::new(); let mut doc = Document::new();
doc.add_f64(field, 42.0); doc.add_f64(field, 42.0);
doc.add_f64(multi_field, 0.24); doc.add_f64(multi_field, 0.24);
doc.add_f64(multi_field, 0.27); doc.add_f64(multi_field, 0.27);
writer.add_document(doc); writer.add_document(doc);
if i % 5 == 0 { if i % 5 == 0 {
writer.commit()?; writer.commit()?;
} }
@@ -1521,7 +1521,6 @@ mod tests {
// If a merging thread fails, we should end up with more // If a merging thread fails, we should end up with more
// than one segment here // than one segment here
assert_eq!(1, index.searchable_segments()?.len()); assert_eq!(1, index.searchable_segments()?.len());
Ok(()) Ok(())
} }
} }

View File

@@ -8,15 +8,16 @@ use crate::store::StoreWriter;
/// Segment serializer is in charge of laying out on disk /// Segment serializer is in charge of laying out on disk
/// the data accumulated and sorted by the `SegmentWriter`. /// the data accumulated and sorted by the `SegmentWriter`.
pub struct SegmentSerializer { pub struct SegmentSerializer {
segment: Segment,
store_writer: StoreWriter, store_writer: StoreWriter,
fast_field_serializer: FastFieldSerializer, fast_field_serializer: FastFieldSerializer,
fieldnorms_serializer: FieldNormsSerializer, fieldnorms_serializer: Option<FieldNormsSerializer>,
postings_serializer: InvertedIndexSerializer, postings_serializer: InvertedIndexSerializer,
} }
impl SegmentSerializer { impl SegmentSerializer {
/// Creates a new `SegmentSerializer`. /// Creates a new `SegmentSerializer`.
pub fn for_segment(segment: &mut Segment) -> crate::Result<SegmentSerializer> { pub fn for_segment(mut segment: Segment) -> crate::Result<SegmentSerializer> {
let store_write = segment.open_write(SegmentComponent::STORE)?; let store_write = segment.open_write(SegmentComponent::STORE)?;
let fast_field_write = segment.open_write(SegmentComponent::FASTFIELDS)?; let fast_field_write = segment.open_write(SegmentComponent::FASTFIELDS)?;
@@ -25,15 +26,21 @@ impl SegmentSerializer {
let fieldnorms_write = segment.open_write(SegmentComponent::FIELDNORMS)?; let fieldnorms_write = segment.open_write(SegmentComponent::FIELDNORMS)?;
let fieldnorms_serializer = FieldNormsSerializer::from_write(fieldnorms_write)?; let fieldnorms_serializer = FieldNormsSerializer::from_write(fieldnorms_write)?;
let postings_serializer = InvertedIndexSerializer::open(segment)?; let postings_serializer = InvertedIndexSerializer::open(&mut segment)?;
Ok(SegmentSerializer { Ok(SegmentSerializer {
segment,
store_writer: StoreWriter::new(store_write), store_writer: StoreWriter::new(store_write),
fast_field_serializer, fast_field_serializer,
fieldnorms_serializer, fieldnorms_serializer: Some(fieldnorms_serializer),
postings_serializer, postings_serializer,
}) })
} }
#[allow(dead_code)]
pub fn segment(&self) -> &Segment {
&self.segment
}
/// Accessor to the `PostingsSerializer`. /// Accessor to the `PostingsSerializer`.
pub fn get_postings_serializer(&mut self) -> &mut InvertedIndexSerializer { pub fn get_postings_serializer(&mut self) -> &mut InvertedIndexSerializer {
&mut self.postings_serializer &mut self.postings_serializer
@@ -44,9 +51,11 @@ impl SegmentSerializer {
&mut self.fast_field_serializer &mut self.fast_field_serializer
} }
/// Accessor to the field norm serializer. /// Extract the field norm serializer.
pub fn get_fieldnorms_serializer(&mut self) -> &mut FieldNormsSerializer { ///
&mut self.fieldnorms_serializer /// Note the fieldnorms serializer can only be extracted once.
pub fn extract_fieldnorms_serializer(&mut self) -> Option<FieldNormsSerializer> {
self.fieldnorms_serializer.take()
} }
/// Accessor to the `StoreWriter`. /// Accessor to the `StoreWriter`.
@@ -55,11 +64,13 @@ impl SegmentSerializer {
} }
/// Finalize the segment serialization. /// Finalize the segment serialization.
pub fn close(self) -> crate::Result<()> { pub fn close(mut self) -> crate::Result<()> {
if let Some(fieldnorms_serializer) = self.extract_fieldnorms_serializer() {
fieldnorms_serializer.close()?;
}
self.fast_field_serializer.close()?; self.fast_field_serializer.close()?;
self.postings_serializer.close()?; self.postings_serializer.close()?;
self.store_writer.close()?; self.store_writer.close()?;
self.fieldnorms_serializer.close()?;
Ok(()) Ok(())
} }
} }

View File

@@ -112,7 +112,7 @@ fn merge(
target_opstamp: Opstamp, target_opstamp: Opstamp,
) -> crate::Result<SegmentEntry> { ) -> crate::Result<SegmentEntry> {
// first we need to apply deletes to our segment. // first we need to apply deletes to our segment.
let mut merged_segment = index.new_segment(); let merged_segment = index.new_segment();
// First we apply all of the delet to the merged segment, up to the target opstamp. // First we apply all of the delet to the merged segment, up to the target opstamp.
for segment_entry in &mut segment_entries { for segment_entry in &mut segment_entries {
@@ -131,12 +131,13 @@ fn merge(
let merger: IndexMerger = IndexMerger::open(index.schema(), &segments[..])?; let merger: IndexMerger = IndexMerger::open(index.schema(), &segments[..])?;
// ... we just serialize this index merger in our new segment to merge the two segments. // ... we just serialize this index merger in our new segment to merge the two segments.
let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment)?; let segment_serializer = SegmentSerializer::for_segment(merged_segment.clone())?;
let num_docs = merger.write(segment_serializer)?; let num_docs = merger.write(segment_serializer)?;
let segment_meta = index.new_segment_meta(merged_segment.id(), num_docs); let merged_segment_id = merged_segment.id();
let segment_meta = index.new_segment_meta(merged_segment_id, num_docs);
Ok(SegmentEntry::new(segment_meta, delete_cursor, None)) Ok(SegmentEntry::new(segment_meta, delete_cursor, None))
} }

View File

@@ -62,11 +62,12 @@ impl SegmentWriter {
/// - schema /// - schema
pub fn for_segment( pub fn for_segment(
memory_budget: usize, memory_budget: usize,
mut segment: Segment, segment: Segment,
schema: &Schema, schema: &Schema,
) -> crate::Result<SegmentWriter> { ) -> crate::Result<SegmentWriter> {
let tokenizer_manager = segment.index().tokenizers().clone();
let table_num_bits = initial_table_size(memory_budget)?; let table_num_bits = initial_table_size(memory_budget)?;
let segment_serializer = SegmentSerializer::for_segment(&mut segment)?; let segment_serializer = SegmentSerializer::for_segment(segment)?;
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_num_bits); let multifield_postings = MultiFieldPostingsWriter::new(schema, table_num_bits);
let tokenizers = schema let tokenizers = schema
.fields() .fields()
@@ -76,7 +77,7 @@ impl SegmentWriter {
.get_indexing_options() .get_indexing_options()
.and_then(|text_index_option| { .and_then(|text_index_option| {
let tokenizer_name = &text_index_option.tokenizer(); let tokenizer_name = &text_index_option.tokenizer();
segment.index().tokenizers().get(tokenizer_name) tokenizer_manager.get(tokenizer_name)
}), }),
_ => None, _ => None,
}, },
@@ -280,9 +281,11 @@ fn write(
fieldnorms_writer: &FieldNormsWriter, fieldnorms_writer: &FieldNormsWriter,
mut serializer: SegmentSerializer, mut serializer: SegmentSerializer,
) -> crate::Result<()> { ) -> crate::Result<()> {
if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() {
fieldnorms_writer.serialize(fieldnorms_serializer)?;
}
let term_ord_map = multifield_postings.serialize(serializer.get_postings_serializer())?; let term_ord_map = multifield_postings.serialize(serializer.get_postings_serializer())?;
fast_field_writers.serialize(serializer.get_fast_field_serializer(), &term_ord_map)?; fast_field_writers.serialize(serializer.get_fast_field_serializer(), &term_ord_map)?;
fieldnorms_writer.serialize(serializer.get_fieldnorms_serializer())?;
serializer.close()?; serializer.close()?;
Ok(()) Ok(())
} }

View File

@@ -14,7 +14,7 @@ use std::fmt;
/// - a field name /// - a field name
/// - a field type, itself wrapping up options describing /// - a field type, itself wrapping up options describing
/// how the field should be indexed. /// how the field should be indexed.
#[derive(Clone, Debug, Eq, PartialEq)] #[derive(Clone, Debug, PartialEq)]
pub struct FieldEntry { pub struct FieldEntry {
name: String, name: String,
field_type: FieldType, field_type: FieldType,

View File

@@ -48,7 +48,7 @@ pub enum Type {
/// A `FieldType` describes the type (text, u64) of a field as well as /// A `FieldType` describes the type (text, u64) of a field as well as
/// how it should be handled by tantivy. /// how it should be handled by tantivy.
#[derive(Clone, Debug, Eq, PartialEq)] #[derive(Clone, Debug, PartialEq)]
pub enum FieldType { pub enum FieldType {
/// String field type configuration /// String field type configuration
Str(TextOptions), Str(TextOptions),

View File

@@ -6,7 +6,7 @@ use std::borrow::Cow;
use std::ops::BitOr; use std::ops::BitOr;
/// Define how a text field should be handled by tantivy. /// Define how a text field should be handled by tantivy.
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct TextOptions { pub struct TextOptions {
indexing: Option<TextFieldIndexing>, indexing: Option<TextFieldIndexing>,
stored: bool, stored: bool,
@@ -51,7 +51,7 @@ impl Default for TextOptions {
/// - the amount of information that should be stored about the presence of a term in a document. /// - the amount of information that should be stored about the presence of a term in a document.
/// Essentially, should we store the term frequency and/or the positions (See [`IndexRecordOption`](./enum.IndexRecordOption.html)). /// Essentially, should we store the term frequency and/or the positions (See [`IndexRecordOption`](./enum.IndexRecordOption.html)).
/// - the name of the `Tokenizer` that should be used to process the field. /// - the name of the `Tokenizer` that should be used to process the field.
#[derive(Clone, PartialEq, Eq, Debug, Serialize, Deserialize)] #[derive(Clone, PartialEq, Debug, Serialize, Deserialize)]
pub struct TextFieldIndexing { pub struct TextFieldIndexing {
record: IndexRecordOption, record: IndexRecordOption,
tokenizer: Cow<'static, str>, tokenizer: Cow<'static, str>,