From 1c9450174e83c1385bebcdf7353002d81c13c208 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 21 Mar 2018 17:36:16 +0900 Subject: [PATCH] Fieldnorm reader working except merge --- src/core/segment_reader.rs | 5 +-- src/fastfield/writer.rs | 23 ------------- src/fieldnorm/mod.rs | 10 +++--- src/fieldnorm/reader.rs | 50 +++++++++++++++++++++++++++++ src/fieldnorm/serializer.rs | 23 ++++++++++--- src/fieldnorm/writer.rs | 49 ++++++++++++++++++++++------ src/indexer/merger.rs | 34 +++++++++----------- src/indexer/segment_serializer.rs | 9 +++--- src/indexer/segment_writer.rs | 29 +++++------------ src/lib.rs | 6 ++-- src/postings/mod.rs | 9 ++++-- src/query/term_query/mod.rs | 7 ++-- src/query/term_query/term_scorer.rs | 5 +-- 13 files changed, 161 insertions(+), 98 deletions(-) create mode 100644 src/fieldnorm/reader.rs diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 85ddbfea0..cc2ddee16 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -26,6 +26,7 @@ use schema::Schema; use termdict::TermDictionary; use fastfield::{FastValue, MultiValueIntFastFieldReader}; use schema::Cardinality; +use fieldnorm::FieldNormReader; /// Entry point to access all of the datastructures of the `Segment` /// @@ -158,10 +159,10 @@ impl SegmentReader { /// /// They are simply stored as a fast field, serialized in /// the `.fieldnorm` file of the segment. - pub fn get_fieldnorms_reader(&self, field: Field) -> Option> { + pub fn get_fieldnorms_reader(&self, field: Field) -> Option { self.fieldnorms_composite .open_read(field) - .map(FastFieldReader::open) + .map(FieldNormReader::open) } /// Accessor to the segment's `StoreReader`. diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 19cd5cef2..57e887be0 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -116,16 +116,6 @@ impl FastFieldsWriter { } Ok(()) } - - /// Ensures all of the fast field writers have - /// reached `doc`. (included) - /// - /// The missing values will be filled with 0. - pub fn fill_val_up_to(&mut self, doc: DocId) { - for field_writer in &mut self.single_value_writers { - field_writer.fill_val_up_to(doc); - } - } } /// Fast field writer for ints. @@ -178,19 +168,6 @@ impl IntFastFieldWriter { self.val_if_missing = val_if_missing; } - /// Ensures all of the fast field writer have - /// reached `doc`. (included) - /// - /// The missing values will be filled with 0. - fn fill_val_up_to(&mut self, doc: DocId) { - let target = doc as usize + 1; - debug_assert!(self.val_count <= target); - let val_if_missing = self.val_if_missing; - while self.val_count < target { - self.add_val(val_if_missing); - } - } - /// Records a new value. /// /// The n-th value being recorded is implicitely diff --git a/src/fieldnorm/mod.rs b/src/fieldnorm/mod.rs index 6bb7ca19e..c1a28e045 100644 --- a/src/fieldnorm/mod.rs +++ b/src/fieldnorm/mod.rs @@ -1,10 +1,10 @@ - - - mod code; mod serializer; mod writer; +mod reader; +pub use self::reader::FieldNormReader; pub use self::writer::FieldNormsWriter; -pub use self::code::fieldnorm_to_id; -pub use self::code::id_to_fieldnorm; \ No newline at end of file +pub use self::serializer::FieldNormsSerializer; + +use self::code::{fieldnorm_to_id, id_to_fieldnorm}; \ No newline at end of file diff --git a/src/fieldnorm/reader.rs b/src/fieldnorm/reader.rs new file mode 100644 index 000000000..a097dd2de --- /dev/null +++ b/src/fieldnorm/reader.rs @@ -0,0 +1,50 @@ +use super::{id_to_fieldnorm, fieldnorm_to_id}; +use directory::ReadOnlySource; +use DocId; + +pub struct FieldNormReader { + data: ReadOnlySource +} + +impl FieldNormReader { + + pub fn open(data: ReadOnlySource) -> Self { + FieldNormReader { + data + } + } + + pub fn fieldnorm(&self, doc_id: DocId) -> u32 { + let fieldnorm_id = self.fieldnorm_id(doc_id); + id_to_fieldnorm(fieldnorm_id) + } + + #[inline(always)] + pub fn fieldnorm_id(&self, doc_id: DocId) -> u8 { + let fielnorms_data = self.data.as_slice(); + fielnorms_data[doc_id as usize] + } + + #[inline(always)] + pub fn id_to_fieldnorm(id: u8) -> u32 { + id_to_fieldnorm(id) + } + + #[inline(always)] + pub fn fieldnorm_to_id(fieldnorm: u32) -> u8 { + fieldnorm_to_id(fieldnorm) + } +} + +#[cfg(test)] +impl From> for FieldNormReader { + fn from(field_norms: Vec) -> FieldNormReader { + let field_norms_id = field_norms.into_iter() + .map(FieldNormReader::fieldnorm_to_id) + .collect::>(); + let field_norms_data = ReadOnlySource::from(field_norms_id); + FieldNormReader { + data: field_norms_data + } + } +} \ No newline at end of file diff --git a/src/fieldnorm/serializer.rs b/src/fieldnorm/serializer.rs index 4a0d75f55..236ec39a9 100644 --- a/src/fieldnorm/serializer.rs +++ b/src/fieldnorm/serializer.rs @@ -1,22 +1,37 @@ use directory::WritePtr; use std::io; use common::CompositeWrite; +use schema::Field; +use std::io::Write; -pub struct FieldNormSerializer { +pub struct FieldNormsSerializer { composite_write: CompositeWrite, } -impl FieldNormSerializer { +impl FieldNormsSerializer { /// Constructor - pub fn from_write(write: WritePtr) -> io::Result { + pub fn from_write(write: WritePtr) -> io::Result { // just making room for the pointer to header. let composite_write = CompositeWrite::wrap(write); - Ok(FieldNormSerializer { + Ok(FieldNormsSerializer { composite_write }) } + + pub fn serialize_field(&mut self, field: Field, fieldnorms_data: &[u8]) -> io::Result<()> { + let write = self.composite_write.for_field(field); + write.write_all(fieldnorms_data)?; + write.flush()?; + Ok(()) + } + + pub fn close(mut self) -> io::Result<()> { + self.composite_write.close()?; + Ok(()) + } + } diff --git a/src/fieldnorm/writer.rs b/src/fieldnorm/writer.rs index db9a71878..3816e3ed8 100644 --- a/src/fieldnorm/writer.rs +++ b/src/fieldnorm/writer.rs @@ -1,29 +1,60 @@ use DocId; -use super::fieldnorm_to_id; + use schema::Field; +use super::FieldNormsSerializer; +use std::io; +use schema::Schema; +use super::fieldnorm_to_id; pub struct FieldNormsWriter { + fields: Vec, fieldnorms_buffer: Vec> } impl FieldNormsWriter { - pub fn new(num_fields: usize) -> FieldNormsWriter { + pub fn for_schema(schema: &Schema) -> FieldNormsWriter { + let fields = schema + .fields() + .iter() + .enumerate() + .filter(|&(_, field_entry)| { + field_entry.is_indexed() + }) + .map(|(field, _)| Field(field as u32)) + .collect::>(); + let max_field = fields + .iter() + .map(|field| field.0) + .max() + .map(|max_field_id| max_field_id as usize + 1) + .unwrap_or(0); FieldNormsWriter { - fieldnorms_buffer: (0..num_fields) + fields, + fieldnorms_buffer: (0..max_field) .map(|_| Vec::new()) .collect::>() } } + pub fn fill_up_to_max_doc(&mut self, max_doc: DocId) { + for &field in self.fields.iter() { + self.fieldnorms_buffer[field.0 as usize].resize(max_doc as usize, 0u8); + } + } + pub fn record(&mut self, doc: DocId, field: Field, fieldnorm: u32) { let fieldnorm_buffer: &mut Vec = &mut self.fieldnorms_buffer[field.0 as usize]; - assert!(fieldnorm_buffer.len() < doc as usize, "Cannot register a given fieldnorm twice"); + assert!(fieldnorm_buffer.len() <= doc as usize, "Cannot register a given fieldnorm twice"); // we fill intermediary `DocId` as having a fieldnorm of 0. - fieldnorm_buffer.resize(doc as usize, 0u8); + fieldnorm_buffer.resize(doc as usize + 1, 0u8); fieldnorm_buffer[doc as usize] = fieldnorm_to_id(fieldnorm); } -// -// pub fn serialize(self) { -// -// } + + pub fn serialize(&self, fieldnorms_serializer: &mut FieldNormsSerializer) -> io::Result<()> { + for &field in self.fields.iter() { + let fieldnorm_values: &[u8] = &self.fieldnorms_buffer[field.0 as usize][..]; + fieldnorms_serializer.serialize_field(field, fieldnorm_values)?; + } + Ok(()) + } } \ No newline at end of file diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 99a101f63..0efb262df 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -17,6 +17,7 @@ use store::StoreWriter; use std::cmp::{max, min}; use termdict::TermDictionary; use termdict::TermStreamer; +use fieldnorm::FieldNormsSerializer; pub struct IndexMerger { schema: Schema, @@ -46,12 +47,6 @@ fn compute_min_max_val( } } -fn extract_fieldnorm_reader( - segment_reader: &SegmentReader, - field: Field, -) -> Option> { - segment_reader.get_fieldnorms_reader(field) -} fn extract_fast_field_reader( segment_reader: &SegmentReader, @@ -102,19 +97,20 @@ impl IndexMerger { }) } - fn write_fieldnorms(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> { - let fieldnorm_fastfields: Vec = self.schema - .fields() - .iter() - .enumerate() - .filter(|&(_, field_entry)| field_entry.is_indexed()) - .map(|(field_id, _)| Field(field_id as u32)) - .collect(); - self.generic_write_fast_field( - fieldnorm_fastfields, - &extract_fieldnorm_reader, - fast_field_serializer, - ) + fn write_fieldnorms(&self, fast_field_serializer: &mut FieldNormsSerializer) -> Result<()> { + unimplemented!("Not implemented yet"); +// let fieldnorm_fastfields: Vec = self.schema +// .fields() +// .iter() +// .enumerate() +// .filter(|&(_, field_entry)| field_entry.is_indexed()) +// .map(|(field_id, _)| Field(field_id as u32)) +// .collect(); +// self.generic_write_fast_field( +// fieldnorm_fastfields, +// &extract_fieldnorm_reader, +// fast_field_serializer, +// ) } fn write_fast_fields(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> { diff --git a/src/indexer/segment_serializer.rs b/src/indexer/segment_serializer.rs index dd836b835..036a998b8 100644 --- a/src/indexer/segment_serializer.rs +++ b/src/indexer/segment_serializer.rs @@ -4,6 +4,7 @@ use core::Segment; use core::SegmentComponent; use fastfield::FastFieldSerializer; use store::StoreWriter; +use fieldnorm::FieldNormsSerializer; use postings::InvertedIndexSerializer; /// Segment serializer is in charge of laying out on disk @@ -11,7 +12,7 @@ use postings::InvertedIndexSerializer; pub struct SegmentSerializer { store_writer: StoreWriter, fast_field_serializer: FastFieldSerializer, - fieldnorms_serializer: FastFieldSerializer, + fieldnorms_serializer: FieldNormsSerializer, postings_serializer: InvertedIndexSerializer, } @@ -24,14 +25,14 @@ impl SegmentSerializer { let fast_field_serializer = FastFieldSerializer::from_write(fast_field_write)?; let fieldnorms_write = segment.open_write(SegmentComponent::FIELDNORMS)?; - let fieldnorms_serializer = FastFieldSerializer::from_write(fieldnorms_write)?; + let fieldnorms_serializer = FieldNormsSerializer::from_write(fieldnorms_write)?; let postings_serializer = InvertedIndexSerializer::open(segment)?; Ok(SegmentSerializer { - postings_serializer, store_writer: StoreWriter::new(store_write), fast_field_serializer, fieldnorms_serializer, + postings_serializer, }) } @@ -46,7 +47,7 @@ impl SegmentSerializer { } /// Accessor to the field norm serializer. - pub fn get_fieldnorms_serializer(&mut self) -> &mut FastFieldSerializer { + pub fn get_fieldnorms_serializer(&mut self) -> &mut FieldNormsSerializer { &mut self.fieldnorms_serializer } diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index b46466f47..4e6cfed73 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -19,6 +19,7 @@ use tokenizer::BoxedTokenizer; use tokenizer::FacetTokenizer; use tokenizer::{TokenStream, Tokenizer}; use schema::Value; +use fieldnorm::FieldNormsWriter; /// A `SegmentWriter` is in charge of creating segment index from a /// documents. @@ -31,21 +32,11 @@ pub struct SegmentWriter<'a> { multifield_postings: MultiFieldPostingsWriter<'a>, segment_serializer: SegmentSerializer, fast_field_writers: FastFieldsWriter, - fieldnorms_writer: FastFieldsWriter, + fieldnorms_writer: FieldNormsWriter, doc_opstamps: Vec, tokenizers: Vec>>, } -fn create_fieldnorms_writer(schema: &Schema) -> FastFieldsWriter { - let u64_fields: Vec = schema - .fields() - .iter() - .enumerate() - .filter(|&(_, field_entry)| field_entry.is_indexed()) - .map(|(field_id, _)| Field(field_id as u32)) - .collect(); - FastFieldsWriter::new(u64_fields) -} impl<'a> SegmentWriter<'a> { /// Creates a new `SegmentWriter` @@ -83,7 +74,7 @@ impl<'a> SegmentWriter<'a> { heap, max_doc: 0, multifield_postings, - fieldnorms_writer: create_fieldnorms_writer(schema), + fieldnorms_writer: FieldNormsWriter::for_schema(schema), segment_serializer, fast_field_writers: FastFieldsWriter::from_schema(schema), doc_opstamps: Vec::with_capacity(1_000), @@ -95,7 +86,8 @@ impl<'a> SegmentWriter<'a> { /// /// Finalize consumes the `SegmentWriter`, so that it cannot /// be used afterwards. - pub fn finalize(self) -> Result> { + pub fn finalize(mut self) -> Result> { + self.fieldnorms_writer.fill_up_to_max_doc(self.max_doc); write( &self.multifield_postings, &self.fast_field_writers, @@ -190,10 +182,7 @@ impl<'a> SegmentWriter<'a> { 0 }; self.fieldnorms_writer - .get_field_writer(field) - .map(|field_norms_writer| { - field_norms_writer.add_val(u64::from(num_tokens)) - }); + .record(doc_id, field, num_tokens); } FieldType::U64(ref int_option) => { if int_option.is_indexed() { @@ -219,7 +208,6 @@ impl<'a> SegmentWriter<'a> { } } } - self.fieldnorms_writer.fill_val_up_to(doc_id); doc.filter_fields(|field| schema.get_field_entry(field).is_stored()); let doc_writer = self.segment_serializer.get_store_writer(); doc_writer.store(&doc)?; @@ -252,14 +240,13 @@ impl<'a> SegmentWriter<'a> { fn write( multifield_postings: &MultiFieldPostingsWriter, fast_field_writers: &FastFieldsWriter, - fieldnorms_writer: &FastFieldsWriter, + fieldnorms_writer: &FieldNormsWriter, mut serializer: SegmentSerializer, ) -> Result<()> { let term_ord_map = multifield_postings.serialize(serializer.get_postings_serializer())?; fast_field_writers.serialize(serializer.get_fast_field_serializer(), &term_ord_map)?; - fieldnorms_writer.serialize(serializer.get_fieldnorms_serializer(), &HashMap::new())?; + fieldnorms_writer.serialize(serializer.get_fieldnorms_serializer())?; serializer.close()?; - Ok(()) } diff --git a/src/lib.rs b/src/lib.rs index e807d2814..3fe4d9005 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -418,9 +418,9 @@ mod tests { let searcher = index.searcher(); let segment_reader: &SegmentReader = searcher.segment_reader(0); let fieldnorms_reader = segment_reader.get_fieldnorms_reader(text_field).unwrap(); - assert_eq!(fieldnorms_reader.get(0), 3); - assert_eq!(fieldnorms_reader.get(1), 0); - assert_eq!(fieldnorms_reader.get(2), 2); + assert_eq!(fieldnorms_reader.fieldnorm(0), 3); + assert_eq!(fieldnorms_reader.fieldnorm(1), 0); + assert_eq!(fieldnorms_reader.fieldnorm(2), 2); } } diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 4ed54bf59..dfbed6f9b 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -57,6 +57,7 @@ pub mod tests { use indexer::operation::AddOperation; use tests; use rand::{Rng, SeedableRng, XorShiftRng}; + use fieldnorm::FieldNormReader; #[test] pub fn test_position_write() { @@ -196,10 +197,12 @@ pub mod tests { let segment_reader = SegmentReader::open(&segment).unwrap(); { let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text_field).unwrap(); - assert_eq!(fieldnorm_reader.get(0), 8 + 5); - assert_eq!(fieldnorm_reader.get(1), 2); + assert_eq!(fieldnorm_reader.fieldnorm(0), 8 + 5); + assert_eq!(fieldnorm_reader.fieldnorm(1), 2); for i in 2..1000 { - assert_eq!(fieldnorm_reader.get(i), (i + 1) as u64); + assert_eq!( + fieldnorm_reader.fieldnorm_id(i), + FieldNormReader::fieldnorm_to_id(i + 1) ); } } { diff --git a/src/query/term_query/mod.rs b/src/query/term_query/mod.rs index 11e70aa8a..34ff31c19 100644 --- a/src/query/term_query/mod.rs +++ b/src/query/term_query/mod.rs @@ -16,6 +16,7 @@ mod tests { use query::TermQuery; use Index; use schema::*; + use fieldnorm::FieldNormReader; use schema::IndexRecordOption; use fastfield::FastFieldReader; @@ -55,9 +56,9 @@ mod tests { #[test] pub fn test_term_scorer() { - let left_fieldnorms = FastFieldReader::from(vec![10, 4]); - assert_eq!(left_fieldnorms.get(0), 10); - assert_eq!(left_fieldnorms.get(1), 4); + let left_fieldnorms = FieldNormReader::from(vec![10, 4]); + assert_eq!(left_fieldnorms.fieldnorm(0), 10); + assert_eq!(left_fieldnorms.fieldnorm(1), 4); let left = SegmentPostings::create_from_docs(&[1]); let mut left_scorer = TermScorer { idf: 0.30685282, diff --git a/src/query/term_query/term_scorer.rs b/src/query/term_query/term_scorer.rs index d8352780c..a4ffaa411 100644 --- a/src/query/term_query/term_scorer.rs +++ b/src/query/term_query/term_scorer.rs @@ -5,10 +5,11 @@ use postings::SegmentPostings; use query::Scorer; use postings::Postings; use fastfield::FastFieldReader; +use fieldnorm::FieldNormReader; pub struct TermScorer { pub idf: Score, - pub fieldnorm_reader_opt: Option>, + pub fieldnorm_reader_opt: Option, pub postings: SegmentPostings, } @@ -41,7 +42,7 @@ impl Scorer for TermScorer { let doc = self.postings.doc(); let tf = match self.fieldnorm_reader_opt { Some(ref fieldnorm_reader) => { - let field_norm = fieldnorm_reader.get(doc); + let field_norm = fieldnorm_reader.fieldnorm(doc); (self.postings.term_freq() as f32 / field_norm as f32) } None => self.postings.term_freq() as f32,