From 9a706c296af3e1cdee815bd8036bf988bc6c649e Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 19 Feb 2018 21:50:33 +0900 Subject: [PATCH 1/4] Larger union horizon --- src/query/union.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/query/union.rs b/src/query/union.rs index 88df78b2e..1f85b65e7 100644 --- a/src/query/union.rs +++ b/src/query/union.rs @@ -6,7 +6,7 @@ use DocId; use Score; use query::score_combiner::{DoNothingCombiner, ScoreCombiner}; -const HORIZON_NUM_TINYBITSETS: usize = 32; +const HORIZON_NUM_TINYBITSETS: usize = 64; const HORIZON: u32 = 64u32 * HORIZON_NUM_TINYBITSETS as u32; /// Creates a `DocSet` that iterator through the intersection of two `DocSet`s. From 2a843d86cb71c86c0e48245a4896e03be77342c2 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 19 Feb 2018 21:51:39 +0900 Subject: [PATCH 2/4] Code cleaning --- src/fastfield/mod.rs | 52 ++++++++++++++++++-------------------------- src/macros.rs | 2 +- 2 files changed, 22 insertions(+), 32 deletions(-) diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index ffca841b7..1ebe1899c 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -70,12 +70,6 @@ mod tests { }; } - fn add_single_field_doc(fast_field_writers: &mut FastFieldsWriter, field: Field, value: u64) { - let mut doc = Document::default(); - doc.add_u64(field, value); - fast_field_writers.add_document(&doc); - } - #[test] pub fn test_fastfield() { let test_fastfield = U64FastFieldReader::from(vec![100, 200, 300]); @@ -92,9 +86,9 @@ mod tests { let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); let mut serializer = FastFieldSerializer::from_write(write).unwrap(); let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); - add_single_field_doc(&mut fast_field_writers, *FIELD, 13u64); - add_single_field_doc(&mut fast_field_writers, *FIELD, 14u64); - add_single_field_doc(&mut fast_field_writers, *FIELD, 2u64); + fast_field_writers.add_document(&doc!(*FIELD=>13u64)); + fast_field_writers.add_document(&doc!(*FIELD=>14u64)); + fast_field_writers.add_document(&doc!(*FIELD=>2u64)); fast_field_writers .serialize(&mut serializer, &HashMap::new()) .unwrap(); @@ -122,15 +116,15 @@ mod tests { let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); let mut serializer = FastFieldSerializer::from_write(write).unwrap(); let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); - add_single_field_doc(&mut fast_field_writers, *FIELD, 4u64); - add_single_field_doc(&mut fast_field_writers, *FIELD, 14_082_001u64); - add_single_field_doc(&mut fast_field_writers, *FIELD, 3_052u64); - add_single_field_doc(&mut fast_field_writers, *FIELD, 9002u64); - add_single_field_doc(&mut fast_field_writers, *FIELD, 15_001u64); - add_single_field_doc(&mut fast_field_writers, *FIELD, 777u64); - add_single_field_doc(&mut fast_field_writers, *FIELD, 1_002u64); - add_single_field_doc(&mut fast_field_writers, *FIELD, 1_501u64); - add_single_field_doc(&mut fast_field_writers, *FIELD, 215u64); + fast_field_writers.add_document(&doc!(*FIELD=>4u64)); + fast_field_writers.add_document(&doc!(*FIELD=>14_082_001u64)); + fast_field_writers.add_document(&doc!(*FIELD=>3_052u64)); + fast_field_writers.add_document(&doc!(*FIELD=>9_002u64)); + fast_field_writers.add_document(&doc!(*FIELD=>15_001u64)); + fast_field_writers.add_document(&doc!(*FIELD=>777u64)); + fast_field_writers.add_document(&doc!(*FIELD=>1_002u64)); + fast_field_writers.add_document(&doc!(*FIELD=>1_501u64)); + fast_field_writers.add_document(&doc!(*FIELD=>215u64)); fast_field_writers .serialize(&mut serializer, &HashMap::new()) .unwrap(); @@ -166,7 +160,7 @@ mod tests { let mut serializer = FastFieldSerializer::from_write(write).unwrap(); let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); for _ in 0..10_000 { - add_single_field_doc(&mut fast_field_writers, *FIELD, 100_000u64); + fast_field_writers.add_document(&doc!(*FIELD=>100_000u64)); } fast_field_writers .serialize(&mut serializer, &HashMap::new()) @@ -197,13 +191,9 @@ mod tests { let mut serializer = FastFieldSerializer::from_write(write).unwrap(); let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); // forcing the amplitude to be high - add_single_field_doc(&mut fast_field_writers, *FIELD, 0u64); + fast_field_writers.add_document(&doc!(*FIELD=>0u64)); for i in 0u64..10_000u64 { - add_single_field_doc( - &mut fast_field_writers, - *FIELD, - 5_000_000_000_000_000_000u64 + i, - ); + fast_field_writers.add_document(&doc!(*FIELD=>5_000_000_000_000_000_000u64 + i)); } fast_field_writers .serialize(&mut serializer, &HashMap::new()) @@ -320,8 +310,8 @@ mod tests { let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); let mut serializer = FastFieldSerializer::from_write(write).unwrap(); let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); - for x in &permutation { - add_single_field_doc(&mut fast_field_writers, *FIELD, *x); + for &x in &permutation { + fast_field_writers.add_document(&doc!(*FIELD=>x)); } fast_field_writers .serialize(&mut serializer, &HashMap::new()) @@ -377,8 +367,8 @@ mod tests { let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); let mut serializer = FastFieldSerializer::from_write(write).unwrap(); let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); - for x in &permutation { - add_single_field_doc(&mut fast_field_writers, *FIELD, *x); + for &x in &permutation { + fast_field_writers.add_document(&doc!(*FIELD=>x)); } fast_field_writers .serialize(&mut serializer, &HashMap::new()) @@ -411,8 +401,8 @@ mod tests { let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); let mut serializer = FastFieldSerializer::from_write(write).unwrap(); let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); - for x in &permutation { - add_single_field_doc(&mut fast_field_writers, *FIELD, *x); + for &x in &permutation { + fast_field_writers.add_document(&doc!(*FIELD=>x)); } fast_field_writers .serialize(&mut serializer, &HashMap::new()) diff --git a/src/macros.rs b/src/macros.rs index 1dcca3181..525108c58 100644 --- a/src/macros.rs +++ b/src/macros.rs @@ -54,7 +54,7 @@ macro_rules! doc( ($crate::Document::default()) } }; // avoids a warning due to the useless `mut`. - ($($field:ident => $value:expr),*) => { + ($($field:expr => $value:expr),*) => { { let mut document = $crate::Document::default(); $( From 43742a93ef5e0c35a4f1d3f65d29402c35a85a81 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 19 Feb 2018 23:41:03 +0900 Subject: [PATCH 3/4] Multivalue u64 field / i64 field. --- src/collector/mod.rs | 2 +- src/core/segment_reader.rs | 33 ++++----- src/datastruct/stacker/hashmap.rs | 4 +- src/fastfield/facet_reader.rs | 8 +-- src/fastfield/mod.rs | 12 ++++ src/fastfield/multivalued/mod.rs | 108 ++++++++++++++++++++++++++++ src/fastfield/multivalued/reader.rs | 45 +++++++++--- src/fastfield/multivalued/writer.rs | 62 +++++++++++++--- src/fastfield/serializer.rs | 2 +- src/fastfield/writer.rs | 20 ++---- src/indexer/merger.rs | 12 ++-- src/indexer/segment_writer.rs | 1 - src/lib.rs | 11 +-- src/postings/mod.rs | 2 +- src/postings/postings_writer.rs | 2 +- 15 files changed, 254 insertions(+), 70 deletions(-) diff --git a/src/collector/mod.rs b/src/collector/mod.rs index 218ead2e9..f8867abc1 100644 --- a/src/collector/mod.rs +++ b/src/collector/mod.rs @@ -167,7 +167,7 @@ pub mod tests { impl Collector for FastFieldTestCollector { fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> { - self.ff_reader = Some(reader.get_fast_field_reader(self.field)?); + self.ff_reader = Some(reader.fast_field_reader(self.field)?); Ok(()) } diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 6f66c5f17..c97be0bb7 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -91,7 +91,7 @@ impl SegmentReader { /// /// # Panics /// May panic if the index is corrupted. - pub fn get_fast_field_reader( + pub fn fast_field_reader( &self, field: Field, ) -> fastfield::Result { @@ -116,7 +116,7 @@ impl SegmentReader { field_entry )).into()); } - let term_ords_reader = self.multi_value_reader(field)?; + let term_ords_reader = self.multi_fast_field_reader(field)?; let termdict_source = self.termdict_composite.open_read(field).ok_or_else(|| { ErrorKind::InvalidArgument(format!( "The field \"{}\" is a hierarchical \ @@ -130,20 +130,6 @@ impl SegmentReader { Ok(facet_reader) } - /// Accessor to the `MultiValueIntFastFieldReader` associated to a given `Field`. - pub fn multi_value_reader(&self, field: Field) -> Result { - let field_entry = self.schema.get_field_entry(field); - let idx_reader = self.fast_fields_composite - .open_read_with_idx(field, 0) - .ok_or_else(|| FastFieldNotAvailableError::new(field_entry)) - .map(U64FastFieldReader::open)?; - let vals_reader = self.fast_fields_composite - .open_read_with_idx(field, 1) - .ok_or_else(|| FastFieldNotAvailableError::new(field_entry)) - .map(U64FastFieldReader::open)?; - Ok(MultiValueIntFastFieldReader::open(idx_reader, vals_reader)) - } - /// Accessor to the segment's `Field norms`'s reader. /// /// Field norms are the length (in tokens) of the fields. @@ -158,6 +144,21 @@ impl SegmentReader { .map(U64FastFieldReader::open) } + /// Accessor to the `MultiValueIntFastFieldReader` associated to a given `Field`. + /// + pub fn multi_fast_field_reader(&self, field: Field) -> Result> { + let field_entry = self.schema.get_field_entry(field); + let idx_reader = self.fast_fields_composite + .open_read_with_idx(field, 0) + .ok_or_else(|| FastFieldNotAvailableError::new(field_entry)) + .map(U64FastFieldReader::open)?; + let vals_reader = self.fast_fields_composite + .open_read_with_idx(field, 1) + .ok_or_else(|| FastFieldNotAvailableError::new(field_entry)) + .map(U64FastFieldReader::open)?; + Ok(MultiValueIntFastFieldReader::open(idx_reader, vals_reader)) + } + /// Accessor to the segment's `StoreReader`. pub fn get_store_reader(&self) -> &StoreReader { &self.store_reader diff --git a/src/datastruct/stacker/hashmap.rs b/src/datastruct/stacker/hashmap.rs index 6e804889b..f1a61702c 100644 --- a/src/datastruct/stacker/hashmap.rs +++ b/src/datastruct/stacker/hashmap.rs @@ -189,11 +189,11 @@ impl<'a> TermHashMap<'a> { let (addr, val): (u32, &mut V) = self.heap.allocate_object(); assert_eq!(addr, key_bytes_ref.addr() + 2 + key_bytes.len() as u32); self.set_bucket(hash, key_bytes_ref, bucket); - return (bucket, val); + return (bucket as UnorderedTermId, val); } else if kv.hash == hash { let (stored_key, expull_addr): (&[u8], u32) = self.get_key_value(kv.key_value_addr); if stored_key == key_bytes { - return (bucket, self.heap.get_mut_ref(expull_addr)); + return (bucket as UnorderedTermId, self.heap.get_mut_ref(expull_addr)); } } } diff --git a/src/fastfield/facet_reader.rs b/src/fastfield/facet_reader.rs index ea5a9e25a..049311094 100644 --- a/src/fastfield/facet_reader.rs +++ b/src/fastfield/facet_reader.rs @@ -18,7 +18,7 @@ use termdict::{TermDictionary, TermDictionaryImpl}; /// list of facets. This ordinal is segment local and /// only makes sense for a given segment. pub struct FacetReader { - term_ords: MultiValueIntFastFieldReader, + term_ords: MultiValueIntFastFieldReader, term_dict: TermDictionaryImpl, } @@ -31,12 +31,12 @@ impl FacetReader { /// - a `TermDictionaryImpl` that helps associating a facet to /// an ordinal and vice versa. pub fn new( - term_ords: MultiValueIntFastFieldReader, + term_ords: MultiValueIntFastFieldReader, term_dict: TermDictionaryImpl, ) -> FacetReader { FacetReader { - term_ords: term_ords, - term_dict: term_dict, + term_ords, + term_dict } } diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 1ebe1899c..5a3590b9a 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -41,6 +41,18 @@ pub use self::error::{FastFieldNotAvailableError, Result}; pub use self::facet_reader::FacetReader; pub use self::multivalued::MultiValueIntFastFieldReader; +use common; +use schema::Value; + +fn value_to_u64(value: &Value) -> u64 { + match *value { + Value::U64(ref val) => *val, + Value::I64(ref val) => common::i64_to_u64(*val), + _ => panic!("Expected a u64/i64 field, got {:?} ", value), + } +} + + #[cfg(test)] mod tests { use super::*; diff --git a/src/fastfield/multivalued/mod.rs b/src/fastfield/multivalued/mod.rs index 0043e7783..60ef673f8 100644 --- a/src/fastfield/multivalued/mod.rs +++ b/src/fastfield/multivalued/mod.rs @@ -3,3 +3,111 @@ mod reader; pub use self::writer::MultiValueIntFastFieldWriter; pub use self::reader::MultiValueIntFastFieldReader; + + + +#[cfg(test)] +mod tests { + + use schema::SchemaBuilder; + use schema::Cardinality; + use schema::IntOptions; + use Index; + + #[test] + fn test_multivalued_u64() { + let mut schema_builder = SchemaBuilder::default(); + let field = schema_builder.add_u64_field( + "multifield", + IntOptions::default().set_fast(Cardinality::MultiValues) + ); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); + index_writer.add_document(doc!(field=>1u64, field=>3u64)); + index_writer.add_document(doc!()); + index_writer.add_document(doc!(field=>4u64)); + index_writer.add_document(doc!(field=>5u64, field=>20u64,field=>1u64)); + assert!(index_writer.commit().is_ok()); + + index.load_searchers().unwrap(); + let searcher = index.searcher(); + let reader = searcher.segment_reader(0); + let mut vals = Vec::new(); + let multi_value_reader = reader.multi_fast_field_reader::(field).unwrap(); + { + multi_value_reader.get_vals(2, &mut vals); + assert_eq!(&vals, &[4u64]); + } + { + multi_value_reader.get_vals(0, &mut vals); + assert_eq!(&vals, &[1u64, 3u64]); + } + { + multi_value_reader.get_vals(1, &mut vals); + assert!(vals.is_empty()); + } + } + + + #[test] + fn test_multivalued_i64() { + let mut schema_builder = SchemaBuilder::default(); + let field = schema_builder.add_i64_field( + "multifield", + IntOptions::default().set_fast(Cardinality::MultiValues) + ); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); + index_writer.add_document(doc!(field=> 1i64, field => 3i64)); + index_writer.add_document(doc!()); + index_writer.add_document(doc!(field=> -4i64)); + index_writer.add_document(doc!(field=> -5i64, field => -20i64, field=>1i64)); + assert!(index_writer.commit().is_ok()); + + index.load_searchers().unwrap(); + let searcher = index.searcher(); + let reader = searcher.segment_reader(0); + let mut vals = Vec::new(); + let multi_value_reader = reader.multi_fast_field_reader::(field).unwrap(); + { + multi_value_reader.get_vals(2, &mut vals); + assert_eq!(&vals, &[-4i64]); + } + { + multi_value_reader.get_vals(0, &mut vals); + assert_eq!(&vals, &[1i64, 3i64]); + } + { + multi_value_reader.get_vals(1, &mut vals); + assert!(vals.is_empty()); + } + { + multi_value_reader.get_vals(3, &mut vals); + assert_eq!(&vals, &[-5i64, -20i64, 1i64]); + } + } + + #[test] + #[should_panic] + fn test_multivalued_unreachable() { + let mut schema_builder = SchemaBuilder::default(); + let field = schema_builder.add_i64_field( + "multifield", + IntOptions::default().set_fast(Cardinality::MultiValues) + ); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); + index_writer.add_document(doc!(field=> 1i64, field => 3i64)); + assert!(index_writer.commit().is_ok()); + + index.load_searchers().unwrap(); + let searcher = index.searcher(); + let reader = searcher.segment_reader(0); + let multi_value_reader = reader.multi_fast_field_reader::(field).unwrap(); + let mut vals = Vec::new(); + multi_value_reader.get_vals(0, &mut vals); + } +} \ No newline at end of file diff --git a/src/fastfield/multivalued/reader.rs b/src/fastfield/multivalued/reader.rs index 147cee89a..cdcf51d56 100644 --- a/src/fastfield/multivalued/reader.rs +++ b/src/fastfield/multivalued/reader.rs @@ -1,7 +1,11 @@ use DocId; use fastfield::FastFieldReader; - use fastfield::U64FastFieldReader; +use std::marker::PhantomData; +use common; + + + /// Reader for a multivalued `u64` fast field. /// @@ -12,30 +16,55 @@ use fastfield::U64FastFieldReader; /// The `idx_reader` associated, for each document, the index of its first value. /// #[derive(Clone)] -pub struct MultiValueIntFastFieldReader { +pub struct MultiValueIntFastFieldReader { idx_reader: U64FastFieldReader, vals_reader: U64FastFieldReader, + __phantom__: PhantomData } -impl MultiValueIntFastFieldReader { +trait ConvertU64 { + fn from_u64(val: u64) -> Item; +} + +impl ConvertU64 for MultiValueIntFastFieldReader { + default fn from_u64(_: u64) -> Item { + unimplemented!("MultiValueIntFastField only exists for u64 and i64."); + } +} + +impl ConvertU64 for MultiValueIntFastFieldReader { + fn from_u64(val: u64) -> u64 { + val + } +} + +impl ConvertU64 for MultiValueIntFastFieldReader { + fn from_u64(val: u64) -> i64 { + common::u64_to_i64(val) + } +} + + +impl MultiValueIntFastFieldReader { pub(crate) fn open( idx_reader: U64FastFieldReader, vals_reader: U64FastFieldReader, - ) -> MultiValueIntFastFieldReader { + ) -> MultiValueIntFastFieldReader { MultiValueIntFastFieldReader { - idx_reader: idx_reader, - vals_reader: vals_reader, + idx_reader, + vals_reader, + __phantom__: PhantomData, } } /// Returns the array of values associated to the given `doc`. - pub fn get_vals(&self, doc: DocId, vals: &mut Vec) { + pub fn get_vals(&self, doc: DocId, vals: &mut Vec) { let start = self.idx_reader.get(doc) as u32; let stop = self.idx_reader.get(doc + 1) as u32; vals.clear(); for val_id in start..stop { let val = self.vals_reader.get(val_id); - vals.push(val); + vals.push(Self::from_u64(val)); } } } diff --git a/src/fastfield/multivalued/writer.rs b/src/fastfield/multivalued/writer.rs index d988656ce..15b3fa91e 100644 --- a/src/fastfield/multivalued/writer.rs +++ b/src/fastfield/multivalued/writer.rs @@ -1,22 +1,28 @@ use fastfield::FastFieldSerializer; +use fastfield::serializer::FastSingleFieldSerializer; +use fastfield::value_to_u64; use std::collections::HashMap; use postings::UnorderedTermId; -use schema::Field; +use schema::{Document, Field}; use std::io; +use itertools::Itertools; + pub struct MultiValueIntFastFieldWriter { field: Field, - vals: Vec, + vals: Vec, doc_index: Vec, + is_facet: bool } impl MultiValueIntFastFieldWriter { /// Creates a new `IntFastFieldWriter` - pub fn new(field: Field) -> Self { + pub fn new(field: Field, is_facet: bool) -> Self { MultiValueIntFastFieldWriter { - field: field, + field, vals: Vec::new(), doc_index: Vec::new(), + is_facet } } @@ -37,11 +43,32 @@ impl MultiValueIntFastFieldWriter { self.vals.push(val); } - /// Push the fast fields value to the `FastFieldWriter`. + pub fn add_document(&mut self, doc: &Document) { + if !self.is_facet { + for field_value in doc.field_values() { + if field_value.field() == self.field { + self.add_val(value_to_u64(field_value.value())); + } + } + } + + } + + /// Serializes fast field values by pushing them to the `FastFieldSerializer`. + /// + /// HashMap makes it possible to remap them before serializing. + /// Specifically, string terms are first stored in the writer as their + /// position in the `IndexWriter`'s `HashMap`. This value is called + /// an `UnorderedTermId`. + /// + /// During the serialization of the segment, terms gets sorted and + /// `tantivy` builds a mapping to convert this `UnorderedTermId` into + /// term ordinals. + /// pub fn serialize( &self, serializer: &mut FastFieldSerializer, - mapping: &HashMap, + mapping_opt: Option<&HashMap>, ) -> io::Result<()> { { // writing the offset index @@ -55,10 +82,25 @@ impl MultiValueIntFastFieldWriter { } { // writing the values themselves. - let mut value_serializer = - serializer.new_u64_fast_field_with_idx(self.field, 0u64, mapping.len() as u64, 1)?; - for val in &self.vals { - value_serializer.add_val(*mapping.get(val).expect("Missing term ordinal") as u64)?; + let mut value_serializer: FastSingleFieldSerializer<_>; + match mapping_opt { + Some(mapping) => { + value_serializer = + serializer.new_u64_fast_field_with_idx(self.field, 0u64, mapping.len() as u64, 1)?; + for val in &self.vals { + let remapped_val = *mapping.get(val).expect("Missing term ordinal") as u64; + value_serializer.add_val(remapped_val)?; + } + } + None => { + let val_min_max = self.vals.iter().cloned().minmax(); + let (val_min, val_max) = val_min_max.into_option().unwrap_or((0u64, 0)); + value_serializer = + serializer.new_u64_fast_field_with_idx(self.field, val_min, val_max, 1)?; + for &val in &self.vals { + value_serializer.add_val(val)?; + } + } } value_serializer.close_field()?; } diff --git a/src/fastfield/serializer.rs b/src/fastfield/serializer.rs index 43b55daf0..208b9e2ea 100644 --- a/src/fastfield/serializer.rs +++ b/src/fastfield/serializer.rs @@ -37,7 +37,7 @@ impl FastFieldSerializer { // just making room for the pointer to header. let composite_write = CompositeWrite::wrap(write); Ok(FastFieldSerializer { - composite_write: composite_write, + composite_write }) } diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 7248b93e1..ab9b3a6c9 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -1,7 +1,6 @@ use schema::{Cardinality, Document, Field, Schema}; use fastfield::FastFieldSerializer; use std::io; -use schema::Value; use DocId; use schema::FieldType; use common; @@ -39,14 +38,14 @@ impl FastFieldsWriter { single_value_writers.push(fast_field_writer); } Some(Cardinality::MultiValues) => { - let fast_field_writer = MultiValueIntFastFieldWriter::new(field); + let fast_field_writer = MultiValueIntFastFieldWriter::new(field, false); multi_values_writers.push(fast_field_writer); } None => {} } } FieldType::HierarchicalFacet => { - let fast_field_writer = MultiValueIntFastFieldWriter::new(field); + let fast_field_writer = MultiValueIntFastFieldWriter::new(field, true); multi_values_writers.push(fast_field_writer); } _ => {} @@ -97,6 +96,7 @@ impl FastFieldsWriter { } for field_writer in &mut self.multi_values_writers { field_writer.next_doc(); + field_writer.add_document(doc); } } @@ -112,11 +112,7 @@ impl FastFieldsWriter { } for field_writer in &self.multi_values_writers { let field = field_writer.field(); - if let Some(mapping) = mapping.get(&field) { - field_writer.serialize(serializer, mapping)?; - } else { - panic!("Term ordinal mapping missing for {:?}", field); - } + field_writer.serialize(serializer, mapping.get(&field))?; } Ok(()) } @@ -160,7 +156,7 @@ impl IntFastFieldWriter { /// Creates a new `IntFastFieldWriter` pub fn new(field: Field) -> IntFastFieldWriter { IntFastFieldWriter { - field: field, + field, vals: Vec::new(), val_count: 0, val_if_missing: 0u64, @@ -227,11 +223,7 @@ impl IntFastFieldWriter { /// only the first one is taken in account. fn extract_val(&self, doc: &Document) -> u64 { match doc.get_first(self.field) { - Some(v) => match *v { - Value::U64(ref val) => *val, - Value::I64(ref val) => common::i64_to_u64(*val), - _ => panic!("Expected a u64field, got {:?} ", v), - }, + Some(v) => super::value_to_u64(v), None => self.val_if_missing, } } diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index ce2671167..a2ee1b00c 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -58,7 +58,7 @@ fn extract_fast_field_reader( segment_reader: &SegmentReader, field: Field, ) -> Option { - segment_reader.get_fast_field_reader(field).ok() + segment_reader.fast_field_reader(field).ok() } struct DeltaComputer { @@ -630,14 +630,14 @@ mod tests { let score_field_reader: U64FastFieldReader = searcher .segment_reader(0) - .get_fast_field_reader(score_field) + .fast_field_reader(score_field) .unwrap(); assert_eq!(score_field_reader.min_value(), 1); assert_eq!(score_field_reader.max_value(), 3); let score_field_reader: U64FastFieldReader = searcher .segment_reader(1) - .get_fast_field_reader(score_field) + .fast_field_reader(score_field) .unwrap(); assert_eq!(score_field_reader.min_value(), 4000); assert_eq!(score_field_reader.max_value(), 7000); @@ -687,7 +687,7 @@ mod tests { ); let score_field_reader: U64FastFieldReader = searcher .segment_reader(0) - .get_fast_field_reader(score_field) + .fast_field_reader(score_field) .unwrap(); assert_eq!(score_field_reader.min_value(), 3); assert_eq!(score_field_reader.max_value(), 7000); @@ -733,7 +733,7 @@ mod tests { ); let score_field_reader: U64FastFieldReader = searcher .segment_reader(0) - .get_fast_field_reader(score_field) + .fast_field_reader(score_field) .unwrap(); assert_eq!(score_field_reader.min_value(), 3); assert_eq!(score_field_reader.max_value(), 7000); @@ -784,7 +784,7 @@ mod tests { ); let score_field_reader: U64FastFieldReader = searcher .segment_reader(0) - .get_fast_field_reader(score_field) + .fast_field_reader(score_field) .unwrap(); assert_eq!(score_field_reader.min_value(), 6000); assert_eq!(score_field_reader.max_value(), 7000); diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index d3fcbd736..b46466f47 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -160,7 +160,6 @@ impl<'a> SegmentWriter<'a> { self.multifield_postings.subscribe(doc_id, &term); unordered_term_id_opt = Some(unordered_term_id); }); - if let Some(unordered_term_id) = unordered_term_id_opt { self.fast_field_writers .get_multivalue_writer(field) diff --git a/src/lib.rs b/src/lib.rs index bd8c31e80..bcf3cf451 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,6 +3,7 @@ #![cfg_attr(feature = "cargo-clippy", allow(inline_always))] #![feature(box_syntax)] #![feature(optin_builtin_traits)] +#![feature(specialization)] #![feature(conservative_impl_trait)] #![feature(collections_range)] #![feature(integer_atomics)] @@ -857,22 +858,22 @@ mod tests { let segment_reader: &SegmentReader = searcher.segment_reader(0); { let fast_field_reader_res = - segment_reader.get_fast_field_reader::(text_field); + segment_reader.fast_field_reader::(text_field); assert!(fast_field_reader_res.is_err()); } { let fast_field_reader_res = - segment_reader.get_fast_field_reader::(stored_int_field); + segment_reader.fast_field_reader::(stored_int_field); assert!(fast_field_reader_res.is_err()); } { let fast_field_reader_res = - segment_reader.get_fast_field_reader::(fast_field_signed); + segment_reader.fast_field_reader::(fast_field_signed); assert!(fast_field_reader_res.is_err()); } { let fast_field_reader_res = - segment_reader.get_fast_field_reader::(fast_field_signed); + segment_reader.fast_field_reader::(fast_field_signed); assert!(fast_field_reader_res.is_ok()); let fast_field_reader = fast_field_reader_res.unwrap(); assert_eq!(fast_field_reader.get(0), 4i64) @@ -880,7 +881,7 @@ mod tests { { let fast_field_reader_res = - segment_reader.get_fast_field_reader::(fast_field_signed); + segment_reader.fast_field_reader::(fast_field_signed); assert!(fast_field_reader_res.is_ok()); let fast_field_reader = fast_field_reader_res.unwrap(); assert_eq!(fast_field_reader.get(0), 4i64) diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 112519b99..4427b94eb 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -25,7 +25,7 @@ pub use self::segment_postings::{BlockSegmentPostings, SegmentPostings}; pub use common::HasLen; -pub(crate) type UnorderedTermId = usize; +pub(crate) type UnorderedTermId = u64; #[allow(enum_variant_names)] pub(crate) enum FreqReadingOption { diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index 2b9e4d677..44d016088 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -221,7 +221,7 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<' heap: &Heap, ) -> UnorderedTermId { debug_assert!(term.as_slice().len() >= 4); - let (term_ord, recorder): (usize, &mut Rec) = term_index.get_or_create(term); + let (term_ord, recorder): (UnorderedTermId, &mut Rec) = term_index.get_or_create(term); let current_doc = recorder.current_doc(); if current_doc != doc { if current_doc != u32::max_value() { From f16cc6367efc6ea8b4ea91c6fa947e3858a070ae Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 20 Feb 2018 11:11:56 +0900 Subject: [PATCH 4/4] Refactoring of fastfields --- src/collector/mod.rs | 3 +- src/common/bitpacker.rs | 12 +- src/core/segment_reader.rs | 55 +++--- src/fastfield/mod.rs | 151 ++++++++++++----- src/fastfield/multivalued/mod.rs | 24 --- src/fastfield/multivalued/reader.rs | 54 ++---- src/fastfield/reader.rs | 250 +++++++++------------------- src/fastfield/writer.rs | 4 +- src/indexer/merger.rs | 30 ++-- src/lib.rs | 12 +- src/postings/mod.rs | 1 - src/query/term_query/mod.rs | 3 +- src/query/term_query/term_scorer.rs | 3 +- 13 files changed, 261 insertions(+), 341 deletions(-) diff --git a/src/collector/mod.rs b/src/collector/mod.rs index f8867abc1..066803182 100644 --- a/src/collector/mod.rs +++ b/src/collector/mod.rs @@ -94,7 +94,6 @@ pub mod tests { use Score; use core::SegmentReader; use SegmentLocalId; - use fastfield::U64FastFieldReader; use fastfield::FastFieldReader; use schema::Field; @@ -148,7 +147,7 @@ pub mod tests { pub struct FastFieldTestCollector { vals: Vec, field: Field, - ff_reader: Option, + ff_reader: Option>, } impl FastFieldTestCollector { diff --git a/src/common/bitpacker.rs b/src/common/bitpacker.rs index 1521fd2af..04524013a 100644 --- a/src/common/bitpacker.rs +++ b/src/common/bitpacker.rs @@ -89,7 +89,7 @@ where pub fn get(&self, idx: usize) -> u64 { if self.num_bits == 0 { - return 0; + return 0u64; } let data: &[u8] = &*self.data; let num_bits = self.num_bits; @@ -107,7 +107,7 @@ where ); let val_unshifted_unmasked: u64 = unsafe { *(data[addr..].as_ptr() as *const u64) }; let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64; - (val_shifted & mask) + val_shifted & mask } else { let val_unshifted_unmasked: u64 = if addr + 8 <= data.len() { unsafe { *(data[addr..].as_ptr() as *const u64) } @@ -119,14 +119,18 @@ where unsafe { *(buffer[..].as_ptr() as *const u64) } }; let val_shifted = val_unshifted_unmasked >> (bit_shift as u64); - (val_shifted & mask) + val_shifted & mask } } + /// Reads a range of values from the fast field. + /// + /// The range of values read is from + /// `[start..start + output.len()[` pub fn get_range(&self, start: u32, output: &mut [u64]) { if self.num_bits == 0 { for val in output.iter_mut() { - *val = 0; + *val = 0u64; } } else { let data: &[u8] = &*self.data; diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index c97be0bb7..85ddbfea0 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -21,10 +21,11 @@ use schema::FieldType; use error::ErrorKind; use termdict::TermDictionaryImpl; use fastfield::FacetReader; -use fastfield::{FastFieldReader, U64FastFieldReader}; +use fastfield::FastFieldReader; use schema::Schema; use termdict::TermDictionary; -use fastfield::MultiValueIntFastFieldReader; +use fastfield::{FastValue, MultiValueIntFastFieldReader}; +use schema::Cardinality; /// Entry point to access all of the datastructures of the `Segment` /// @@ -91,18 +92,37 @@ impl SegmentReader { /// /// # Panics /// May panic if the index is corrupted. - pub fn fast_field_reader( + pub fn fast_field_reader( &self, field: Field, - ) -> fastfield::Result { + ) -> fastfield::Result> { let field_entry = self.schema.get_field_entry(field); - if !TFastFieldReader::is_enabled(field_entry.field_type()) { - Err(FastFieldNotAvailableError::new(field_entry)) - } else { + if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::SingleValue) { self.fast_fields_composite .open_read(field) .ok_or_else(|| FastFieldNotAvailableError::new(field_entry)) - .map(TFastFieldReader::open) + .map(FastFieldReader::open) + } else { + Err(FastFieldNotAvailableError::new(field_entry)) + } + } + + /// Accessor to the `MultiValueIntFastFieldReader` associated to a given `Field`. + /// May panick if the field is not a multivalued fastfield of the type `Item`. + pub fn multi_fast_field_reader(&self, field: Field) -> fastfield::Result> { + let field_entry = self.schema.get_field_entry(field); + if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::MultiValues) { + let idx_reader = self.fast_fields_composite + .open_read_with_idx(field, 0) + .ok_or_else(|| FastFieldNotAvailableError::new(field_entry)) + .map(FastFieldReader::open)?; + let vals_reader = self.fast_fields_composite + .open_read_with_idx(field, 1) + .ok_or_else(|| FastFieldNotAvailableError::new(field_entry)) + .map(FastFieldReader::open)?; + Ok(MultiValueIntFastFieldReader::open(idx_reader, vals_reader)) + } else { + Err(FastFieldNotAvailableError::new(field_entry)) } } @@ -138,25 +158,10 @@ impl SegmentReader { /// /// They are simply stored as a fast field, serialized in /// the `.fieldnorm` file of the segment. - pub fn get_fieldnorms_reader(&self, field: Field) -> Option { + pub fn get_fieldnorms_reader(&self, field: Field) -> Option> { self.fieldnorms_composite .open_read(field) - .map(U64FastFieldReader::open) - } - - /// Accessor to the `MultiValueIntFastFieldReader` associated to a given `Field`. - /// - pub fn multi_fast_field_reader(&self, field: Field) -> Result> { - let field_entry = self.schema.get_field_entry(field); - let idx_reader = self.fast_fields_composite - .open_read_with_idx(field, 0) - .ok_or_else(|| FastFieldNotAvailableError::new(field_entry)) - .map(U64FastFieldReader::open)?; - let vals_reader = self.fast_fields_composite - .open_read_with_idx(field, 1) - .ok_or_else(|| FastFieldNotAvailableError::new(field_entry)) - .map(U64FastFieldReader::open)?; - Ok(MultiValueIntFastFieldReader::open(idx_reader, vals_reader)) + .map(FastFieldReader::open) } /// Accessor to the segment's `StoreReader`. diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 5a3590b9a..14d54f2a8 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -23,6 +23,19 @@ values stored. Read access performance is comparable to that of an array lookup. */ +use common; +use schema::Cardinality; +use schema::FieldType; +use schema::Value; +pub use self::delete::DeleteBitSet; +pub use self::delete::write_delete_bitset; +pub use self::error::{FastFieldNotAvailableError, Result}; +pub use self::facet_reader::FacetReader; +pub use self::multivalued::MultiValueIntFastFieldReader; +pub use self::reader::FastFieldReader; +pub use self::serializer::FastFieldSerializer; +pub use self::writer::{FastFieldsWriter, IntFastFieldWriter}; + mod reader; mod writer; mod serializer; @@ -31,18 +44,76 @@ mod delete; mod facet_reader; mod multivalued; -pub use self::delete::write_delete_bitset; -pub use self::delete::DeleteBitSet; -pub use self::writer::{FastFieldsWriter, IntFastFieldWriter}; -pub use self::reader::{I64FastFieldReader, U64FastFieldReader}; -pub use self::reader::FastFieldReader; -pub use self::serializer::FastFieldSerializer; -pub use self::error::{FastFieldNotAvailableError, Result}; -pub use self::facet_reader::FacetReader; -pub use self::multivalued::MultiValueIntFastFieldReader; +/// Trait for types that are allowed for fast fields: (u64 or i64). +pub trait FastValue: Default + Clone + Copy { + /// Converts a value from u64 + /// + /// Internally all fast field values are encoded as u64. + fn from_u64(val: u64) -> Self; -use common; -use schema::Value; + /// Converts a value to u64. + /// + /// Internally all fast field values are encoded as u64. + fn to_u64(&self) -> u64; + + /// Returns the fast field cardinality that can be extracted from the given + /// `FieldType`. + /// + /// If the type is not a fast field, `None` is returned. + fn fast_field_cardinality(field_type: &FieldType) -> Option; + + /// Cast value to `u64`. + /// The value is just reinterpreted in memory. + fn as_u64(&self) -> u64; +} + + +impl FastValue for u64 { + fn from_u64(val: u64) -> Self { + val + } + + fn to_u64(&self) -> u64 { + *self + } + + fn as_u64(&self) -> u64 { + *self + } + + fn fast_field_cardinality(field_type: &FieldType) -> Option { + match *field_type { + FieldType::U64(ref integer_options) => + integer_options.get_fastfield_cardinality(), + FieldType::HierarchicalFacet => + Some(Cardinality::MultiValues), + _ => None, + } + } +} + +impl FastValue for i64 { + fn from_u64(val: u64) -> Self { + common::u64_to_i64(val) + } + + fn to_u64(&self) -> u64 { + common::i64_to_u64(*self) + } + + + fn fast_field_cardinality(field_type: &FieldType) -> Option { + match *field_type { + FieldType::I64(ref integer_options) => + integer_options.get_fastfield_cardinality(), + _ => None, + } + } + + fn as_u64(&self) -> u64 { + *self as u64 + } +} fn value_to_u64(value: &Value) -> u64 { match *value { @@ -55,21 +126,22 @@ fn value_to_u64(value: &Value) -> u64 { #[cfg(test)] mod tests { - use super::*; - use schema::Field; - use std::path::Path; + + use common::CompositeFile; use directory::{Directory, RAMDirectory, WritePtr}; - use schema::Document; - use schema::{Schema, SchemaBuilder}; - use schema::FAST; - use std::collections::HashMap; - use test::Bencher; - use test; use fastfield::FastFieldReader; use rand::Rng; use rand::SeedableRng; - use common::CompositeFile; use rand::XorShiftRng; + use schema::{Schema, SchemaBuilder}; + use schema::Document; + use schema::FAST; + use schema::Field; + use std::collections::HashMap; + use std::path::Path; + use super::*; + use test; + use test::Bencher; lazy_static! { static ref SCHEMA: Schema = { @@ -84,7 +156,7 @@ mod tests { #[test] pub fn test_fastfield() { - let test_fastfield = U64FastFieldReader::from(vec![100, 200, 300]); + let test_fastfield = FastFieldReader::::from(vec![100, 200, 300]); assert_eq!(test_fastfield.get(0), 100); assert_eq!(test_fastfield.get(1), 200); assert_eq!(test_fastfield.get(2), 300); @@ -113,7 +185,7 @@ mod tests { { let composite_file = CompositeFile::open(&source).unwrap(); let field_source = composite_file.open_read(*FIELD).unwrap(); - let fast_field_reader: U64FastFieldReader = U64FastFieldReader::open(field_source); + let fast_field_reader = FastFieldReader::::open(field_source); assert_eq!(fast_field_reader.get(0), 13u64); assert_eq!(fast_field_reader.get(1), 14u64); assert_eq!(fast_field_reader.get(2), 2u64); @@ -148,8 +220,8 @@ mod tests { } { let fast_fields_composite = CompositeFile::open(&source).unwrap(); - let fast_field_reader: U64FastFieldReader = - U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap()); + let data = fast_fields_composite.open_read(*FIELD).unwrap(); + let fast_field_reader = FastFieldReader::::open(data); assert_eq!(fast_field_reader.get(0), 4u64); assert_eq!(fast_field_reader.get(1), 14_082_001u64); assert_eq!(fast_field_reader.get(2), 3_052u64); @@ -185,8 +257,8 @@ mod tests { } { let fast_fields_composite = CompositeFile::open(&source).unwrap(); - let fast_field_reader: U64FastFieldReader = - U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap()); + let data = fast_fields_composite.open_read(*FIELD).unwrap(); + let fast_field_reader = FastFieldReader::::open(data); for doc in 0..10_000 { assert_eq!(fast_field_reader.get(doc), 100_000u64); } @@ -218,9 +290,8 @@ mod tests { } { let fast_fields_composite = CompositeFile::open(&source).unwrap(); - let fast_field_reader: U64FastFieldReader = - U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap()); - + let data = fast_fields_composite.open_read(*FIELD).unwrap(); + let fast_field_reader = FastFieldReader::::open(data); assert_eq!(fast_field_reader.get(0), 0u64); for doc in 1..10_001 { assert_eq!( @@ -259,8 +330,8 @@ mod tests { } { let fast_fields_composite = CompositeFile::open(&source).unwrap(); - let fast_field_reader: I64FastFieldReader = - I64FastFieldReader::open(fast_fields_composite.open_read(i64_field).unwrap()); + let data = fast_fields_composite.open_read(i64_field).unwrap(); + let fast_field_reader = FastFieldReader::::open(data); assert_eq!(fast_field_reader.min_value(), -100i64); assert_eq!(fast_field_reader.max_value(), 9_999i64); @@ -298,8 +369,8 @@ mod tests { let source = directory.open_read(&path).unwrap(); { let fast_fields_composite = CompositeFile::open(&source).unwrap(); - let fast_field_reader: I64FastFieldReader = - I64FastFieldReader::open(fast_fields_composite.open_read(i64_field).unwrap()); + let data = fast_fields_composite.open_read(i64_field).unwrap(); + let fast_field_reader = FastFieldReader::::open(data); assert_eq!(fast_field_reader.get(0u32), 0i64); } } @@ -333,8 +404,8 @@ mod tests { let source = directory.open_read(&path).unwrap(); { let fast_fields_composite = CompositeFile::open(&source).unwrap(); - let fast_field_reader: U64FastFieldReader = - U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap()); + let data = fast_fields_composite.open_read(*FIELD).unwrap(); + let fast_field_reader = FastFieldReader::::open(data); let mut a = 0u64; for _ in 0..n { @@ -390,8 +461,8 @@ mod tests { let source = directory.open_read(&path).unwrap(); { let fast_fields_composite = CompositeFile::open(&source).unwrap(); - let fast_field_reader: U64FastFieldReader = - U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap()); + let data = fast_fields_composite.open_read(*FIELD).unwrap(); + let fast_field_reader = FastFieldReader::::open(data); b.iter(|| { let n = test::black_box(7000u32); @@ -424,8 +495,8 @@ mod tests { let source = directory.open_read(&path).unwrap(); { let fast_fields_composite = CompositeFile::open(&source).unwrap(); - let fast_field_reader: U64FastFieldReader = - U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap()); + let data = fast_fields_composite.open_read(*FIELD).unwrap(); + let fast_field_reader = FastFieldReader::::open(data); b.iter(|| { let n = test::black_box(1000u32); diff --git a/src/fastfield/multivalued/mod.rs b/src/fastfield/multivalued/mod.rs index 60ef673f8..5c9f4dc9a 100644 --- a/src/fastfield/multivalued/mod.rs +++ b/src/fastfield/multivalued/mod.rs @@ -4,8 +4,6 @@ mod reader; pub use self::writer::MultiValueIntFastFieldWriter; pub use self::reader::MultiValueIntFastFieldReader; - - #[cfg(test)] mod tests { @@ -88,26 +86,4 @@ mod tests { assert_eq!(&vals, &[-5i64, -20i64, 1i64]); } } - - #[test] - #[should_panic] - fn test_multivalued_unreachable() { - let mut schema_builder = SchemaBuilder::default(); - let field = schema_builder.add_i64_field( - "multifield", - IntOptions::default().set_fast(Cardinality::MultiValues) - ); - let schema = schema_builder.build(); - let index = Index::create_in_ram(schema); - let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); - index_writer.add_document(doc!(field=> 1i64, field => 3i64)); - assert!(index_writer.commit().is_ok()); - - index.load_searchers().unwrap(); - let searcher = index.searcher(); - let reader = searcher.segment_reader(0); - let multi_value_reader = reader.multi_fast_field_reader::(field).unwrap(); - let mut vals = Vec::new(); - multi_value_reader.get_vals(0, &mut vals); - } } \ No newline at end of file diff --git a/src/fastfield/multivalued/reader.rs b/src/fastfield/multivalued/reader.rs index cdcf51d56..4dbe49717 100644 --- a/src/fastfield/multivalued/reader.rs +++ b/src/fastfield/multivalued/reader.rs @@ -1,10 +1,5 @@ use DocId; -use fastfield::FastFieldReader; -use fastfield::U64FastFieldReader; -use std::marker::PhantomData; -use common; - - +use fastfield::{FastFieldReader, FastValue}; /// Reader for a multivalued `u64` fast field. @@ -16,44 +11,19 @@ use common; /// The `idx_reader` associated, for each document, the index of its first value. /// #[derive(Clone)] -pub struct MultiValueIntFastFieldReader { - idx_reader: U64FastFieldReader, - vals_reader: U64FastFieldReader, - __phantom__: PhantomData +pub struct MultiValueIntFastFieldReader { + idx_reader: FastFieldReader, + vals_reader: FastFieldReader } -trait ConvertU64 { - fn from_u64(val: u64) -> Item; -} - -impl ConvertU64 for MultiValueIntFastFieldReader { - default fn from_u64(_: u64) -> Item { - unimplemented!("MultiValueIntFastField only exists for u64 and i64."); - } -} - -impl ConvertU64 for MultiValueIntFastFieldReader { - fn from_u64(val: u64) -> u64 { - val - } -} - -impl ConvertU64 for MultiValueIntFastFieldReader { - fn from_u64(val: u64) -> i64 { - common::u64_to_i64(val) - } -} - - -impl MultiValueIntFastFieldReader { +impl MultiValueIntFastFieldReader { pub(crate) fn open( - idx_reader: U64FastFieldReader, - vals_reader: U64FastFieldReader, + idx_reader: FastFieldReader, + vals_reader: FastFieldReader, ) -> MultiValueIntFastFieldReader { MultiValueIntFastFieldReader { idx_reader, - vals_reader, - __phantom__: PhantomData, + vals_reader } } @@ -61,11 +31,9 @@ impl MultiValueIntFastFieldReader { pub fn get_vals(&self, doc: DocId, vals: &mut Vec) { let start = self.idx_reader.get(doc) as u32; let stop = self.idx_reader.get(doc + 1) as u32; - vals.clear(); - for val_id in start..stop { - let val = self.vals_reader.get(val_id); - vals.push(Self::from_u64(val)); - } + let len = (stop - start) as usize; + vals.resize(len, Item::default()); + self.vals_reader.get_range(start, &mut vals[..]); } } diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index 003a75a8e..cfe0c2b77 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -1,111 +1,36 @@ -use directory::ReadOnlySource; -use common::{self, BinarySerializable}; -use common::compute_num_bits; +use common::BinarySerializable; use common::bitpacker::BitUnpacker; -use DocId; -use schema::SchemaBuilder; -use std::path::Path; -use schema::FAST; -use directory::{Directory, RAMDirectory, WritePtr}; -use fastfield::{FastFieldSerializer, FastFieldsWriter}; -use schema::FieldType; -use std::mem; use common::CompositeFile; -use std::collections::HashMap; +use common::compute_num_bits; +use directory::{Directory, RAMDirectory, WritePtr}; +use directory::ReadOnlySource; +use DocId; +use fastfield::{FastFieldSerializer, FastFieldsWriter}; use owning_ref::OwningRef; +use schema::FAST; +use schema::SchemaBuilder; +use std::collections::HashMap; +use std::marker::PhantomData; +use std::mem; +use std::path::Path; +use super::FastValue; /// Trait for accessing a fastfield. /// /// Depending on the field type, a different /// fast field is required. -pub trait FastFieldReader: Sized { - /// Type of the value stored in the fastfield. - type ValueType; +#[derive(Clone)] +pub struct FastFieldReader { + bit_unpacker: BitUnpacker>, + min_value_u64: u64, + max_value_u64: u64, + _phantom: PhantomData +} - /// Return the value associated to the given document. - /// - /// This accessor should return as fast as possible. - /// - /// # Panics - /// - /// May panic if `doc` is greater than the segment - // `maxdoc`. - fn get(&self, doc: DocId) -> Self::ValueType; - - /// Fills an output buffer with the fast field values - /// associated with the `DocId` going from - /// `start` to `start + output.len()`. - /// - /// # Panics - /// - /// May panic if `start + output.len()` is greater than - /// the segment's `maxdoc`. - fn get_range(&self, start: u32, output: &mut [Self::ValueType]); +impl FastFieldReader { /// Opens a fast field given a source. - fn open(source: ReadOnlySource) -> Self; - - /// Returns true iff the given field_type makes - /// it possible to access the field values via a - /// fastfield. - fn is_enabled(field_type: &FieldType) -> bool; -} - -/// `FastFieldReader` for unsigned 64-bits integers. -#[derive(Clone)] -pub struct U64FastFieldReader { - bit_unpacker: BitUnpacker>, - min_value: u64, - max_value: u64, -} - -impl U64FastFieldReader { - /// Returns the minimum value for this fast field. - /// - /// The min value does not take in account of possible - /// deleted document, and should be considered as a lower bound - /// of the actual minimum value. - pub fn min_value(&self) -> u64 { - self.min_value - } - - /// Returns the maximum value for this fast field. - /// - /// The max value does not take in account of possible - /// deleted document, and should be considered as an upper bound - /// of the actual maximum value. - pub fn max_value(&self) -> u64 { - self.max_value - } -} - -impl FastFieldReader for U64FastFieldReader { - type ValueType = u64; - - fn get(&self, doc: DocId) -> u64 { - self.min_value + self.bit_unpacker.get(doc as usize) - } - - fn is_enabled(field_type: &FieldType) -> bool { - match *field_type { - FieldType::U64(ref integer_options) => integer_options.is_fast(), - FieldType::HierarchicalFacet => true, - _ => false, - } - } - - fn get_range(&self, start: u32, output: &mut [Self::ValueType]) { - self.bit_unpacker.get_range(start, output); - for out in output.iter_mut() { - *out += self.min_value; - } - } - - /// Opens a new fast field reader given a read only source. - /// - /// # Panics - /// Panics if the data is corrupted. - fn open(data: ReadOnlySource) -> U64FastFieldReader { + pub fn open(data: ReadOnlySource) -> Self { let min_value: u64; let amplitude: u64; { @@ -119,16 +44,64 @@ impl FastFieldReader for U64FastFieldReader { let num_bits = compute_num_bits(amplitude); let owning_ref = OwningRef::new(data).map(|data| &data[16..]); let bit_unpacker = BitUnpacker::new(owning_ref, num_bits); - U64FastFieldReader { - min_value, - max_value, + FastFieldReader { + min_value_u64: min_value, + max_value_u64: max_value, bit_unpacker, + _phantom: PhantomData } } + + + /// Return the value associated to the given document. + /// + /// This accessor should return as fast as possible. + /// + /// # Panics + /// + /// May panic if `doc` is greater than the segment + // `maxdoc`. + pub fn get(&self, doc: DocId) -> Item { + Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc as usize)) + } + + /// Fills an output buffer with the fast field values + /// associated with the `DocId` going from + /// `start` to `start + output.len()`. + /// + /// # Panics + /// + /// May panic if `start + output.len()` is greater than + /// the segment's `maxdoc`. + pub fn get_range(&self, start: u32, output: &mut [Item]) { + let output_u64: &mut [u64] = unsafe { mem::transmute(output) }; + self.bit_unpacker.get_range(start, output_u64); + for out in output_u64.iter_mut() { + *out = Item::from_u64(*out + self.min_value_u64).as_u64(); + } + } + + /// Returns the minimum value for this fast field. + /// + /// The max value does not take in account of possible + /// deleted document, and should be considered as an upper bound + /// of the actual maximum value. + pub fn min_value(&self) -> Item { + Item::from_u64(self.min_value_u64) + } + + /// Returns the maximum value for this fast field. + /// + /// The max value does not take in account of possible + /// deleted document, and should be considered as an upper bound + /// of the actual maximum value. + pub fn max_value(&self) -> Item { + Item::from_u64(self.max_value_u64) + } } -impl From> for U64FastFieldReader { - fn from(vals: Vec) -> U64FastFieldReader { +impl From> for FastFieldReader { + fn from(vals: Vec) -> FastFieldReader { let mut schema_builder = SchemaBuilder::default(); let field = schema_builder.add_u64_field("field", FAST); let schema = schema_builder.build(); @@ -146,7 +119,7 @@ impl From> for U64FastFieldReader { .get_field_writer(field) .expect("With a RAMDirectory, this should never fail."); for val in vals { - fast_field_writer.add_val(val); + fast_field_writer.add_val(val.to_u64()); } } fast_field_writers @@ -158,79 +131,10 @@ impl From> for U64FastFieldReader { let source = directory.open_read(path).expect("Failed to open the file"); let composite_file = CompositeFile::open(&source).expect("Failed to read the composite file"); - let field_source = composite_file .open_read(field) .expect("File component not found"); - U64FastFieldReader::open(field_source) + FastFieldReader::open(field_source) } } -/// `FastFieldReader` for signed 64-bits integers. -pub struct I64FastFieldReader { - underlying: U64FastFieldReader, -} - -impl I64FastFieldReader { - /// Returns the minimum value for this fast field. - /// - /// The min value does not take in account of possible - /// deleted document, and should be considered as a lower bound - /// of the actual minimum value. - pub fn min_value(&self) -> i64 { - common::u64_to_i64(self.underlying.min_value()) - } - - /// Returns the maximum value for this fast field. - /// - /// The max value does not take in account of possible - /// deleted document, and should be considered as an upper bound - /// of the actual maximum value. - pub fn max_value(&self) -> i64 { - common::u64_to_i64(self.underlying.max_value()) - } -} - -impl FastFieldReader for I64FastFieldReader { - type ValueType = i64; - - /// - /// - /// # Panics - /// - /// May panic or return wrong random result if `doc` - /// is greater or equal to the segment's `maxdoc`. - fn get(&self, doc: DocId) -> i64 { - common::u64_to_i64(self.underlying.get(doc)) - } - - /// - /// # Panics - /// - /// May panic or return wrong random result if `doc` - /// is greater or equal to the segment's `maxdoc`. - fn get_range(&self, start: u32, output: &mut [Self::ValueType]) { - let output_u64: &mut [u64] = unsafe { mem::transmute(output) }; - self.underlying.get_range(start, output_u64); - for mut_val in output_u64.iter_mut() { - *mut_val = common::u64_to_i64(*mut_val as u64) as u64; - } - } - - /// Opens a new fast field reader given a read only source. - /// - /// # Panics - /// Panics if the data is corrupted. - fn open(data: ReadOnlySource) -> I64FastFieldReader { - I64FastFieldReader { - underlying: U64FastFieldReader::open(data), - } - } - - fn is_enabled(field_type: &FieldType) -> bool { - match *field_type { - FieldType::I64(ref integer_options) => integer_options.is_fast(), - _ => false, - } - } -} diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index ab9b3a6c9..19cd5cef2 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -52,8 +52,8 @@ impl FastFieldsWriter { } } FastFieldsWriter { - single_value_writers: single_value_writers, - multi_values_writers: multi_values_writers, + single_value_writers, + multi_values_writers } } diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index a2ee1b00c..7e1a0580b 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -5,7 +5,6 @@ use DocId; use core::SerializableSegment; use indexer::SegmentSerializer; use postings::InvertedIndexSerializer; -use fastfield::U64FastFieldReader; use itertools::Itertools; use postings::Postings; use docset::DocSet; @@ -26,7 +25,7 @@ pub struct IndexMerger { } fn compute_min_max_val( - u64_reader: &U64FastFieldReader, + u64_reader: &FastFieldReader, max_doc: DocId, delete_bitset: &DeleteBitSet, ) -> Option<(u64, u64)> { @@ -50,14 +49,14 @@ fn compute_min_max_val( fn extract_fieldnorm_reader( segment_reader: &SegmentReader, field: Field, -) -> Option { +) -> Option> { segment_reader.get_fieldnorms_reader(field) } fn extract_fast_field_reader( segment_reader: &SegmentReader, field: Field, -) -> Option { +) -> Option> { segment_reader.fast_field_reader(field).ok() } @@ -137,7 +136,7 @@ impl IndexMerger { fn generic_write_fast_field( &self, fields: Vec, - field_reader_extractor: &Fn(&SegmentReader, Field) -> Option, + field_reader_extractor: &Fn(&SegmentReader, Field) -> Option>, fast_field_serializer: &mut FastFieldSerializer, ) -> Result<()> { for field in fields { @@ -368,7 +367,6 @@ mod tests { use query::TermQuery; use schema::Field; use core::Index; - use fastfield::U64FastFieldReader; use Searcher; use DocAddress; use collector::tests::FastFieldTestCollector; @@ -628,16 +626,16 @@ mod tests { vec![6_000, 7_000] ); - let score_field_reader: U64FastFieldReader = searcher + let score_field_reader = searcher .segment_reader(0) - .fast_field_reader(score_field) + .fast_field_reader::(score_field) .unwrap(); assert_eq!(score_field_reader.min_value(), 1); assert_eq!(score_field_reader.max_value(), 3); - let score_field_reader: U64FastFieldReader = searcher + let score_field_reader = searcher .segment_reader(1) - .fast_field_reader(score_field) + .fast_field_reader::(score_field) .unwrap(); assert_eq!(score_field_reader.min_value(), 4000); assert_eq!(score_field_reader.max_value(), 7000); @@ -685,9 +683,9 @@ mod tests { search_term(&searcher, Term::from_field_text(text_field, "g")), vec![6_000, 7_000] ); - let score_field_reader: U64FastFieldReader = searcher + let score_field_reader = searcher .segment_reader(0) - .fast_field_reader(score_field) + .fast_field_reader::(score_field) .unwrap(); assert_eq!(score_field_reader.min_value(), 3); assert_eq!(score_field_reader.max_value(), 7000); @@ -731,9 +729,9 @@ mod tests { search_term(&searcher, Term::from_field_text(text_field, "g")), vec![6_000, 7_000] ); - let score_field_reader: U64FastFieldReader = searcher + let score_field_reader = searcher .segment_reader(0) - .fast_field_reader(score_field) + .fast_field_reader::(score_field) .unwrap(); assert_eq!(score_field_reader.min_value(), 3); assert_eq!(score_field_reader.max_value(), 7000); @@ -782,9 +780,9 @@ mod tests { search_term(&searcher, Term::from_field_text(text_field, "g")), vec![6_000, 7_000] ); - let score_field_reader: U64FastFieldReader = searcher + let score_field_reader = searcher .segment_reader(0) - .fast_field_reader(score_field) + .fast_field_reader::(score_field) .unwrap(); assert_eq!(score_field_reader.min_value(), 6000); assert_eq!(score_field_reader.max_value(), 7000); diff --git a/src/lib.rs b/src/lib.rs index bcf3cf451..792269ba3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,7 +3,6 @@ #![cfg_attr(feature = "cargo-clippy", allow(inline_always))] #![feature(box_syntax)] #![feature(optin_builtin_traits)] -#![feature(specialization)] #![feature(conservative_impl_trait)] #![feature(collections_range)] #![feature(integer_atomics)] @@ -287,7 +286,6 @@ mod tests { use schema::*; use docset::DocSet; use IndexWriter; - use fastfield::{FastFieldReader, I64FastFieldReader, U64FastFieldReader}; use Postings; use rand::{Rng, SeedableRng, XorShiftRng}; use rand::distributions::{IndependentSample, Range}; @@ -858,22 +856,22 @@ mod tests { let segment_reader: &SegmentReader = searcher.segment_reader(0); { let fast_field_reader_res = - segment_reader.fast_field_reader::(text_field); + segment_reader.fast_field_reader::(text_field); assert!(fast_field_reader_res.is_err()); } { let fast_field_reader_res = - segment_reader.fast_field_reader::(stored_int_field); + segment_reader.fast_field_reader::(stored_int_field); assert!(fast_field_reader_res.is_err()); } { let fast_field_reader_res = - segment_reader.fast_field_reader::(fast_field_signed); + segment_reader.fast_field_reader::(fast_field_signed); assert!(fast_field_reader_res.is_err()); } { let fast_field_reader_res = - segment_reader.fast_field_reader::(fast_field_signed); + segment_reader.fast_field_reader::(fast_field_signed); assert!(fast_field_reader_res.is_ok()); let fast_field_reader = fast_field_reader_res.unwrap(); assert_eq!(fast_field_reader.get(0), 4i64) @@ -881,7 +879,7 @@ mod tests { { let fast_field_reader_res = - segment_reader.fast_field_reader::(fast_field_signed); + segment_reader.fast_field_reader::(fast_field_signed); assert!(fast_field_reader_res.is_ok()); let fast_field_reader = fast_field_reader_res.unwrap(); assert_eq!(fast_field_reader.get(0), 4i64) diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 4427b94eb..c67a9f855 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -51,7 +51,6 @@ pub mod tests { use schema::IndexRecordOption; use std::iter; use datastruct::stacker::Heap; - use fastfield::FastFieldReader; use query::TermQuery; use schema::Field; use test::{self, Bencher}; diff --git a/src/query/term_query/mod.rs b/src/query/term_query/mod.rs index 04efc2f43..11e70aa8a 100644 --- a/src/query/term_query/mod.rs +++ b/src/query/term_query/mod.rs @@ -13,7 +13,6 @@ mod tests { use postings::SegmentPostings; use query::{Query, Scorer}; use query::term_query::TermScorer; - use fastfield::U64FastFieldReader; use query::TermQuery; use Index; use schema::*; @@ -56,7 +55,7 @@ mod tests { #[test] pub fn test_term_scorer() { - let left_fieldnorms = U64FastFieldReader::from(vec![10, 4]); + let left_fieldnorms = FastFieldReader::from(vec![10, 4]); assert_eq!(left_fieldnorms.get(0), 10); assert_eq!(left_fieldnorms.get(1), 4); let left = SegmentPostings::create_from_docs(&[1]); diff --git a/src/query/term_query/term_scorer.rs b/src/query/term_query/term_scorer.rs index e9faf1d67..d8352780c 100644 --- a/src/query/term_query/term_scorer.rs +++ b/src/query/term_query/term_scorer.rs @@ -1,7 +1,6 @@ use Score; use DocId; use docset::{DocSet, SkipResult}; -use fastfield::U64FastFieldReader; use postings::SegmentPostings; use query::Scorer; use postings::Postings; @@ -9,7 +8,7 @@ use fastfield::FastFieldReader; pub struct TermScorer { pub idf: Score, - pub fieldnorm_reader_opt: Option, + pub fieldnorm_reader_opt: Option>, pub postings: SegmentPostings, }