diff --git a/examples/custom_collector.rs b/examples/custom_collector.rs index 0f79b869d..a954a9643 100644 --- a/examples/custom_collector.rs +++ b/examples/custom_collector.rs @@ -10,7 +10,7 @@ // --- // Importing tantivy... use tantivy::collector::{Collector, SegmentCollector}; -use tantivy::fastfield::FastFieldReader; +use tantivy::fastfield::{DynamicFastFieldReader, FastFieldReader}; use tantivy::query::QueryParser; use tantivy::schema::Field; use tantivy::schema::{Schema, FAST, INDEXED, TEXT}; @@ -98,7 +98,7 @@ impl Collector for StatsCollector { } struct StatsSegmentCollector { - fast_field_reader: FastFieldReader, + fast_field_reader: DynamicFastFieldReader, stats: Stats, } diff --git a/src/collector/filter_collector_wrapper.rs b/src/collector/filter_collector_wrapper.rs index 454cdb160..f1246be9d 100644 --- a/src/collector/filter_collector_wrapper.rs +++ b/src/collector/filter_collector_wrapper.rs @@ -12,7 +12,7 @@ use std::marker::PhantomData; use crate::collector::{Collector, SegmentCollector}; -use crate::fastfield::{FastFieldReader, FastValue}; +use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue}; use crate::schema::Field; use crate::{Score, SegmentReader, TantivyError}; @@ -155,7 +155,7 @@ where TPredicate: 'static, TPredicateValue: FastValue, { - fast_field_reader: FastFieldReader, + fast_field_reader: DynamicFastFieldReader, segment_collector: TSegmentCollector, predicate: TPredicate, t_predicate_value: PhantomData, diff --git a/src/collector/histogram_collector.rs b/src/collector/histogram_collector.rs index b7cd5b8cd..18345d821 100644 --- a/src/collector/histogram_collector.rs +++ b/src/collector/histogram_collector.rs @@ -1,5 +1,5 @@ use crate::collector::{Collector, SegmentCollector}; -use crate::fastfield::{FastFieldReader, FastValue}; +use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue}; use crate::schema::{Field, Type}; use crate::{DocId, Score}; use fastdivide::DividerU64; @@ -84,7 +84,7 @@ impl HistogramComputer { } pub struct SegmentHistogramCollector { histogram_computer: HistogramComputer, - ff_reader: FastFieldReader, + ff_reader: DynamicFastFieldReader, } impl SegmentCollector for SegmentHistogramCollector { diff --git a/src/collector/tests.rs b/src/collector/tests.rs index db9d4217a..18cd384c7 100644 --- a/src/collector/tests.rs +++ b/src/collector/tests.rs @@ -1,6 +1,7 @@ use super::*; use crate::core::SegmentReader; use crate::fastfield::BytesFastFieldReader; +use crate::fastfield::DynamicFastFieldReader; use crate::fastfield::FastFieldReader; use crate::schema::Field; use crate::DocId; @@ -162,7 +163,7 @@ pub struct FastFieldTestCollector { pub struct FastFieldSegmentCollector { vals: Vec, - reader: FastFieldReader, + reader: DynamicFastFieldReader, } impl FastFieldTestCollector { diff --git a/src/collector/top_score_collector.rs b/src/collector/top_score_collector.rs index c772b3ded..198751800 100644 --- a/src/collector/top_score_collector.rs +++ b/src/collector/top_score_collector.rs @@ -4,7 +4,7 @@ use crate::collector::tweak_score_top_collector::TweakedScoreTopCollector; use crate::collector::{ CustomScorer, CustomSegmentScorer, ScoreSegmentTweaker, ScoreTweaker, SegmentCollector, }; -use crate::fastfield::FastFieldReader; +use crate::fastfield::{DynamicFastFieldReader, FastFieldReader}; use crate::query::Weight; use crate::schema::Field; use crate::DocAddress; @@ -129,7 +129,7 @@ impl fmt::Debug for TopDocs { } struct ScorerByFastFieldReader { - ff_reader: FastFieldReader, + ff_reader: DynamicFastFieldReader, } impl CustomSegmentScorer for ScorerByFastFieldReader { @@ -151,7 +151,7 @@ impl CustomScorer for ScorerByField { // mapping is monotonic, so it is sufficient to compute our top-K docs. // // The conversion will then happen only on the top-K docs. - let ff_reader: FastFieldReader = segment_reader + let ff_reader = segment_reader .fast_fields() .typed_fast_field_reader(self.field)?; Ok(ScorerByFastFieldReader { ff_reader }) @@ -401,6 +401,7 @@ impl TopDocs { /// # use tantivy::query::QueryParser; /// use tantivy::SegmentReader; /// use tantivy::collector::TopDocs; + /// use tantivy::fastfield::FastFieldReader; /// use tantivy::schema::Field; /// /// fn create_schema() -> Schema { @@ -508,6 +509,7 @@ impl TopDocs { /// use tantivy::SegmentReader; /// use tantivy::collector::TopDocs; /// use tantivy::schema::Field; + /// use tantivy::fastfield::FastFieldReader; /// /// # fn create_schema() -> Schema { /// # let mut schema_builder = Schema::builder(); diff --git a/src/fastfield/bytes/reader.rs b/src/fastfield/bytes/reader.rs index c00d56ea1..0e938c912 100644 --- a/src/fastfield/bytes/reader.rs +++ b/src/fastfield/bytes/reader.rs @@ -1,7 +1,7 @@ +use crate::directory::FileSlice; use crate::directory::OwnedBytes; -use crate::fastfield::FastFieldReader; +use crate::fastfield::{BitpackedFastFieldReader, FastFieldReader, MultiValueLength}; use crate::DocId; -use crate::{directory::FileSlice, fastfield::MultiValueLength}; /// Reader for byte array fast fields /// @@ -15,13 +15,13 @@ use crate::{directory::FileSlice, fastfield::MultiValueLength}; /// and the start index for the next document, and keeping the bytes in between. #[derive(Clone)] pub struct BytesFastFieldReader { - idx_reader: FastFieldReader, + idx_reader: BitpackedFastFieldReader, values: OwnedBytes, } impl BytesFastFieldReader { pub(crate) fn open( - idx_reader: FastFieldReader, + idx_reader: BitpackedFastFieldReader, values_file: FileSlice, ) -> crate::Result { let values = values_file.read_bytes()?; diff --git a/src/fastfield/bytes/writer.rs b/src/fastfield/bytes/writer.rs index 8322ed4c5..f7b1b5d68 100644 --- a/src/fastfield/bytes/writer.rs +++ b/src/fastfield/bytes/writer.rs @@ -1,8 +1,11 @@ use std::io; +use crate::fastfield::serializer::FastFieldSerializer; use crate::schema::{Document, Field, Value}; use crate::DocId; -use crate::{fastfield::serializer::FastFieldSerializer, indexer::doc_id_mapping::DocIdMapping}; +use crate::{ + fastfield::serializer::CompositeFastFieldSerializer, indexer::doc_id_mapping::DocIdMapping, +}; /// Writer for byte array (as in, any number of bytes per document) fast fields /// @@ -104,7 +107,7 @@ impl BytesFastFieldWriter { /// Serializes the fast field values by pushing them to the `FastFieldSerializer`. pub fn serialize( &self, - serializer: &mut FastFieldSerializer, + serializer: &mut CompositeFastFieldSerializer, doc_id_map: Option<&DocIdMapping>, ) -> io::Result<()> { // writing the offset index diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index f73cd040d..e898c72ca 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -29,8 +29,11 @@ pub use self::delete::DeleteBitSet; pub use self::error::{FastFieldNotAvailableError, Result}; pub use self::facet_reader::FacetReader; pub use self::multivalued::{MultiValuedFastFieldReader, MultiValuedFastFieldWriter}; +pub use self::reader::BitpackedFastFieldReader; +pub use self::reader::DynamicFastFieldReader; pub use self::reader::FastFieldReader; pub use self::readers::FastFieldReaders; +pub use self::serializer::CompositeFastFieldSerializer; pub use self::serializer::FastFieldSerializer; pub use self::writer::{FastFieldsWriter, IntFastFieldWriter}; use crate::schema::Cardinality; @@ -211,7 +214,7 @@ mod tests { use super::*; use crate::common::CompositeFile; use crate::directory::{Directory, RamDirectory, WritePtr}; - use crate::fastfield::FastFieldReader; + use crate::fastfield::BitpackedFastFieldReader; use crate::merge_policy::NoMergePolicy; use crate::schema::Field; use crate::schema::Schema; @@ -236,7 +239,7 @@ mod tests { #[test] pub fn test_fastfield() { - let test_fastfield = FastFieldReader::::from(vec![100, 200, 300]); + let test_fastfield = BitpackedFastFieldReader::::from(vec![100, 200, 300]); assert_eq!(test_fastfield.get(0), 100); assert_eq!(test_fastfield.get(1), 200); assert_eq!(test_fastfield.get(2), 300); @@ -254,7 +257,7 @@ mod tests { let directory: RamDirectory = RamDirectory::create(); { let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut serializer = FastFieldSerializer::from_write(write).unwrap(); + let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap(); let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); fast_field_writers.add_document(&doc!(*FIELD=>13u64)); fast_field_writers.add_document(&doc!(*FIELD=>14u64)); @@ -268,7 +271,7 @@ mod tests { assert_eq!(file.len(), 36 as usize); let composite_file = CompositeFile::open(&file)?; let file = composite_file.open_read(*FIELD).unwrap(); - let fast_field_reader = FastFieldReader::::open(file)?; + let fast_field_reader = BitpackedFastFieldReader::::open(file)?; assert_eq!(fast_field_reader.get(0), 13u64); assert_eq!(fast_field_reader.get(1), 14u64); assert_eq!(fast_field_reader.get(2), 2u64); @@ -281,7 +284,7 @@ mod tests { let directory: RamDirectory = RamDirectory::create(); { let write: WritePtr = directory.open_write(Path::new("test"))?; - let mut serializer = FastFieldSerializer::from_write(write)?; + let mut serializer = CompositeFastFieldSerializer::from_write(write)?; let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); fast_field_writers.add_document(&doc!(*FIELD=>4u64)); fast_field_writers.add_document(&doc!(*FIELD=>14_082_001u64)); @@ -300,7 +303,7 @@ mod tests { { let fast_fields_composite = CompositeFile::open(&file)?; let data = fast_fields_composite.open_read(*FIELD).unwrap(); - let fast_field_reader = FastFieldReader::::open(data)?; + let fast_field_reader = BitpackedFastFieldReader::::open(data)?; assert_eq!(fast_field_reader.get(0), 4u64); assert_eq!(fast_field_reader.get(1), 14_082_001u64); assert_eq!(fast_field_reader.get(2), 3_052u64); @@ -321,7 +324,7 @@ mod tests { { let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut serializer = FastFieldSerializer::from_write(write).unwrap(); + let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap(); let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); for _ in 0..10_000 { fast_field_writers.add_document(&doc!(*FIELD=>100_000u64)); @@ -336,7 +339,7 @@ mod tests { { let fast_fields_composite = CompositeFile::open(&file).unwrap(); let data = fast_fields_composite.open_read(*FIELD).unwrap(); - let fast_field_reader = FastFieldReader::::open(data)?; + let fast_field_reader = BitpackedFastFieldReader::::open(data)?; for doc in 0..10_000 { assert_eq!(fast_field_reader.get(doc), 100_000u64); } @@ -351,7 +354,7 @@ mod tests { { let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut serializer = FastFieldSerializer::from_write(write).unwrap(); + let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap(); let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); // forcing the amplitude to be high fast_field_writers.add_document(&doc!(*FIELD=>0u64)); @@ -368,7 +371,7 @@ mod tests { { let fast_fields_composite = CompositeFile::open(&file)?; let data = fast_fields_composite.open_read(*FIELD).unwrap(); - let fast_field_reader = FastFieldReader::::open(data)?; + let fast_field_reader = BitpackedFastFieldReader::::open(data)?; assert_eq!(fast_field_reader.get(0), 0u64); for doc in 1..10_001 { assert_eq!( @@ -390,7 +393,7 @@ mod tests { let schema = schema_builder.build(); { let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut serializer = FastFieldSerializer::from_write(write).unwrap(); + let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap(); let mut fast_field_writers = FastFieldsWriter::from_schema(&schema); for i in -100i64..10_000i64 { let mut doc = Document::default(); @@ -407,7 +410,7 @@ mod tests { { let fast_fields_composite = CompositeFile::open(&file)?; let data = fast_fields_composite.open_read(i64_field).unwrap(); - let fast_field_reader = FastFieldReader::::open(data)?; + let fast_field_reader = BitpackedFastFieldReader::::open(data)?; assert_eq!(fast_field_reader.min_value(), -100i64); assert_eq!(fast_field_reader.max_value(), 9_999i64); @@ -433,7 +436,7 @@ mod tests { { let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut serializer = FastFieldSerializer::from_write(write).unwrap(); + let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap(); let mut fast_field_writers = FastFieldsWriter::from_schema(&schema); let doc = Document::default(); fast_field_writers.add_document(&doc); @@ -447,7 +450,7 @@ mod tests { { let fast_fields_composite = CompositeFile::open(&file).unwrap(); let data = fast_fields_composite.open_read(i64_field).unwrap(); - let fast_field_reader = FastFieldReader::::open(data)?; + let fast_field_reader = BitpackedFastFieldReader::::open(data)?; assert_eq!(fast_field_reader.get(0u32), 0i64); } Ok(()) @@ -468,7 +471,7 @@ mod tests { let directory = RamDirectory::create(); { let write: WritePtr = directory.open_write(Path::new("test"))?; - let mut serializer = FastFieldSerializer::from_write(write)?; + let mut serializer = CompositeFastFieldSerializer::from_write(write)?; let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); for &x in &permutation { fast_field_writers.add_document(&doc!(*FIELD=>x)); @@ -480,7 +483,7 @@ mod tests { { let fast_fields_composite = CompositeFile::open(&file)?; let data = fast_fields_composite.open_read(*FIELD).unwrap(); - let fast_field_reader = FastFieldReader::::open(data)?; + let fast_field_reader = BitpackedFastFieldReader::::open(data)?; let mut a = 0u64; for _ in 0..n { diff --git a/src/fastfield/multivalued/reader.rs b/src/fastfield/multivalued/reader.rs index 82993b042..8e917203d 100644 --- a/src/fastfield/multivalued/reader.rs +++ b/src/fastfield/multivalued/reader.rs @@ -1,6 +1,6 @@ use std::ops::Range; -use crate::fastfield::{FastFieldReader, FastValue, MultiValueLength}; +use crate::fastfield::{BitpackedFastFieldReader, FastFieldReader, FastValue, MultiValueLength}; use crate::DocId; /// Reader for a multivalued `u64` fast field. @@ -13,14 +13,14 @@ use crate::DocId; /// #[derive(Clone)] pub struct MultiValuedFastFieldReader { - idx_reader: FastFieldReader, - vals_reader: FastFieldReader, + idx_reader: BitpackedFastFieldReader, + vals_reader: BitpackedFastFieldReader, } impl MultiValuedFastFieldReader { pub(crate) fn open( - idx_reader: FastFieldReader, - vals_reader: FastFieldReader, + idx_reader: BitpackedFastFieldReader, + vals_reader: BitpackedFastFieldReader, ) -> MultiValuedFastFieldReader { MultiValuedFastFieldReader { idx_reader, diff --git a/src/fastfield/multivalued/writer.rs b/src/fastfield/multivalued/writer.rs index 5bf0faaa4..1e1777c67 100644 --- a/src/fastfield/multivalued/writer.rs +++ b/src/fastfield/multivalued/writer.rs @@ -1,5 +1,6 @@ -use crate::fastfield::serializer::FastSingleFieldSerializer; -use crate::fastfield::FastFieldSerializer; +use crate::fastfield::serializer::DynamicFastFieldSerializer; +use crate::fastfield::serializer::FastFieldSerializer; +use crate::fastfield::CompositeFastFieldSerializer; use crate::postings::UnorderedTermId; use crate::schema::{Document, Field}; use crate::termdict::TermOrdinal; @@ -134,7 +135,7 @@ impl MultiValuedFastFieldWriter { /// pub fn serialize( &self, - serializer: &mut FastFieldSerializer, + serializer: &mut CompositeFastFieldSerializer, mapping_opt: Option<&FnvHashMap>, doc_id_map: Option<&DocIdMapping>, ) -> io::Result<()> { @@ -154,7 +155,7 @@ impl MultiValuedFastFieldWriter { } { // writing the values themselves. - let mut value_serializer: FastSingleFieldSerializer<'_, _>; + let mut value_serializer: DynamicFastFieldSerializer<'_, _>; match mapping_opt { Some(mapping) => { value_serializer = serializer.new_u64_fast_field_with_idx( diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index a861849ac..7050a1b9a 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -4,7 +4,7 @@ use crate::common::CompositeFile; use crate::directory::FileSlice; use crate::directory::OwnedBytes; use crate::directory::{Directory, RamDirectory, WritePtr}; -use crate::fastfield::{FastFieldSerializer, FastFieldsWriter}; +use crate::fastfield::{CompositeFastFieldSerializer, FastFieldsWriter}; use crate::schema::Schema; use crate::schema::FAST; use crate::DocId; @@ -14,12 +14,94 @@ use std::path::Path; use tantivy_bitpacker::compute_num_bits; use tantivy_bitpacker::BitUnpacker; +/// FastFieldReader is the trait to access fast field data. +pub trait FastFieldReader: Clone { + /// Return the value associated to the given document. + /// + /// This accessor should return as fast as possible. + /// + /// # Panics + /// + /// May panic if `doc` is greater than the segment + fn get(&self, doc: DocId) -> Item; + + /// Fills an output buffer with the fast field values + /// associated with the `DocId` going from + /// `start` to `start + output.len()`. + /// + /// Regardless of the type of `Item`, this method works + /// - transmuting the output array + /// - extracting the `Item`s as if they were `u64` + /// - possibly converting the `u64` value to the right type. + /// + /// # Panics + /// + /// May panic if `start + output.len()` is greater than + /// the segment's `maxdoc`. + fn get_range(&self, start: DocId, output: &mut [Item]); + + /// Returns the minimum value for this fast field. + /// + /// The max value does not take in account of possible + /// deleted document, and should be considered as an upper bound + /// of the actual maximum value. + fn min_value(&self) -> Item; + + /// Returns the maximum value for this fast field. + /// + /// The max value does not take in account of possible + /// deleted document, and should be considered as an upper bound + /// of the actual maximum value. + fn max_value(&self) -> Item; +} + +#[derive(Clone)] +/// DynamicFastFieldReader wraps different readers to access +/// the various encoded fastfield data +/// +pub enum DynamicFastFieldReader { + /// Bitpacked compressed fastfield data. + Bitpacked(BitpackedFastFieldReader), +} + +impl DynamicFastFieldReader { + /// Returns correct the reader wrapped in the `DynamicFastFieldReader` enum for the data. + pub fn open(file: FileSlice) -> crate::Result> { + Ok(DynamicFastFieldReader::Bitpacked( + BitpackedFastFieldReader::open(file)?, + )) + } +} + +impl FastFieldReader for DynamicFastFieldReader { + fn get(&self, doc: DocId) -> Item { + match self { + Self::Bitpacked(reader) => reader.get(doc), + } + } + fn get_range(&self, start: DocId, output: &mut [Item]) { + match self { + Self::Bitpacked(reader) => reader.get_range(start, output), + } + } + fn min_value(&self) -> Item { + match self { + Self::Bitpacked(reader) => reader.min_value(), + } + } + fn max_value(&self) -> Item { + match self { + Self::Bitpacked(reader) => reader.max_value(), + } + } +} + /// Trait for accessing a fastfield. /// /// Depending on the field type, a different /// fast field is required. #[derive(Clone)] -pub struct FastFieldReader { +pub struct BitpackedFastFieldReader { bytes: OwnedBytes, bit_unpacker: BitUnpacker, min_value_u64: u64, @@ -27,7 +109,7 @@ pub struct FastFieldReader { _phantom: PhantomData, } -impl FastFieldReader { +impl BitpackedFastFieldReader { /// Opens a fast field given a file. pub fn open(file: FileSlice) -> crate::Result { let mut bytes = file.read_bytes()?; @@ -36,7 +118,7 @@ impl FastFieldReader { let max_value = min_value + amplitude; let num_bits = compute_num_bits(amplitude); let bit_unpacker = BitUnpacker::new(num_bits); - Ok(FastFieldReader { + Ok(BitpackedFastFieldReader { bytes, min_value_u64: min_value, max_value_u64: max_value, @@ -44,19 +126,6 @@ impl FastFieldReader { _phantom: PhantomData, }) } - - /// Return the value associated to the given document. - /// - /// This accessor should return as fast as possible. - /// - /// # Panics - /// - /// May panic if `doc` is greater than the segment - // `maxdoc`. - pub fn get(&self, doc: DocId) -> Item { - self.get_u64(u64::from(doc)) - } - pub(crate) fn get_u64(&self, doc: u64) -> Item { Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc, &self.bytes)) } @@ -78,6 +147,20 @@ impl FastFieldReader { *out = self.get_u64(start + (i as u64)); } } +} + +impl FastFieldReader for BitpackedFastFieldReader { + /// Return the value associated to the given document. + /// + /// This accessor should return as fast as possible. + /// + /// # Panics + /// + /// May panic if `doc` is greater than the segment + // `maxdoc`. + fn get(&self, doc: DocId) -> Item { + self.get_u64(u64::from(doc)) + } /// Fills an output buffer with the fast field values /// associated with the `DocId` going from @@ -92,7 +175,7 @@ impl FastFieldReader { /// /// May panic if `start + output.len()` is greater than /// the segment's `maxdoc`. - pub fn get_range(&self, start: DocId, output: &mut [Item]) { + fn get_range(&self, start: DocId, output: &mut [Item]) { self.get_range_u64(u64::from(start), output); } @@ -101,7 +184,7 @@ impl FastFieldReader { /// The max value does not take in account of possible /// deleted document, and should be considered as an upper bound /// of the actual maximum value. - pub fn min_value(&self) -> Item { + fn min_value(&self) -> Item { Item::from_u64(self.min_value_u64) } @@ -110,13 +193,13 @@ impl FastFieldReader { /// The max value does not take in account of possible /// deleted document, and should be considered as an upper bound /// of the actual maximum value. - pub fn max_value(&self) -> Item { + fn max_value(&self) -> Item { Item::from_u64(self.max_value_u64) } } -impl From> for FastFieldReader { - fn from(vals: Vec) -> FastFieldReader { +impl From> for BitpackedFastFieldReader { + fn from(vals: Vec) -> BitpackedFastFieldReader { let mut schema_builder = Schema::builder(); let field = schema_builder.add_u64_field("field", FAST); let schema = schema_builder.build(); @@ -126,7 +209,7 @@ impl From> for FastFieldReader { let write: WritePtr = directory .open_write(path) .expect("With a RamDirectory, this should never fail."); - let mut serializer = FastFieldSerializer::from_write(write) + let mut serializer = CompositeFastFieldSerializer::from_write(write) .expect("With a RamDirectory, this should never fail."); let mut fast_field_writers = FastFieldsWriter::from_schema(&schema); { @@ -148,6 +231,6 @@ impl From> for FastFieldReader { let field_file = composite_file .open_read(field) .expect("File component not found"); - FastFieldReader::open(field_file).unwrap() + BitpackedFastFieldReader::open(field_file).unwrap() } } diff --git a/src/fastfield/readers.rs b/src/fastfield/readers.rs index 81024694c..af8bdf949 100644 --- a/src/fastfield/readers.rs +++ b/src/fastfield/readers.rs @@ -1,13 +1,16 @@ use crate::common::CompositeFile; use crate::directory::FileSlice; use crate::fastfield::MultiValuedFastFieldReader; +use crate::fastfield::{BitpackedFastFieldReader, FastFieldNotAvailableError}; use crate::fastfield::{BytesFastFieldReader, FastValue}; -use crate::fastfield::{FastFieldNotAvailableError, FastFieldReader}; use crate::schema::{Cardinality, Field, FieldType, Schema}; use crate::space_usage::PerFieldSpaceUsage; use crate::TantivyError; -/// Provides access to all of the FastFieldReader. +use super::reader::DynamicFastFieldReader; +use super::FastFieldReader; + +/// Provides access to all of the BitpackedFastFieldReader. /// /// Internally, `FastFieldReaders` have preloaded fast field readers, /// and just wraps several `HashMap`. @@ -100,9 +103,9 @@ impl FastFieldReaders { pub(crate) fn typed_fast_field_reader( &self, field: Field, - ) -> crate::Result> { + ) -> crate::Result> { let fast_field_slice = self.fast_field_data(field, 0)?; - FastFieldReader::open(fast_field_slice) + DynamicFastFieldReader::open(fast_field_slice) } pub(crate) fn typed_fast_field_multi_reader( @@ -111,16 +114,16 @@ impl FastFieldReaders { ) -> crate::Result> { let fast_field_slice_idx = self.fast_field_data(field, 0)?; let fast_field_slice_vals = self.fast_field_data(field, 1)?; - let idx_reader = FastFieldReader::open(fast_field_slice_idx)?; - let vals_reader: FastFieldReader = - FastFieldReader::open(fast_field_slice_vals)?; + let idx_reader = BitpackedFastFieldReader::open(fast_field_slice_idx)?; + let vals_reader: BitpackedFastFieldReader = + BitpackedFastFieldReader::open(fast_field_slice_vals)?; Ok(MultiValuedFastFieldReader::open(idx_reader, vals_reader)) } /// Returns the `u64` fast field reader reader associated to `field`. /// /// If `field` is not a u64 fast field, this method returns an Error. - pub fn u64(&self, field: Field) -> crate::Result> { + pub fn u64(&self, field: Field) -> crate::Result> { self.check_type(field, FastType::U64, Cardinality::SingleValue)?; self.typed_fast_field_reader(field) } @@ -129,14 +132,14 @@ impl FastFieldReaders { /// field is effectively of type `u64` or not. /// /// If not, the fastfield reader will returns the u64-value associated to the original FastValue. - pub fn u64_lenient(&self, field: Field) -> crate::Result> { + pub fn u64_lenient(&self, field: Field) -> crate::Result> { self.typed_fast_field_reader(field) } /// Returns the `i64` fast field reader reader associated to `field`. /// /// If `field` is not a i64 fast field, this method returns an Error. - pub fn i64(&self, field: Field) -> crate::Result> { + pub fn i64(&self, field: Field) -> crate::Result> { self.check_type(field, FastType::I64, Cardinality::SingleValue)?; self.typed_fast_field_reader(field) } @@ -144,7 +147,7 @@ impl FastFieldReaders { /// Returns the `i64` fast field reader reader associated to `field`. /// /// If `field` is not a i64 fast field, this method returns an Error. - pub fn date(&self, field: Field) -> crate::Result> { + pub fn date(&self, field: Field) -> crate::Result> { self.check_type(field, FastType::Date, Cardinality::SingleValue)?; self.typed_fast_field_reader(field) } @@ -152,7 +155,7 @@ impl FastFieldReaders { /// Returns the `f64` fast field reader reader associated to `field`. /// /// If `field` is not a f64 fast field, this method returns an Error. - pub fn f64(&self, field: Field) -> crate::Result> { + pub fn f64(&self, field: Field) -> crate::Result> { self.check_type(field, FastType::F64, Cardinality::SingleValue)?; self.typed_fast_field_reader(field) } @@ -213,7 +216,7 @@ impl FastFieldReaders { ))); } let fast_field_idx_file = self.fast_field_data(field, 0)?; - let idx_reader = FastFieldReader::open(fast_field_idx_file)?; + let idx_reader = BitpackedFastFieldReader::open(fast_field_idx_file)?; let data = self.fast_field_data(field, 1)?; BytesFastFieldReader::open(idx_reader, data) } else { diff --git a/src/fastfield/serializer.rs b/src/fastfield/serializer.rs index a3841ff06..28199bfa7 100644 --- a/src/fastfield/serializer.rs +++ b/src/fastfield/serializer.rs @@ -7,10 +7,10 @@ use std::io::{self, Write}; use tantivy_bitpacker::compute_num_bits; use tantivy_bitpacker::BitPacker; -/// `FastFieldSerializer` is in charge of serializing +/// `CompositeFastFieldSerializer` is in charge of serializing /// fastfields on disk. /// -/// Fast fields are encoded using bit-packing. +/// Fast fields have differnt encodings like bit-packing. /// /// `FastFieldWriter`s are in charge of pushing the data to /// the serializer. @@ -27,16 +27,16 @@ use tantivy_bitpacker::BitPacker; /// * ... /// * `close_field()` /// * `close()` -pub struct FastFieldSerializer { +pub struct CompositeFastFieldSerializer { composite_write: CompositeWrite, } -impl FastFieldSerializer { +impl CompositeFastFieldSerializer { /// Constructor - pub fn from_write(write: WritePtr) -> io::Result { + pub fn from_write(write: WritePtr) -> io::Result { // just making room for the pointer to header. let composite_write = CompositeWrite::wrap(write); - Ok(FastFieldSerializer { composite_write }) + Ok(CompositeFastFieldSerializer { composite_write }) } /// Start serializing a new u64 fast field @@ -45,7 +45,7 @@ impl FastFieldSerializer { field: Field, min_value: u64, max_value: u64, - ) -> io::Result>> { + ) -> io::Result>> { self.new_u64_fast_field_with_idx(field, min_value, max_value, 0) } @@ -56,9 +56,9 @@ impl FastFieldSerializer { min_value: u64, max_value: u64, idx: usize, - ) -> io::Result>> { + ) -> io::Result>> { let field_write = self.composite_write.for_field_with_idx(field, idx); - FastSingleFieldSerializer::open(field_write, min_value, max_value) + DynamicFastFieldSerializer::open(field_write, min_value, max_value) } /// Start serializing a new [u8] fast field @@ -79,14 +79,111 @@ impl FastFieldSerializer { } } -pub struct FastSingleFieldSerializer<'a, W: Write> { +#[derive(Debug, Clone)] +pub struct EstimationStats { + min_value: u64, + max_value: u64, +} +/// The FastFieldSerializer trait is the common interface +/// implemented by every fastfield serializer variant. +/// +/// `DynamicFastFieldSerializer` is the enum wrapping all variants. +/// It is used to create an serializer instance. +pub trait FastFieldSerializer { + /// add value to serializer + fn add_val(&mut self, val: u64) -> io::Result<()>; + /// finish serializing a field. + fn close_field(self) -> io::Result<()>; +} + +/// The FastFieldSerializerEstimate trait is required on all variants +/// of fast field compressions, to decide which one to choose. +pub trait FastFieldSerializerEstimate { + /// returns an estimate of the compression ratio. + fn estimate( + /*fastfield_accessor: impl FastFieldReader,*/ stats: EstimationStats, + ) -> (f32, &'static str); + /// the unique name of the compressor + fn name() -> &'static str; +} + +pub enum DynamicFastFieldSerializer<'a, W: Write> { + Bitpacked(BitpackedFastFieldSerializer<'a, W>), +} + +impl<'a, W: Write> DynamicFastFieldSerializer<'a, W> { + /// Creates a new fast field serializer. + /// + /// The serializer in fact encode the values by bitpacking + /// `(val - min_value)`. + /// + /// It requires a `min_value` and a `max_value` to compute + /// compute the minimum number of bits required to encode + /// values. + pub fn open( + write: &'a mut W, + min_value: u64, + max_value: u64, + ) -> io::Result> { + let stats = EstimationStats { + min_value, + max_value, + }; + let (_ratio, name) = ( + BitpackedFastFieldSerializer::>::estimate(stats), + BitpackedFastFieldSerializer::>::name(), + ); + Self::open_from_name(write, min_value, max_value, name) + } + + /// Creates a new fast field serializer. + /// + /// The serializer in fact encode the values by bitpacking + /// `(val - min_value)`. + /// + /// It requires a `min_value` and a `max_value` to compute + /// compute the minimum number of bits required to encode + /// values. + pub fn open_from_name( + write: &'a mut W, + min_value: u64, + max_value: u64, + name: &str, + ) -> io::Result> { + // Weirdly the W generic on BitpackedFastFieldSerializer needs to be set, + // although name() doesn't use it + let variant = if name == BitpackedFastFieldSerializer::>::name() { + DynamicFastFieldSerializer::Bitpacked(BitpackedFastFieldSerializer::open( + write, min_value, max_value, + )?) + } else { + panic!("unknown fastfield serializer {}", name); + }; + + Ok(variant) + } +} +impl<'a, W: Write> FastFieldSerializer for DynamicFastFieldSerializer<'a, W> { + fn add_val(&mut self, val: u64) -> io::Result<()> { + match self { + Self::Bitpacked(serializer) => serializer.add_val(val), + } + } + fn close_field(self) -> io::Result<()> { + match self { + Self::Bitpacked(serializer) => serializer.close_field(), + } + } +} + +pub struct BitpackedFastFieldSerializer<'a, W: Write> { bit_packer: BitPacker, write: &'a mut W, min_value: u64, num_bits: u8, } -impl<'a, W: Write> FastSingleFieldSerializer<'a, W> { +impl<'a, W: Write> BitpackedFastFieldSerializer<'a, W> { /// Creates a new fast field serializer. /// /// The serializer in fact encode the values by bitpacking @@ -99,34 +196,51 @@ impl<'a, W: Write> FastSingleFieldSerializer<'a, W> { write: &'a mut W, min_value: u64, max_value: u64, - ) -> io::Result> { + ) -> io::Result> { assert!(min_value <= max_value); min_value.serialize(write)?; let amplitude = max_value - min_value; amplitude.serialize(write)?; let num_bits = compute_num_bits(amplitude); let bit_packer = BitPacker::new(); - Ok(FastSingleFieldSerializer { + Ok(BitpackedFastFieldSerializer { bit_packer, write, min_value, num_bits, }) } +} +impl<'a, W: 'a + Write> FastFieldSerializer for BitpackedFastFieldSerializer<'a, W> { /// Pushes a new value to the currently open u64 fast field. - pub fn add_val(&mut self, val: u64) -> io::Result<()> { + fn add_val(&mut self, val: u64) -> io::Result<()> { let val_to_write: u64 = val - self.min_value; self.bit_packer .write(val_to_write, self.num_bits, &mut self.write)?; Ok(()) } - - pub fn close_field(mut self) -> io::Result<()> { + fn close_field(mut self) -> io::Result<()> { self.bit_packer.close(&mut self.write) } } +impl<'a, W: 'a + Write> FastFieldSerializerEstimate for BitpackedFastFieldSerializer<'a, W> { + fn estimate( + /*_fastfield_accessor: impl FastFieldReader, */ stats: EstimationStats, + ) -> (f32, &'static str) { + let amplitude = stats.max_value - stats.min_value; + let num_bits = compute_num_bits(amplitude); + let num_bits_uncompressed = 64; + let ratio = num_bits as f32 / num_bits_uncompressed as f32; + let name = Self::name(); + (ratio, name) + } + fn name() -> &'static str { + "Bitpacked" + } +} + pub struct FastBytesFieldSerializer<'a, W: Write> { write: &'a mut W, } diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index c4b6c4d21..e99fc7148 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -1,6 +1,7 @@ use super::multivalued::MultiValuedFastFieldWriter; use crate::common; -use crate::fastfield::{BytesFastFieldWriter, FastFieldSerializer}; +use crate::fastfield::serializer::FastFieldSerializer; +use crate::fastfield::{BytesFastFieldWriter, CompositeFastFieldSerializer}; use crate::indexer::doc_id_mapping::DocIdMapping; use crate::postings::UnorderedTermId; use crate::schema::{Cardinality, Document, Field, FieldEntry, FieldType, Schema}; @@ -148,7 +149,7 @@ impl FastFieldsWriter { /// order to the fast field serializer. pub fn serialize( &self, - serializer: &mut FastFieldSerializer, + serializer: &mut CompositeFastFieldSerializer, mapping: &HashMap>, doc_id_map: Option<&DocIdMapping>, ) -> io::Result<()> { @@ -272,7 +273,7 @@ impl IntFastFieldWriter { /// Push the fast fields value to the `FastFieldWriter`. pub fn serialize( &self, - serializer: &mut FastFieldSerializer, + serializer: &mut CompositeFastFieldSerializer, doc_id_map: Option<&DocIdMapping>, ) -> io::Result<()> { let (min, max) = if self.val_min > self.val_max { diff --git a/src/indexer/doc_id_mapping.rs b/src/indexer/doc_id_mapping.rs index ba9b2cb10..dd8f9aa7d 100644 --- a/src/indexer/doc_id_mapping.rs +++ b/src/indexer/doc_id_mapping.rs @@ -8,7 +8,6 @@ use crate::{ DocId, IndexSortByField, Order, TantivyError, }; use std::cmp::Reverse; - /// Struct to provide mapping from old doc_id to new doc_id and vice versa pub struct DocIdMapping { new_doc_id_to_old: Vec, @@ -92,6 +91,7 @@ pub(crate) fn get_doc_id_mapping_from_field( #[cfg(test)] mod tests_indexsorting { + use crate::fastfield::FastFieldReader; use crate::{collector::TopDocs, query::QueryParser, schema::*}; use crate::{schema::Schema, DocAddress}; use crate::{Index, IndexSettings, IndexSortByField, Order}; diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 087288d8c..fdd63bbf4 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -1,6 +1,8 @@ use super::doc_id_mapping::DocIdMapping; use crate::error::DataCorruption; +use crate::fastfield::CompositeFastFieldSerializer; use crate::fastfield::DeleteBitSet; +use crate::fastfield::DynamicFastFieldReader; use crate::fastfield::FastFieldReader; use crate::fastfield::FastFieldSerializer; use crate::fastfield::MultiValuedFastFieldReader; @@ -87,7 +89,7 @@ pub struct IndexMerger { } fn compute_min_max_val( - u64_reader: &FastFieldReader, + u64_reader: &impl FastFieldReader, max_doc: DocId, delete_bitset_opt: Option<&DeleteBitSet>, ) -> Option<(u64, u64)> { @@ -265,7 +267,7 @@ impl IndexMerger { fn write_fast_fields( &self, - fast_field_serializer: &mut FastFieldSerializer, + fast_field_serializer: &mut CompositeFastFieldSerializer, mut term_ord_mappings: HashMap, doc_id_mapping: &Option>, ) -> crate::Result<()> { @@ -315,11 +317,11 @@ impl IndexMerger { fn write_single_fast_field( &self, field: Field, - fast_field_serializer: &mut FastFieldSerializer, + fast_field_serializer: &mut CompositeFastFieldSerializer, doc_id_mapping: &Option>, ) -> crate::Result<()> { let (min_value, max_value) = self.readers.iter().map(|reader|{ - let u64_reader: FastFieldReader = reader + let u64_reader: DynamicFastFieldReader = reader .fast_fields() .typed_fast_field_reader(field) .expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen."); @@ -334,7 +336,7 @@ impl IndexMerger { .readers .iter() .map(|reader| { - let u64_reader: FastFieldReader = reader + let u64_reader: DynamicFastFieldReader = reader .fast_fields() .typed_fast_field_reader(field) .expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen."); @@ -362,7 +364,7 @@ impl IndexMerger { let u64_readers = self.readers.iter() .filter(|reader|reader.max_doc() != reader.delete_bitset().map(|bit_set|bit_set.len() as u32).unwrap_or(0)) .map(|reader|{ - let u64_reader: FastFieldReader = reader + let u64_reader: DynamicFastFieldReader = reader .fast_fields() .typed_fast_field_reader(field) .expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen."); @@ -413,7 +415,7 @@ impl IndexMerger { pub(crate) fn get_sort_field_accessor<'b>( reader: &SegmentReader, sort_by_field: &'b IndexSortByField, - ) -> crate::Result> { + ) -> crate::Result> { let field_id = expect_field_id_for_sort_field(&reader.schema(), &sort_by_field)?; // for now expect fastfield, but not strictly required let value_accessor = reader.fast_fields().u64_lenient(field_id)?; Ok(value_accessor) @@ -422,7 +424,12 @@ impl IndexMerger { pub(crate) fn get_reader_with_sort_field_accessor<'a, 'b>( &'a self, sort_by_field: &'b IndexSortByField, - ) -> crate::Result, FastFieldReader)>> { + ) -> crate::Result< + Vec<( + SegmentReaderWithOrdinal<'a>, + impl FastFieldReader + Clone, + )>, + > { let reader_and_field_accessors = self .readers .iter() @@ -491,7 +498,7 @@ impl IndexMerger { // is used to index the reader_and_field_accessors vec. fn write_1_n_fast_field_idx_generic( field: Field, - fast_field_serializer: &mut FastFieldSerializer, + fast_field_serializer: &mut CompositeFastFieldSerializer, doc_id_mapping: &Option>, reader_and_field_accessors: &[(&SegmentReader, impl MultiValueLength)], ) -> crate::Result<()> { @@ -546,7 +553,7 @@ impl IndexMerger { fn write_multi_value_fast_field_idx( &self, field: Field, - fast_field_serializer: &mut FastFieldSerializer, + fast_field_serializer: &mut CompositeFastFieldSerializer, doc_id_mapping: &Option>, ) -> crate::Result<()> { let reader_and_field_accessors = self.readers.iter().map(|reader|{ @@ -568,7 +575,7 @@ impl IndexMerger { &self, field: Field, term_ordinal_mappings: &TermOrdinalMapping, - fast_field_serializer: &mut FastFieldSerializer, + fast_field_serializer: &mut CompositeFastFieldSerializer, doc_id_mapping: &Option>, ) -> crate::Result<()> { // Multifastfield consists in 2 fastfields. @@ -631,7 +638,7 @@ impl IndexMerger { fn write_multi_fast_field( &self, field: Field, - fast_field_serializer: &mut FastFieldSerializer, + fast_field_serializer: &mut CompositeFastFieldSerializer, doc_id_mapping: &Option>, ) -> crate::Result<()> { // Multifastfield consists in 2 fastfields. @@ -718,7 +725,7 @@ impl IndexMerger { fn write_bytes_fast_field( &self, field: Field, - fast_field_serializer: &mut FastFieldSerializer, + fast_field_serializer: &mut CompositeFastFieldSerializer, doc_id_mapping: &Option>, ) -> crate::Result<()> { let reader_and_field_accessors = self @@ -1088,6 +1095,7 @@ mod tests { use crate::collector::tests::{BytesFastFieldTestCollector, FastFieldTestCollector}; use crate::collector::{Count, FacetCollector}; use crate::core::Index; + use crate::fastfield::FastFieldReader; use crate::query::AllQuery; use crate::query::BooleanQuery; use crate::query::Scorer; diff --git a/src/indexer/merger_sorted_index_test.rs b/src/indexer/merger_sorted_index_test.rs index 0544f6234..b0d98bb22 100644 --- a/src/indexer/merger_sorted_index_test.rs +++ b/src/indexer/merger_sorted_index_test.rs @@ -1,5 +1,6 @@ #[cfg(test)] mod tests { + use crate::fastfield::FastFieldReader; use crate::{ collector::TopDocs, schema::{Cardinality, TextFieldIndexing}, diff --git a/src/indexer/segment_serializer.rs b/src/indexer/segment_serializer.rs index d1a119b80..63ccb530c 100644 --- a/src/indexer/segment_serializer.rs +++ b/src/indexer/segment_serializer.rs @@ -1,6 +1,6 @@ use crate::core::Segment; use crate::core::SegmentComponent; -use crate::fastfield::FastFieldSerializer; +use crate::fastfield::CompositeFastFieldSerializer; use crate::fieldnorm::FieldNormsSerializer; use crate::postings::InvertedIndexSerializer; use crate::store::StoreWriter; @@ -10,7 +10,7 @@ use crate::store::StoreWriter; pub struct SegmentSerializer { segment: Segment, pub(crate) store_writer: StoreWriter, - fast_field_serializer: FastFieldSerializer, + fast_field_serializer: CompositeFastFieldSerializer, fieldnorms_serializer: Option, postings_serializer: InvertedIndexSerializer, } @@ -33,7 +33,7 @@ impl SegmentSerializer { let store_write = segment.open_write(store_component)?; let fast_field_write = segment.open_write(SegmentComponent::FastFields)?; - let fast_field_serializer = FastFieldSerializer::from_write(fast_field_write)?; + let fast_field_serializer = CompositeFastFieldSerializer::from_write(fast_field_write)?; let fieldnorms_write = segment.open_write(SegmentComponent::FieldNorms)?; let fieldnorms_serializer = FieldNormsSerializer::from_write(fieldnorms_write)?; @@ -68,7 +68,7 @@ impl SegmentSerializer { } /// Accessor to the `FastFieldSerializer`. - pub fn get_fast_field_serializer(&mut self) -> &mut FastFieldSerializer { + pub fn get_fast_field_serializer(&mut self) -> &mut CompositeFastFieldSerializer { &mut self.fast_field_serializer } diff --git a/src/lib.rs b/src/lib.rs index 2f696efd1..fbc875a2e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -291,6 +291,7 @@ mod tests { use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE; use crate::core::SegmentReader; use crate::docset::{DocSet, TERMINATED}; + use crate::fastfield::FastFieldReader; use crate::query::BooleanQuery; use crate::schema::*; use crate::DocAddress;