diff --git a/examples/custom_collector.rs b/examples/custom_collector.rs index 01f3cc9d5..f6ac5dcfb 100644 --- a/examples/custom_collector.rs +++ b/examples/custom_collector.rs @@ -7,11 +7,12 @@ // Of course, you can have a look at the tantivy's built-in collectors // such as the `CountCollector` for more examples. +use std::sync::Arc; + use fastfield_codecs::Column; // --- // Importing tantivy... use tantivy::collector::{Collector, SegmentCollector}; -use tantivy::fastfield::DynamicFastFieldReader; use tantivy::query::QueryParser; use tantivy::schema::{Field, Schema, FAST, INDEXED, TEXT}; use tantivy::{doc, Index, Score, SegmentReader}; @@ -96,7 +97,7 @@ impl Collector for StatsCollector { } struct StatsSegmentCollector { - fast_field_reader: DynamicFastFieldReader, + fast_field_reader: Arc>, stats: Stats, } diff --git a/examples/warmer.rs b/examples/warmer.rs index 2f5ee56d2..6b8c2830f 100644 --- a/examples/warmer.rs +++ b/examples/warmer.rs @@ -2,7 +2,6 @@ use std::cmp::Reverse; use std::collections::{HashMap, HashSet}; use std::sync::{Arc, RwLock, Weak}; -use fastfield_codecs::Column; use tantivy::collector::TopDocs; use tantivy::query::QueryParser; use tantivy::schema::{Field, Schema, FAST, TEXT}; diff --git a/fastfield_codecs/src/column.rs b/fastfield_codecs/src/column.rs index a62bee5b0..cc4893421 100644 --- a/fastfield_codecs/src/column.rs +++ b/fastfield_codecs/src/column.rs @@ -1,3 +1,5 @@ +use std::marker::PhantomData; + pub trait Column { /// Return the value associated to the given idx. /// @@ -42,8 +44,103 @@ pub trait Column { fn max_value(&self) -> T; fn num_vals(&self) -> u64; + /// Returns a iterator over the data fn iter<'a>(&'a self) -> Box + 'a> { Box::new((0..self.num_vals()).map(|idx| self.get_val(idx))) } } + +struct VecColumn<'a>(&'a [u64]); +impl<'a> Column for VecColumn<'a> { + fn get_val(&self, position: u64) -> u64 { + self.0[position as usize] + } + + fn iter<'b>(&'b self) -> Box + 'b> { + Box::new(self.0.iter().cloned()) + } + + fn min_value(&self) -> u64 { + self.0.iter().min().cloned().unwrap_or(0) + } + + fn max_value(&self) -> u64 { + self.0.iter().max().cloned().unwrap_or(0) + } + + fn num_vals(&self) -> u64 { + self.0.len() as u64 + } +} + +impl<'a> From<&'a [u64]> for VecColumn<'a> { + fn from(data: &'a [u64]) -> Self { + Self(data) + } +} + +struct MonotonicMappingColumn { + from_column: C, + monotonic_mapping: T, + _phantom: PhantomData, +} + +/// Creates a view of a column transformed by a monotonic mapping. +pub fn monotonic_map_column( + from_column: C, + monotonic_mapping: T, +) -> impl Column +where + C: Column, + T: Fn(Input) -> Output, +{ + MonotonicMappingColumn { + from_column, + monotonic_mapping, + _phantom: PhantomData, + } +} + +impl Column for MonotonicMappingColumn +where + C: Column, + T: Fn(Input) -> Output, +{ + fn get_val(&self, idx: u64) -> Output { + let from_val = self.from_column.get_val(idx); + (self.monotonic_mapping)(from_val) + } + + fn min_value(&self) -> Output { + let from_min_value = self.from_column.min_value(); + (self.monotonic_mapping)(from_min_value) + } + + fn max_value(&self) -> Output { + let from_max_value = self.from_column.max_value(); + (self.monotonic_mapping)(from_max_value) + } + + fn num_vals(&self) -> u64 { + self.from_column.num_vals() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_monotonic_mapping() { + let vals = &[1u64, 3u64][..]; + let col = VecColumn::from(vals); + let mapped = monotonic_map_column(col, |el| el + 4); + assert_eq!(mapped.min_value(), 5u64); + assert_eq!(mapped.max_value(), 7u64); + assert_eq!(mapped.num_vals(), 2); + assert_eq!(mapped.num_vals(), 2); + assert_eq!(mapped.get_val(0), 5); + assert_eq!(mapped.get_val(0), 7); + } +} diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index ccba50065..e5a713024 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -14,7 +14,7 @@ pub mod linear; mod column; -pub use self::column::Column; +pub use self::column::{monotonic_map_column, Column}; #[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)] #[repr(u8)] @@ -56,12 +56,12 @@ impl FastFieldCodecType { /// The FastFieldSerializerEstimate trait is required on all variants /// of fast field compressions, to decide which one to choose. -pub trait FastFieldCodec { +pub trait FastFieldCodec: 'static { /// A codex needs to provide a unique name and id, which is /// used for debugging and de/serialization. const CODEC_TYPE: FastFieldCodecType; - type Reader: Column; + type Reader: Column + 'static; /// Reads the metadata and returns the CodecReader fn open_from_bytes(bytes: OwnedBytes) -> io::Result; @@ -90,35 +90,6 @@ pub struct FastFieldStats { pub num_vals: u64, } -struct VecColum<'a>(&'a [u64]); -impl<'a> Column for VecColum<'a> { - fn get_val(&self, position: u64) -> u64 { - self.0[position as usize] - } - - fn iter<'b>(&'b self) -> Box + 'b> { - Box::new(self.0.iter().cloned()) - } - - fn min_value(&self) -> u64 { - self.0.iter().min().cloned().unwrap_or(0) - } - - fn max_value(&self) -> u64 { - self.0.iter().max().cloned().unwrap_or(0) - } - - fn num_vals(&self) -> u64 { - self.0.len() as u64 - } -} - -impl<'a> From<&'a [u64]> for VecColum<'a> { - fn from(data: &'a [u64]) -> Self { - Self(data) - } -} - #[cfg(test)] mod tests { use proptest::arbitrary::any; diff --git a/src/aggregation/agg_req_with_accessor.rs b/src/aggregation/agg_req_with_accessor.rs index 0c84f99e3..6e09749aa 100644 --- a/src/aggregation/agg_req_with_accessor.rs +++ b/src/aggregation/agg_req_with_accessor.rs @@ -4,14 +4,14 @@ use std::rc::Rc; use std::sync::atomic::AtomicU32; use std::sync::Arc; +use fastfield_codecs::Column; + use super::agg_req::{Aggregation, Aggregations, BucketAggregationType, MetricAggregation}; use super::bucket::{HistogramAggregation, RangeAggregation, TermsAggregation}; use super::metric::{AverageAggregation, StatsAggregation}; use super::segment_agg_result::BucketCount; use super::VecWithNames; -use crate::fastfield::{ - type_and_cardinality, DynamicFastFieldReader, FastType, MultiValuedFastFieldReader, -}; +use crate::fastfield::{type_and_cardinality, FastType, MultiValuedFastFieldReader}; use crate::schema::{Cardinality, Type}; use crate::{InvertedIndexReader, SegmentReader, TantivyError}; @@ -37,10 +37,16 @@ impl AggregationsWithAccessor { #[derive(Clone)] pub(crate) enum FastFieldAccessor { Multi(MultiValuedFastFieldReader), - Single(DynamicFastFieldReader), + Single(Arc>), } impl FastFieldAccessor { - pub fn as_single(&self) -> Option<&DynamicFastFieldReader> { + pub fn as_single(&self) -> Option<&dyn Column> { + match self { + FastFieldAccessor::Multi(_) => None, + FastFieldAccessor::Single(reader) => Some(&**reader), + } + } + pub fn into_single(self) -> Option>> { match self { FastFieldAccessor::Multi(_) => None, FastFieldAccessor::Single(reader) => Some(reader), @@ -118,7 +124,7 @@ impl BucketAggregationWithAccessor { pub struct MetricAggregationWithAccessor { pub metric: MetricAggregation, pub field_type: Type, - pub accessor: DynamicFastFieldReader, + pub accessor: Arc, } impl MetricAggregationWithAccessor { @@ -134,9 +140,8 @@ impl MetricAggregationWithAccessor { Ok(MetricAggregationWithAccessor { accessor: accessor - .as_single() - .expect("unexpected fast field cardinality") - .clone(), + .into_single() + .expect("unexpected fast field cardinality"), field_type, metric: metric.clone(), }) diff --git a/src/aggregation/bucket/histogram/histogram.rs b/src/aggregation/bucket/histogram/histogram.rs index 7f60efcfa..883a1c263 100644 --- a/src/aggregation/bucket/histogram/histogram.rs +++ b/src/aggregation/bucket/histogram/histogram.rs @@ -15,7 +15,6 @@ use crate::aggregation::intermediate_agg_result::{ IntermediateAggregationResults, IntermediateBucketResult, IntermediateHistogramBucketEntry, }; use crate::aggregation::segment_agg_result::SegmentAggregationResultsCollector; -use crate::fastfield::DynamicFastFieldReader; use crate::schema::Type; use crate::{DocId, TantivyError}; @@ -264,7 +263,7 @@ impl SegmentHistogramCollector { req: &HistogramAggregation, sub_aggregation: &AggregationsWithAccessor, field_type: Type, - accessor: &DynamicFastFieldReader, + accessor: &dyn Column, ) -> crate::Result { req.validate()?; let min = f64_from_fastfield_u64(accessor.min_value(), &field_type); diff --git a/src/aggregation/bucket/range.rs b/src/aggregation/bucket/range.rs index 1019c4294..d3e7ec71a 100644 --- a/src/aggregation/bucket/range.rs +++ b/src/aggregation/bucket/range.rs @@ -1,7 +1,6 @@ use std::fmt::Debug; use std::ops::Range; -use fastfield_codecs::Column; use fnv::FnvHashMap; use serde::{Deserialize, Serialize}; diff --git a/src/aggregation/metric/average.rs b/src/aggregation/metric/average.rs index f10f713ee..206bb7607 100644 --- a/src/aggregation/metric/average.rs +++ b/src/aggregation/metric/average.rs @@ -4,7 +4,6 @@ use fastfield_codecs::Column; use serde::{Deserialize, Serialize}; use crate::aggregation::f64_from_fastfield_u64; -use crate::fastfield::DynamicFastFieldReader; use crate::schema::Type; use crate::DocId; @@ -58,7 +57,7 @@ impl SegmentAverageCollector { data: Default::default(), } } - pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &DynamicFastFieldReader) { + pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &dyn Column) { let mut iter = doc.chunks_exact(4); for docs in iter.by_ref() { let val1 = field.get_val(docs[0] as u64); diff --git a/src/aggregation/metric/stats.rs b/src/aggregation/metric/stats.rs index 702c09e51..924bc634c 100644 --- a/src/aggregation/metric/stats.rs +++ b/src/aggregation/metric/stats.rs @@ -2,7 +2,6 @@ use fastfield_codecs::Column; use serde::{Deserialize, Serialize}; use crate::aggregation::f64_from_fastfield_u64; -use crate::fastfield::DynamicFastFieldReader; use crate::schema::Type; use crate::{DocId, TantivyError}; @@ -164,7 +163,7 @@ impl SegmentStatsCollector { stats: IntermediateStats::default(), } } - pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &DynamicFastFieldReader) { + pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &dyn Column) { let mut iter = doc.chunks_exact(4); for docs in iter.by_ref() { let val1 = field.get_val(docs[0] as u64); diff --git a/src/aggregation/segment_agg_result.rs b/src/aggregation/segment_agg_result.rs index fe0740089..406791fc8 100644 --- a/src/aggregation/segment_agg_result.rs +++ b/src/aggregation/segment_agg_result.rs @@ -185,10 +185,10 @@ impl SegmentMetricResultCollector { pub(crate) fn collect_block(&mut self, doc: &[DocId], metric: &MetricAggregationWithAccessor) { match self { SegmentMetricResultCollector::Average(avg_collector) => { - avg_collector.collect_block(doc, &metric.accessor); + avg_collector.collect_block(doc, &*metric.accessor); } SegmentMetricResultCollector::Stats(stats_collector) => { - stats_collector.collect_block(doc, &metric.accessor); + stats_collector.collect_block(doc, &*metric.accessor); } } } diff --git a/src/collector/filter_collector_wrapper.rs b/src/collector/filter_collector_wrapper.rs index 487385b28..323bddc09 100644 --- a/src/collector/filter_collector_wrapper.rs +++ b/src/collector/filter_collector_wrapper.rs @@ -10,11 +10,12 @@ // --- // Importing tantivy... use std::marker::PhantomData; +use std::sync::Arc; use fastfield_codecs::Column; use crate::collector::{Collector, SegmentCollector}; -use crate::fastfield::{DynamicFastFieldReader, FastValue}; +use crate::fastfield::FastValue; use crate::schema::Field; use crate::{Score, SegmentReader, TantivyError}; @@ -160,7 +161,7 @@ where TPredicate: 'static, TPredicateValue: FastValue, { - fast_field_reader: DynamicFastFieldReader, + fast_field_reader: Arc>, segment_collector: TSegmentCollector, predicate: TPredicate, t_predicate_value: PhantomData, diff --git a/src/collector/histogram_collector.rs b/src/collector/histogram_collector.rs index e6d7fd41b..117f4f8df 100644 --- a/src/collector/histogram_collector.rs +++ b/src/collector/histogram_collector.rs @@ -1,8 +1,10 @@ +use std::sync::Arc; + use fastdivide::DividerU64; use fastfield_codecs::Column; use crate::collector::{Collector, SegmentCollector}; -use crate::fastfield::{DynamicFastFieldReader, FastValue}; +use crate::fastfield::FastValue; use crate::schema::{Field, Type}; use crate::{DocId, Score}; @@ -85,7 +87,7 @@ impl HistogramComputer { } pub struct SegmentHistogramCollector { histogram_computer: HistogramComputer, - ff_reader: DynamicFastFieldReader, + ff_reader: Arc>, } impl SegmentCollector for SegmentHistogramCollector { diff --git a/src/collector/tests.rs b/src/collector/tests.rs index 102399716..690110840 100644 --- a/src/collector/tests.rs +++ b/src/collector/tests.rs @@ -1,9 +1,11 @@ +use std::sync::Arc; + use fastfield_codecs::Column; use super::*; use crate::collector::{Count, FilterCollector, TopDocs}; use crate::core::SegmentReader; -use crate::fastfield::{BytesFastFieldReader, DynamicFastFieldReader}; +use crate::fastfield::BytesFastFieldReader; use crate::query::{AllQuery, QueryParser}; use crate::schema::{Field, Schema, FAST, TEXT}; use crate::time::format_description::well_known::Rfc3339; @@ -158,7 +160,7 @@ pub struct FastFieldTestCollector { pub struct FastFieldSegmentCollector { vals: Vec, - reader: DynamicFastFieldReader, + reader: Arc>, } impl FastFieldTestCollector { diff --git a/src/collector/top_score_collector.rs b/src/collector/top_score_collector.rs index eb4443e04..2ee31934a 100644 --- a/src/collector/top_score_collector.rs +++ b/src/collector/top_score_collector.rs @@ -1,6 +1,7 @@ use std::collections::BinaryHeap; use std::fmt; use std::marker::PhantomData; +use std::sync::Arc; use fastfield_codecs::Column; @@ -11,7 +12,7 @@ use crate::collector::tweak_score_top_collector::TweakedScoreTopCollector; use crate::collector::{ CustomScorer, CustomSegmentScorer, ScoreSegmentTweaker, ScoreTweaker, SegmentCollector, }; -use crate::fastfield::{DynamicFastFieldReader, FastValue}; +use crate::fastfield::FastValue; use crate::query::Weight; use crate::schema::Field; use crate::{DocAddress, DocId, Score, SegmentOrdinal, SegmentReader, TantivyError}; @@ -131,7 +132,7 @@ impl fmt::Debug for TopDocs { } struct ScorerByFastFieldReader { - ff_reader: DynamicFastFieldReader, + ff_reader: Arc>, } impl CustomSegmentScorer for ScorerByFastFieldReader { @@ -409,7 +410,6 @@ impl TopDocs { /// # use tantivy::query::QueryParser; /// use tantivy::SegmentReader; /// use tantivy::collector::TopDocs; - /// use tantivy::fastfield::Column; /// use tantivy::schema::Field; /// /// fn create_schema() -> Schema { @@ -517,7 +517,6 @@ impl TopDocs { /// use tantivy::SegmentReader; /// use tantivy::collector::TopDocs; /// use tantivy::schema::Field; - /// use fastfield_codecs::Column; /// /// # fn create_schema() -> Schema { /// # let mut schema_builder = Schema::builder(); diff --git a/src/fastfield/bytes/reader.rs b/src/fastfield/bytes/reader.rs index 859184887..cf89fdc48 100644 --- a/src/fastfield/bytes/reader.rs +++ b/src/fastfield/bytes/reader.rs @@ -1,7 +1,9 @@ +use std::sync::Arc; + use fastfield_codecs::Column; use crate::directory::{FileSlice, OwnedBytes}; -use crate::fastfield::{DynamicFastFieldReader, MultiValueLength}; +use crate::fastfield::MultiValueLength; use crate::DocId; /// Reader for byte array fast fields @@ -16,13 +18,13 @@ use crate::DocId; /// and the start index for the next document, and keeping the bytes in between. #[derive(Clone)] pub struct BytesFastFieldReader { - idx_reader: DynamicFastFieldReader, + idx_reader: Arc>, values: OwnedBytes, } impl BytesFastFieldReader { pub(crate) fn open( - idx_reader: DynamicFastFieldReader, + idx_reader: Arc>, values_file: FileSlice, ) -> crate::Result { let values = values_file.read_bytes()?; diff --git a/src/fastfield/gcd.rs b/src/fastfield/gcd.rs index 1e5fa1a19..abcf535f4 100644 --- a/src/fastfield/gcd.rs +++ b/src/fastfield/gcd.rs @@ -3,20 +3,11 @@ use std::num::NonZeroU64; use common::BinarySerializable; use fastdivide::DividerU64; -use fastfield_codecs::{Column, FastFieldCodec}; +use fastfield_codecs::{monotonic_map_column, Column, FastFieldCodec}; use ownedbytes::OwnedBytes; pub const GCD_DEFAULT: u64 = 1; -/// Wrapper for accessing a fastfield. -/// -/// Holds the data and the codec to the read the data. -#[derive(Clone)] -pub struct GCDReader { - gcd_params: GCDParams, - reader: CodecReader, -} - #[derive(Debug, Clone, Copy)] struct GCDParams { gcd: u64, @@ -24,12 +15,6 @@ struct GCDParams { num_vals: u64, } -impl GCDParams { - pub fn eval(&self, val: u64) -> u64 { - self.min_value + self.gcd * val - } -} - impl BinarySerializable for GCDParams { fn serialize(&self, writer: &mut W) -> io::Result<()> { self.gcd.serialize(writer)?; @@ -52,31 +37,13 @@ impl BinarySerializable for GCDParams { pub fn open_gcd_from_bytes( bytes: OwnedBytes, -) -> io::Result> { +) -> io::Result { let footer_offset = bytes.len() - 24; let (body, mut footer) = bytes.split(footer_offset); let gcd_params = GCDParams::deserialize(&mut footer)?; + let gcd_remap = move |val: u64| gcd_params.min_value + gcd_params.gcd * val; let reader: WrappedCodec::Reader = WrappedCodec::open_from_bytes(body)?; - Ok(GCDReader { gcd_params, reader }) -} - -impl Column for GCDReader { - #[inline] - fn get_val(&self, doc: u64) -> u64 { - let val = self.reader.get_val(doc); - self.gcd_params.eval(val) - } - - fn min_value(&self) -> u64 { - self.gcd_params.eval(self.reader.min_value()) - } - - fn max_value(&self) -> u64 { - self.gcd_params.eval(self.reader.max_value()) - } - fn num_vals(&self) -> u64 { - self.gcd_params.num_vals - } + Ok(monotonic_map_column(reader, gcd_remap)) } pub fn write_gcd_header( @@ -134,6 +101,7 @@ mod tests { use std::collections::HashMap; use std::num::NonZeroU64; use std::path::Path; + use std::sync::Arc; use std::time::{Duration, SystemTime}; use common::HasLen; @@ -141,11 +109,11 @@ mod tests { use crate::directory::{CompositeFile, RamDirectory, WritePtr}; use crate::fastfield::gcd::compute_gcd; + use crate::fastfield::reader::open_fast_field; use crate::fastfield::serializer::FastFieldCodecEnableCheck; - use crate::fastfield::tests::{FIELD, FIELDI64, SCHEMA, SCHEMAI64}; + use crate::fastfield::tests::{encode_decode_fast_field, FIELD, FIELDI64, SCHEMA, SCHEMAI64}; use crate::fastfield::{ - find_gcd, CompositeFastFieldSerializer, DynamicFastFieldReader, FastFieldCodecType, - FastFieldsWriter, ALL_CODECS, + find_gcd, CompositeFastFieldSerializer, FastFieldCodecType, FastFieldsWriter, ALL_CODECS, }; use crate::schema::{Cardinality, Schema}; use crate::{DateOptions, DatePrecision, DateTime, Directory}; @@ -187,8 +155,7 @@ mod tests { let file = directory.open_read(path).unwrap(); let composite_file = CompositeFile::open(&file)?; let file = composite_file.open_read(*FIELD).unwrap(); - let fast_field_reader = DynamicFastFieldReader::::open(file)?; - + let fast_field_reader: Arc> = open_fast_field(file.read_bytes()?)?; assert_eq!(fast_field_reader.get_val(0), -4000i64); assert_eq!(fast_field_reader.get_val(1), -3000i64); assert_eq!(fast_field_reader.get_val(2), -2000i64); @@ -229,7 +196,7 @@ mod tests { let file = directory.open_read(path).unwrap(); let composite_file = CompositeFile::open(&file)?; let file = composite_file.open_read(*FIELD).unwrap(); - let fast_field_reader = DynamicFastFieldReader::::open(file)?; + let fast_field_reader = open_fast_field::(file.read_bytes()?)?; assert_eq!(fast_field_reader.get_val(0), 1000u64); assert_eq!(fast_field_reader.get_val(1), 2000u64); assert_eq!(fast_field_reader.get_val(2), 3000u64); @@ -258,7 +225,7 @@ mod tests { #[test] pub fn test_fastfield2() { - let test_fastfield = DynamicFastFieldReader::::from(vec![100, 200, 300]); + let test_fastfield = encode_decode_fast_field(&[100u64, 200u64, 300u64]); assert_eq!(test_fastfield.get_val(0), 100); assert_eq!(test_fastfield.get_val(1), 200); assert_eq!(test_fastfield.get_val(2), 300); @@ -324,7 +291,7 @@ mod tests { let composite_file = CompositeFile::open(&file)?; let file = composite_file.open_read(*FIELD).unwrap(); let len = file.len(); - let test_fastfield = DynamicFastFieldReader::::open(file)?; + let test_fastfield = open_fast_field::(file.read_bytes()?)?; assert_eq!(test_fastfield.get_val(0), time1.truncate(precision)); assert_eq!(test_fastfield.get_val(1), time2.truncate(precision)); diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index f3984df6e..15f833939 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -26,9 +26,8 @@ pub use self::alive_bitset::{intersect_alive_bitsets, write_alive_bitset, AliveB pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter}; pub use self::error::{FastFieldNotAvailableError, Result}; pub use self::facet_reader::FacetReader; -pub(crate) use self::gcd::{find_gcd, GCDReader, GCD_DEFAULT}; +pub(crate) use self::gcd::{find_gcd, GCD_DEFAULT}; pub use self::multivalued::{MultiValuedFastFieldReader, MultiValuedFastFieldWriter}; -pub use self::reader::DynamicFastFieldReader; pub use self::readers::FastFieldReaders; pub(crate) use self::readers::{type_and_cardinality, FastType}; pub use self::serializer::{Column, CompositeFastFieldSerializer, FastFieldStats}; @@ -266,6 +265,7 @@ mod tests { use std::collections::HashMap; use std::ops::Range; use std::path::Path; + use std::sync::Arc; use common::HasLen; use once_cell::sync::Lazy; @@ -275,6 +275,7 @@ mod tests { use super::*; use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr}; + use crate::fastfield::reader::open_fast_field; use crate::merge_policy::NoMergePolicy; use crate::schema::{Document, Field, Schema, FAST, STRING, TEXT}; use crate::time::OffsetDateTime; @@ -295,9 +296,51 @@ mod tests { pub static FIELD: Lazy = Lazy::new(|| SCHEMA.get_field("field").unwrap()); pub static FIELDI64: Lazy = Lazy::new(|| SCHEMAI64.get_field("field").unwrap()); + /// Encode values using the most appropriate codec and and then loads it + /// right away. + /// + /// This is useful in tests and bench. + pub(crate) fn encode_decode_fast_field( + vals: &[Item], + ) -> Arc> { + let mut schema_builder = Schema::builder(); + let field = schema_builder.add_u64_field("field", FAST); + let schema = schema_builder.build(); + let path = Path::new("__dummy__"); + let directory: RamDirectory = RamDirectory::create(); + { + let write: WritePtr = directory + .open_write(path) + .expect("With a RamDirectory, this should never fail."); + let mut serializer = CompositeFastFieldSerializer::from_write(write) + .expect("With a RamDirectory, this should never fail."); + let mut fast_field_writers = FastFieldsWriter::from_schema(&schema); + { + let fast_field_writer = fast_field_writers + .get_field_writer_mut(field) + .expect("With a RamDirectory, this should never fail."); + for val in vals { + fast_field_writer.add_val(val.to_u64()); + } + } + fast_field_writers + .serialize(&mut serializer, &HashMap::new(), None) + .unwrap(); + serializer.close().unwrap(); + } + let file = directory.open_read(path).expect("Failed to open the file"); + let composite_file = CompositeFile::open(&file).expect("Failed to read the composite file"); + let field_bytes = composite_file + .open_read(field) + .expect("File component not found") + .read_bytes() + .unwrap(); + open_fast_field(field_bytes).unwrap() + } + #[test] pub fn test_fastfield() { - let test_fastfield = DynamicFastFieldReader::::from(vec![100, 200, 300]); + let test_fastfield = encode_decode_fast_field(&[100u64, 200u64, 300u64]); assert_eq!(test_fastfield.get_val(0u64), 100); assert_eq!(test_fastfield.get_val(1u64), 200); assert_eq!(test_fastfield.get_val(2u64), 300); @@ -328,8 +371,8 @@ mod tests { let file = directory.open_read(path).unwrap(); assert_eq!(file.len(), 45); let composite_file = CompositeFile::open(&file)?; - let file = composite_file.open_read(*FIELD).unwrap(); - let fast_field_reader = DynamicFastFieldReader::::open(file)?; + let fast_field_bytes = composite_file.open_read(*FIELD).unwrap().read_bytes()?; + let fast_field_reader = open_fast_field::(fast_field_bytes)?; assert_eq!(fast_field_reader.get_val(0), 13u64); assert_eq!(fast_field_reader.get_val(1), 14u64); assert_eq!(fast_field_reader.get_val(2), 2u64); @@ -360,8 +403,11 @@ mod tests { assert_eq!(file.len(), 70); { let fast_fields_composite = CompositeFile::open(&file)?; - let data = fast_fields_composite.open_read(*FIELD).unwrap(); - let fast_field_reader = DynamicFastFieldReader::::open(data)?; + let data = fast_fields_composite + .open_read(*FIELD) + .unwrap() + .read_bytes()?; + let fast_field_reader = open_fast_field::(data)?; assert_eq!(fast_field_reader.get_val(0), 4u64); assert_eq!(fast_field_reader.get_val(1), 14_082_001u64); assert_eq!(fast_field_reader.get_val(2), 3_052u64); @@ -396,8 +442,11 @@ mod tests { assert_eq!(file.len(), 43); { let fast_fields_composite = CompositeFile::open(&file).unwrap(); - let data = fast_fields_composite.open_read(*FIELD).unwrap(); - let fast_field_reader = DynamicFastFieldReader::::open(data)?; + let data = fast_fields_composite + .open_read(*FIELD) + .unwrap() + .read_bytes()?; + let fast_field_reader = open_fast_field::(data)?; for doc in 0..10_000 { assert_eq!(fast_field_reader.get_val(doc), 100_000u64); } @@ -428,8 +477,11 @@ mod tests { assert_eq!(file.len(), 80051); { let fast_fields_composite = CompositeFile::open(&file)?; - let data = fast_fields_composite.open_read(*FIELD).unwrap(); - let fast_field_reader = DynamicFastFieldReader::::open(data)?; + let data = fast_fields_composite + .open_read(*FIELD) + .unwrap() + .read_bytes()?; + let fast_field_reader = open_fast_field::(data)?; assert_eq!(fast_field_reader.get_val(0), 0u64); for doc in 1..10_001 { assert_eq!( @@ -469,8 +521,11 @@ mod tests { assert_eq!(file.len(), 75_usize); // linear interpol size after calc improvement { let fast_fields_composite = CompositeFile::open(&file)?; - let data = fast_fields_composite.open_read(i64_field).unwrap(); - let fast_field_reader = DynamicFastFieldReader::::open(data)?; + let data = fast_fields_composite + .open_read(i64_field) + .unwrap() + .read_bytes()?; + let fast_field_reader = open_fast_field::(data)?; assert_eq!(fast_field_reader.min_value(), -100i64); assert_eq!(fast_field_reader.max_value(), 9_999i64); @@ -509,8 +564,11 @@ mod tests { let file = directory.open_read(path).unwrap(); { let fast_fields_composite = CompositeFile::open(&file).unwrap(); - let data = fast_fields_composite.open_read(i64_field).unwrap(); - let fast_field_reader = DynamicFastFieldReader::::open(data)?; + let data = fast_fields_composite + .open_read(i64_field) + .unwrap() + .read_bytes()?; + let fast_field_reader = open_fast_field::(data)?; assert_eq!(fast_field_reader.get_val(0), 0i64); } Ok(()) @@ -547,8 +605,11 @@ mod tests { let file = directory.open_read(path)?; { let fast_fields_composite = CompositeFile::open(&file)?; - let data = fast_fields_composite.open_read(*FIELD).unwrap(); - let fast_field_reader = DynamicFastFieldReader::::open(data)?; + let data = fast_fields_composite + .open_read(*FIELD) + .unwrap() + .read_bytes()?; + let fast_field_reader = open_fast_field::(data)?; for a in 0..n { assert_eq!(fast_field_reader.get_val(a as u64), permutation[a as usize]); @@ -607,7 +668,7 @@ mod tests { let mut all = vec![]; for doc in docs { - let mut out = vec![]; + let mut out: Vec = vec![]; ff.get_vals(doc, &mut out); all.extend(out); } @@ -865,7 +926,7 @@ mod tests { #[test] pub fn test_fastfield_bool() { - let test_fastfield = DynamicFastFieldReader::::from(vec![true, false, true, false]); + let test_fastfield = encode_decode_fast_field::(&[true, false, true, false]); assert_eq!(test_fastfield.get_val(0), true); assert_eq!(test_fastfield.get_val(1), false); assert_eq!(test_fastfield.get_val(2), true); @@ -898,8 +959,8 @@ mod tests { let file = directory.open_read(path).unwrap(); assert_eq!(file.len(), 44); let composite_file = CompositeFile::open(&file)?; - let file = composite_file.open_read(field).unwrap(); - let fast_field_reader = DynamicFastFieldReader::::open(file)?; + let data = composite_file.open_read(field).unwrap().read_bytes()?; + let fast_field_reader = open_fast_field::(data)?; assert_eq!(fast_field_reader.get_val(0), true); assert_eq!(fast_field_reader.get_val(1), false); assert_eq!(fast_field_reader.get_val(2), true); @@ -934,8 +995,8 @@ mod tests { let file = directory.open_read(path).unwrap(); assert_eq!(file.len(), 56); let composite_file = CompositeFile::open(&file)?; - let file = composite_file.open_read(field).unwrap(); - let fast_field_reader = DynamicFastFieldReader::::open(file)?; + let data = composite_file.open_read(field).unwrap().read_bytes()?; + let fast_field_reader = open_fast_field::(data)?; for i in 0..25 { assert_eq!(fast_field_reader.get_val(i * 2), true); assert_eq!(fast_field_reader.get_val(i * 2 + 1), false); @@ -968,8 +1029,8 @@ mod tests { let file = directory.open_read(path).unwrap(); assert_eq!(file.len(), 43); let composite_file = CompositeFile::open(&file)?; - let file = composite_file.open_read(field).unwrap(); - let fast_field_reader = DynamicFastFieldReader::::open(file)?; + let data = composite_file.open_read(field).unwrap().read_bytes()?; + let fast_field_reader = open_fast_field::(data)?; assert_eq!(fast_field_reader.get_val(0), false); Ok(()) @@ -978,32 +1039,17 @@ mod tests { #[cfg(all(test, feature = "unstable"))] mod bench { - use std::collections::HashMap; - use std::path::Path; + use std::sync::Arc; use fastfield_codecs::Column; use test::{self, Bencher}; - use super::tests::{generate_permutation, FIELD, SCHEMA}; - use super::*; - use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr}; - use crate::fastfield::tests::generate_permutation_gcd; + use crate::fastfield::tests::{ + encode_decode_fast_field, generate_permutation, generate_permutation_gcd, + }; #[bench] - fn bench_intfastfield_linear_veclookup(b: &mut Bencher) { - let permutation = generate_permutation(); - b.iter(|| { - let n = test::black_box(7000u32); - let mut a = 0u64; - for i in (0u32..n / 7).map(|v| v * 7) { - a ^= permutation[i as usize]; - } - a - }); - } - - #[bench] - fn bench_intfastfield_veclookup(b: &mut Bencher) { + fn bench_intfastfield_jumpy_veclookup(b: &mut Bencher) { let permutation = generate_permutation(); b.iter(|| { let n = test::black_box(1000u32); @@ -1016,102 +1062,81 @@ mod bench { } #[bench] - fn bench_intfastfield_linear_fflookup(b: &mut Bencher) { - let path = Path::new("test"); + fn bench_intfastfield_jumpy_fflookup(b: &mut Bencher) { let permutation = generate_permutation(); - let directory: RamDirectory = RamDirectory::create(); - { - let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap(); - let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); - for &x in &permutation { - fast_field_writers.add_document(&doc!(*FIELD=>x)); + let column: Arc> = encode_decode_fast_field(&permutation); + b.iter(|| { + let n = test::black_box(1000u32); + let mut a = 0u64; + for _ in 0u32..n { + a = column.get_val(a as u64); } - fast_field_writers - .serialize(&mut serializer, &HashMap::new(), None) - .unwrap(); - serializer.close().unwrap(); - } - let file = directory.open_read(&path).unwrap(); - { - let fast_fields_composite = CompositeFile::open(&file).unwrap(); - let data = fast_fields_composite.open_read(*FIELD).unwrap(); - let fast_field_reader = DynamicFastFieldReader::::open(data).unwrap(); + a + }); + } - b.iter(|| { - let n = test::black_box(7000u32); - let mut a = 0u64; - for i in (0u32..n / 7).map(|val| val * 7) { - a ^= fast_field_reader.get_val(i as u64); - } - a - }); - } + #[bench] + fn bench_intfastfield_linear_fflookup(b: &mut Bencher) { + let permutation = generate_permutation(); + let column: Arc> = encode_decode_fast_field(&permutation); + b.iter(|| { + let n = test::black_box(7000u32); + let mut a = 0u64; + for i in (0..n / 7).map(|val| val * 7) { + a += column.get_val(i as u64); + } + a + }); + } + + #[bench] + fn bench_intfastfield_linear_vec(b: &mut Bencher) { + let permutation = generate_permutation(); + b.iter(|| { + let n = test::black_box(7000); + let mut a = 0u64; + for i in (0..n / 7).map(|val| val * 7) { + a += permutation[i]; + } + a + }); } #[bench] fn bench_intfastfield_fflookup(b: &mut Bencher) { - let path = Path::new("test"); let permutation = generate_permutation(); - let directory: RamDirectory = RamDirectory::create(); - { - let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap(); - let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); - for &x in &permutation { - fast_field_writers.add_document(&doc!(*FIELD=>x)); + let column: Arc> = encode_decode_fast_field(&permutation); + b.iter(|| { + let mut a = 0u64; + for i in 0u64..permutation.len() as u64 { + a = column.get_val(i); } - fast_field_writers - .serialize(&mut serializer, &HashMap::new(), None) - .unwrap(); - serializer.close().unwrap(); - } - let file = directory.open_read(&path).unwrap(); - { - let fast_fields_composite = CompositeFile::open(&file).unwrap(); - let data = fast_fields_composite.open_read(*FIELD).unwrap(); - let fast_field_reader = DynamicFastFieldReader::::open(data).unwrap(); - - b.iter(|| { - let mut a = 0u32; - for i in 0u64..permutation.len() as u64 { - a = fast_field_reader.get_val(i) as u32; - } - a - }); - } + a + }); } #[bench] fn bench_intfastfield_fflookup_gcd(b: &mut Bencher) { - let path = Path::new("test"); let permutation = generate_permutation_gcd(); - let directory: RamDirectory = RamDirectory::create(); - { - let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap(); - let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); - for &x in &permutation { - fast_field_writers.add_document(&doc!(*FIELD=>x)); + let column: Arc> = encode_decode_fast_field(&permutation); + b.iter(|| { + let mut a = 0u64; + for i in 0..permutation.len() as u64 { + a += column.get_val(i); } - fast_field_writers - .serialize(&mut serializer, &HashMap::new(), None) - .unwrap(); - serializer.close().unwrap(); - } - let file = directory.open_read(&path).unwrap(); - { - let fast_fields_composite = CompositeFile::open(&file).unwrap(); - let data = fast_fields_composite.open_read(*FIELD).unwrap(); - let fast_field_reader = DynamicFastFieldReader::::open(data).unwrap(); + a + }); + } - b.iter(|| { - let mut a = 0u32; - for i in 0u32..permutation.len() as u32 { - a = fast_field_reader.get_val(i as u64) as u32; - } - a - }); - } + #[bench] + fn bench_intfastfield_vec(b: &mut Bencher) { + let permutation = generate_permutation_gcd(); + b.iter(|| { + let mut a = 0u64; + for i in 0..permutation.len() { + a += permutation[i as usize] as u64; + } + a + }); } } diff --git a/src/fastfield/multivalued/mod.rs b/src/fastfield/multivalued/mod.rs index c7ba3313a..1a203d6ca 100644 --- a/src/fastfield/multivalued/mod.rs +++ b/src/fastfield/multivalued/mod.rs @@ -346,6 +346,7 @@ mod tests { assert!(test_multivalued_no_panic(&ops[..]).is_ok()); } } + #[test] fn test_multivalued_proptest_gcd() { use IndexingOp::*; diff --git a/src/fastfield/multivalued/reader.rs b/src/fastfield/multivalued/reader.rs index 77fa9e73a..2af69dab1 100644 --- a/src/fastfield/multivalued/reader.rs +++ b/src/fastfield/multivalued/reader.rs @@ -1,8 +1,9 @@ use std::ops::Range; +use std::sync::Arc; use fastfield_codecs::Column; -use crate::fastfield::{DynamicFastFieldReader, FastValue, MultiValueLength}; +use crate::fastfield::{FastValue, MultiValueLength}; use crate::DocId; /// Reader for a multivalued `u64` fast field. @@ -14,14 +15,14 @@ use crate::DocId; /// The `idx_reader` associated, for each document, the index of its first value. #[derive(Clone)] pub struct MultiValuedFastFieldReader { - idx_reader: DynamicFastFieldReader, - vals_reader: DynamicFastFieldReader, + idx_reader: Arc>, + vals_reader: Arc>, } impl MultiValuedFastFieldReader { pub(crate) fn open( - idx_reader: DynamicFastFieldReader, - vals_reader: DynamicFastFieldReader, + idx_reader: Arc>, + vals_reader: Arc>, ) -> MultiValuedFastFieldReader { MultiValuedFastFieldReader { idx_reader, diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index 9f12d60a3..1a932b5fc 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -1,144 +1,65 @@ -use std::collections::HashMap; use std::marker::PhantomData; -use std::path::Path; +use std::sync::Arc; use common::BinarySerializable; -use fastfield_codecs::bitpacked::{BitpackedCodec, BitpackedReader}; -use fastfield_codecs::blockwise_linear::{BlockwiseLinearCodec, BlockwiseLinearReader}; -use fastfield_codecs::linear::{LinearCodec, LinearReader}; -use fastfield_codecs::{Column, FastFieldCodec, FastFieldCodecType}; +use fastfield_codecs::bitpacked::BitpackedCodec; +use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec; +use fastfield_codecs::linear::LinearCodec; +use fastfield_codecs::{monotonic_map_column, Column, FastFieldCodec, FastFieldCodecType}; use super::gcd::open_gcd_from_bytes; use super::FastValue; -use crate::directory::{CompositeFile, Directory, FileSlice, OwnedBytes, RamDirectory, WritePtr}; +use crate::directory::OwnedBytes; use crate::error::DataCorruption; -use crate::fastfield::{CompositeFastFieldSerializer, FastFieldsWriter, GCDReader}; -use crate::schema::{Schema, FAST}; -#[derive(Clone)] -/// DynamicFastFieldReader wraps different readers to access -/// the various encoded fastfield data -pub enum DynamicFastFieldReader { - /// Bitpacked compressed fastfield data. - Bitpacked(FastFieldReaderCodecWrapper), - /// Linear interpolated values + bitpacked - Linear(FastFieldReaderCodecWrapper), - /// Blockwise linear interpolated values + bitpacked - BlockwiseLinear(FastFieldReaderCodecWrapper), - - /// GCD and Bitpacked compressed fastfield data. - BitpackedGCD(FastFieldReaderCodecWrapper>), - /// GCD and Linear interpolated values + bitpacked - LinearGCD(FastFieldReaderCodecWrapper>), - /// GCD and Blockwise linear interpolated values + bitpacked - BlockwiseLinearGCD(FastFieldReaderCodecWrapper>), +fn open_codec_from_bytes( + bytes: OwnedBytes, +) -> crate::Result>> { + let reader = C::open_from_bytes(bytes)?; + Ok(Arc::new(monotonic_map_column(reader, Item::from_u64))) } -impl DynamicFastFieldReader { - /// Returns correct the reader wrapped in the `DynamicFastFieldReader` enum for the data. - pub fn open_from_id( - mut bytes: OwnedBytes, - codec_type: FastFieldCodecType, - ) -> crate::Result> { - let reader = match codec_type { - FastFieldCodecType::Bitpacked => { - DynamicFastFieldReader::Bitpacked(BitpackedCodec::open_from_bytes(bytes)?.into()) - } - FastFieldCodecType::Linear => { - DynamicFastFieldReader::Linear(LinearCodec::open_from_bytes(bytes)?.into()) - } - FastFieldCodecType::BlockwiseLinear => DynamicFastFieldReader::BlockwiseLinear( - BlockwiseLinearCodec::open_from_bytes(bytes)?.into(), - ), - FastFieldCodecType::Gcd => { - let codec_type = FastFieldCodecType::deserialize(&mut bytes)?; - match codec_type { - FastFieldCodecType::Bitpacked => DynamicFastFieldReader::BitpackedGCD( - open_gcd_from_bytes::(bytes)?.into(), - ), - FastFieldCodecType::Linear => DynamicFastFieldReader::LinearGCD( - open_gcd_from_bytes::(bytes)?.into(), - ), - FastFieldCodecType::BlockwiseLinear => { - DynamicFastFieldReader::BlockwiseLinearGCD( - open_gcd_from_bytes::(bytes)?.into(), - ) - } - FastFieldCodecType::Gcd => { - return Err(DataCorruption::comment_only( - "Gcd codec wrapped into another gcd codec. This combination is not \ - allowed.", - ) - .into()) - } +fn open_codec_with_gcd( + bytes: OwnedBytes, +) -> crate::Result>> { + let reader = open_gcd_from_bytes::(bytes)?; + Ok(Arc::new(monotonic_map_column(reader, Item::from_u64))) +} + +/// Returns correct the reader wrapped in the `DynamicFastFieldReader` enum for the data. +fn open_from_id( + mut bytes: OwnedBytes, + codec_type: FastFieldCodecType, +) -> crate::Result>> { + match codec_type { + FastFieldCodecType::Bitpacked => open_codec_from_bytes::(bytes), + FastFieldCodecType::Linear => open_codec_from_bytes::(bytes), + FastFieldCodecType::BlockwiseLinear => { + open_codec_from_bytes::(bytes) + } + FastFieldCodecType::Gcd => { + let codec_type = FastFieldCodecType::deserialize(&mut bytes)?; + match codec_type { + FastFieldCodecType::Bitpacked => open_codec_with_gcd::(bytes), + FastFieldCodecType::Linear => open_codec_with_gcd::(bytes), + FastFieldCodecType::BlockwiseLinear => { + open_codec_with_gcd::(bytes) } + FastFieldCodecType::Gcd => Err(DataCorruption::comment_only( + "Gcd codec wrapped into another gcd codec. This combination is not allowed.", + ) + .into()), } - }; - Ok(reader) - } - - /// Returns correct the reader wrapped in the `DynamicFastFieldReader` enum for the data. - pub fn open(file: FileSlice) -> crate::Result> { - let mut bytes = file.read_bytes()?; - let codec_type = FastFieldCodecType::deserialize(&mut bytes)?; - Self::open_from_id(bytes, codec_type) + } } } -impl Column for DynamicFastFieldReader { - #[inline] - fn get_val(&self, idx: u64) -> Item { - match self { - Self::Bitpacked(reader) => reader.get_val(idx), - Self::Linear(reader) => reader.get_val(idx), - Self::BlockwiseLinear(reader) => reader.get_val(idx), - Self::BitpackedGCD(reader) => reader.get_val(idx), - Self::LinearGCD(reader) => reader.get_val(idx), - Self::BlockwiseLinearGCD(reader) => reader.get_val(idx), - } - } - #[inline] - fn get_range(&self, start: u64, output: &mut [Item]) { - match self { - Self::Bitpacked(reader) => reader.get_range(start, output), - Self::Linear(reader) => reader.get_range(start, output), - Self::BlockwiseLinear(reader) => reader.get_range(start, output), - Self::BitpackedGCD(reader) => reader.get_range(start, output), - Self::LinearGCD(reader) => reader.get_range(start, output), - Self::BlockwiseLinearGCD(reader) => reader.get_range(start, output), - } - } - fn min_value(&self) -> Item { - match self { - Self::Bitpacked(reader) => reader.min_value(), - Self::Linear(reader) => reader.min_value(), - Self::BlockwiseLinear(reader) => reader.min_value(), - Self::BitpackedGCD(reader) => reader.min_value(), - Self::LinearGCD(reader) => reader.min_value(), - Self::BlockwiseLinearGCD(reader) => reader.min_value(), - } - } - fn max_value(&self) -> Item { - match self { - Self::Bitpacked(reader) => reader.max_value(), - Self::Linear(reader) => reader.max_value(), - Self::BlockwiseLinear(reader) => reader.max_value(), - Self::BitpackedGCD(reader) => reader.max_value(), - Self::LinearGCD(reader) => reader.max_value(), - Self::BlockwiseLinearGCD(reader) => reader.max_value(), - } - } - - fn num_vals(&self) -> u64 { - match self { - Self::Bitpacked(reader) => reader.num_vals(), - Self::Linear(reader) => reader.num_vals(), - Self::BlockwiseLinear(reader) => reader.num_vals(), - Self::BitpackedGCD(reader) => reader.num_vals(), - Self::LinearGCD(reader) => reader.num_vals(), - Self::BlockwiseLinearGCD(reader) => reader.num_vals(), - } - } +/// Returns correct the reader wrapped in the `DynamicFastFieldReader` enum for the data. +pub fn open_fast_field( + mut bytes: OwnedBytes, +) -> crate::Result>> { + let codec_type = FastFieldCodecType::deserialize(&mut bytes)?; + open_from_id(bytes, codec_type) } /// Wrapper for accessing a fastfield. @@ -239,40 +160,3 @@ impl Column for FastFieldReaderCodecWr self.reader.num_vals() } } - -impl From> for DynamicFastFieldReader { - fn from(vals: Vec) -> DynamicFastFieldReader { - let mut schema_builder = Schema::builder(); - let field = schema_builder.add_u64_field("field", FAST); - let schema = schema_builder.build(); - let path = Path::new("__dummy__"); - let directory: RamDirectory = RamDirectory::create(); - { - let write: WritePtr = directory - .open_write(path) - .expect("With a RamDirectory, this should never fail."); - let mut serializer = CompositeFastFieldSerializer::from_write(write) - .expect("With a RamDirectory, this should never fail."); - let mut fast_field_writers = FastFieldsWriter::from_schema(&schema); - { - let fast_field_writer = fast_field_writers - .get_field_writer_mut(field) - .expect("With a RamDirectory, this should never fail."); - for val in vals { - fast_field_writer.add_val(val.to_u64()); - } - } - fast_field_writers - .serialize(&mut serializer, &HashMap::new(), None) - .unwrap(); - serializer.close().unwrap(); - } - - let file = directory.open_read(path).expect("Failed to open the file"); - let composite_file = CompositeFile::open(&file).expect("Failed to read the composite file"); - let field_file = composite_file - .open_read(field) - .expect("File component not found"); - DynamicFastFieldReader::open(field_file).unwrap() - } -} diff --git a/src/fastfield/readers.rs b/src/fastfield/readers.rs index 9f75baafd..e01b7fe3e 100644 --- a/src/fastfield/readers.rs +++ b/src/fastfield/readers.rs @@ -1,5 +1,9 @@ -use super::reader::DynamicFastFieldReader; +use std::sync::Arc; + +use fastfield_codecs::Column; + use crate::directory::{CompositeFile, FileSlice}; +use crate::fastfield::reader::open_fast_field; use crate::fastfield::{ BytesFastFieldReader, FastFieldNotAvailableError, FastValue, MultiValuedFastFieldReader, }; @@ -109,14 +113,16 @@ impl FastFieldReaders { &self, field: Field, index: usize, - ) -> crate::Result> { + ) -> crate::Result>> { let fast_field_slice = self.fast_field_data(field, index)?; - DynamicFastFieldReader::open(fast_field_slice) + let bytes = fast_field_slice.read_bytes()?; + open_fast_field(bytes) } + pub(crate) fn typed_fast_field_reader( &self, field: Field, - ) -> crate::Result> { + ) -> crate::Result>> { self.typed_fast_field_reader_with_idx(field, 0) } @@ -132,7 +138,7 @@ impl FastFieldReaders { /// Returns the `u64` fast field reader reader associated to `field`. /// /// If `field` is not a u64 fast field, this method returns an Error. - pub fn u64(&self, field: Field) -> crate::Result> { + pub fn u64(&self, field: Field) -> crate::Result>> { self.check_type(field, FastType::U64, Cardinality::SingleValue)?; self.typed_fast_field_reader(field) } @@ -142,14 +148,14 @@ impl FastFieldReaders { /// /// If not, the fastfield reader will returns the u64-value associated to the original /// FastValue. - pub fn u64_lenient(&self, field: Field) -> crate::Result> { + pub fn u64_lenient(&self, field: Field) -> crate::Result>> { self.typed_fast_field_reader(field) } /// Returns the `i64` fast field reader reader associated to `field`. /// /// If `field` is not a i64 fast field, this method returns an Error. - pub fn i64(&self, field: Field) -> crate::Result> { + pub fn i64(&self, field: Field) -> crate::Result>> { self.check_type(field, FastType::I64, Cardinality::SingleValue)?; self.typed_fast_field_reader(field) } @@ -157,7 +163,7 @@ impl FastFieldReaders { /// Returns the `date` fast field reader reader associated to `field`. /// /// If `field` is not a date fast field, this method returns an Error. - pub fn date(&self, field: Field) -> crate::Result> { + pub fn date(&self, field: Field) -> crate::Result>> { self.check_type(field, FastType::Date, Cardinality::SingleValue)?; self.typed_fast_field_reader(field) } @@ -165,7 +171,7 @@ impl FastFieldReaders { /// Returns the `f64` fast field reader reader associated to `field`. /// /// If `field` is not a f64 fast field, this method returns an Error. - pub fn f64(&self, field: Field) -> crate::Result> { + pub fn f64(&self, field: Field) -> crate::Result>> { self.check_type(field, FastType::F64, Cardinality::SingleValue)?; self.typed_fast_field_reader(field) } @@ -173,7 +179,7 @@ impl FastFieldReaders { /// Returns the `bool` fast field reader reader associated to `field`. /// /// If `field` is not a bool fast field, this method returns an Error. - pub fn bool(&self, field: Field) -> crate::Result> { + pub fn bool(&self, field: Field) -> crate::Result>> { self.check_type(field, FastType::Bool, Cardinality::SingleValue)?; self.typed_fast_field_reader(field) } @@ -241,7 +247,8 @@ impl FastFieldReaders { ))); } let fast_field_idx_file = self.fast_field_data(field, 0)?; - let idx_reader = DynamicFastFieldReader::open(fast_field_idx_file)?; + let fast_field_idx_bytes = fast_field_idx_file.read_bytes()?; + let idx_reader = open_fast_field(fast_field_idx_bytes)?; let data = self.fast_field_data(field, 1)?; BytesFastFieldReader::open(idx_reader, data) } else { diff --git a/src/fastfield/serializer/mod.rs b/src/fastfield/serializer/mod.rs index 85f5e4e4b..63dc2dd5c 100644 --- a/src/fastfield/serializer/mod.rs +++ b/src/fastfield/serializer/mod.rs @@ -6,7 +6,7 @@ use fastdivide::DividerU64; pub use fastfield_codecs::bitpacked::{BitpackedCodec, BitpackedSerializerLegacy}; use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec; use fastfield_codecs::linear::LinearCodec; -use fastfield_codecs::FastFieldCodecType; +use fastfield_codecs::{monotonic_map_column, FastFieldCodecType}; pub use fastfield_codecs::{Column, FastFieldCodec, FastFieldStats}; use super::{find_gcd, ALL_CODECS, GCD_DEFAULT}; @@ -136,56 +136,22 @@ impl CompositeFastFieldSerializer { } Self::write_header(field_write, FastFieldCodecType::Gcd)?; - struct GCDWrappedFFAccess { - fastfield_accessor: T, - base_value: u64, - max_value: u64, - num_vals: u64, - gcd: DividerU64, - } - impl Column for GCDWrappedFFAccess { - fn get_val(&self, position: u64) -> u64 { - self.gcd - .divide(self.fastfield_accessor.get_val(position) - self.base_value) - } - fn iter(&self) -> Box + '_> { - Box::new( - self.fastfield_accessor - .iter() - .map(|val| self.gcd.divide(val - self.base_value)), - ) - } - fn min_value(&self) -> u64 { - 0 - } - - fn max_value(&self) -> u64 { - self.max_value - } - - fn num_vals(&self) -> u64 { - self.num_vals - } - } - - let num_vals = fastfield_accessor.num_vals(); let base_value = fastfield_accessor.min_value(); - let max_value = (fastfield_accessor.max_value() - fastfield_accessor.min_value()) / gcd; - let fastfield_accessor = GCDWrappedFFAccess { - fastfield_accessor, - base_value, - max_value, - num_vals, - gcd: DividerU64::divide_by(gcd), - }; + let gcd_divider = DividerU64::divide_by(gcd); + + let divided_fastfield_accessor = monotonic_map_column(fastfield_accessor, |val: u64| { + gcd_divider.divide(val - base_value) + }); + + let num_vals = divided_fastfield_accessor.num_vals(); Self::create_auto_detect_u64_fast_field_with_idx_gcd( self.codec_enable_checker.clone(), field, field_write, - fastfield_accessor, + divided_fastfield_accessor, )?; write_gcd_header(field_write, base_value, gcd, num_vals)?; Ok(()) diff --git a/src/indexer/doc_id_mapping.rs b/src/indexer/doc_id_mapping.rs index 914a144a7..6cba76840 100644 --- a/src/indexer/doc_id_mapping.rs +++ b/src/indexer/doc_id_mapping.rs @@ -143,8 +143,6 @@ pub(crate) fn get_doc_id_mapping_from_field( #[cfg(test)] mod tests_indexsorting { - use fastfield_codecs::Column; - use crate::collector::TopDocs; use crate::indexer::doc_id_mapping::DocIdMapping; use crate::query::QueryParser; diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 0bbf2f5b5..f8bc47d7c 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -775,7 +775,6 @@ impl Drop for IndexWriter { mod tests { use std::collections::{HashMap, HashSet}; - use fastfield_codecs::Column; use proptest::prelude::*; use proptest::prop_oneof; use proptest::strategy::Strategy; diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index fa2bac324..412282a18 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -9,8 +9,8 @@ use crate::core::{Segment, SegmentReader}; use crate::docset::{DocSet, TERMINATED}; use crate::error::DataCorruption; use crate::fastfield::{ - AliveBitSet, Column, CompositeFastFieldSerializer, DynamicFastFieldReader, FastFieldStats, - MultiValueLength, MultiValuedFastFieldReader, + AliveBitSet, Column, CompositeFastFieldSerializer, FastFieldStats, MultiValueLength, + MultiValuedFastFieldReader, }; use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter}; use crate::indexer::doc_id_mapping::{expect_field_id_for_sort_field, SegmentDocIdMapping}; @@ -87,7 +87,7 @@ pub struct IndexMerger { } fn compute_min_max_val( - u64_reader: &impl Column, + u64_reader: &dyn Column, segment_reader: &SegmentReader, ) -> Option<(u64, u64)> { if segment_reader.max_doc() == 0 { @@ -341,12 +341,12 @@ impl IndexMerger { .readers .iter() .filter_map(|reader| { - let u64_reader: DynamicFastFieldReader = + let u64_reader: Arc> = reader.fast_fields().typed_fast_field_reader(field).expect( "Failed to find a reader for single fast field. This is a tantivy bug and \ it should never happen.", ); - compute_min_max_val(&u64_reader, reader) + compute_min_max_val(&*u64_reader, reader) }) .reduce(|a, b| (a.0.min(b.0), a.1.max(b.1))) .expect("Unexpected error, empty readers in IndexMerger"); @@ -355,7 +355,7 @@ impl IndexMerger { .readers .iter() .map(|reader| { - let u64_reader: DynamicFastFieldReader = + let u64_reader: Arc> = reader.fast_fields().typed_fast_field_reader(field).expect( "Failed to find a reader for single fast field. This is a tantivy bug and \ it should never happen.", @@ -372,7 +372,7 @@ impl IndexMerger { #[derive(Clone)] struct SortedDocIdFieldAccessProvider<'a> { doc_id_mapping: &'a SegmentDocIdMapping, - fast_field_readers: &'a Vec>, + fast_field_readers: &'a Vec>>, stats: FastFieldStats, } impl<'a> Column for SortedDocIdFieldAccessProvider<'a> { @@ -443,7 +443,7 @@ impl IndexMerger { pub(crate) fn get_sort_field_accessor( reader: &SegmentReader, sort_by_field: &IndexSortByField, - ) -> crate::Result { + ) -> crate::Result> { let field_id = expect_field_id_for_sort_field(reader.schema(), sort_by_field)?; // for now expect fastfield, but not strictly required let value_accessor = reader.fast_fields().u64_lenient(field_id)?; Ok(value_accessor) @@ -452,7 +452,7 @@ impl IndexMerger { pub(crate) fn get_reader_with_sort_field_accessor( &self, sort_by_field: &IndexSortByField, - ) -> crate::Result> { + ) -> crate::Result)>> { let reader_ordinal_and_field_accessors = self .readers .iter() @@ -618,7 +618,7 @@ impl IndexMerger { .map(|reader| { let u64s_reader: MultiValuedFastFieldReader = reader .fast_fields() - .typed_fast_field_multi_reader(field) + .typed_fast_field_multi_reader::(field) .expect( "Failed to find index for multivalued field. This is a bug in tantivy, \ please report.", @@ -668,7 +668,7 @@ impl IndexMerger { { let mut serialize_vals = fast_field_serializer.new_u64_fast_field_with_idx(field, 0u64, max_term_ord, 1)?; - let mut vals = Vec::with_capacity(100); + let mut vals: Vec = Vec::with_capacity(100); for old_doc_addr in doc_id_mapping.iter_old_doc_addrs() { let term_ordinal_mapping: &[TermOrdinal] = @@ -742,7 +742,7 @@ impl IndexMerger { for reader in &self.readers { let ff_reader: MultiValuedFastFieldReader = reader .fast_fields() - .typed_fast_field_multi_reader(field) + .typed_fast_field_multi_reader::(field) .expect( "Failed to find multivalued fast field reader. This is a bug in tantivy. \ Please report.", @@ -1199,7 +1199,6 @@ impl IndexMerger { #[cfg(test)] mod tests { use byteorder::{BigEndian, ReadBytesExt}; - use fastfield_codecs::Column; use schema::FAST; use crate::collector::tests::{ diff --git a/src/indexer/merger_sorted_index_test.rs b/src/indexer/merger_sorted_index_test.rs index 8cc1c05d6..127ad192c 100644 --- a/src/indexer/merger_sorted_index_test.rs +++ b/src/indexer/merger_sorted_index_test.rs @@ -1,7 +1,5 @@ #[cfg(test)] mod tests { - use fastfield_codecs::Column; - use crate::collector::TopDocs; use crate::core::Index; use crate::fastfield::{AliveBitSet, MultiValuedFastFieldReader}; @@ -480,11 +478,12 @@ mod tests { #[cfg(all(test, feature = "unstable"))] mod bench_sorted_index_merge { + use std::sync::Arc; + use fastfield_codecs::Column; use test::{self, Bencher}; use crate::core::Index; - use crate::fastfield::DynamicFastFieldReader; use crate::indexer::merger::IndexMerger; use crate::schema::{Cardinality, NumericOptions, Schema}; use crate::{IndexSettings, IndexSortByField, IndexWriter, Order}; @@ -536,7 +535,7 @@ mod bench_sorted_index_merge { b.iter(|| { let sorted_doc_ids = doc_id_mapping.iter_old_doc_addrs().map(|doc_addr| { let reader = &merger.readers[doc_addr.segment_ord as usize]; - let u64_reader: DynamicFastFieldReader = + let u64_reader: Arc> = reader.fast_fields().typed_fast_field_reader(field).expect( "Failed to find a reader for single fast field. This is a tantivy bug and \ it should never happen.", diff --git a/src/lib.rs b/src/lib.rs index 40ff11028..4c699732e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -421,7 +421,6 @@ pub struct DocAddress { #[cfg(test)] pub mod tests { use common::{BinarySerializable, FixedSize}; - use fastfield_codecs::Column; use rand::distributions::{Bernoulli, Uniform}; use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; diff --git a/src/store/reader.rs b/src/store/reader.rs index 62afd4c04..2f8a26bc3 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -339,7 +339,7 @@ impl StoreReader { async fn read_block_async(&self, checkpoint: &Checkpoint) -> crate::AsyncIoResult { let cache_key = checkpoint.byte_range.start; if let Some(block) = self.cache.get_from_cache(checkpoint.byte_range.start) { - return Ok(block.clone()); + return Ok(block); } let compressed_block = self diff --git a/src/termdict/sstable_termdict/sstable/delta.rs b/src/termdict/sstable_termdict/sstable/delta.rs index 3551891cd..775ebe09b 100644 --- a/src/termdict/sstable_termdict/sstable/delta.rs +++ b/src/termdict/sstable_termdict/sstable/delta.rs @@ -172,8 +172,7 @@ where TValueReader: value::ValueReader } pub fn suffix(&self) -> &[u8] { - &self - .block_reader + self.block_reader .buffer_from_to(self.suffix_start, self.suffix_end) } diff --git a/src/termdict/sstable_termdict/sstable/sstable_index.rs b/src/termdict/sstable_termdict/sstable/sstable_index.rs index 8258cbe7d..dfcf7a1a4 100644 --- a/src/termdict/sstable_termdict/sstable/sstable_index.rs +++ b/src/termdict/sstable_termdict/sstable/sstable_index.rs @@ -50,7 +50,7 @@ pub struct SSTableIndexBuilder { /// matches `left <= left' < right`. fn find_shorter_str_in_between(left: &mut Vec, right: &[u8]) { assert!(&left[..] < right); - let common_len = common_prefix_len(&left, right); + let common_len = common_prefix_len(left, right); if left.len() == common_len { return; }