Compare commits

..

3 Commits

Author SHA1 Message Date
Paul Masurel
1553901f51 Introducing a column trait 2022-08-27 22:48:05 +02:00
Paul Masurel
e8a6e123ae Small refactoring estimate. 2022-08-27 21:53:46 +02:00
Paul Masurel
43a4c8287c Removing Deserializer trait
And renaming the `Serializer` trait `FastFieldCodec`.
2022-08-27 21:11:54 +02:00
36 changed files with 556 additions and 498 deletions

View File

@@ -7,12 +7,11 @@
// Of course, you can have a look at the tantivy's built-in collectors // Of course, you can have a look at the tantivy's built-in collectors
// such as the `CountCollector` for more examples. // such as the `CountCollector` for more examples.
use std::sync::Arc;
use fastfield_codecs::Column; use fastfield_codecs::Column;
// --- // ---
// Importing tantivy... // Importing tantivy...
use tantivy::collector::{Collector, SegmentCollector}; use tantivy::collector::{Collector, SegmentCollector};
use tantivy::fastfield::DynamicFastFieldReader;
use tantivy::query::QueryParser; use tantivy::query::QueryParser;
use tantivy::schema::{Field, Schema, FAST, INDEXED, TEXT}; use tantivy::schema::{Field, Schema, FAST, INDEXED, TEXT};
use tantivy::{doc, Index, Score, SegmentReader}; use tantivy::{doc, Index, Score, SegmentReader};
@@ -97,7 +96,7 @@ impl Collector for StatsCollector {
} }
struct StatsSegmentCollector { struct StatsSegmentCollector {
fast_field_reader: Arc<dyn Column<u64>>, fast_field_reader: DynamicFastFieldReader<u64>,
stats: Stats, stats: Stats,
} }

View File

@@ -2,6 +2,7 @@ use std::cmp::Reverse;
use std::collections::{HashMap, HashSet}; use std::collections::{HashMap, HashSet};
use std::sync::{Arc, RwLock, Weak}; use std::sync::{Arc, RwLock, Weak};
use fastfield_codecs::Column;
use tantivy::collector::TopDocs; use tantivy::collector::TopDocs;
use tantivy::query::QueryParser; use tantivy::query::QueryParser;
use tantivy::schema::{Field, Schema, FAST, TEXT}; use tantivy::schema::{Field, Schema, FAST, TEXT};

View File

@@ -1,5 +1,3 @@
use std::marker::PhantomData;
pub trait Column<T = u64> { pub trait Column<T = u64> {
/// Return the value associated to the given idx. /// Return the value associated to the given idx.
/// ///
@@ -44,103 +42,8 @@ pub trait Column<T = u64> {
fn max_value(&self) -> T; fn max_value(&self) -> T;
fn num_vals(&self) -> u64; fn num_vals(&self) -> u64;
/// Returns a iterator over the data /// Returns a iterator over the data
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = T> + 'a> { fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = T> + 'a> {
Box::new((0..self.num_vals()).map(|idx| self.get_val(idx))) Box::new((0..self.num_vals()).map(|idx| self.get_val(idx)))
} }
} }
struct VecColumn<'a>(&'a [u64]);
impl<'a> Column for VecColumn<'a> {
fn get_val(&self, position: u64) -> u64 {
self.0[position as usize]
}
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
Box::new(self.0.iter().cloned())
}
fn min_value(&self) -> u64 {
self.0.iter().min().cloned().unwrap_or(0)
}
fn max_value(&self) -> u64 {
self.0.iter().max().cloned().unwrap_or(0)
}
fn num_vals(&self) -> u64 {
self.0.len() as u64
}
}
impl<'a> From<&'a [u64]> for VecColumn<'a> {
fn from(data: &'a [u64]) -> Self {
Self(data)
}
}
struct MonotonicMappingColumn<C, T, Input> {
from_column: C,
monotonic_mapping: T,
_phantom: PhantomData<Input>,
}
/// Creates a view of a column transformed by a monotonic mapping.
pub fn monotonic_map_column<C, T, Input, Output>(
from_column: C,
monotonic_mapping: T,
) -> impl Column<Output>
where
C: Column<Input>,
T: Fn(Input) -> Output,
{
MonotonicMappingColumn {
from_column,
monotonic_mapping,
_phantom: PhantomData,
}
}
impl<C, T, Input, Output> Column<Output> for MonotonicMappingColumn<C, T, Input>
where
C: Column<Input>,
T: Fn(Input) -> Output,
{
fn get_val(&self, idx: u64) -> Output {
let from_val = self.from_column.get_val(idx);
(self.monotonic_mapping)(from_val)
}
fn min_value(&self) -> Output {
let from_min_value = self.from_column.min_value();
(self.monotonic_mapping)(from_min_value)
}
fn max_value(&self) -> Output {
let from_max_value = self.from_column.max_value();
(self.monotonic_mapping)(from_max_value)
}
fn num_vals(&self) -> u64 {
self.from_column.num_vals()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_monotonic_mapping() {
let vals = &[1u64, 3u64][..];
let col = VecColumn::from(vals);
let mapped = monotonic_map_column(col, |el| el + 4);
assert_eq!(mapped.min_value(), 5u64);
assert_eq!(mapped.max_value(), 7u64);
assert_eq!(mapped.num_vals(), 2);
assert_eq!(mapped.num_vals(), 2);
assert_eq!(mapped.get_val(0), 5);
assert_eq!(mapped.get_val(0), 7);
}
}

View File

@@ -14,7 +14,8 @@ pub mod linear;
mod column; mod column;
pub use self::column::{monotonic_map_column, Column}; pub use self::column::Column;
#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)] #[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
#[repr(u8)] #[repr(u8)]
@@ -56,12 +57,12 @@ impl FastFieldCodecType {
/// The FastFieldSerializerEstimate trait is required on all variants /// The FastFieldSerializerEstimate trait is required on all variants
/// of fast field compressions, to decide which one to choose. /// of fast field compressions, to decide which one to choose.
pub trait FastFieldCodec: 'static { pub trait FastFieldCodec {
/// A codex needs to provide a unique name and id, which is /// A codex needs to provide a unique name and id, which is
/// used for debugging and de/serialization. /// used for debugging and de/serialization.
const CODEC_TYPE: FastFieldCodecType; const CODEC_TYPE: FastFieldCodecType;
type Reader: Column<u64> + 'static; type Reader: Column<u64>;
/// Reads the metadata and returns the CodecReader /// Reads the metadata and returns the CodecReader
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self::Reader>; fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self::Reader>;
@@ -90,6 +91,48 @@ pub struct FastFieldStats {
pub num_vals: u64, pub num_vals: u64,
} }
impl<'a> Column for &'a [u64] {
fn get_val(&self, position: u64) -> u64 {
self[position as usize]
}
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
Box::new((self as &[u64]).iter().cloned())
}
fn min_value(&self) -> u64 {
self.iter().min().unwrap_or(0)
}
fn max_value(&self) -> u64 {
self.iter().max().unwrap_or(0)
}
fn num_vals(&self) -> u64 {
self.len() as u64
}
}
impl Column for Vec<u64> {
fn get_val(&self, position: u64) -> u64 {
self[position as usize]
}
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
Box::new((self as &[u64]).iter().cloned())
}
fn min_value(&self) -> u64 {
self.iter().min().unwrap_or(0)
}
fn max_value(&self) -> u64 {
self.iter().max().unwrap_or(0)
}
fn num_vals(&self) -> u64 {
self.len() as u64
}
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use proptest::arbitrary::any; use proptest::arbitrary::any;
@@ -103,10 +146,10 @@ mod tests {
data: &[u64], data: &[u64],
name: &str, name: &str,
) -> Option<(f32, f32)> { ) -> Option<(f32, f32)> {
let estimation = Codec::estimate(&VecColum::from(data))?; let estimation = Codec::estimate(&data)?;
let mut out: Vec<u8> = Vec::new(); let mut out: Vec<u8> = Vec::new();
Codec::serialize(&mut out, &VecColum::from(data)).unwrap(); Codec::serialize(&mut out, &data).unwrap();
let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0); let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0);
@@ -153,11 +196,6 @@ mod tests {
data_and_names.push((vec![5, 50, 3, 13, 1, 1000, 35], "rand small")); data_and_names.push((vec![5, 50, 3, 13, 1, 1000, 35], "rand small"));
data_and_names.push((vec![10], "single value")); data_and_names.push((vec![10], "single value"));
data_and_names.push((
vec![1572656989877777, 1170935903116329, 720575940379279, 0],
"overflow error",
));
data_and_names data_and_names
} }
@@ -192,7 +230,6 @@ mod tests {
#[test] #[test]
fn estimation_good_interpolation_case() { fn estimation_good_interpolation_case() {
let data = (10..=20000_u64).collect::<Vec<_>>(); let data = (10..=20000_u64).collect::<Vec<_>>();
let data: VecColum = data.as_slice().into();
let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap(); let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
assert_le!(linear_interpol_estimation, 0.01); assert_le!(linear_interpol_estimation, 0.01);
@@ -206,9 +243,8 @@ mod tests {
} }
#[test] #[test]
fn estimation_test_bad_interpolation_case() { fn estimation_test_bad_interpolation_case() {
let data: &[u64] = &[200, 10, 10, 10, 10, 1000, 20]; let data = vec![200, 10, 10, 10, 10, 1000, 20];
let data: VecColum = data.into();
let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap(); let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
assert_le!(linear_interpol_estimation, 0.32); assert_le!(linear_interpol_estimation, 0.32);
@@ -219,7 +255,6 @@ mod tests {
fn estimation_test_bad_interpolation_case_monotonically_increasing() { fn estimation_test_bad_interpolation_case_monotonically_increasing() {
let mut data: Vec<u64> = (200..=20000_u64).collect(); let mut data: Vec<u64> = (200..=20000_u64).collect();
data.push(1_000_000); data.push(1_000_000);
let data: VecColum = data.as_slice().into();
// in this case the linear interpolation can't in fact not be worse than bitpacking, // in this case the linear interpolation can't in fact not be worse than bitpacking,
// but the estimator adds some threshold, which leads to estimated worse behavior // but the estimator adds some threshold, which leads to estimated worse behavior

View File

@@ -115,9 +115,9 @@ fn diff(val1: u64, val2: u64) -> f64 {
#[inline] #[inline]
pub fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 { pub fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
if slope < 0.0 { if slope < 0.0 {
first_val.saturating_sub((pos as f32 * -slope) as u64) first_val - (pos as f32 * -slope) as u64
} else { } else {
first_val.saturating_add((pos as f32 * slope) as u64) first_val + (pos as f32 * slope) as u64
} }
} }
@@ -314,13 +314,6 @@ mod tests {
create_and_validate(&data, "large amplitude"); create_and_validate(&data, "large amplitude");
} }
#[test]
fn overflow_error_test() {
let data = vec![1572656989877777, 1170935903116329, 720575940379279, 0];
create_and_validate(&data, "overflow test");
}
#[test] #[test]
fn linear_interpol_fast_concave_data() { fn linear_interpol_fast_concave_data() {
let data = vec![0, 1, 2, 5, 8, 10, 20, 50]; let data = vec![0, 1, 2, 5, 8, 10, 20, 50];

View File

@@ -3,33 +3,9 @@ extern crate prettytable;
use fastfield_codecs::bitpacked::BitpackedCodec; use fastfield_codecs::bitpacked::BitpackedCodec;
use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec; use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec;
use fastfield_codecs::linear::LinearCodec; use fastfield_codecs::linear::LinearCodec;
use fastfield_codecs::{Column, FastFieldCodec, FastFieldCodecType, FastFieldStats}; use fastfield_codecs::{FastFieldCodec, FastFieldCodecType, FastFieldStats};
use prettytable::{Cell, Row, Table}; use prettytable::{Cell, Row, Table};
struct Data<'a>(&'a [u64]);
impl<'a> Column for Data<'a> {
fn get_val(&self, position: u64) -> u64 {
self.0[position as usize]
}
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
Box::new(self.0.iter().cloned())
}
fn min_value(&self) -> u64 {
*self.0.iter().min().unwrap_or(&0)
}
fn max_value(&self) -> u64 {
*self.0.iter().max().unwrap_or(&0)
}
fn num_vals(&self) -> u64 {
self.0.len() as u64
}
}
fn main() { fn main() {
let mut table = Table::new(); let mut table = Table::new();
@@ -110,11 +86,10 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
pub fn serialize_with_codec<C: FastFieldCodec>( pub fn serialize_with_codec<C: FastFieldCodec>(
data: &[u64], data: &[u64],
) -> Option<(f32, f32, FastFieldCodecType)> { ) -> Option<(f32, f32, FastFieldCodecType)> {
let data = Data(data);
let estimation = C::estimate(&data)?; let estimation = C::estimate(&data)?;
let mut out = Vec::new(); let mut out = Vec::new();
C::serialize(&mut out, &data).unwrap(); C::serialize(&mut out, &data).unwrap();
let actual_compression = out.len() as f32 / (data.num_vals() * 8) as f32; let actual_compression = out.len() as f32 / (data.len() * 8) as f32;
Some((estimation, actual_compression, C::CODEC_TYPE)) Some((estimation, actual_compression, C::CODEC_TYPE))
} }

View File

@@ -4,14 +4,14 @@ use std::rc::Rc;
use std::sync::atomic::AtomicU32; use std::sync::atomic::AtomicU32;
use std::sync::Arc; use std::sync::Arc;
use fastfield_codecs::Column;
use super::agg_req::{Aggregation, Aggregations, BucketAggregationType, MetricAggregation}; use super::agg_req::{Aggregation, Aggregations, BucketAggregationType, MetricAggregation};
use super::bucket::{HistogramAggregation, RangeAggregation, TermsAggregation}; use super::bucket::{HistogramAggregation, RangeAggregation, TermsAggregation};
use super::metric::{AverageAggregation, StatsAggregation}; use super::metric::{AverageAggregation, StatsAggregation};
use super::segment_agg_result::BucketCount; use super::segment_agg_result::BucketCount;
use super::VecWithNames; use super::VecWithNames;
use crate::fastfield::{type_and_cardinality, FastType, MultiValuedFastFieldReader}; use crate::fastfield::{
type_and_cardinality, DynamicFastFieldReader, FastType, MultiValuedFastFieldReader,
};
use crate::schema::{Cardinality, Type}; use crate::schema::{Cardinality, Type};
use crate::{InvertedIndexReader, SegmentReader, TantivyError}; use crate::{InvertedIndexReader, SegmentReader, TantivyError};
@@ -37,16 +37,10 @@ impl AggregationsWithAccessor {
#[derive(Clone)] #[derive(Clone)]
pub(crate) enum FastFieldAccessor { pub(crate) enum FastFieldAccessor {
Multi(MultiValuedFastFieldReader<u64>), Multi(MultiValuedFastFieldReader<u64>),
Single(Arc<dyn Column<u64>>), Single(DynamicFastFieldReader<u64>),
} }
impl FastFieldAccessor { impl FastFieldAccessor {
pub fn as_single(&self) -> Option<&dyn Column<u64>> { pub fn as_single(&self) -> Option<&DynamicFastFieldReader<u64>> {
match self {
FastFieldAccessor::Multi(_) => None,
FastFieldAccessor::Single(reader) => Some(&**reader),
}
}
pub fn into_single(self) -> Option<Arc<dyn Column<u64>>> {
match self { match self {
FastFieldAccessor::Multi(_) => None, FastFieldAccessor::Multi(_) => None,
FastFieldAccessor::Single(reader) => Some(reader), FastFieldAccessor::Single(reader) => Some(reader),
@@ -124,7 +118,7 @@ impl BucketAggregationWithAccessor {
pub struct MetricAggregationWithAccessor { pub struct MetricAggregationWithAccessor {
pub metric: MetricAggregation, pub metric: MetricAggregation,
pub field_type: Type, pub field_type: Type,
pub accessor: Arc<dyn Column>, pub accessor: DynamicFastFieldReader<u64>,
} }
impl MetricAggregationWithAccessor { impl MetricAggregationWithAccessor {
@@ -140,8 +134,9 @@ impl MetricAggregationWithAccessor {
Ok(MetricAggregationWithAccessor { Ok(MetricAggregationWithAccessor {
accessor: accessor accessor: accessor
.into_single() .as_single()
.expect("unexpected fast field cardinality"), .expect("unexpected fast field cardinality")
.clone(),
field_type, field_type,
metric: metric.clone(), metric: metric.clone(),
}) })

View File

@@ -15,6 +15,7 @@ use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResults, IntermediateBucketResult, IntermediateHistogramBucketEntry, IntermediateAggregationResults, IntermediateBucketResult, IntermediateHistogramBucketEntry,
}; };
use crate::aggregation::segment_agg_result::SegmentAggregationResultsCollector; use crate::aggregation::segment_agg_result::SegmentAggregationResultsCollector;
use crate::fastfield::DynamicFastFieldReader;
use crate::schema::Type; use crate::schema::Type;
use crate::{DocId, TantivyError}; use crate::{DocId, TantivyError};
@@ -263,7 +264,7 @@ impl SegmentHistogramCollector {
req: &HistogramAggregation, req: &HistogramAggregation,
sub_aggregation: &AggregationsWithAccessor, sub_aggregation: &AggregationsWithAccessor,
field_type: Type, field_type: Type,
accessor: &dyn Column<u64>, accessor: &DynamicFastFieldReader<u64>,
) -> crate::Result<Self> { ) -> crate::Result<Self> {
req.validate()?; req.validate()?;
let min = f64_from_fastfield_u64(accessor.min_value(), &field_type); let min = f64_from_fastfield_u64(accessor.min_value(), &field_type);

View File

@@ -1,6 +1,7 @@
use std::fmt::Debug; use std::fmt::Debug;
use std::ops::Range; use std::ops::Range;
use fastfield_codecs::Column;
use fnv::FnvHashMap; use fnv::FnvHashMap;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};

View File

@@ -4,6 +4,7 @@ use fastfield_codecs::Column;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use crate::aggregation::f64_from_fastfield_u64; use crate::aggregation::f64_from_fastfield_u64;
use crate::fastfield::DynamicFastFieldReader;
use crate::schema::Type; use crate::schema::Type;
use crate::DocId; use crate::DocId;
@@ -57,7 +58,7 @@ impl SegmentAverageCollector {
data: Default::default(), data: Default::default(),
} }
} }
pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &dyn Column<u64>) { pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &DynamicFastFieldReader<u64>) {
let mut iter = doc.chunks_exact(4); let mut iter = doc.chunks_exact(4);
for docs in iter.by_ref() { for docs in iter.by_ref() {
let val1 = field.get_val(docs[0] as u64); let val1 = field.get_val(docs[0] as u64);

View File

@@ -2,6 +2,7 @@ use fastfield_codecs::Column;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use crate::aggregation::f64_from_fastfield_u64; use crate::aggregation::f64_from_fastfield_u64;
use crate::fastfield::DynamicFastFieldReader;
use crate::schema::Type; use crate::schema::Type;
use crate::{DocId, TantivyError}; use crate::{DocId, TantivyError};
@@ -163,7 +164,7 @@ impl SegmentStatsCollector {
stats: IntermediateStats::default(), stats: IntermediateStats::default(),
} }
} }
pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &dyn Column<u64>) { pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &DynamicFastFieldReader<u64>) {
let mut iter = doc.chunks_exact(4); let mut iter = doc.chunks_exact(4);
for docs in iter.by_ref() { for docs in iter.by_ref() {
let val1 = field.get_val(docs[0] as u64); let val1 = field.get_val(docs[0] as u64);

View File

@@ -185,10 +185,10 @@ impl SegmentMetricResultCollector {
pub(crate) fn collect_block(&mut self, doc: &[DocId], metric: &MetricAggregationWithAccessor) { pub(crate) fn collect_block(&mut self, doc: &[DocId], metric: &MetricAggregationWithAccessor) {
match self { match self {
SegmentMetricResultCollector::Average(avg_collector) => { SegmentMetricResultCollector::Average(avg_collector) => {
avg_collector.collect_block(doc, &*metric.accessor); avg_collector.collect_block(doc, &metric.accessor);
} }
SegmentMetricResultCollector::Stats(stats_collector) => { SegmentMetricResultCollector::Stats(stats_collector) => {
stats_collector.collect_block(doc, &*metric.accessor); stats_collector.collect_block(doc, &metric.accessor);
} }
} }
} }

View File

@@ -10,12 +10,11 @@
// --- // ---
// Importing tantivy... // Importing tantivy...
use std::marker::PhantomData; use std::marker::PhantomData;
use std::sync::Arc;
use fastfield_codecs::Column; use fastfield_codecs::Column;
use crate::collector::{Collector, SegmentCollector}; use crate::collector::{Collector, SegmentCollector};
use crate::fastfield::FastValue; use crate::fastfield::{DynamicFastFieldReader, FastValue};
use crate::schema::Field; use crate::schema::Field;
use crate::{Score, SegmentReader, TantivyError}; use crate::{Score, SegmentReader, TantivyError};
@@ -161,7 +160,7 @@ where
TPredicate: 'static, TPredicate: 'static,
TPredicateValue: FastValue, TPredicateValue: FastValue,
{ {
fast_field_reader: Arc<dyn Column<TPredicateValue>>, fast_field_reader: DynamicFastFieldReader<TPredicateValue>,
segment_collector: TSegmentCollector, segment_collector: TSegmentCollector,
predicate: TPredicate, predicate: TPredicate,
t_predicate_value: PhantomData<TPredicateValue>, t_predicate_value: PhantomData<TPredicateValue>,

View File

@@ -1,10 +1,8 @@
use std::sync::Arc;
use fastdivide::DividerU64; use fastdivide::DividerU64;
use fastfield_codecs::Column; use fastfield_codecs::Column;
use crate::collector::{Collector, SegmentCollector}; use crate::collector::{Collector, SegmentCollector};
use crate::fastfield::FastValue; use crate::fastfield::{DynamicFastFieldReader, FastValue};
use crate::schema::{Field, Type}; use crate::schema::{Field, Type};
use crate::{DocId, Score}; use crate::{DocId, Score};
@@ -87,7 +85,7 @@ impl HistogramComputer {
} }
pub struct SegmentHistogramCollector { pub struct SegmentHistogramCollector {
histogram_computer: HistogramComputer, histogram_computer: HistogramComputer,
ff_reader: Arc<dyn Column<u64>>, ff_reader: DynamicFastFieldReader<u64>,
} }
impl SegmentCollector for SegmentHistogramCollector { impl SegmentCollector for SegmentHistogramCollector {

View File

@@ -1,11 +1,9 @@
use std::sync::Arc;
use fastfield_codecs::Column; use fastfield_codecs::Column;
use super::*; use super::*;
use crate::collector::{Count, FilterCollector, TopDocs}; use crate::collector::{Count, FilterCollector, TopDocs};
use crate::core::SegmentReader; use crate::core::SegmentReader;
use crate::fastfield::BytesFastFieldReader; use crate::fastfield::{BytesFastFieldReader, DynamicFastFieldReader};
use crate::query::{AllQuery, QueryParser}; use crate::query::{AllQuery, QueryParser};
use crate::schema::{Field, Schema, FAST, TEXT}; use crate::schema::{Field, Schema, FAST, TEXT};
use crate::time::format_description::well_known::Rfc3339; use crate::time::format_description::well_known::Rfc3339;
@@ -160,7 +158,7 @@ pub struct FastFieldTestCollector {
pub struct FastFieldSegmentCollector { pub struct FastFieldSegmentCollector {
vals: Vec<u64>, vals: Vec<u64>,
reader: Arc<dyn Column<u64>>, reader: DynamicFastFieldReader<u64>,
} }
impl FastFieldTestCollector { impl FastFieldTestCollector {

View File

@@ -1,7 +1,6 @@
use std::collections::BinaryHeap; use std::collections::BinaryHeap;
use std::fmt; use std::fmt;
use std::marker::PhantomData; use std::marker::PhantomData;
use std::sync::Arc;
use fastfield_codecs::Column; use fastfield_codecs::Column;
@@ -12,7 +11,7 @@ use crate::collector::tweak_score_top_collector::TweakedScoreTopCollector;
use crate::collector::{ use crate::collector::{
CustomScorer, CustomSegmentScorer, ScoreSegmentTweaker, ScoreTweaker, SegmentCollector, CustomScorer, CustomSegmentScorer, ScoreSegmentTweaker, ScoreTweaker, SegmentCollector,
}; };
use crate::fastfield::FastValue; use crate::fastfield::{DynamicFastFieldReader, FastValue};
use crate::query::Weight; use crate::query::Weight;
use crate::schema::Field; use crate::schema::Field;
use crate::{DocAddress, DocId, Score, SegmentOrdinal, SegmentReader, TantivyError}; use crate::{DocAddress, DocId, Score, SegmentOrdinal, SegmentReader, TantivyError};
@@ -132,7 +131,7 @@ impl fmt::Debug for TopDocs {
} }
struct ScorerByFastFieldReader { struct ScorerByFastFieldReader {
ff_reader: Arc<dyn Column<u64>>, ff_reader: DynamicFastFieldReader<u64>,
} }
impl CustomSegmentScorer<u64> for ScorerByFastFieldReader { impl CustomSegmentScorer<u64> for ScorerByFastFieldReader {
@@ -410,6 +409,7 @@ impl TopDocs {
/// # use tantivy::query::QueryParser; /// # use tantivy::query::QueryParser;
/// use tantivy::SegmentReader; /// use tantivy::SegmentReader;
/// use tantivy::collector::TopDocs; /// use tantivy::collector::TopDocs;
/// use tantivy::fastfield::FastFieldReader;
/// use tantivy::schema::Field; /// use tantivy::schema::Field;
/// ///
/// fn create_schema() -> Schema { /// fn create_schema() -> Schema {
@@ -458,7 +458,7 @@ impl TopDocs {
/// ///
/// // We can now define our actual scoring function /// // We can now define our actual scoring function
/// move |doc: DocId, original_score: Score| { /// move |doc: DocId, original_score: Score| {
/// let popularity: u64 = popularity_reader.get_val(doc as u64); /// let popularity: u64 = popularity_reader.get(doc);
/// // Well.. For the sake of the example we use a simple logarithm /// // Well.. For the sake of the example we use a simple logarithm
/// // function. /// // function.
/// let popularity_boost_score = ((2u64 + popularity) as Score).log2(); /// let popularity_boost_score = ((2u64 + popularity) as Score).log2();
@@ -517,6 +517,7 @@ impl TopDocs {
/// use tantivy::SegmentReader; /// use tantivy::SegmentReader;
/// use tantivy::collector::TopDocs; /// use tantivy::collector::TopDocs;
/// use tantivy::schema::Field; /// use tantivy::schema::Field;
/// use tantivy::fastfield::FastFieldReader;
/// ///
/// # fn create_schema() -> Schema { /// # fn create_schema() -> Schema {
/// # let mut schema_builder = Schema::builder(); /// # let mut schema_builder = Schema::builder();
@@ -568,8 +569,8 @@ impl TopDocs {
/// ///
/// // We can now define our actual scoring function /// // We can now define our actual scoring function
/// move |doc: DocId| { /// move |doc: DocId| {
/// let popularity: u64 = popularity_reader.get_val(doc as u64); /// let popularity: u64 = popularity_reader.get(doc);
/// let boosted: u64 = boosted_reader.get_val(doc as u64); /// let boosted: u64 = boosted_reader.get(doc);
/// // Score do not have to be `f64` in tantivy. /// // Score do not have to be `f64` in tantivy.
/// // Here we return a couple to get lexicographical order /// // Here we return a couple to get lexicographical order
/// // for free. /// // for free.

View File

@@ -16,7 +16,7 @@ use crate::directory::MmapDirectory;
use crate::directory::{Directory, ManagedDirectory, RamDirectory, INDEX_WRITER_LOCK}; use crate::directory::{Directory, ManagedDirectory, RamDirectory, INDEX_WRITER_LOCK};
use crate::error::{DataCorruption, TantivyError}; use crate::error::{DataCorruption, TantivyError};
use crate::indexer::index_writer::{MAX_NUM_THREAD, MEMORY_ARENA_NUM_BYTES_MIN}; use crate::indexer::index_writer::{MAX_NUM_THREAD, MEMORY_ARENA_NUM_BYTES_MIN};
use crate::indexer::segment_updater::save_metas; use crate::indexer::segment_updater::save_new_metas;
use crate::reader::{IndexReader, IndexReaderBuilder}; use crate::reader::{IndexReader, IndexReaderBuilder};
use crate::schema::{Field, FieldType, Schema}; use crate::schema::{Field, FieldType, Schema};
use crate::tokenizer::{TextAnalyzer, TokenizerManager}; use crate::tokenizer::{TextAnalyzer, TokenizerManager};
@@ -47,34 +47,6 @@ fn load_metas(
.map_err(From::from) .map_err(From::from)
} }
/// Save the index meta file.
/// This operation is atomic :
/// Either
/// - it fails, in which case an error is returned,
/// and the `meta.json` remains untouched,
/// - it succeeds, and `meta.json` is written
/// and flushed.
///
/// This method is not part of tantivy's public API
fn save_new_metas(
schema: Schema,
index_settings: IndexSettings,
directory: &dyn Directory,
) -> crate::Result<()> {
save_metas(
&IndexMeta {
index_settings,
segments: Vec::new(),
schema,
opstamp: 0u64,
payload: None,
},
directory,
)?;
directory.sync_directory()?;
Ok(())
}
/// IndexBuilder can be used to create an index. /// IndexBuilder can be used to create an index.
/// ///
/// Use in conjunction with `SchemaBuilder`. Global index settings /// Use in conjunction with `SchemaBuilder`. Global index settings

View File

@@ -1,9 +1,7 @@
use std::sync::Arc;
use fastfield_codecs::Column; use fastfield_codecs::Column;
use crate::directory::{FileSlice, OwnedBytes}; use crate::directory::{FileSlice, OwnedBytes};
use crate::fastfield::MultiValueLength; use crate::fastfield::{DynamicFastFieldReader, MultiValueLength};
use crate::DocId; use crate::DocId;
/// Reader for byte array fast fields /// Reader for byte array fast fields
@@ -18,13 +16,13 @@ use crate::DocId;
/// and the start index for the next document, and keeping the bytes in between. /// and the start index for the next document, and keeping the bytes in between.
#[derive(Clone)] #[derive(Clone)]
pub struct BytesFastFieldReader { pub struct BytesFastFieldReader {
idx_reader: Arc<dyn Column<u64>>, idx_reader: DynamicFastFieldReader<u64>,
values: OwnedBytes, values: OwnedBytes,
} }
impl BytesFastFieldReader { impl BytesFastFieldReader {
pub(crate) fn open( pub(crate) fn open(
idx_reader: Arc<dyn Column<u64>>, idx_reader: DynamicFastFieldReader<u64>,
values_file: FileSlice, values_file: FileSlice,
) -> crate::Result<BytesFastFieldReader> { ) -> crate::Result<BytesFastFieldReader> {
let values = values_file.read_bytes()?; let values = values_file.read_bytes()?;

View File

@@ -3,11 +3,20 @@ use std::num::NonZeroU64;
use common::BinarySerializable; use common::BinarySerializable;
use fastdivide::DividerU64; use fastdivide::DividerU64;
use fastfield_codecs::{monotonic_map_column, Column, FastFieldCodec}; use fastfield_codecs::{Column, FastFieldCodec};
use ownedbytes::OwnedBytes; use ownedbytes::OwnedBytes;
pub const GCD_DEFAULT: u64 = 1; pub const GCD_DEFAULT: u64 = 1;
/// Wrapper for accessing a fastfield.
///
/// Holds the data and the codec to the read the data.
#[derive(Clone)]
pub struct GCDReader<CodecReader: Column> {
gcd_params: GCDParams,
reader: CodecReader,
}
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]
struct GCDParams { struct GCDParams {
gcd: u64, gcd: u64,
@@ -15,6 +24,12 @@ struct GCDParams {
num_vals: u64, num_vals: u64,
} }
impl GCDParams {
pub fn eval(&self, val: u64) -> u64 {
self.min_value + self.gcd * val
}
}
impl BinarySerializable for GCDParams { impl BinarySerializable for GCDParams {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> { fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
self.gcd.serialize(writer)?; self.gcd.serialize(writer)?;
@@ -37,13 +52,31 @@ impl BinarySerializable for GCDParams {
pub fn open_gcd_from_bytes<WrappedCodec: FastFieldCodec>( pub fn open_gcd_from_bytes<WrappedCodec: FastFieldCodec>(
bytes: OwnedBytes, bytes: OwnedBytes,
) -> io::Result<impl Column> { ) -> io::Result<GCDReader<WrappedCodec::Reader>> {
let footer_offset = bytes.len() - 24; let footer_offset = bytes.len() - 24;
let (body, mut footer) = bytes.split(footer_offset); let (body, mut footer) = bytes.split(footer_offset);
let gcd_params = GCDParams::deserialize(&mut footer)?; let gcd_params = GCDParams::deserialize(&mut footer)?;
let gcd_remap = move |val: u64| gcd_params.min_value + gcd_params.gcd * val;
let reader: WrappedCodec::Reader = WrappedCodec::open_from_bytes(body)?; let reader: WrappedCodec::Reader = WrappedCodec::open_from_bytes(body)?;
Ok(monotonic_map_column(reader, gcd_remap)) Ok(GCDReader { gcd_params, reader })
}
impl<C: Column + Clone> Column for GCDReader<C> {
#[inline]
fn get_val(&self, doc: u64) -> u64 {
let val = self.reader.get_val(doc);
self.gcd_params.eval(val)
}
fn min_value(&self) -> u64 {
self.gcd_params.eval(self.reader.min_value())
}
fn max_value(&self) -> u64 {
self.gcd_params.eval(self.reader.max_value())
}
fn num_vals(&self) -> u64 {
self.gcd_params.num_vals
}
} }
pub fn write_gcd_header<W: Write>( pub fn write_gcd_header<W: Write>(
@@ -101,7 +134,6 @@ mod tests {
use std::collections::HashMap; use std::collections::HashMap;
use std::num::NonZeroU64; use std::num::NonZeroU64;
use std::path::Path; use std::path::Path;
use std::sync::Arc;
use std::time::{Duration, SystemTime}; use std::time::{Duration, SystemTime};
use common::HasLen; use common::HasLen;
@@ -109,11 +141,11 @@ mod tests {
use crate::directory::{CompositeFile, RamDirectory, WritePtr}; use crate::directory::{CompositeFile, RamDirectory, WritePtr};
use crate::fastfield::gcd::compute_gcd; use crate::fastfield::gcd::compute_gcd;
use crate::fastfield::reader::open_fast_field;
use crate::fastfield::serializer::FastFieldCodecEnableCheck; use crate::fastfield::serializer::FastFieldCodecEnableCheck;
use crate::fastfield::tests::{encode_decode_fast_field, FIELD, FIELDI64, SCHEMA, SCHEMAI64}; use crate::fastfield::tests::{FIELD, FIELDI64, SCHEMA, SCHEMAI64};
use crate::fastfield::{ use crate::fastfield::{
find_gcd, CompositeFastFieldSerializer, FastFieldCodecType, FastFieldsWriter, ALL_CODECS, find_gcd, CompositeFastFieldSerializer, DynamicFastFieldReader, FastFieldCodecType,
FastFieldsWriter, ALL_CODECS,
}; };
use crate::schema::{Cardinality, Schema}; use crate::schema::{Cardinality, Schema};
use crate::{DateOptions, DatePrecision, DateTime, Directory}; use crate::{DateOptions, DatePrecision, DateTime, Directory};
@@ -155,7 +187,8 @@ mod tests {
let file = directory.open_read(path).unwrap(); let file = directory.open_read(path).unwrap();
let composite_file = CompositeFile::open(&file)?; let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(*FIELD).unwrap(); let file = composite_file.open_read(*FIELD).unwrap();
let fast_field_reader: Arc<dyn Column<i64>> = open_fast_field(file.read_bytes()?)?; let fast_field_reader = DynamicFastFieldReader::<i64>::open(file)?;
assert_eq!(fast_field_reader.get_val(0), -4000i64); assert_eq!(fast_field_reader.get_val(0), -4000i64);
assert_eq!(fast_field_reader.get_val(1), -3000i64); assert_eq!(fast_field_reader.get_val(1), -3000i64);
assert_eq!(fast_field_reader.get_val(2), -2000i64); assert_eq!(fast_field_reader.get_val(2), -2000i64);
@@ -177,7 +210,7 @@ mod tests {
#[test] #[test]
fn test_fastfield_gcd_i64() -> crate::Result<()> { fn test_fastfield_gcd_i64() -> crate::Result<()> {
for &code_type in ALL_CODECS { for &code_type in ALL_CODECS {
test_fastfield_gcd_i64_with_codec(code_type, 5500)?; test_fastfield_gcd_i64_with_codec(code_type, 5005)?;
} }
Ok(()) Ok(())
} }
@@ -196,7 +229,7 @@ mod tests {
let file = directory.open_read(path).unwrap(); let file = directory.open_read(path).unwrap();
let composite_file = CompositeFile::open(&file)?; let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(*FIELD).unwrap(); let file = composite_file.open_read(*FIELD).unwrap();
let fast_field_reader = open_fast_field::<u64>(file.read_bytes()?)?; let fast_field_reader = DynamicFastFieldReader::<u64>::open(file)?;
assert_eq!(fast_field_reader.get_val(0), 1000u64); assert_eq!(fast_field_reader.get_val(0), 1000u64);
assert_eq!(fast_field_reader.get_val(1), 2000u64); assert_eq!(fast_field_reader.get_val(1), 2000u64);
assert_eq!(fast_field_reader.get_val(2), 3000u64); assert_eq!(fast_field_reader.get_val(2), 3000u64);
@@ -218,14 +251,14 @@ mod tests {
#[test] #[test]
fn test_fastfield_gcd_u64() -> crate::Result<()> { fn test_fastfield_gcd_u64() -> crate::Result<()> {
for &code_type in ALL_CODECS { for &code_type in ALL_CODECS {
test_fastfield_gcd_u64_with_codec(code_type, 5500)?; test_fastfield_gcd_u64_with_codec(code_type, 5005)?;
} }
Ok(()) Ok(())
} }
#[test] #[test]
pub fn test_fastfield2() { pub fn test_fastfield2() {
let test_fastfield = encode_decode_fast_field(&[100u64, 200u64, 300u64]); let test_fastfield = DynamicFastFieldReader::<u64>::from(vec![100, 200, 300]);
assert_eq!(test_fastfield.get_val(0), 100); assert_eq!(test_fastfield.get_val(0), 100);
assert_eq!(test_fastfield.get_val(1), 200); assert_eq!(test_fastfield.get_val(1), 200);
assert_eq!(test_fastfield.get_val(2), 300); assert_eq!(test_fastfield.get_val(2), 300);
@@ -291,7 +324,7 @@ mod tests {
let composite_file = CompositeFile::open(&file)?; let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(*FIELD).unwrap(); let file = composite_file.open_read(*FIELD).unwrap();
let len = file.len(); let len = file.len();
let test_fastfield = open_fast_field::<DateTime>(file.read_bytes()?)?; let test_fastfield = DynamicFastFieldReader::<DateTime>::open(file)?;
assert_eq!(test_fastfield.get_val(0), time1.truncate(precision)); assert_eq!(test_fastfield.get_val(0), time1.truncate(precision));
assert_eq!(test_fastfield.get_val(1), time2.truncate(precision)); assert_eq!(test_fastfield.get_val(1), time2.truncate(precision));

View File

@@ -26,8 +26,9 @@ pub use self::alive_bitset::{intersect_alive_bitsets, write_alive_bitset, AliveB
pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter}; pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter};
pub use self::error::{FastFieldNotAvailableError, Result}; pub use self::error::{FastFieldNotAvailableError, Result};
pub use self::facet_reader::FacetReader; pub use self::facet_reader::FacetReader;
pub(crate) use self::gcd::{find_gcd, GCD_DEFAULT}; pub(crate) use self::gcd::{find_gcd, GCDReader, GCD_DEFAULT};
pub use self::multivalued::{MultiValuedFastFieldReader, MultiValuedFastFieldWriter}; pub use self::multivalued::{MultiValuedFastFieldReader, MultiValuedFastFieldWriter};
pub use self::reader::DynamicFastFieldReader;
pub use self::readers::FastFieldReaders; pub use self::readers::FastFieldReaders;
pub(crate) use self::readers::{type_and_cardinality, FastType}; pub(crate) use self::readers::{type_and_cardinality, FastType};
pub use self::serializer::{Column, CompositeFastFieldSerializer, FastFieldStats}; pub use self::serializer::{Column, CompositeFastFieldSerializer, FastFieldStats};
@@ -265,7 +266,6 @@ mod tests {
use std::collections::HashMap; use std::collections::HashMap;
use std::ops::Range; use std::ops::Range;
use std::path::Path; use std::path::Path;
use std::sync::Arc;
use common::HasLen; use common::HasLen;
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
@@ -275,7 +275,6 @@ mod tests {
use super::*; use super::*;
use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr}; use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr};
use crate::fastfield::reader::open_fast_field;
use crate::merge_policy::NoMergePolicy; use crate::merge_policy::NoMergePolicy;
use crate::schema::{Document, Field, Schema, FAST, STRING, TEXT}; use crate::schema::{Document, Field, Schema, FAST, STRING, TEXT};
use crate::time::OffsetDateTime; use crate::time::OffsetDateTime;
@@ -296,51 +295,9 @@ mod tests {
pub static FIELD: Lazy<Field> = Lazy::new(|| SCHEMA.get_field("field").unwrap()); pub static FIELD: Lazy<Field> = Lazy::new(|| SCHEMA.get_field("field").unwrap());
pub static FIELDI64: Lazy<Field> = Lazy::new(|| SCHEMAI64.get_field("field").unwrap()); pub static FIELDI64: Lazy<Field> = Lazy::new(|| SCHEMAI64.get_field("field").unwrap());
/// Encode values using the most appropriate codec and and then loads it
/// right away.
///
/// This is useful in tests and bench.
pub(crate) fn encode_decode_fast_field<Item: FastValue>(
vals: &[Item],
) -> Arc<dyn Column<Item>> {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field("field", FAST);
let schema = schema_builder.build();
let path = Path::new("__dummy__");
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory
.open_write(path)
.expect("With a RamDirectory, this should never fail.");
let mut serializer = CompositeFastFieldSerializer::from_write(write)
.expect("With a RamDirectory, this should never fail.");
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
{
let fast_field_writer = fast_field_writers
.get_field_writer_mut(field)
.expect("With a RamDirectory, this should never fail.");
for val in vals {
fast_field_writer.add_val(val.to_u64());
}
}
fast_field_writers
.serialize(&mut serializer, &HashMap::new(), None)
.unwrap();
serializer.close().unwrap();
}
let file = directory.open_read(path).expect("Failed to open the file");
let composite_file = CompositeFile::open(&file).expect("Failed to read the composite file");
let field_bytes = composite_file
.open_read(field)
.expect("File component not found")
.read_bytes()
.unwrap();
open_fast_field(field_bytes).unwrap()
}
#[test] #[test]
pub fn test_fastfield() { pub fn test_fastfield() {
let test_fastfield = encode_decode_fast_field(&[100u64, 200u64, 300u64]); let test_fastfield = DynamicFastFieldReader::<u64>::from(vec![100, 200, 300]);
assert_eq!(test_fastfield.get_val(0u64), 100); assert_eq!(test_fastfield.get_val(0u64), 100);
assert_eq!(test_fastfield.get_val(1u64), 200); assert_eq!(test_fastfield.get_val(1u64), 200);
assert_eq!(test_fastfield.get_val(2u64), 300); assert_eq!(test_fastfield.get_val(2u64), 300);
@@ -371,8 +328,8 @@ mod tests {
let file = directory.open_read(path).unwrap(); let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 45); assert_eq!(file.len(), 45);
let composite_file = CompositeFile::open(&file)?; let composite_file = CompositeFile::open(&file)?;
let fast_field_bytes = composite_file.open_read(*FIELD).unwrap().read_bytes()?; let file = composite_file.open_read(*FIELD).unwrap();
let fast_field_reader = open_fast_field::<u64>(fast_field_bytes)?; let fast_field_reader = DynamicFastFieldReader::<u64>::open(file)?;
assert_eq!(fast_field_reader.get_val(0), 13u64); assert_eq!(fast_field_reader.get_val(0), 13u64);
assert_eq!(fast_field_reader.get_val(1), 14u64); assert_eq!(fast_field_reader.get_val(1), 14u64);
assert_eq!(fast_field_reader.get_val(2), 2u64); assert_eq!(fast_field_reader.get_val(2), 2u64);
@@ -403,11 +360,8 @@ mod tests {
assert_eq!(file.len(), 70); assert_eq!(file.len(), 70);
{ {
let fast_fields_composite = CompositeFile::open(&file)?; let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite let data = fast_fields_composite.open_read(*FIELD).unwrap();
.open_read(*FIELD) let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?;
.unwrap()
.read_bytes()?;
let fast_field_reader = open_fast_field::<u64>(data)?;
assert_eq!(fast_field_reader.get_val(0), 4u64); assert_eq!(fast_field_reader.get_val(0), 4u64);
assert_eq!(fast_field_reader.get_val(1), 14_082_001u64); assert_eq!(fast_field_reader.get_val(1), 14_082_001u64);
assert_eq!(fast_field_reader.get_val(2), 3_052u64); assert_eq!(fast_field_reader.get_val(2), 3_052u64);
@@ -442,11 +396,8 @@ mod tests {
assert_eq!(file.len(), 43); assert_eq!(file.len(), 43);
{ {
let fast_fields_composite = CompositeFile::open(&file).unwrap(); let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite let data = fast_fields_composite.open_read(*FIELD).unwrap();
.open_read(*FIELD) let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?;
.unwrap()
.read_bytes()?;
let fast_field_reader = open_fast_field::<u64>(data)?;
for doc in 0..10_000 { for doc in 0..10_000 {
assert_eq!(fast_field_reader.get_val(doc), 100_000u64); assert_eq!(fast_field_reader.get_val(doc), 100_000u64);
} }
@@ -477,11 +428,8 @@ mod tests {
assert_eq!(file.len(), 80051); assert_eq!(file.len(), 80051);
{ {
let fast_fields_composite = CompositeFile::open(&file)?; let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite let data = fast_fields_composite.open_read(*FIELD).unwrap();
.open_read(*FIELD) let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?;
.unwrap()
.read_bytes()?;
let fast_field_reader = open_fast_field::<u64>(data)?;
assert_eq!(fast_field_reader.get_val(0), 0u64); assert_eq!(fast_field_reader.get_val(0), 0u64);
for doc in 1..10_001 { for doc in 1..10_001 {
assert_eq!( assert_eq!(
@@ -521,11 +469,8 @@ mod tests {
assert_eq!(file.len(), 75_usize); // linear interpol size after calc improvement assert_eq!(file.len(), 75_usize); // linear interpol size after calc improvement
{ {
let fast_fields_composite = CompositeFile::open(&file)?; let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite let data = fast_fields_composite.open_read(i64_field).unwrap();
.open_read(i64_field) let fast_field_reader = DynamicFastFieldReader::<i64>::open(data)?;
.unwrap()
.read_bytes()?;
let fast_field_reader = open_fast_field::<i64>(data)?;
assert_eq!(fast_field_reader.min_value(), -100i64); assert_eq!(fast_field_reader.min_value(), -100i64);
assert_eq!(fast_field_reader.max_value(), 9_999i64); assert_eq!(fast_field_reader.max_value(), 9_999i64);
@@ -564,11 +509,8 @@ mod tests {
let file = directory.open_read(path).unwrap(); let file = directory.open_read(path).unwrap();
{ {
let fast_fields_composite = CompositeFile::open(&file).unwrap(); let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite let data = fast_fields_composite.open_read(i64_field).unwrap();
.open_read(i64_field) let fast_field_reader = DynamicFastFieldReader::<i64>::open(data)?;
.unwrap()
.read_bytes()?;
let fast_field_reader = open_fast_field::<i64>(data)?;
assert_eq!(fast_field_reader.get_val(0), 0i64); assert_eq!(fast_field_reader.get_val(0), 0i64);
} }
Ok(()) Ok(())
@@ -605,11 +547,8 @@ mod tests {
let file = directory.open_read(path)?; let file = directory.open_read(path)?;
{ {
let fast_fields_composite = CompositeFile::open(&file)?; let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite let data = fast_fields_composite.open_read(*FIELD).unwrap();
.open_read(*FIELD) let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?;
.unwrap()
.read_bytes()?;
let fast_field_reader = open_fast_field::<u64>(data)?;
for a in 0..n { for a in 0..n {
assert_eq!(fast_field_reader.get_val(a as u64), permutation[a as usize]); assert_eq!(fast_field_reader.get_val(a as u64), permutation[a as usize]);
@@ -668,7 +607,7 @@ mod tests {
let mut all = vec![]; let mut all = vec![];
for doc in docs { for doc in docs {
let mut out: Vec<u64> = vec![]; let mut out = vec![];
ff.get_vals(doc, &mut out); ff.get_vals(doc, &mut out);
all.extend(out); all.extend(out);
} }
@@ -926,7 +865,7 @@ mod tests {
#[test] #[test]
pub fn test_fastfield_bool() { pub fn test_fastfield_bool() {
let test_fastfield = encode_decode_fast_field::<bool>(&[true, false, true, false]); let test_fastfield = DynamicFastFieldReader::<bool>::from(vec![true, false, true, false]);
assert_eq!(test_fastfield.get_val(0), true); assert_eq!(test_fastfield.get_val(0), true);
assert_eq!(test_fastfield.get_val(1), false); assert_eq!(test_fastfield.get_val(1), false);
assert_eq!(test_fastfield.get_val(2), true); assert_eq!(test_fastfield.get_val(2), true);
@@ -959,8 +898,8 @@ mod tests {
let file = directory.open_read(path).unwrap(); let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 44); assert_eq!(file.len(), 44);
let composite_file = CompositeFile::open(&file)?; let composite_file = CompositeFile::open(&file)?;
let data = composite_file.open_read(field).unwrap().read_bytes()?; let file = composite_file.open_read(field).unwrap();
let fast_field_reader = open_fast_field::<bool>(data)?; let fast_field_reader = DynamicFastFieldReader::<bool>::open(file)?;
assert_eq!(fast_field_reader.get_val(0), true); assert_eq!(fast_field_reader.get_val(0), true);
assert_eq!(fast_field_reader.get_val(1), false); assert_eq!(fast_field_reader.get_val(1), false);
assert_eq!(fast_field_reader.get_val(2), true); assert_eq!(fast_field_reader.get_val(2), true);
@@ -995,8 +934,8 @@ mod tests {
let file = directory.open_read(path).unwrap(); let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 56); assert_eq!(file.len(), 56);
let composite_file = CompositeFile::open(&file)?; let composite_file = CompositeFile::open(&file)?;
let data = composite_file.open_read(field).unwrap().read_bytes()?; let file = composite_file.open_read(field).unwrap();
let fast_field_reader = open_fast_field::<bool>(data)?; let fast_field_reader = DynamicFastFieldReader::<bool>::open(file)?;
for i in 0..25 { for i in 0..25 {
assert_eq!(fast_field_reader.get_val(i * 2), true); assert_eq!(fast_field_reader.get_val(i * 2), true);
assert_eq!(fast_field_reader.get_val(i * 2 + 1), false); assert_eq!(fast_field_reader.get_val(i * 2 + 1), false);
@@ -1029,8 +968,8 @@ mod tests {
let file = directory.open_read(path).unwrap(); let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 43); assert_eq!(file.len(), 43);
let composite_file = CompositeFile::open(&file)?; let composite_file = CompositeFile::open(&file)?;
let data = composite_file.open_read(field).unwrap().read_bytes()?; let file = composite_file.open_read(field).unwrap();
let fast_field_reader = open_fast_field::<bool>(data)?; let fast_field_reader = DynamicFastFieldReader::<bool>::open(file)?;
assert_eq!(fast_field_reader.get_val(0), false); assert_eq!(fast_field_reader.get_val(0), false);
Ok(()) Ok(())
@@ -1039,17 +978,32 @@ mod tests {
#[cfg(all(test, feature = "unstable"))] #[cfg(all(test, feature = "unstable"))]
mod bench { mod bench {
use std::sync::Arc; use std::collections::HashMap;
use std::path::Path;
use fastfield_codecs::Column;
use test::{self, Bencher}; use test::{self, Bencher};
use crate::fastfield::tests::{ use super::tests::{generate_permutation, FIELD, SCHEMA};
encode_decode_fast_field, generate_permutation, generate_permutation_gcd, use super::*;
}; use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr};
use crate::fastfield::tests::generate_permutation_gcd;
use crate::fastfield::FastFieldReader;
#[bench] #[bench]
fn bench_intfastfield_jumpy_veclookup(b: &mut Bencher) { fn bench_intfastfield_linear_veclookup(b: &mut Bencher) {
let permutation = generate_permutation();
b.iter(|| {
let n = test::black_box(7000u32);
let mut a = 0u64;
for i in (0u32..n / 7).map(|v| v * 7) {
a ^= permutation[i as usize];
}
a
});
}
#[bench]
fn bench_intfastfield_veclookup(b: &mut Bencher) {
let permutation = generate_permutation(); let permutation = generate_permutation();
b.iter(|| { b.iter(|| {
let n = test::black_box(1000u32); let n = test::black_box(1000u32);
@@ -1061,82 +1015,103 @@ mod bench {
}); });
} }
#[bench]
fn bench_intfastfield_jumpy_fflookup(b: &mut Bencher) {
let permutation = generate_permutation();
let column: Arc<dyn Column<u64>> = encode_decode_fast_field(&permutation);
b.iter(|| {
let n = test::black_box(1000u32);
let mut a = 0u64;
for _ in 0u32..n {
a = column.get_val(a as u64);
}
a
});
}
#[bench] #[bench]
fn bench_intfastfield_linear_fflookup(b: &mut Bencher) { fn bench_intfastfield_linear_fflookup(b: &mut Bencher) {
let path = Path::new("test");
let permutation = generate_permutation(); let permutation = generate_permutation();
let column: Arc<dyn Column<u64>> = encode_decode_fast_field(&permutation); let directory: RamDirectory = RamDirectory::create();
b.iter(|| { {
let n = test::black_box(7000u32); let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut a = 0u64; let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
for i in (0..n / 7).map(|val| val * 7) { let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
a += column.get_val(i as u64); for &x in &permutation {
fast_field_writers.add_document(&doc!(*FIELD=>x));
} }
a fast_field_writers
}); .serialize(&mut serializer, &HashMap::new(), None)
} .unwrap();
serializer.close().unwrap();
}
let file = directory.open_read(&path).unwrap();
{
let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
#[bench] b.iter(|| {
fn bench_intfastfield_linear_vec(b: &mut Bencher) { let n = test::black_box(7000u32);
let permutation = generate_permutation(); let mut a = 0u64;
b.iter(|| { for i in (0u32..n / 7).map(|val| val * 7) {
let n = test::black_box(7000); a ^= fast_field_reader.get(i);
let mut a = 0u64; }
for i in (0..n / 7).map(|val| val * 7) { a
a += permutation[i]; });
} }
a
});
} }
#[bench] #[bench]
fn bench_intfastfield_fflookup(b: &mut Bencher) { fn bench_intfastfield_fflookup(b: &mut Bencher) {
let path = Path::new("test");
let permutation = generate_permutation(); let permutation = generate_permutation();
let column: Arc<dyn Column<u64>> = encode_decode_fast_field(&permutation); let directory: RamDirectory = RamDirectory::create();
b.iter(|| { {
let mut a = 0u64; let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
for i in 0u64..permutation.len() as u64 { let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
a = column.get_val(i); let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for &x in &permutation {
fast_field_writers.add_document(&doc!(*FIELD=>x));
} }
a fast_field_writers
}); .serialize(&mut serializer, &HashMap::new(), None)
.unwrap();
serializer.close().unwrap();
}
let file = directory.open_read(&path).unwrap();
{
let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
b.iter(|| {
let mut a = 0u32;
for i in 0u32..permutation.len() as u32 {
a = fast_field_reader.get(i) as u32;
}
a
});
}
} }
#[bench] #[bench]
fn bench_intfastfield_fflookup_gcd(b: &mut Bencher) { fn bench_intfastfield_fflookup_gcd(b: &mut Bencher) {
let path = Path::new("test");
let permutation = generate_permutation_gcd(); let permutation = generate_permutation_gcd();
let column: Arc<dyn Column<u64>> = encode_decode_fast_field(&permutation); let directory: RamDirectory = RamDirectory::create();
b.iter(|| { {
let mut a = 0u64; let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
for i in 0..permutation.len() as u64 { let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
a += column.get_val(i); let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for &x in &permutation {
fast_field_writers.add_document(&doc!(*FIELD=>x));
} }
a fast_field_writers
}); .serialize(&mut serializer, &HashMap::new(), None)
} .unwrap();
serializer.close().unwrap();
}
let file = directory.open_read(&path).unwrap();
{
let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
#[bench] b.iter(|| {
fn bench_intfastfield_vec(b: &mut Bencher) { let mut a = 0u32;
let permutation = generate_permutation_gcd(); for i in 0u32..permutation.len() as u32 {
b.iter(|| { a = fast_field_reader.get(i) as u32;
let mut a = 0u64; }
for i in 0..permutation.len() { a
a += permutation[i as usize] as u64; });
} }
a
});
} }
} }

View File

@@ -346,7 +346,6 @@ mod tests {
assert!(test_multivalued_no_panic(&ops[..]).is_ok()); assert!(test_multivalued_no_panic(&ops[..]).is_ok());
} }
} }
#[test] #[test]
fn test_multivalued_proptest_gcd() { fn test_multivalued_proptest_gcd() {
use IndexingOp::*; use IndexingOp::*;

View File

@@ -1,9 +1,8 @@
use std::ops::Range; use std::ops::Range;
use std::sync::Arc;
use fastfield_codecs::Column; use fastfield_codecs::Column;
use crate::fastfield::{FastValue, MultiValueLength}; use crate::fastfield::{DynamicFastFieldReader, FastValue, MultiValueLength};
use crate::DocId; use crate::DocId;
/// Reader for a multivalued `u64` fast field. /// Reader for a multivalued `u64` fast field.
@@ -15,14 +14,14 @@ use crate::DocId;
/// The `idx_reader` associated, for each document, the index of its first value. /// The `idx_reader` associated, for each document, the index of its first value.
#[derive(Clone)] #[derive(Clone)]
pub struct MultiValuedFastFieldReader<Item: FastValue> { pub struct MultiValuedFastFieldReader<Item: FastValue> {
idx_reader: Arc<dyn Column<u64>>, idx_reader: DynamicFastFieldReader<u64>,
vals_reader: Arc<dyn Column<Item>>, vals_reader: DynamicFastFieldReader<Item>,
} }
impl<Item: FastValue> MultiValuedFastFieldReader<Item> { impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
pub(crate) fn open( pub(crate) fn open(
idx_reader: Arc<dyn Column<u64>>, idx_reader: DynamicFastFieldReader<u64>,
vals_reader: Arc<dyn Column<Item>>, vals_reader: DynamicFastFieldReader<Item>,
) -> MultiValuedFastFieldReader<Item> { ) -> MultiValuedFastFieldReader<Item> {
MultiValuedFastFieldReader { MultiValuedFastFieldReader {
idx_reader, idx_reader,

View File

@@ -1,65 +1,144 @@
use std::collections::HashMap;
use std::marker::PhantomData; use std::marker::PhantomData;
use std::sync::Arc; use std::path::Path;
use common::BinarySerializable; use common::BinarySerializable;
use fastfield_codecs::bitpacked::BitpackedCodec; use fastfield_codecs::bitpacked::{BitpackedCodec, BitpackedReader};
use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec; use fastfield_codecs::blockwise_linear::{BlockwiseLinearCodec, BlockwiseLinearReader};
use fastfield_codecs::linear::LinearCodec; use fastfield_codecs::linear::{LinearCodec, LinearReader};
use fastfield_codecs::{monotonic_map_column, Column, FastFieldCodec, FastFieldCodecType}; use fastfield_codecs::{Column, FastFieldCodec, FastFieldCodecType};
use super::gcd::open_gcd_from_bytes; use super::gcd::open_gcd_from_bytes;
use super::FastValue; use super::FastValue;
use crate::directory::OwnedBytes; use crate::directory::{CompositeFile, Directory, FileSlice, OwnedBytes, RamDirectory, WritePtr};
use crate::error::DataCorruption; use crate::error::DataCorruption;
use crate::fastfield::{CompositeFastFieldSerializer, FastFieldsWriter, GCDReader};
use crate::schema::{Schema, FAST};
fn open_codec_from_bytes<C: FastFieldCodec, Item: FastValue>( #[derive(Clone)]
bytes: OwnedBytes, /// DynamicFastFieldReader wraps different readers to access
) -> crate::Result<Arc<dyn Column<Item>>> { /// the various encoded fastfield data
let reader = C::open_from_bytes(bytes)?; pub enum DynamicFastFieldReader<Item: FastValue> {
Ok(Arc::new(monotonic_map_column(reader, Item::from_u64))) /// Bitpacked compressed fastfield data.
Bitpacked(FastFieldReaderCodecWrapper<Item, BitpackedReader>),
/// Linear interpolated values + bitpacked
Linear(FastFieldReaderCodecWrapper<Item, LinearReader>),
/// Blockwise linear interpolated values + bitpacked
BlockwiseLinear(FastFieldReaderCodecWrapper<Item, BlockwiseLinearReader>),
/// GCD and Bitpacked compressed fastfield data.
BitpackedGCD(FastFieldReaderCodecWrapper<Item, GCDReader<BitpackedReader>>),
/// GCD and Linear interpolated values + bitpacked
LinearGCD(FastFieldReaderCodecWrapper<Item, GCDReader<LinearReader>>),
/// GCD and Blockwise linear interpolated values + bitpacked
BlockwiseLinearGCD(FastFieldReaderCodecWrapper<Item, GCDReader<BlockwiseLinearReader>>),
} }
fn open_codec_with_gcd<C: FastFieldCodec, Item: FastValue>( impl<Item: FastValue> DynamicFastFieldReader<Item> {
bytes: OwnedBytes, /// Returns correct the reader wrapped in the `DynamicFastFieldReader` enum for the data.
) -> crate::Result<Arc<dyn Column<Item>>> { pub fn open_from_id(
let reader = open_gcd_from_bytes::<C>(bytes)?; mut bytes: OwnedBytes,
Ok(Arc::new(monotonic_map_column(reader, Item::from_u64))) codec_type: FastFieldCodecType,
} ) -> crate::Result<DynamicFastFieldReader<Item>> {
let reader = match codec_type {
/// Returns correct the reader wrapped in the `DynamicFastFieldReader` enum for the data. FastFieldCodecType::Bitpacked => {
fn open_from_id<Item: FastValue>( DynamicFastFieldReader::Bitpacked(BitpackedCodec::open_from_bytes(bytes)?.into())
mut bytes: OwnedBytes,
codec_type: FastFieldCodecType,
) -> crate::Result<Arc<dyn Column<Item>>> {
match codec_type {
FastFieldCodecType::Bitpacked => open_codec_from_bytes::<BitpackedCodec, _>(bytes),
FastFieldCodecType::Linear => open_codec_from_bytes::<LinearCodec, _>(bytes),
FastFieldCodecType::BlockwiseLinear => {
open_codec_from_bytes::<BlockwiseLinearCodec, _>(bytes)
}
FastFieldCodecType::Gcd => {
let codec_type = FastFieldCodecType::deserialize(&mut bytes)?;
match codec_type {
FastFieldCodecType::Bitpacked => open_codec_with_gcd::<BitpackedCodec, _>(bytes),
FastFieldCodecType::Linear => open_codec_with_gcd::<LinearCodec, _>(bytes),
FastFieldCodecType::BlockwiseLinear => {
open_codec_with_gcd::<BlockwiseLinearCodec, _>(bytes)
}
FastFieldCodecType::Gcd => Err(DataCorruption::comment_only(
"Gcd codec wrapped into another gcd codec. This combination is not allowed.",
)
.into()),
} }
} FastFieldCodecType::Linear => {
DynamicFastFieldReader::Linear(LinearCodec::open_from_bytes(bytes)?.into())
}
FastFieldCodecType::BlockwiseLinear => DynamicFastFieldReader::BlockwiseLinear(
BlockwiseLinearCodec::open_from_bytes(bytes)?.into(),
),
FastFieldCodecType::Gcd => {
let codec_type = FastFieldCodecType::deserialize(&mut bytes)?;
match codec_type {
FastFieldCodecType::Bitpacked => DynamicFastFieldReader::BitpackedGCD(
open_gcd_from_bytes::<BitpackedCodec>(bytes)?.into(),
),
FastFieldCodecType::Linear => DynamicFastFieldReader::LinearGCD(
open_gcd_from_bytes::<LinearCodec>(bytes)?.into(),
),
FastFieldCodecType::BlockwiseLinear => {
DynamicFastFieldReader::BlockwiseLinearGCD(
open_gcd_from_bytes::<BlockwiseLinearCodec>(bytes)?.into(),
)
}
FastFieldCodecType::Gcd => {
return Err(DataCorruption::comment_only(
"Gcd codec wrapped into another gcd codec. This combination is not \
allowed.",
)
.into())
}
}
}
};
Ok(reader)
}
/// Returns correct the reader wrapped in the `DynamicFastFieldReader` enum for the data.
pub fn open(file: FileSlice) -> crate::Result<DynamicFastFieldReader<Item>> {
let mut bytes = file.read_bytes()?;
let codec_type = FastFieldCodecType::deserialize(&mut bytes)?;
Self::open_from_id(bytes, codec_type)
} }
} }
/// Returns correct the reader wrapped in the `DynamicFastFieldReader` enum for the data. impl<Item: FastValue> Column<Item> for DynamicFastFieldReader<Item> {
pub fn open_fast_field<Item: FastValue>( #[inline]
mut bytes: OwnedBytes, fn get_val(&self, idx: u64) -> Item {
) -> crate::Result<Arc<dyn Column<Item>>> { match self {
let codec_type = FastFieldCodecType::deserialize(&mut bytes)?; Self::Bitpacked(reader) => reader.get_val(idx),
open_from_id(bytes, codec_type) Self::Linear(reader) => reader.get_val(idx),
Self::BlockwiseLinear(reader) => reader.get_val(idx),
Self::BitpackedGCD(reader) => reader.get_val(idx),
Self::LinearGCD(reader) => reader.get_val(idx),
Self::BlockwiseLinearGCD(reader) => reader.get_val(idx),
}
}
#[inline]
fn get_range(&self, start: u64, output: &mut [Item]) {
match self {
Self::Bitpacked(reader) => reader.get_range(start, output),
Self::Linear(reader) => reader.get_range(start, output),
Self::BlockwiseLinear(reader) => reader.get_range(start, output),
Self::BitpackedGCD(reader) => reader.get_range(start, output),
Self::LinearGCD(reader) => reader.get_range(start, output),
Self::BlockwiseLinearGCD(reader) => reader.get_range(start, output),
}
}
fn min_value(&self) -> Item {
match self {
Self::Bitpacked(reader) => reader.min_value(),
Self::Linear(reader) => reader.min_value(),
Self::BlockwiseLinear(reader) => reader.min_value(),
Self::BitpackedGCD(reader) => reader.min_value(),
Self::LinearGCD(reader) => reader.min_value(),
Self::BlockwiseLinearGCD(reader) => reader.min_value(),
}
}
fn max_value(&self) -> Item {
match self {
Self::Bitpacked(reader) => reader.max_value(),
Self::Linear(reader) => reader.max_value(),
Self::BlockwiseLinear(reader) => reader.max_value(),
Self::BitpackedGCD(reader) => reader.max_value(),
Self::LinearGCD(reader) => reader.max_value(),
Self::BlockwiseLinearGCD(reader) => reader.max_value(),
}
}
fn num_vals(&self) -> u64 {
match self {
Self::Bitpacked(reader) => reader.num_vals(),
Self::Linear(reader) => reader.num_vals(),
Self::BlockwiseLinear(reader) => reader.num_vals(),
Self::BitpackedGCD(reader) => reader.num_vals(),
Self::LinearGCD(reader) => reader.num_vals(),
Self::BlockwiseLinearGCD(reader) => reader.num_vals(),
}
}
} }
/// Wrapper for accessing a fastfield. /// Wrapper for accessing a fastfield.
@@ -160,3 +239,40 @@ impl<Item: FastValue, C: Column + Clone> Column<Item> for FastFieldReaderCodecWr
self.reader.num_vals() self.reader.num_vals()
} }
} }
impl<Item: FastValue> From<Vec<Item>> for DynamicFastFieldReader<Item> {
fn from(vals: Vec<Item>) -> DynamicFastFieldReader<Item> {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field("field", FAST);
let schema = schema_builder.build();
let path = Path::new("__dummy__");
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory
.open_write(path)
.expect("With a RamDirectory, this should never fail.");
let mut serializer = CompositeFastFieldSerializer::from_write(write)
.expect("With a RamDirectory, this should never fail.");
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
{
let fast_field_writer = fast_field_writers
.get_field_writer_mut(field)
.expect("With a RamDirectory, this should never fail.");
for val in vals {
fast_field_writer.add_val(val.to_u64());
}
}
fast_field_writers
.serialize(&mut serializer, &HashMap::new(), None)
.unwrap();
serializer.close().unwrap();
}
let file = directory.open_read(path).expect("Failed to open the file");
let composite_file = CompositeFile::open(&file).expect("Failed to read the composite file");
let field_file = composite_file
.open_read(field)
.expect("File component not found");
DynamicFastFieldReader::open(field_file).unwrap()
}
}

View File

@@ -1,9 +1,5 @@
use std::sync::Arc; use super::reader::DynamicFastFieldReader;
use fastfield_codecs::Column;
use crate::directory::{CompositeFile, FileSlice}; use crate::directory::{CompositeFile, FileSlice};
use crate::fastfield::reader::open_fast_field;
use crate::fastfield::{ use crate::fastfield::{
BytesFastFieldReader, FastFieldNotAvailableError, FastValue, MultiValuedFastFieldReader, BytesFastFieldReader, FastFieldNotAvailableError, FastValue, MultiValuedFastFieldReader,
}; };
@@ -113,16 +109,14 @@ impl FastFieldReaders {
&self, &self,
field: Field, field: Field,
index: usize, index: usize,
) -> crate::Result<Arc<dyn Column<TFastValue>>> { ) -> crate::Result<DynamicFastFieldReader<TFastValue>> {
let fast_field_slice = self.fast_field_data(field, index)?; let fast_field_slice = self.fast_field_data(field, index)?;
let bytes = fast_field_slice.read_bytes()?; DynamicFastFieldReader::open(fast_field_slice)
open_fast_field(bytes)
} }
pub(crate) fn typed_fast_field_reader<TFastValue: FastValue>( pub(crate) fn typed_fast_field_reader<TFastValue: FastValue>(
&self, &self,
field: Field, field: Field,
) -> crate::Result<Arc<dyn Column<TFastValue>>> { ) -> crate::Result<DynamicFastFieldReader<TFastValue>> {
self.typed_fast_field_reader_with_idx(field, 0) self.typed_fast_field_reader_with_idx(field, 0)
} }
@@ -138,7 +132,7 @@ impl FastFieldReaders {
/// Returns the `u64` fast field reader reader associated to `field`. /// Returns the `u64` fast field reader reader associated to `field`.
/// ///
/// If `field` is not a u64 fast field, this method returns an Error. /// If `field` is not a u64 fast field, this method returns an Error.
pub fn u64(&self, field: Field) -> crate::Result<Arc<dyn Column<u64>>> { pub fn u64(&self, field: Field) -> crate::Result<DynamicFastFieldReader<u64>> {
self.check_type(field, FastType::U64, Cardinality::SingleValue)?; self.check_type(field, FastType::U64, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field) self.typed_fast_field_reader(field)
} }
@@ -148,14 +142,14 @@ impl FastFieldReaders {
/// ///
/// If not, the fastfield reader will returns the u64-value associated to the original /// If not, the fastfield reader will returns the u64-value associated to the original
/// FastValue. /// FastValue.
pub fn u64_lenient(&self, field: Field) -> crate::Result<Arc<dyn Column<u64>>> { pub fn u64_lenient(&self, field: Field) -> crate::Result<DynamicFastFieldReader<u64>> {
self.typed_fast_field_reader(field) self.typed_fast_field_reader(field)
} }
/// Returns the `i64` fast field reader reader associated to `field`. /// Returns the `i64` fast field reader reader associated to `field`.
/// ///
/// If `field` is not a i64 fast field, this method returns an Error. /// If `field` is not a i64 fast field, this method returns an Error.
pub fn i64(&self, field: Field) -> crate::Result<Arc<dyn Column<i64>>> { pub fn i64(&self, field: Field) -> crate::Result<DynamicFastFieldReader<i64>> {
self.check_type(field, FastType::I64, Cardinality::SingleValue)?; self.check_type(field, FastType::I64, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field) self.typed_fast_field_reader(field)
} }
@@ -163,7 +157,7 @@ impl FastFieldReaders {
/// Returns the `date` fast field reader reader associated to `field`. /// Returns the `date` fast field reader reader associated to `field`.
/// ///
/// If `field` is not a date fast field, this method returns an Error. /// If `field` is not a date fast field, this method returns an Error.
pub fn date(&self, field: Field) -> crate::Result<Arc<dyn Column<DateTime>>> { pub fn date(&self, field: Field) -> crate::Result<DynamicFastFieldReader<DateTime>> {
self.check_type(field, FastType::Date, Cardinality::SingleValue)?; self.check_type(field, FastType::Date, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field) self.typed_fast_field_reader(field)
} }
@@ -171,7 +165,7 @@ impl FastFieldReaders {
/// Returns the `f64` fast field reader reader associated to `field`. /// Returns the `f64` fast field reader reader associated to `field`.
/// ///
/// If `field` is not a f64 fast field, this method returns an Error. /// If `field` is not a f64 fast field, this method returns an Error.
pub fn f64(&self, field: Field) -> crate::Result<Arc<dyn Column<f64>>> { pub fn f64(&self, field: Field) -> crate::Result<DynamicFastFieldReader<f64>> {
self.check_type(field, FastType::F64, Cardinality::SingleValue)?; self.check_type(field, FastType::F64, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field) self.typed_fast_field_reader(field)
} }
@@ -179,7 +173,7 @@ impl FastFieldReaders {
/// Returns the `bool` fast field reader reader associated to `field`. /// Returns the `bool` fast field reader reader associated to `field`.
/// ///
/// If `field` is not a bool fast field, this method returns an Error. /// If `field` is not a bool fast field, this method returns an Error.
pub fn bool(&self, field: Field) -> crate::Result<Arc<dyn Column<bool>>> { pub fn bool(&self, field: Field) -> crate::Result<DynamicFastFieldReader<bool>> {
self.check_type(field, FastType::Bool, Cardinality::SingleValue)?; self.check_type(field, FastType::Bool, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field) self.typed_fast_field_reader(field)
} }
@@ -247,8 +241,7 @@ impl FastFieldReaders {
))); )));
} }
let fast_field_idx_file = self.fast_field_data(field, 0)?; let fast_field_idx_file = self.fast_field_data(field, 0)?;
let fast_field_idx_bytes = fast_field_idx_file.read_bytes()?; let idx_reader = DynamicFastFieldReader::open(fast_field_idx_file)?;
let idx_reader = open_fast_field(fast_field_idx_bytes)?;
let data = self.fast_field_data(field, 1)?; let data = self.fast_field_data(field, 1)?;
BytesFastFieldReader::open(idx_reader, data) BytesFastFieldReader::open(idx_reader, data)
} else { } else {

View File

@@ -6,7 +6,7 @@ use fastdivide::DividerU64;
pub use fastfield_codecs::bitpacked::{BitpackedCodec, BitpackedSerializerLegacy}; pub use fastfield_codecs::bitpacked::{BitpackedCodec, BitpackedSerializerLegacy};
use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec; use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec;
use fastfield_codecs::linear::LinearCodec; use fastfield_codecs::linear::LinearCodec;
use fastfield_codecs::{monotonic_map_column, FastFieldCodecType}; use fastfield_codecs::FastFieldCodecType;
pub use fastfield_codecs::{Column, FastFieldCodec, FastFieldStats}; pub use fastfield_codecs::{Column, FastFieldCodec, FastFieldStats};
use super::{find_gcd, ALL_CODECS, GCD_DEFAULT}; use super::{find_gcd, ALL_CODECS, GCD_DEFAULT};
@@ -64,8 +64,8 @@ impl From<FastFieldCodecType> for FastFieldCodecEnableCheck {
// use this, when this is merged and stabilized explicit_generic_args_with_impl_trait // use this, when this is merged and stabilized explicit_generic_args_with_impl_trait
// https://github.com/rust-lang/rust/pull/86176 // https://github.com/rust-lang/rust/pull/86176
fn codec_estimation<C: FastFieldCodec, D: Column>( fn codec_estimation<C: FastFieldCodec>(
fastfield_accessor: &D, fastfield_accessor: &impl Column,
estimations: &mut Vec<(f32, FastFieldCodecType)>, estimations: &mut Vec<(f32, FastFieldCodecType)>,
) { ) {
if let Some(ratio) = C::estimate(fastfield_accessor) { if let Some(ratio) = C::estimate(fastfield_accessor) {
@@ -136,22 +136,56 @@ impl CompositeFastFieldSerializer {
} }
Self::write_header(field_write, FastFieldCodecType::Gcd)?; Self::write_header(field_write, FastFieldCodecType::Gcd)?;
struct GCDWrappedFFAccess<T: Column> {
fastfield_accessor: T,
base_value: u64,
max_value: u64,
num_vals: u64,
gcd: DividerU64,
}
impl<T: Column> Column for GCDWrappedFFAccess<T> {
fn get_val(&self, position: u64) -> u64 {
self.gcd
.divide(self.fastfield_accessor.get_val(position) - self.base_value)
}
fn iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
Box::new(
self.fastfield_accessor
.iter()
.map(|val| self.gcd.divide(val - self.base_value)),
)
}
fn min_value(&self) -> u64 {
0
}
fn max_value(&self) -> u64 {
self.max_value
}
fn num_vals(&self) -> u64 {
self.num_vals
}
}
let num_vals = fastfield_accessor.num_vals();
let base_value = fastfield_accessor.min_value(); let base_value = fastfield_accessor.min_value();
let max_value = (fastfield_accessor.max_value() - fastfield_accessor.min_value()) / gcd;
let gcd_divider = DividerU64::divide_by(gcd); let fastfield_accessor = GCDWrappedFFAccess {
fastfield_accessor,
let divided_fastfield_accessor = monotonic_map_column(fastfield_accessor, |val: u64| { base_value,
gcd_divider.divide(val - base_value) max_value,
}); num_vals,
gcd: DividerU64::divide_by(gcd),
let num_vals = divided_fastfield_accessor.num_vals(); };
Self::create_auto_detect_u64_fast_field_with_idx_gcd( Self::create_auto_detect_u64_fast_field_with_idx_gcd(
self.codec_enable_checker.clone(), self.codec_enable_checker.clone(),
field, field,
field_write, field_write,
divided_fastfield_accessor, fastfield_accessor,
)?; )?;
write_gcd_header(field_write, base_value, gcd, num_vals)?; write_gcd_header(field_write, base_value, gcd, num_vals)?;
Ok(()) Ok(())
@@ -168,13 +202,13 @@ impl CompositeFastFieldSerializer {
let mut estimations = vec![]; let mut estimations = vec![];
if codec_enable_checker.is_enabled(FastFieldCodecType::Bitpacked) { if codec_enable_checker.is_enabled(FastFieldCodecType::Bitpacked) {
codec_estimation::<BitpackedCodec, _>(&fastfield_accessor, &mut estimations); codec_estimation::<BitpackedCodec>(&fastfield_accessor, &mut estimations);
} }
if codec_enable_checker.is_enabled(FastFieldCodecType::Linear) { if codec_enable_checker.is_enabled(FastFieldCodecType::Linear) {
codec_estimation::<LinearCodec, _>(&fastfield_accessor, &mut estimations); codec_estimation::<LinearCodec>(&fastfield_accessor, &mut estimations);
} }
if codec_enable_checker.is_enabled(FastFieldCodecType::BlockwiseLinear) { if codec_enable_checker.is_enabled(FastFieldCodecType::BlockwiseLinear) {
codec_estimation::<BlockwiseLinearCodec, _>(&fastfield_accessor, &mut estimations); codec_estimation::<BlockwiseLinearCodec>(&fastfield_accessor, &mut estimations);
} }
if let Some(broken_estimation) = estimations.iter().find(|estimation| estimation.0.is_nan()) if let Some(broken_estimation) = estimations.iter().find(|estimation| estimation.0.is_nan())
{ {

View File

@@ -143,6 +143,8 @@ pub(crate) fn get_doc_id_mapping_from_field(
#[cfg(test)] #[cfg(test)]
mod tests_indexsorting { mod tests_indexsorting {
use fastfield_codecs::Column;
use crate::collector::TopDocs; use crate::collector::TopDocs;
use crate::indexer::doc_id_mapping::DocIdMapping; use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::query::QueryParser; use crate::query::QueryParser;

View File

@@ -174,7 +174,9 @@ fn index_documents(
segment_updater: &mut SegmentUpdater, segment_updater: &mut SegmentUpdater,
mut delete_cursor: DeleteCursor, mut delete_cursor: DeleteCursor,
) -> crate::Result<()> { ) -> crate::Result<()> {
let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone())?; let schema = segment.schema();
let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), schema)?;
for document_group in grouped_document_iterator { for document_group in grouped_document_iterator {
for doc in document_group { for doc in document_group {
segment_writer.add_document(doc)?; segment_writer.add_document(doc)?;
@@ -775,6 +777,7 @@ impl Drop for IndexWriter {
mod tests { mod tests {
use std::collections::{HashMap, HashSet}; use std::collections::{HashMap, HashSet};
use fastfield_codecs::Column;
use proptest::prelude::*; use proptest::prelude::*;
use proptest::prop_oneof; use proptest::prop_oneof;
use proptest::strategy::Strategy; use proptest::strategy::Strategy;

View File

@@ -9,8 +9,8 @@ use crate::core::{Segment, SegmentReader};
use crate::docset::{DocSet, TERMINATED}; use crate::docset::{DocSet, TERMINATED};
use crate::error::DataCorruption; use crate::error::DataCorruption;
use crate::fastfield::{ use crate::fastfield::{
AliveBitSet, Column, CompositeFastFieldSerializer, FastFieldStats, MultiValueLength, AliveBitSet, Column, CompositeFastFieldSerializer, DynamicFastFieldReader, FastFieldStats,
MultiValuedFastFieldReader, MultiValueLength, MultiValuedFastFieldReader,
}; };
use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter}; use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter};
use crate::indexer::doc_id_mapping::{expect_field_id_for_sort_field, SegmentDocIdMapping}; use crate::indexer::doc_id_mapping::{expect_field_id_for_sort_field, SegmentDocIdMapping};
@@ -87,7 +87,7 @@ pub struct IndexMerger {
} }
fn compute_min_max_val( fn compute_min_max_val(
u64_reader: &dyn Column<u64>, u64_reader: &impl Column<u64>,
segment_reader: &SegmentReader, segment_reader: &SegmentReader,
) -> Option<(u64, u64)> { ) -> Option<(u64, u64)> {
if segment_reader.max_doc() == 0 { if segment_reader.max_doc() == 0 {
@@ -133,7 +133,7 @@ impl TermOrdinalMapping {
fn max_term_ord(&self) -> TermOrdinal { fn max_term_ord(&self) -> TermOrdinal {
self.per_segment_new_term_ordinals self.per_segment_new_term_ordinals
.iter() .iter()
.flat_map(|term_ordinals| term_ordinals.iter().max().cloned()) .flat_map(|term_ordinals| term_ordinals.iter().max())
.max() .max()
.unwrap_or_default() .unwrap_or_default()
} }
@@ -341,12 +341,12 @@ impl IndexMerger {
.readers .readers
.iter() .iter()
.filter_map(|reader| { .filter_map(|reader| {
let u64_reader: Arc<dyn Column<u64>> = let u64_reader: DynamicFastFieldReader<u64> =
reader.fast_fields().typed_fast_field_reader(field).expect( reader.fast_fields().typed_fast_field_reader(field).expect(
"Failed to find a reader for single fast field. This is a tantivy bug and \ "Failed to find a reader for single fast field. This is a tantivy bug and \
it should never happen.", it should never happen.",
); );
compute_min_max_val(&*u64_reader, reader) compute_min_max_val(&u64_reader, reader)
}) })
.reduce(|a, b| (a.0.min(b.0), a.1.max(b.1))) .reduce(|a, b| (a.0.min(b.0), a.1.max(b.1)))
.expect("Unexpected error, empty readers in IndexMerger"); .expect("Unexpected error, empty readers in IndexMerger");
@@ -355,7 +355,7 @@ impl IndexMerger {
.readers .readers
.iter() .iter()
.map(|reader| { .map(|reader| {
let u64_reader: Arc<dyn Column<u64>> = let u64_reader: DynamicFastFieldReader<u64> =
reader.fast_fields().typed_fast_field_reader(field).expect( reader.fast_fields().typed_fast_field_reader(field).expect(
"Failed to find a reader for single fast field. This is a tantivy bug and \ "Failed to find a reader for single fast field. This is a tantivy bug and \
it should never happen.", it should never happen.",
@@ -372,7 +372,7 @@ impl IndexMerger {
#[derive(Clone)] #[derive(Clone)]
struct SortedDocIdFieldAccessProvider<'a> { struct SortedDocIdFieldAccessProvider<'a> {
doc_id_mapping: &'a SegmentDocIdMapping, doc_id_mapping: &'a SegmentDocIdMapping,
fast_field_readers: &'a Vec<Arc<dyn Column<u64>>>, fast_field_readers: &'a Vec<DynamicFastFieldReader<u64>>,
stats: FastFieldStats, stats: FastFieldStats,
} }
impl<'a> Column for SortedDocIdFieldAccessProvider<'a> { impl<'a> Column for SortedDocIdFieldAccessProvider<'a> {
@@ -443,7 +443,7 @@ impl IndexMerger {
pub(crate) fn get_sort_field_accessor( pub(crate) fn get_sort_field_accessor(
reader: &SegmentReader, reader: &SegmentReader,
sort_by_field: &IndexSortByField, sort_by_field: &IndexSortByField,
) -> crate::Result<Arc<dyn Column>> { ) -> crate::Result<impl Column> {
let field_id = expect_field_id_for_sort_field(reader.schema(), sort_by_field)?; // for now expect fastfield, but not strictly required let field_id = expect_field_id_for_sort_field(reader.schema(), sort_by_field)?; // for now expect fastfield, but not strictly required
let value_accessor = reader.fast_fields().u64_lenient(field_id)?; let value_accessor = reader.fast_fields().u64_lenient(field_id)?;
Ok(value_accessor) Ok(value_accessor)
@@ -452,7 +452,7 @@ impl IndexMerger {
pub(crate) fn get_reader_with_sort_field_accessor( pub(crate) fn get_reader_with_sort_field_accessor(
&self, &self,
sort_by_field: &IndexSortByField, sort_by_field: &IndexSortByField,
) -> crate::Result<Vec<(SegmentOrdinal, Arc<dyn Column>)>> { ) -> crate::Result<Vec<(SegmentOrdinal, impl Column)>> {
let reader_ordinal_and_field_accessors = self let reader_ordinal_and_field_accessors = self
.readers .readers
.iter() .iter()
@@ -618,7 +618,7 @@ impl IndexMerger {
.map(|reader| { .map(|reader| {
let u64s_reader: MultiValuedFastFieldReader<u64> = reader let u64s_reader: MultiValuedFastFieldReader<u64> = reader
.fast_fields() .fast_fields()
.typed_fast_field_multi_reader::<u64>(field) .typed_fast_field_multi_reader(field)
.expect( .expect(
"Failed to find index for multivalued field. This is a bug in tantivy, \ "Failed to find index for multivalued field. This is a bug in tantivy, \
please report.", please report.",
@@ -668,7 +668,7 @@ impl IndexMerger {
{ {
let mut serialize_vals = let mut serialize_vals =
fast_field_serializer.new_u64_fast_field_with_idx(field, 0u64, max_term_ord, 1)?; fast_field_serializer.new_u64_fast_field_with_idx(field, 0u64, max_term_ord, 1)?;
let mut vals: Vec<u64> = Vec::with_capacity(100); let mut vals = Vec::with_capacity(100);
for old_doc_addr in doc_id_mapping.iter_old_doc_addrs() { for old_doc_addr in doc_id_mapping.iter_old_doc_addrs() {
let term_ordinal_mapping: &[TermOrdinal] = let term_ordinal_mapping: &[TermOrdinal] =
@@ -742,7 +742,7 @@ impl IndexMerger {
for reader in &self.readers { for reader in &self.readers {
let ff_reader: MultiValuedFastFieldReader<u64> = reader let ff_reader: MultiValuedFastFieldReader<u64> = reader
.fast_fields() .fast_fields()
.typed_fast_field_multi_reader::<u64>(field) .typed_fast_field_multi_reader(field)
.expect( .expect(
"Failed to find multivalued fast field reader. This is a bug in tantivy. \ "Failed to find multivalued fast field reader. This is a bug in tantivy. \
Please report.", Please report.",
@@ -784,7 +784,7 @@ impl IndexMerger {
let new_doc_id: DocId = let new_doc_id: DocId =
self.offsets self.offsets
.iter() .iter()
.position(|&offset| offset > pos) .position(|offset| offset > pos)
.expect("pos is out of bounds") as DocId .expect("pos is out of bounds") as DocId
- 1u32; - 1u32;
@@ -1199,6 +1199,7 @@ impl IndexMerger {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use byteorder::{BigEndian, ReadBytesExt}; use byteorder::{BigEndian, ReadBytesExt};
use fastfield_codecs::Column;
use schema::FAST; use schema::FAST;
use crate::collector::tests::{ use crate::collector::tests::{

View File

@@ -1,5 +1,7 @@
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use fastfield_codecs::Column;
use crate::collector::TopDocs; use crate::collector::TopDocs;
use crate::core::Index; use crate::core::Index;
use crate::fastfield::{AliveBitSet, MultiValuedFastFieldReader}; use crate::fastfield::{AliveBitSet, MultiValuedFastFieldReader};
@@ -478,12 +480,11 @@ mod tests {
#[cfg(all(test, feature = "unstable"))] #[cfg(all(test, feature = "unstable"))]
mod bench_sorted_index_merge { mod bench_sorted_index_merge {
use std::sync::Arc;
use fastfield_codecs::Column;
use test::{self, Bencher}; use test::{self, Bencher};
use crate::core::Index; use crate::core::Index;
// use cratedoc_id, readerdoc_id_mappinglet vals = reader.fate::schema;
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader};
use crate::indexer::merger::IndexMerger; use crate::indexer::merger::IndexMerger;
use crate::schema::{Cardinality, NumericOptions, Schema}; use crate::schema::{Cardinality, NumericOptions, Schema};
use crate::{IndexSettings, IndexSortByField, IndexWriter, Order}; use crate::{IndexSettings, IndexSortByField, IndexWriter, Order};
@@ -535,7 +536,7 @@ mod bench_sorted_index_merge {
b.iter(|| { b.iter(|| {
let sorted_doc_ids = doc_id_mapping.iter_old_doc_addrs().map(|doc_addr| { let sorted_doc_ids = doc_id_mapping.iter_old_doc_addrs().map(|doc_addr| {
let reader = &merger.readers[doc_addr.segment_ord as usize]; let reader = &merger.readers[doc_addr.segment_ord as usize];
let u64_reader: Arc<dyn Column<u64>> = let u64_reader: DynamicFastFieldReader<u64> =
reader.fast_fields().typed_fast_field_reader(field).expect( reader.fast_fields().typed_fast_field_reader(field).expect(
"Failed to find a reader for single fast field. This is a tantivy bug and \ "Failed to find a reader for single fast field. This is a tantivy bug and \
it should never happen.", it should never happen.",
@@ -545,7 +546,7 @@ mod bench_sorted_index_merge {
// add values in order of the new doc_ids // add values in order of the new doc_ids
let mut val = 0; let mut val = 0;
for (doc_id, _reader, field_reader) in sorted_doc_ids { for (doc_id, _reader, field_reader) in sorted_doc_ids {
val = field_reader.get_val(doc_id as u64); val = field_reader.get(doc_id);
} }
val val

View File

@@ -25,10 +25,39 @@ use crate::indexer::{
DefaultMergePolicy, MergeCandidate, MergeOperation, MergePolicy, SegmentEntry, DefaultMergePolicy, MergeCandidate, MergeOperation, MergePolicy, SegmentEntry,
SegmentSerializer, SegmentSerializer,
}; };
use crate::schema::Schema;
use crate::{FutureResult, Opstamp}; use crate::{FutureResult, Opstamp};
const NUM_MERGE_THREADS: usize = 4; const NUM_MERGE_THREADS: usize = 4;
/// Save the index meta file.
/// This operation is atomic :
/// Either
/// - it fails, in which case an error is returned,
/// and the `meta.json` remains untouched,
/// - it succeeds, and `meta.json` is written
/// and flushed.
///
/// This method is not part of tantivy's public API
pub fn save_new_metas(
schema: Schema,
index_settings: IndexSettings,
directory: &dyn Directory,
) -> crate::Result<()> {
save_metas(
&IndexMeta {
index_settings,
segments: Vec::new(),
schema,
opstamp: 0u64,
payload: None,
},
directory,
)?;
directory.sync_directory()?;
Ok(())
}
/// Save the index meta file. /// Save the index meta file.
/// This operation is atomic: /// This operation is atomic:
/// Either /// Either
@@ -38,7 +67,7 @@ const NUM_MERGE_THREADS: usize = 4;
/// and flushed. /// and flushed.
/// ///
/// This method is not part of tantivy's public API /// This method is not part of tantivy's public API
pub(crate) fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate::Result<()> { fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate::Result<()> {
info!("save metas"); info!("save metas");
let mut buffer = serde_json::to_vec_pretty(metas)?; let mut buffer = serde_json::to_vec_pretty(metas)?;
// Just adding a new line at the end of the buffer. // Just adding a new line at the end of the buffer.

View File

@@ -80,8 +80,8 @@ impl SegmentWriter {
pub fn for_segment( pub fn for_segment(
memory_budget_in_bytes: usize, memory_budget_in_bytes: usize,
segment: Segment, segment: Segment,
schema: Schema,
) -> crate::Result<SegmentWriter> { ) -> crate::Result<SegmentWriter> {
let schema = segment.schema();
let tokenizer_manager = segment.index().tokenizers().clone(); let tokenizer_manager = segment.index().tokenizers().clone();
let table_size = compute_initial_table_size(memory_budget_in_bytes)?; let table_size = compute_initial_table_size(memory_budget_in_bytes)?;
let segment_serializer = SegmentSerializer::for_segment(segment, false)?; let segment_serializer = SegmentSerializer::for_segment(segment, false)?;

View File

@@ -421,6 +421,7 @@ pub struct DocAddress {
#[cfg(test)] #[cfg(test)]
pub mod tests { pub mod tests {
use common::{BinarySerializable, FixedSize}; use common::{BinarySerializable, FixedSize};
use fastfield_codecs::Column;
use rand::distributions::{Bernoulli, Uniform}; use rand::distributions::{Bernoulli, Uniform};
use rand::rngs::StdRng; use rand::rngs::StdRng;
use rand::{Rng, SeedableRng}; use rand::{Rng, SeedableRng};

View File

@@ -227,7 +227,7 @@ pub mod tests {
{ {
let mut segment_writer = let mut segment_writer =
SegmentWriter::for_segment(3_000_000, segment.clone()).unwrap(); SegmentWriter::for_segment(3_000_000, segment.clone(), schema).unwrap();
{ {
// checking that position works if the field has two values // checking that position works if the field has two values
let op = AddOperation { let op = AddOperation {

View File

@@ -339,7 +339,7 @@ impl StoreReader {
async fn read_block_async(&self, checkpoint: &Checkpoint) -> crate::AsyncIoResult<Block> { async fn read_block_async(&self, checkpoint: &Checkpoint) -> crate::AsyncIoResult<Block> {
let cache_key = checkpoint.byte_range.start; let cache_key = checkpoint.byte_range.start;
if let Some(block) = self.cache.get_from_cache(checkpoint.byte_range.start) { if let Some(block) = self.cache.get_from_cache(checkpoint.byte_range.start) {
return Ok(block); return Ok(block.clone());
} }
let compressed_block = self let compressed_block = self

View File

@@ -172,7 +172,8 @@ where TValueReader: value::ValueReader
} }
pub fn suffix(&self) -> &[u8] { pub fn suffix(&self) -> &[u8] {
self.block_reader &self
.block_reader
.buffer_from_to(self.suffix_start, self.suffix_end) .buffer_from_to(self.suffix_start, self.suffix_end)
} }

View File

@@ -50,7 +50,7 @@ pub struct SSTableIndexBuilder {
/// matches `left <= left' < right`. /// matches `left <= left' < right`.
fn find_shorter_str_in_between(left: &mut Vec<u8>, right: &[u8]) { fn find_shorter_str_in_between(left: &mut Vec<u8>, right: &[u8]) {
assert!(&left[..] < right); assert!(&left[..] < right);
let common_len = common_prefix_len(left, right); let common_len = common_prefix_len(&left, right);
if left.len() == common_len { if left.len() == common_len {
return; return;
} }