mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-12 20:12:54 +00:00
Compare commits
7 Commits
refact-dyn
...
column-tra
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4a072e3c18 | ||
|
|
84e0c75598 | ||
|
|
08c4412d73 | ||
|
|
70e58adff9 | ||
|
|
0d1cd119e9 | ||
|
|
d3dd620048 | ||
|
|
e89c220b56 |
@@ -62,7 +62,7 @@ impl<W: TerminatingWrite> TerminatingWrite for CountingWriter<W> {
|
||||
pub struct AntiCallToken(());
|
||||
|
||||
/// Trait used to indicate when no more write need to be done on a writer
|
||||
pub trait TerminatingWrite: Write + Send {
|
||||
pub trait TerminatingWrite: Write + Send + Sync {
|
||||
/// Indicate that the writer will no longer be used. Internally call terminate_ref.
|
||||
fn terminate(mut self) -> io::Result<()>
|
||||
where Self: Sized {
|
||||
|
||||
@@ -7,12 +7,11 @@
|
||||
// Of course, you can have a look at the tantivy's built-in collectors
|
||||
// such as the `CountCollector` for more examples.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use fastfield_codecs::Column;
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
use tantivy::collector::{Collector, SegmentCollector};
|
||||
use tantivy::fastfield::DynamicFastFieldReader;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::{Field, Schema, FAST, INDEXED, TEXT};
|
||||
use tantivy::{doc, Index, Score, SegmentReader};
|
||||
@@ -97,7 +96,7 @@ impl Collector for StatsCollector {
|
||||
}
|
||||
|
||||
struct StatsSegmentCollector {
|
||||
fast_field_reader: Arc<dyn Column<u64>>,
|
||||
fast_field_reader: DynamicFastFieldReader<u64>,
|
||||
stats: Stats,
|
||||
}
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@ use std::cmp::Reverse;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::sync::{Arc, RwLock, Weak};
|
||||
|
||||
use fastfield_codecs::Column;
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::{Field, Schema, FAST, TEXT};
|
||||
|
||||
@@ -289,6 +289,7 @@ impl FastFieldCodec for BlockwiseLinearCodec {
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima are for the deviation of the calculated value and
|
||||
/// the offset is also unknown.
|
||||
#[allow(clippy::question_mark)]
|
||||
fn estimate(fastfield_accessor: &impl Column) -> Option<f32> {
|
||||
if fastfield_accessor.num_vals() < 10 * CHUNK_SIZE {
|
||||
return None;
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
use std::marker::PhantomData;
|
||||
|
||||
pub trait Column<T = u64> {
|
||||
/// Return the value associated to the given idx.
|
||||
///
|
||||
@@ -44,103 +42,8 @@ pub trait Column<T = u64> {
|
||||
fn max_value(&self) -> T;
|
||||
|
||||
fn num_vals(&self) -> u64;
|
||||
|
||||
/// Returns a iterator over the data
|
||||
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = T> + 'a> {
|
||||
Box::new((0..self.num_vals()).map(|idx| self.get_val(idx)))
|
||||
}
|
||||
}
|
||||
|
||||
struct VecColumn<'a>(&'a [u64]);
|
||||
impl<'a> Column for VecColumn<'a> {
|
||||
fn get_val(&self, position: u64) -> u64 {
|
||||
self.0[position as usize]
|
||||
}
|
||||
|
||||
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
|
||||
Box::new(self.0.iter().cloned())
|
||||
}
|
||||
|
||||
fn min_value(&self) -> u64 {
|
||||
self.0.iter().min().cloned().unwrap_or(0)
|
||||
}
|
||||
|
||||
fn max_value(&self) -> u64 {
|
||||
self.0.iter().max().cloned().unwrap_or(0)
|
||||
}
|
||||
|
||||
fn num_vals(&self) -> u64 {
|
||||
self.0.len() as u64
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a [u64]> for VecColumn<'a> {
|
||||
fn from(data: &'a [u64]) -> Self {
|
||||
Self(data)
|
||||
}
|
||||
}
|
||||
|
||||
struct MonotonicMappingColumn<C, T, Input> {
|
||||
from_column: C,
|
||||
monotonic_mapping: T,
|
||||
_phantom: PhantomData<Input>,
|
||||
}
|
||||
|
||||
/// Creates a view of a column transformed by a monotonic mapping.
|
||||
pub fn monotonic_map_column<C, T, Input, Output>(
|
||||
from_column: C,
|
||||
monotonic_mapping: T,
|
||||
) -> impl Column<Output>
|
||||
where
|
||||
C: Column<Input>,
|
||||
T: Fn(Input) -> Output,
|
||||
{
|
||||
MonotonicMappingColumn {
|
||||
from_column,
|
||||
monotonic_mapping,
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
impl<C, T, Input, Output> Column<Output> for MonotonicMappingColumn<C, T, Input>
|
||||
where
|
||||
C: Column<Input>,
|
||||
T: Fn(Input) -> Output,
|
||||
{
|
||||
fn get_val(&self, idx: u64) -> Output {
|
||||
let from_val = self.from_column.get_val(idx);
|
||||
(self.monotonic_mapping)(from_val)
|
||||
}
|
||||
|
||||
fn min_value(&self) -> Output {
|
||||
let from_min_value = self.from_column.min_value();
|
||||
(self.monotonic_mapping)(from_min_value)
|
||||
}
|
||||
|
||||
fn max_value(&self) -> Output {
|
||||
let from_max_value = self.from_column.max_value();
|
||||
(self.monotonic_mapping)(from_max_value)
|
||||
}
|
||||
|
||||
fn num_vals(&self) -> u64 {
|
||||
self.from_column.num_vals()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_monotonic_mapping() {
|
||||
let vals = &[1u64, 3u64][..];
|
||||
let col = VecColumn::from(vals);
|
||||
let mapped = monotonic_map_column(col, |el| el + 4);
|
||||
assert_eq!(mapped.min_value(), 5u64);
|
||||
assert_eq!(mapped.max_value(), 7u64);
|
||||
assert_eq!(mapped.num_vals(), 2);
|
||||
assert_eq!(mapped.num_vals(), 2);
|
||||
assert_eq!(mapped.get_val(0), 5);
|
||||
assert_eq!(mapped.get_val(0), 7);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,7 +14,7 @@ pub mod linear;
|
||||
|
||||
mod column;
|
||||
|
||||
pub use self::column::{monotonic_map_column, Column};
|
||||
pub use self::column::Column;
|
||||
|
||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
|
||||
#[repr(u8)]
|
||||
@@ -56,12 +56,12 @@ impl FastFieldCodecType {
|
||||
|
||||
/// The FastFieldSerializerEstimate trait is required on all variants
|
||||
/// of fast field compressions, to decide which one to choose.
|
||||
pub trait FastFieldCodec: 'static {
|
||||
pub trait FastFieldCodec {
|
||||
/// A codex needs to provide a unique name and id, which is
|
||||
/// used for debugging and de/serialization.
|
||||
const CODEC_TYPE: FastFieldCodecType;
|
||||
|
||||
type Reader: Column<u64> + 'static;
|
||||
type Reader: Column<u64>;
|
||||
|
||||
/// Reads the metadata and returns the CodecReader
|
||||
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self::Reader>;
|
||||
@@ -90,10 +90,41 @@ pub struct FastFieldStats {
|
||||
pub num_vals: u64,
|
||||
}
|
||||
|
||||
struct VecColum<'a>(&'a [u64]);
|
||||
impl<'a> Column for VecColum<'a> {
|
||||
|
||||
fn get_val(&self, position: u64) -> u64 {
|
||||
self.0[position as usize]
|
||||
}
|
||||
|
||||
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
|
||||
Box::new(self.0.iter().cloned())
|
||||
}
|
||||
|
||||
fn min_value(&self) -> u64 {
|
||||
self.0.iter().min().cloned().unwrap_or(0)
|
||||
}
|
||||
|
||||
fn max_value(&self) -> u64 {
|
||||
self.0.iter().max().cloned().unwrap_or(0)
|
||||
}
|
||||
|
||||
fn num_vals(&self) -> u64 {
|
||||
self.0.len() as u64
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a [u64]> for VecColum<'a> {
|
||||
fn from(data: &'a [u64]) -> Self {
|
||||
Self(data)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use proptest::arbitrary::any;
|
||||
use proptest::proptest;
|
||||
use proptest::prelude::*;
|
||||
use proptest::strategy::Strategy;
|
||||
use proptest::{prop_oneof, proptest};
|
||||
|
||||
use crate::bitpacked::BitpackedCodec;
|
||||
use crate::blockwise_linear::BlockwiseLinearCodec;
|
||||
@@ -124,21 +155,32 @@ mod tests {
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#![proptest_config(ProptestConfig::with_cases(100))]
|
||||
#[test]
|
||||
fn test_proptest_small(data in proptest::collection::vec(any::<u64>(), 1..10)) {
|
||||
create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
|
||||
create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
|
||||
create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_proptest_large(data in proptest::collection::vec(any::<u64>(), 1..6000)) {
|
||||
fn test_proptest_small(data in proptest::collection::vec(num_strategy(), 1..10)) {
|
||||
create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
|
||||
create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
|
||||
create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
|
||||
}
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#![proptest_config(ProptestConfig::with_cases(10))]
|
||||
#[test]
|
||||
fn test_proptest_large(data in proptest::collection::vec(num_strategy(), 1..6000)) {
|
||||
create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
|
||||
create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
|
||||
create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
|
||||
}
|
||||
|
||||
}
|
||||
fn num_strategy() -> impl Strategy<Value = u64> {
|
||||
prop_oneof![
|
||||
1 => prop::num::u64::ANY.prop_map(|num| u64::MAX - (num % 10) ),
|
||||
1 => prop::num::u64::ANY.prop_map(|num| num % 10 ),
|
||||
20 => prop::num::u64::ANY,
|
||||
]
|
||||
}
|
||||
|
||||
pub fn get_codec_test_datasets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
let mut data_and_names = vec![];
|
||||
|
||||
@@ -193,6 +193,7 @@ impl FastFieldCodec for LinearCodec {
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima for the deviation of the calculated value are and
|
||||
/// the offset to shift all values to >=0 is also unknown.
|
||||
#[allow(clippy::question_mark)]
|
||||
fn estimate(fastfield_accessor: &impl Column) -> Option<f32> {
|
||||
if fastfield_accessor.num_vals() < 3 {
|
||||
return None; // disable compressor for this case
|
||||
@@ -258,6 +259,8 @@ fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use rand::RngCore;
|
||||
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_datasets;
|
||||
|
||||
@@ -340,10 +343,9 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn linear_interpol_fast_field_rand() {
|
||||
for _ in 0..5000 {
|
||||
let mut data = (0..10_000)
|
||||
.map(|_| rand::random::<u64>())
|
||||
.collect::<Vec<_>>();
|
||||
let mut rng = rand::thread_rng();
|
||||
for _ in 0..50 {
|
||||
let mut data = (0..10_000).map(|_| rng.next_u64()).collect::<Vec<_>>();
|
||||
create_and_validate(&data, "random");
|
||||
data.reverse();
|
||||
create_and_validate(&data, "random");
|
||||
|
||||
@@ -6,7 +6,7 @@ use std::{fmt, io, mem};
|
||||
use stable_deref_trait::StableDeref;
|
||||
|
||||
/// An OwnedBytes simply wraps an object that owns a slice of data and exposes
|
||||
/// this data as a static slice.
|
||||
/// this data as a slice.
|
||||
///
|
||||
/// The backing object is required to be `StableDeref`.
|
||||
#[derive(Clone)]
|
||||
|
||||
@@ -4,14 +4,14 @@ use std::rc::Rc;
|
||||
use std::sync::atomic::AtomicU32;
|
||||
use std::sync::Arc;
|
||||
|
||||
use fastfield_codecs::Column;
|
||||
|
||||
use super::agg_req::{Aggregation, Aggregations, BucketAggregationType, MetricAggregation};
|
||||
use super::bucket::{HistogramAggregation, RangeAggregation, TermsAggregation};
|
||||
use super::metric::{AverageAggregation, StatsAggregation};
|
||||
use super::segment_agg_result::BucketCount;
|
||||
use super::VecWithNames;
|
||||
use crate::fastfield::{type_and_cardinality, FastType, MultiValuedFastFieldReader};
|
||||
use crate::fastfield::{
|
||||
type_and_cardinality, DynamicFastFieldReader, FastType, MultiValuedFastFieldReader,
|
||||
};
|
||||
use crate::schema::{Cardinality, Type};
|
||||
use crate::{InvertedIndexReader, SegmentReader, TantivyError};
|
||||
|
||||
@@ -37,16 +37,10 @@ impl AggregationsWithAccessor {
|
||||
#[derive(Clone)]
|
||||
pub(crate) enum FastFieldAccessor {
|
||||
Multi(MultiValuedFastFieldReader<u64>),
|
||||
Single(Arc<dyn Column<u64>>),
|
||||
Single(DynamicFastFieldReader<u64>),
|
||||
}
|
||||
impl FastFieldAccessor {
|
||||
pub fn as_single(&self) -> Option<&dyn Column<u64>> {
|
||||
match self {
|
||||
FastFieldAccessor::Multi(_) => None,
|
||||
FastFieldAccessor::Single(reader) => Some(&**reader),
|
||||
}
|
||||
}
|
||||
pub fn into_single(self) -> Option<Arc<dyn Column<u64>>> {
|
||||
pub fn as_single(&self) -> Option<&DynamicFastFieldReader<u64>> {
|
||||
match self {
|
||||
FastFieldAccessor::Multi(_) => None,
|
||||
FastFieldAccessor::Single(reader) => Some(reader),
|
||||
@@ -124,7 +118,7 @@ impl BucketAggregationWithAccessor {
|
||||
pub struct MetricAggregationWithAccessor {
|
||||
pub metric: MetricAggregation,
|
||||
pub field_type: Type,
|
||||
pub accessor: Arc<dyn Column>,
|
||||
pub accessor: DynamicFastFieldReader<u64>,
|
||||
}
|
||||
|
||||
impl MetricAggregationWithAccessor {
|
||||
@@ -140,8 +134,9 @@ impl MetricAggregationWithAccessor {
|
||||
|
||||
Ok(MetricAggregationWithAccessor {
|
||||
accessor: accessor
|
||||
.into_single()
|
||||
.expect("unexpected fast field cardinality"),
|
||||
.as_single()
|
||||
.expect("unexpected fast field cardinality")
|
||||
.clone(),
|
||||
field_type,
|
||||
metric: metric.clone(),
|
||||
})
|
||||
|
||||
@@ -15,6 +15,7 @@ use crate::aggregation::intermediate_agg_result::{
|
||||
IntermediateAggregationResults, IntermediateBucketResult, IntermediateHistogramBucketEntry,
|
||||
};
|
||||
use crate::aggregation::segment_agg_result::SegmentAggregationResultsCollector;
|
||||
use crate::fastfield::DynamicFastFieldReader;
|
||||
use crate::schema::Type;
|
||||
use crate::{DocId, TantivyError};
|
||||
|
||||
@@ -263,7 +264,7 @@ impl SegmentHistogramCollector {
|
||||
req: &HistogramAggregation,
|
||||
sub_aggregation: &AggregationsWithAccessor,
|
||||
field_type: Type,
|
||||
accessor: &dyn Column<u64>,
|
||||
accessor: &DynamicFastFieldReader<u64>,
|
||||
) -> crate::Result<Self> {
|
||||
req.validate()?;
|
||||
let min = f64_from_fastfield_u64(accessor.min_value(), &field_type);
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::fmt::Debug;
|
||||
use std::ops::Range;
|
||||
|
||||
use fastfield_codecs::Column;
|
||||
use fnv::FnvHashMap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@ use fastfield_codecs::Column;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::aggregation::f64_from_fastfield_u64;
|
||||
use crate::fastfield::DynamicFastFieldReader;
|
||||
use crate::schema::Type;
|
||||
use crate::DocId;
|
||||
|
||||
@@ -57,7 +58,7 @@ impl SegmentAverageCollector {
|
||||
data: Default::default(),
|
||||
}
|
||||
}
|
||||
pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &dyn Column<u64>) {
|
||||
pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &DynamicFastFieldReader<u64>) {
|
||||
let mut iter = doc.chunks_exact(4);
|
||||
for docs in iter.by_ref() {
|
||||
let val1 = field.get_val(docs[0] as u64);
|
||||
|
||||
@@ -2,6 +2,7 @@ use fastfield_codecs::Column;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::aggregation::f64_from_fastfield_u64;
|
||||
use crate::fastfield::DynamicFastFieldReader;
|
||||
use crate::schema::Type;
|
||||
use crate::{DocId, TantivyError};
|
||||
|
||||
@@ -163,7 +164,7 @@ impl SegmentStatsCollector {
|
||||
stats: IntermediateStats::default(),
|
||||
}
|
||||
}
|
||||
pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &dyn Column<u64>) {
|
||||
pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &DynamicFastFieldReader<u64>) {
|
||||
let mut iter = doc.chunks_exact(4);
|
||||
for docs in iter.by_ref() {
|
||||
let val1 = field.get_val(docs[0] as u64);
|
||||
|
||||
@@ -185,10 +185,10 @@ impl SegmentMetricResultCollector {
|
||||
pub(crate) fn collect_block(&mut self, doc: &[DocId], metric: &MetricAggregationWithAccessor) {
|
||||
match self {
|
||||
SegmentMetricResultCollector::Average(avg_collector) => {
|
||||
avg_collector.collect_block(doc, &*metric.accessor);
|
||||
avg_collector.collect_block(doc, &metric.accessor);
|
||||
}
|
||||
SegmentMetricResultCollector::Stats(stats_collector) => {
|
||||
stats_collector.collect_block(doc, &*metric.accessor);
|
||||
stats_collector.collect_block(doc, &metric.accessor);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,12 +10,11 @@
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
use std::marker::PhantomData;
|
||||
use std::sync::Arc;
|
||||
|
||||
use fastfield_codecs::Column;
|
||||
|
||||
use crate::collector::{Collector, SegmentCollector};
|
||||
use crate::fastfield::FastValue;
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastValue};
|
||||
use crate::schema::Field;
|
||||
use crate::{Score, SegmentReader, TantivyError};
|
||||
|
||||
@@ -161,7 +160,7 @@ where
|
||||
TPredicate: 'static,
|
||||
TPredicateValue: FastValue,
|
||||
{
|
||||
fast_field_reader: Arc<dyn Column<TPredicateValue>>,
|
||||
fast_field_reader: DynamicFastFieldReader<TPredicateValue>,
|
||||
segment_collector: TSegmentCollector,
|
||||
predicate: TPredicate,
|
||||
t_predicate_value: PhantomData<TPredicateValue>,
|
||||
|
||||
@@ -1,10 +1,8 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use fastdivide::DividerU64;
|
||||
use fastfield_codecs::Column;
|
||||
|
||||
use crate::collector::{Collector, SegmentCollector};
|
||||
use crate::fastfield::FastValue;
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastValue};
|
||||
use crate::schema::{Field, Type};
|
||||
use crate::{DocId, Score};
|
||||
|
||||
@@ -87,7 +85,7 @@ impl HistogramComputer {
|
||||
}
|
||||
pub struct SegmentHistogramCollector {
|
||||
histogram_computer: HistogramComputer,
|
||||
ff_reader: Arc<dyn Column<u64>>,
|
||||
ff_reader: DynamicFastFieldReader<u64>,
|
||||
}
|
||||
|
||||
impl SegmentCollector for SegmentHistogramCollector {
|
||||
|
||||
@@ -1,11 +1,9 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use fastfield_codecs::Column;
|
||||
|
||||
use super::*;
|
||||
use crate::collector::{Count, FilterCollector, TopDocs};
|
||||
use crate::core::SegmentReader;
|
||||
use crate::fastfield::BytesFastFieldReader;
|
||||
use crate::fastfield::{BytesFastFieldReader, DynamicFastFieldReader};
|
||||
use crate::query::{AllQuery, QueryParser};
|
||||
use crate::schema::{Field, Schema, FAST, TEXT};
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
@@ -160,7 +158,7 @@ pub struct FastFieldTestCollector {
|
||||
|
||||
pub struct FastFieldSegmentCollector {
|
||||
vals: Vec<u64>,
|
||||
reader: Arc<dyn Column<u64>>,
|
||||
reader: DynamicFastFieldReader<u64>,
|
||||
}
|
||||
|
||||
impl FastFieldTestCollector {
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use std::collections::BinaryHeap;
|
||||
use std::fmt;
|
||||
use std::marker::PhantomData;
|
||||
use std::sync::Arc;
|
||||
|
||||
use fastfield_codecs::Column;
|
||||
|
||||
@@ -12,7 +11,7 @@ use crate::collector::tweak_score_top_collector::TweakedScoreTopCollector;
|
||||
use crate::collector::{
|
||||
CustomScorer, CustomSegmentScorer, ScoreSegmentTweaker, ScoreTweaker, SegmentCollector,
|
||||
};
|
||||
use crate::fastfield::FastValue;
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastValue};
|
||||
use crate::query::Weight;
|
||||
use crate::schema::Field;
|
||||
use crate::{DocAddress, DocId, Score, SegmentOrdinal, SegmentReader, TantivyError};
|
||||
@@ -132,7 +131,7 @@ impl fmt::Debug for TopDocs {
|
||||
}
|
||||
|
||||
struct ScorerByFastFieldReader {
|
||||
ff_reader: Arc<dyn Column<u64>>,
|
||||
ff_reader: DynamicFastFieldReader<u64>,
|
||||
}
|
||||
|
||||
impl CustomSegmentScorer<u64> for ScorerByFastFieldReader {
|
||||
@@ -410,6 +409,7 @@ impl TopDocs {
|
||||
/// # use tantivy::query::QueryParser;
|
||||
/// use tantivy::SegmentReader;
|
||||
/// use tantivy::collector::TopDocs;
|
||||
/// use tantivy::fastfield::Column;
|
||||
/// use tantivy::schema::Field;
|
||||
///
|
||||
/// fn create_schema() -> Schema {
|
||||
@@ -517,6 +517,7 @@ impl TopDocs {
|
||||
/// use tantivy::SegmentReader;
|
||||
/// use tantivy::collector::TopDocs;
|
||||
/// use tantivy::schema::Field;
|
||||
/// use fastfield_codecs::Column;
|
||||
///
|
||||
/// # fn create_schema() -> Schema {
|
||||
/// # let mut schema_builder = Schema::builder();
|
||||
|
||||
@@ -7,6 +7,7 @@ use std::sync::Arc;
|
||||
|
||||
use super::segment::Segment;
|
||||
use super::IndexSettings;
|
||||
use crate::core::single_segment_index_writer::SingleSegmentIndexWriter;
|
||||
use crate::core::{
|
||||
Executor, IndexMeta, SegmentId, SegmentMeta, SegmentMetaInventory, META_FILEPATH,
|
||||
};
|
||||
@@ -163,6 +164,25 @@ impl IndexBuilder {
|
||||
self.create(mmap_directory)
|
||||
}
|
||||
|
||||
/// Dragons ahead!!!
|
||||
///
|
||||
/// The point of this API is to let users create a simple index with a single segment
|
||||
/// and without starting any thread.
|
||||
///
|
||||
/// Do not use this method if you are not sure what you are doing.
|
||||
///
|
||||
/// It expects an originally empty directory, and will not run any GC operation.
|
||||
#[doc(hidden)]
|
||||
pub fn single_segment_index_writer(
|
||||
self,
|
||||
dir: impl Into<Box<dyn Directory>>,
|
||||
mem_budget: usize,
|
||||
) -> crate::Result<SingleSegmentIndexWriter> {
|
||||
let index = self.create(dir)?;
|
||||
let index_simple_writer = SingleSegmentIndexWriter::new(index, mem_budget)?;
|
||||
Ok(index_simple_writer)
|
||||
}
|
||||
|
||||
/// Creates a new index in a temp directory.
|
||||
///
|
||||
/// The index will use the `MMapDirectory` in a newly created directory.
|
||||
@@ -608,10 +628,12 @@ impl fmt::Debug for Index {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::collector::Count;
|
||||
use crate::directory::{RamDirectory, WatchCallback};
|
||||
use crate::schema::{Field, Schema, INDEXED, TEXT};
|
||||
use crate::query::TermQuery;
|
||||
use crate::schema::{Field, IndexRecordOption, Schema, INDEXED, TEXT};
|
||||
use crate::tokenizer::TokenizerManager;
|
||||
use crate::{Directory, Index, IndexBuilder, IndexReader, IndexSettings, ReloadPolicy};
|
||||
use crate::{Directory, Index, IndexBuilder, IndexReader, IndexSettings, ReloadPolicy, Term};
|
||||
|
||||
#[test]
|
||||
fn test_indexer_for_field() {
|
||||
@@ -877,4 +899,28 @@ mod tests {
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_single_segment_index_writer() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let directory = RamDirectory::default();
|
||||
let mut single_segment_index_writer = Index::builder()
|
||||
.schema(schema)
|
||||
.single_segment_index_writer(directory, 10_000_000)?;
|
||||
for _ in 0..10 {
|
||||
let doc = doc!(text_field=>"hello");
|
||||
single_segment_index_writer.add_document(doc)?;
|
||||
}
|
||||
let index = single_segment_index_writer.finalize()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(text_field, "hello"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
let count = searcher.search(&term_query, &Count)?;
|
||||
assert_eq!(count, 10);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@ mod segment;
|
||||
mod segment_component;
|
||||
mod segment_id;
|
||||
mod segment_reader;
|
||||
mod single_segment_index_writer;
|
||||
|
||||
use std::path::Path;
|
||||
|
||||
@@ -23,6 +24,7 @@ pub use self::segment::Segment;
|
||||
pub use self::segment_component::SegmentComponent;
|
||||
pub use self::segment_id::SegmentId;
|
||||
pub use self::segment_reader::SegmentReader;
|
||||
pub use self::single_segment_index_writer::SingleSegmentIndexWriter;
|
||||
|
||||
/// The meta file contains all the information about the list of segments and the schema
|
||||
/// of the index.
|
||||
|
||||
47
src/core/single_segment_index_writer.rs
Normal file
47
src/core/single_segment_index_writer.rs
Normal file
@@ -0,0 +1,47 @@
|
||||
use crate::indexer::operation::AddOperation;
|
||||
use crate::indexer::segment_updater::save_metas;
|
||||
use crate::indexer::SegmentWriter;
|
||||
use crate::{Directory, Document, Index, IndexMeta, Opstamp, Segment};
|
||||
|
||||
#[doc(hidden)]
|
||||
pub struct SingleSegmentIndexWriter {
|
||||
segment_writer: SegmentWriter,
|
||||
segment: Segment,
|
||||
opstamp: Opstamp,
|
||||
}
|
||||
|
||||
impl SingleSegmentIndexWriter {
|
||||
pub fn new(index: Index, mem_budget: usize) -> crate::Result<Self> {
|
||||
let segment = index.new_segment();
|
||||
let segment_writer = SegmentWriter::for_segment(mem_budget, segment.clone())?;
|
||||
Ok(Self {
|
||||
segment_writer,
|
||||
segment,
|
||||
opstamp: 0,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn add_document(&mut self, document: Document) -> crate::Result<()> {
|
||||
let opstamp = self.opstamp;
|
||||
self.opstamp += 1;
|
||||
self.segment_writer
|
||||
.add_document(AddOperation { opstamp, document })
|
||||
}
|
||||
|
||||
pub fn finalize(self) -> crate::Result<Index> {
|
||||
let max_doc = self.segment_writer.max_doc();
|
||||
self.segment_writer.finalize()?;
|
||||
let segment: Segment = self.segment.with_max_doc(max_doc);
|
||||
let index = segment.index();
|
||||
let index_meta = IndexMeta {
|
||||
index_settings: index.settings().clone(),
|
||||
segments: vec![segment.meta().clone()],
|
||||
schema: index.schema(),
|
||||
opstamp: 0,
|
||||
payload: None,
|
||||
};
|
||||
save_metas(&index_meta, index.directory())?;
|
||||
index.directory().sync_directory()?;
|
||||
Ok(segment.index().clone())
|
||||
}
|
||||
}
|
||||
@@ -1,9 +1,7 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use fastfield_codecs::Column;
|
||||
|
||||
use crate::directory::{FileSlice, OwnedBytes};
|
||||
use crate::fastfield::MultiValueLength;
|
||||
use crate::fastfield::{DynamicFastFieldReader, MultiValueLength};
|
||||
use crate::DocId;
|
||||
|
||||
/// Reader for byte array fast fields
|
||||
@@ -18,13 +16,13 @@ use crate::DocId;
|
||||
/// and the start index for the next document, and keeping the bytes in between.
|
||||
#[derive(Clone)]
|
||||
pub struct BytesFastFieldReader {
|
||||
idx_reader: Arc<dyn Column<u64>>,
|
||||
idx_reader: DynamicFastFieldReader<u64>,
|
||||
values: OwnedBytes,
|
||||
}
|
||||
|
||||
impl BytesFastFieldReader {
|
||||
pub(crate) fn open(
|
||||
idx_reader: Arc<dyn Column<u64>>,
|
||||
idx_reader: DynamicFastFieldReader<u64>,
|
||||
values_file: FileSlice,
|
||||
) -> crate::Result<BytesFastFieldReader> {
|
||||
let values = values_file.read_bytes()?;
|
||||
|
||||
@@ -3,11 +3,20 @@ use std::num::NonZeroU64;
|
||||
|
||||
use common::BinarySerializable;
|
||||
use fastdivide::DividerU64;
|
||||
use fastfield_codecs::{monotonic_map_column, Column, FastFieldCodec};
|
||||
use fastfield_codecs::{Column, FastFieldCodec};
|
||||
use ownedbytes::OwnedBytes;
|
||||
|
||||
pub const GCD_DEFAULT: u64 = 1;
|
||||
|
||||
/// Wrapper for accessing a fastfield.
|
||||
///
|
||||
/// Holds the data and the codec to the read the data.
|
||||
#[derive(Clone)]
|
||||
pub struct GCDReader<CodecReader: Column> {
|
||||
gcd_params: GCDParams,
|
||||
reader: CodecReader,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
struct GCDParams {
|
||||
gcd: u64,
|
||||
@@ -15,6 +24,12 @@ struct GCDParams {
|
||||
num_vals: u64,
|
||||
}
|
||||
|
||||
impl GCDParams {
|
||||
pub fn eval(&self, val: u64) -> u64 {
|
||||
self.min_value + self.gcd * val
|
||||
}
|
||||
}
|
||||
|
||||
impl BinarySerializable for GCDParams {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
self.gcd.serialize(writer)?;
|
||||
@@ -37,13 +52,31 @@ impl BinarySerializable for GCDParams {
|
||||
|
||||
pub fn open_gcd_from_bytes<WrappedCodec: FastFieldCodec>(
|
||||
bytes: OwnedBytes,
|
||||
) -> io::Result<impl Column> {
|
||||
) -> io::Result<GCDReader<WrappedCodec::Reader>> {
|
||||
let footer_offset = bytes.len() - 24;
|
||||
let (body, mut footer) = bytes.split(footer_offset);
|
||||
let gcd_params = GCDParams::deserialize(&mut footer)?;
|
||||
let gcd_remap = move |val: u64| gcd_params.min_value + gcd_params.gcd * val;
|
||||
let reader: WrappedCodec::Reader = WrappedCodec::open_from_bytes(body)?;
|
||||
Ok(monotonic_map_column(reader, gcd_remap))
|
||||
Ok(GCDReader { gcd_params, reader })
|
||||
}
|
||||
|
||||
impl<C: Column + Clone> Column for GCDReader<C> {
|
||||
#[inline]
|
||||
fn get_val(&self, doc: u64) -> u64 {
|
||||
let val = self.reader.get_val(doc);
|
||||
self.gcd_params.eval(val)
|
||||
}
|
||||
|
||||
fn min_value(&self) -> u64 {
|
||||
self.gcd_params.eval(self.reader.min_value())
|
||||
}
|
||||
|
||||
fn max_value(&self) -> u64 {
|
||||
self.gcd_params.eval(self.reader.max_value())
|
||||
}
|
||||
fn num_vals(&self) -> u64 {
|
||||
self.gcd_params.num_vals
|
||||
}
|
||||
}
|
||||
|
||||
pub fn write_gcd_header<W: Write>(
|
||||
@@ -101,7 +134,6 @@ mod tests {
|
||||
use std::collections::HashMap;
|
||||
use std::num::NonZeroU64;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, SystemTime};
|
||||
|
||||
use common::HasLen;
|
||||
@@ -109,11 +141,11 @@ mod tests {
|
||||
|
||||
use crate::directory::{CompositeFile, RamDirectory, WritePtr};
|
||||
use crate::fastfield::gcd::compute_gcd;
|
||||
use crate::fastfield::reader::open_fast_field;
|
||||
use crate::fastfield::serializer::FastFieldCodecEnableCheck;
|
||||
use crate::fastfield::tests::{encode_decode_fast_field, FIELD, FIELDI64, SCHEMA, SCHEMAI64};
|
||||
use crate::fastfield::tests::{FIELD, FIELDI64, SCHEMA, SCHEMAI64};
|
||||
use crate::fastfield::{
|
||||
find_gcd, CompositeFastFieldSerializer, FastFieldCodecType, FastFieldsWriter, ALL_CODECS,
|
||||
find_gcd, CompositeFastFieldSerializer, DynamicFastFieldReader, FastFieldCodecType,
|
||||
FastFieldsWriter, ALL_CODECS,
|
||||
};
|
||||
use crate::schema::{Cardinality, Schema};
|
||||
use crate::{DateOptions, DatePrecision, DateTime, Directory};
|
||||
@@ -155,7 +187,8 @@ mod tests {
|
||||
let file = directory.open_read(path).unwrap();
|
||||
let composite_file = CompositeFile::open(&file)?;
|
||||
let file = composite_file.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader: Arc<dyn Column<i64>> = open_fast_field(file.read_bytes()?)?;
|
||||
let fast_field_reader = DynamicFastFieldReader::<i64>::open(file)?;
|
||||
|
||||
assert_eq!(fast_field_reader.get_val(0), -4000i64);
|
||||
assert_eq!(fast_field_reader.get_val(1), -3000i64);
|
||||
assert_eq!(fast_field_reader.get_val(2), -2000i64);
|
||||
@@ -196,7 +229,7 @@ mod tests {
|
||||
let file = directory.open_read(path).unwrap();
|
||||
let composite_file = CompositeFile::open(&file)?;
|
||||
let file = composite_file.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = open_fast_field::<u64>(file.read_bytes()?)?;
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(file)?;
|
||||
assert_eq!(fast_field_reader.get_val(0), 1000u64);
|
||||
assert_eq!(fast_field_reader.get_val(1), 2000u64);
|
||||
assert_eq!(fast_field_reader.get_val(2), 3000u64);
|
||||
@@ -225,7 +258,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
pub fn test_fastfield2() {
|
||||
let test_fastfield = encode_decode_fast_field(&[100u64, 200u64, 300u64]);
|
||||
let test_fastfield = DynamicFastFieldReader::<u64>::from(vec![100, 200, 300]);
|
||||
assert_eq!(test_fastfield.get_val(0), 100);
|
||||
assert_eq!(test_fastfield.get_val(1), 200);
|
||||
assert_eq!(test_fastfield.get_val(2), 300);
|
||||
@@ -291,7 +324,7 @@ mod tests {
|
||||
let composite_file = CompositeFile::open(&file)?;
|
||||
let file = composite_file.open_read(*FIELD).unwrap();
|
||||
let len = file.len();
|
||||
let test_fastfield = open_fast_field::<DateTime>(file.read_bytes()?)?;
|
||||
let test_fastfield = DynamicFastFieldReader::<DateTime>::open(file)?;
|
||||
|
||||
assert_eq!(test_fastfield.get_val(0), time1.truncate(precision));
|
||||
assert_eq!(test_fastfield.get_val(1), time2.truncate(precision));
|
||||
|
||||
@@ -26,8 +26,9 @@ pub use self::alive_bitset::{intersect_alive_bitsets, write_alive_bitset, AliveB
|
||||
pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter};
|
||||
pub use self::error::{FastFieldNotAvailableError, Result};
|
||||
pub use self::facet_reader::FacetReader;
|
||||
pub(crate) use self::gcd::{find_gcd, GCD_DEFAULT};
|
||||
pub(crate) use self::gcd::{find_gcd, GCDReader, GCD_DEFAULT};
|
||||
pub use self::multivalued::{MultiValuedFastFieldReader, MultiValuedFastFieldWriter};
|
||||
pub use self::reader::DynamicFastFieldReader;
|
||||
pub use self::readers::FastFieldReaders;
|
||||
pub(crate) use self::readers::{type_and_cardinality, FastType};
|
||||
pub use self::serializer::{Column, CompositeFastFieldSerializer, FastFieldStats};
|
||||
@@ -265,7 +266,6 @@ mod tests {
|
||||
use std::collections::HashMap;
|
||||
use std::ops::Range;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
|
||||
use common::HasLen;
|
||||
use once_cell::sync::Lazy;
|
||||
@@ -275,7 +275,6 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr};
|
||||
use crate::fastfield::reader::open_fast_field;
|
||||
use crate::merge_policy::NoMergePolicy;
|
||||
use crate::schema::{Document, Field, Schema, FAST, STRING, TEXT};
|
||||
use crate::time::OffsetDateTime;
|
||||
@@ -296,51 +295,9 @@ mod tests {
|
||||
pub static FIELD: Lazy<Field> = Lazy::new(|| SCHEMA.get_field("field").unwrap());
|
||||
pub static FIELDI64: Lazy<Field> = Lazy::new(|| SCHEMAI64.get_field("field").unwrap());
|
||||
|
||||
/// Encode values using the most appropriate codec and and then loads it
|
||||
/// right away.
|
||||
///
|
||||
/// This is useful in tests and bench.
|
||||
pub(crate) fn encode_decode_fast_field<Item: FastValue>(
|
||||
vals: &[Item],
|
||||
) -> Arc<dyn Column<Item>> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_u64_field("field", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let path = Path::new("__dummy__");
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory
|
||||
.open_write(path)
|
||||
.expect("With a RamDirectory, this should never fail.");
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write)
|
||||
.expect("With a RamDirectory, this should never fail.");
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
|
||||
{
|
||||
let fast_field_writer = fast_field_writers
|
||||
.get_field_writer_mut(field)
|
||||
.expect("With a RamDirectory, this should never fail.");
|
||||
for val in vals {
|
||||
fast_field_writer.add_val(val.to_u64());
|
||||
}
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new(), None)
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(path).expect("Failed to open the file");
|
||||
let composite_file = CompositeFile::open(&file).expect("Failed to read the composite file");
|
||||
let field_bytes = composite_file
|
||||
.open_read(field)
|
||||
.expect("File component not found")
|
||||
.read_bytes()
|
||||
.unwrap();
|
||||
open_fast_field(field_bytes).unwrap()
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_fastfield() {
|
||||
let test_fastfield = encode_decode_fast_field(&[100u64, 200u64, 300u64]);
|
||||
let test_fastfield = DynamicFastFieldReader::<u64>::from(vec![100, 200, 300]);
|
||||
assert_eq!(test_fastfield.get_val(0u64), 100);
|
||||
assert_eq!(test_fastfield.get_val(1u64), 200);
|
||||
assert_eq!(test_fastfield.get_val(2u64), 300);
|
||||
@@ -371,8 +328,8 @@ mod tests {
|
||||
let file = directory.open_read(path).unwrap();
|
||||
assert_eq!(file.len(), 45);
|
||||
let composite_file = CompositeFile::open(&file)?;
|
||||
let fast_field_bytes = composite_file.open_read(*FIELD).unwrap().read_bytes()?;
|
||||
let fast_field_reader = open_fast_field::<u64>(fast_field_bytes)?;
|
||||
let file = composite_file.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(file)?;
|
||||
assert_eq!(fast_field_reader.get_val(0), 13u64);
|
||||
assert_eq!(fast_field_reader.get_val(1), 14u64);
|
||||
assert_eq!(fast_field_reader.get_val(2), 2u64);
|
||||
@@ -403,11 +360,8 @@ mod tests {
|
||||
assert_eq!(file.len(), 70);
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file)?;
|
||||
let data = fast_fields_composite
|
||||
.open_read(*FIELD)
|
||||
.unwrap()
|
||||
.read_bytes()?;
|
||||
let fast_field_reader = open_fast_field::<u64>(data)?;
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?;
|
||||
assert_eq!(fast_field_reader.get_val(0), 4u64);
|
||||
assert_eq!(fast_field_reader.get_val(1), 14_082_001u64);
|
||||
assert_eq!(fast_field_reader.get_val(2), 3_052u64);
|
||||
@@ -442,11 +396,8 @@ mod tests {
|
||||
assert_eq!(file.len(), 43);
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file).unwrap();
|
||||
let data = fast_fields_composite
|
||||
.open_read(*FIELD)
|
||||
.unwrap()
|
||||
.read_bytes()?;
|
||||
let fast_field_reader = open_fast_field::<u64>(data)?;
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?;
|
||||
for doc in 0..10_000 {
|
||||
assert_eq!(fast_field_reader.get_val(doc), 100_000u64);
|
||||
}
|
||||
@@ -477,11 +428,8 @@ mod tests {
|
||||
assert_eq!(file.len(), 80051);
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file)?;
|
||||
let data = fast_fields_composite
|
||||
.open_read(*FIELD)
|
||||
.unwrap()
|
||||
.read_bytes()?;
|
||||
let fast_field_reader = open_fast_field::<u64>(data)?;
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?;
|
||||
assert_eq!(fast_field_reader.get_val(0), 0u64);
|
||||
for doc in 1..10_001 {
|
||||
assert_eq!(
|
||||
@@ -521,11 +469,8 @@ mod tests {
|
||||
assert_eq!(file.len(), 75_usize); // linear interpol size after calc improvement
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file)?;
|
||||
let data = fast_fields_composite
|
||||
.open_read(i64_field)
|
||||
.unwrap()
|
||||
.read_bytes()?;
|
||||
let fast_field_reader = open_fast_field::<i64>(data)?;
|
||||
let data = fast_fields_composite.open_read(i64_field).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<i64>::open(data)?;
|
||||
|
||||
assert_eq!(fast_field_reader.min_value(), -100i64);
|
||||
assert_eq!(fast_field_reader.max_value(), 9_999i64);
|
||||
@@ -564,11 +509,8 @@ mod tests {
|
||||
let file = directory.open_read(path).unwrap();
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file).unwrap();
|
||||
let data = fast_fields_composite
|
||||
.open_read(i64_field)
|
||||
.unwrap()
|
||||
.read_bytes()?;
|
||||
let fast_field_reader = open_fast_field::<i64>(data)?;
|
||||
let data = fast_fields_composite.open_read(i64_field).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<i64>::open(data)?;
|
||||
assert_eq!(fast_field_reader.get_val(0), 0i64);
|
||||
}
|
||||
Ok(())
|
||||
@@ -605,11 +547,8 @@ mod tests {
|
||||
let file = directory.open_read(path)?;
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file)?;
|
||||
let data = fast_fields_composite
|
||||
.open_read(*FIELD)
|
||||
.unwrap()
|
||||
.read_bytes()?;
|
||||
let fast_field_reader = open_fast_field::<u64>(data)?;
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?;
|
||||
|
||||
for a in 0..n {
|
||||
assert_eq!(fast_field_reader.get_val(a as u64), permutation[a as usize]);
|
||||
@@ -668,7 +607,7 @@ mod tests {
|
||||
let mut all = vec![];
|
||||
|
||||
for doc in docs {
|
||||
let mut out: Vec<u64> = vec![];
|
||||
let mut out = vec![];
|
||||
ff.get_vals(doc, &mut out);
|
||||
all.extend(out);
|
||||
}
|
||||
@@ -926,7 +865,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
pub fn test_fastfield_bool() {
|
||||
let test_fastfield = encode_decode_fast_field::<bool>(&[true, false, true, false]);
|
||||
let test_fastfield = DynamicFastFieldReader::<bool>::from(vec![true, false, true, false]);
|
||||
assert_eq!(test_fastfield.get_val(0), true);
|
||||
assert_eq!(test_fastfield.get_val(1), false);
|
||||
assert_eq!(test_fastfield.get_val(2), true);
|
||||
@@ -959,8 +898,8 @@ mod tests {
|
||||
let file = directory.open_read(path).unwrap();
|
||||
assert_eq!(file.len(), 44);
|
||||
let composite_file = CompositeFile::open(&file)?;
|
||||
let data = composite_file.open_read(field).unwrap().read_bytes()?;
|
||||
let fast_field_reader = open_fast_field::<bool>(data)?;
|
||||
let file = composite_file.open_read(field).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<bool>::open(file)?;
|
||||
assert_eq!(fast_field_reader.get_val(0), true);
|
||||
assert_eq!(fast_field_reader.get_val(1), false);
|
||||
assert_eq!(fast_field_reader.get_val(2), true);
|
||||
@@ -995,8 +934,8 @@ mod tests {
|
||||
let file = directory.open_read(path).unwrap();
|
||||
assert_eq!(file.len(), 56);
|
||||
let composite_file = CompositeFile::open(&file)?;
|
||||
let data = composite_file.open_read(field).unwrap().read_bytes()?;
|
||||
let fast_field_reader = open_fast_field::<bool>(data)?;
|
||||
let file = composite_file.open_read(field).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<bool>::open(file)?;
|
||||
for i in 0..25 {
|
||||
assert_eq!(fast_field_reader.get_val(i * 2), true);
|
||||
assert_eq!(fast_field_reader.get_val(i * 2 + 1), false);
|
||||
@@ -1029,8 +968,8 @@ mod tests {
|
||||
let file = directory.open_read(path).unwrap();
|
||||
assert_eq!(file.len(), 43);
|
||||
let composite_file = CompositeFile::open(&file)?;
|
||||
let data = composite_file.open_read(field).unwrap().read_bytes()?;
|
||||
let fast_field_reader = open_fast_field::<bool>(data)?;
|
||||
let file = composite_file.open_read(field).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<bool>::open(file)?;
|
||||
assert_eq!(fast_field_reader.get_val(0), false);
|
||||
|
||||
Ok(())
|
||||
@@ -1039,22 +978,20 @@ mod tests {
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
use std::sync::Arc;
|
||||
|
||||
use fastfield_codecs::Column;
|
||||
use test::{self, Bencher};
|
||||
|
||||
use crate::fastfield::tests::{
|
||||
encode_decode_fast_field, generate_permutation, generate_permutation_gcd,
|
||||
};
|
||||
use super::tests::generate_permutation;
|
||||
use super::*;
|
||||
use crate::fastfield::tests::generate_permutation_gcd;
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_jumpy_veclookup(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
let n = permutation.len();
|
||||
b.iter(|| {
|
||||
let n = test::black_box(1000u32);
|
||||
let mut a = 0u64;
|
||||
for _ in 0u32..n {
|
||||
for _ in 0..n {
|
||||
a = permutation[a as usize];
|
||||
}
|
||||
a
|
||||
@@ -1064,11 +1001,11 @@ mod bench {
|
||||
#[bench]
|
||||
fn bench_intfastfield_jumpy_fflookup(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
let column: Arc<dyn Column<u64>> = encode_decode_fast_field(&permutation);
|
||||
let n = permutation.len();
|
||||
let column = DynamicFastFieldReader::from(permutation);
|
||||
b.iter(|| {
|
||||
let n = test::black_box(1000u32);
|
||||
let mut a = 0u64;
|
||||
for _ in 0u32..n {
|
||||
for _ in 0..n {
|
||||
a = column.get_val(a as u64);
|
||||
}
|
||||
a
|
||||
@@ -1076,11 +1013,24 @@ mod bench {
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_linear_fflookup(b: &mut Bencher) {
|
||||
fn bench_intfastfield_stride7_vec(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
let column: Arc<dyn Column<u64>> = encode_decode_fast_field(&permutation);
|
||||
let n = permutation.len();
|
||||
b.iter(|| {
|
||||
let mut a = 0u64;
|
||||
for i in (0..n / 7).map(|val| val * 7) {
|
||||
a += permutation[i as usize];
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_stride7_fflookup(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
let n = permutation.len();
|
||||
let column = DynamicFastFieldReader::from(permutation);
|
||||
b.iter(|| {
|
||||
let n = test::black_box(7000u32);
|
||||
let mut a = 0u64;
|
||||
for i in (0..n / 7).map(|val| val * 7) {
|
||||
a += column.get_val(i as u64);
|
||||
@@ -1090,38 +1040,13 @@ mod bench {
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_linear_vec(b: &mut Bencher) {
|
||||
fn bench_intfastfield_scan_all_fflookup(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
b.iter(|| {
|
||||
let n = test::black_box(7000);
|
||||
let mut a = 0u64;
|
||||
for i in (0..n / 7).map(|val| val * 7) {
|
||||
a += permutation[i];
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_fflookup(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
let column: Arc<dyn Column<u64>> = encode_decode_fast_field(&permutation);
|
||||
let n = permutation.len();
|
||||
let column = DynamicFastFieldReader::from(permutation);
|
||||
b.iter(|| {
|
||||
let mut a = 0u64;
|
||||
for i in 0u64..permutation.len() as u64 {
|
||||
a = column.get_val(i);
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_fflookup_gcd(b: &mut Bencher) {
|
||||
let permutation = generate_permutation_gcd();
|
||||
let column: Arc<dyn Column<u64>> = encode_decode_fast_field(&permutation);
|
||||
b.iter(|| {
|
||||
let mut a = 0u64;
|
||||
for i in 0..permutation.len() as u64 {
|
||||
for i in 0u64..n as u64 {
|
||||
a += column.get_val(i);
|
||||
}
|
||||
a
|
||||
@@ -1129,8 +1054,22 @@ mod bench {
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_vec(b: &mut Bencher) {
|
||||
fn bench_intfastfield_scan_all_fflookup_gcd(b: &mut Bencher) {
|
||||
let permutation = generate_permutation_gcd();
|
||||
let n = permutation.len();
|
||||
let column = DynamicFastFieldReader::from(permutation);
|
||||
b.iter(|| {
|
||||
let mut a = 0u64;
|
||||
for i in 0..n as u64 {
|
||||
a += column.get_val(i);
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_scan_all_vec(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
b.iter(|| {
|
||||
let mut a = 0u64;
|
||||
for i in 0..permutation.len() {
|
||||
|
||||
@@ -346,7 +346,6 @@ mod tests {
|
||||
assert!(test_multivalued_no_panic(&ops[..]).is_ok());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multivalued_proptest_gcd() {
|
||||
use IndexingOp::*;
|
||||
|
||||
@@ -1,9 +1,8 @@
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
|
||||
use fastfield_codecs::Column;
|
||||
|
||||
use crate::fastfield::{FastValue, MultiValueLength};
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastValue, MultiValueLength};
|
||||
use crate::DocId;
|
||||
|
||||
/// Reader for a multivalued `u64` fast field.
|
||||
@@ -15,14 +14,14 @@ use crate::DocId;
|
||||
/// The `idx_reader` associated, for each document, the index of its first value.
|
||||
#[derive(Clone)]
|
||||
pub struct MultiValuedFastFieldReader<Item: FastValue> {
|
||||
idx_reader: Arc<dyn Column<u64>>,
|
||||
vals_reader: Arc<dyn Column<Item>>,
|
||||
idx_reader: DynamicFastFieldReader<u64>,
|
||||
vals_reader: DynamicFastFieldReader<Item>,
|
||||
}
|
||||
|
||||
impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
|
||||
pub(crate) fn open(
|
||||
idx_reader: Arc<dyn Column<u64>>,
|
||||
vals_reader: Arc<dyn Column<Item>>,
|
||||
idx_reader: DynamicFastFieldReader<u64>,
|
||||
vals_reader: DynamicFastFieldReader<Item>,
|
||||
) -> MultiValuedFastFieldReader<Item> {
|
||||
MultiValuedFastFieldReader {
|
||||
idx_reader,
|
||||
|
||||
@@ -1,65 +1,144 @@
|
||||
use std::collections::HashMap;
|
||||
use std::marker::PhantomData;
|
||||
use std::sync::Arc;
|
||||
use std::path::Path;
|
||||
|
||||
use common::BinarySerializable;
|
||||
use fastfield_codecs::bitpacked::BitpackedCodec;
|
||||
use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec;
|
||||
use fastfield_codecs::linear::LinearCodec;
|
||||
use fastfield_codecs::{monotonic_map_column, Column, FastFieldCodec, FastFieldCodecType};
|
||||
use fastfield_codecs::bitpacked::{BitpackedCodec, BitpackedReader};
|
||||
use fastfield_codecs::blockwise_linear::{BlockwiseLinearCodec, BlockwiseLinearReader};
|
||||
use fastfield_codecs::linear::{LinearCodec, LinearReader};
|
||||
use fastfield_codecs::{Column, FastFieldCodec, FastFieldCodecType};
|
||||
|
||||
use super::gcd::open_gcd_from_bytes;
|
||||
use super::FastValue;
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::directory::{CompositeFile, Directory, FileSlice, OwnedBytes, RamDirectory, WritePtr};
|
||||
use crate::error::DataCorruption;
|
||||
use crate::fastfield::{CompositeFastFieldSerializer, FastFieldsWriter, GCDReader};
|
||||
use crate::schema::{Schema, FAST};
|
||||
|
||||
fn open_codec_from_bytes<C: FastFieldCodec, Item: FastValue>(
|
||||
bytes: OwnedBytes,
|
||||
) -> crate::Result<Arc<dyn Column<Item>>> {
|
||||
let reader = C::open_from_bytes(bytes)?;
|
||||
Ok(Arc::new(monotonic_map_column(reader, Item::from_u64)))
|
||||
#[derive(Clone)]
|
||||
/// DynamicFastFieldReader wraps different readers to access
|
||||
/// the various encoded fastfield data
|
||||
pub enum DynamicFastFieldReader<Item: FastValue> {
|
||||
/// Bitpacked compressed fastfield data.
|
||||
Bitpacked(FastFieldReaderCodecWrapper<Item, BitpackedReader>),
|
||||
/// Linear interpolated values + bitpacked
|
||||
Linear(FastFieldReaderCodecWrapper<Item, LinearReader>),
|
||||
/// Blockwise linear interpolated values + bitpacked
|
||||
BlockwiseLinear(FastFieldReaderCodecWrapper<Item, BlockwiseLinearReader>),
|
||||
|
||||
/// GCD and Bitpacked compressed fastfield data.
|
||||
BitpackedGCD(FastFieldReaderCodecWrapper<Item, GCDReader<BitpackedReader>>),
|
||||
/// GCD and Linear interpolated values + bitpacked
|
||||
LinearGCD(FastFieldReaderCodecWrapper<Item, GCDReader<LinearReader>>),
|
||||
/// GCD and Blockwise linear interpolated values + bitpacked
|
||||
BlockwiseLinearGCD(FastFieldReaderCodecWrapper<Item, GCDReader<BlockwiseLinearReader>>),
|
||||
}
|
||||
|
||||
fn open_codec_with_gcd<C: FastFieldCodec, Item: FastValue>(
|
||||
bytes: OwnedBytes,
|
||||
) -> crate::Result<Arc<dyn Column<Item>>> {
|
||||
let reader = open_gcd_from_bytes::<C>(bytes)?;
|
||||
Ok(Arc::new(monotonic_map_column(reader, Item::from_u64)))
|
||||
}
|
||||
|
||||
/// Returns correct the reader wrapped in the `DynamicFastFieldReader` enum for the data.
|
||||
fn open_from_id<Item: FastValue>(
|
||||
mut bytes: OwnedBytes,
|
||||
codec_type: FastFieldCodecType,
|
||||
) -> crate::Result<Arc<dyn Column<Item>>> {
|
||||
match codec_type {
|
||||
FastFieldCodecType::Bitpacked => open_codec_from_bytes::<BitpackedCodec, _>(bytes),
|
||||
FastFieldCodecType::Linear => open_codec_from_bytes::<LinearCodec, _>(bytes),
|
||||
FastFieldCodecType::BlockwiseLinear => {
|
||||
open_codec_from_bytes::<BlockwiseLinearCodec, _>(bytes)
|
||||
}
|
||||
FastFieldCodecType::Gcd => {
|
||||
let codec_type = FastFieldCodecType::deserialize(&mut bytes)?;
|
||||
match codec_type {
|
||||
FastFieldCodecType::Bitpacked => open_codec_with_gcd::<BitpackedCodec, _>(bytes),
|
||||
FastFieldCodecType::Linear => open_codec_with_gcd::<LinearCodec, _>(bytes),
|
||||
FastFieldCodecType::BlockwiseLinear => {
|
||||
open_codec_with_gcd::<BlockwiseLinearCodec, _>(bytes)
|
||||
}
|
||||
FastFieldCodecType::Gcd => Err(DataCorruption::comment_only(
|
||||
"Gcd codec wrapped into another gcd codec. This combination is not allowed.",
|
||||
)
|
||||
.into()),
|
||||
impl<Item: FastValue> DynamicFastFieldReader<Item> {
|
||||
/// Returns correct the reader wrapped in the `DynamicFastFieldReader` enum for the data.
|
||||
pub fn open_from_id(
|
||||
mut bytes: OwnedBytes,
|
||||
codec_type: FastFieldCodecType,
|
||||
) -> crate::Result<DynamicFastFieldReader<Item>> {
|
||||
let reader = match codec_type {
|
||||
FastFieldCodecType::Bitpacked => {
|
||||
DynamicFastFieldReader::Bitpacked(BitpackedCodec::open_from_bytes(bytes)?.into())
|
||||
}
|
||||
}
|
||||
FastFieldCodecType::Linear => {
|
||||
DynamicFastFieldReader::Linear(LinearCodec::open_from_bytes(bytes)?.into())
|
||||
}
|
||||
FastFieldCodecType::BlockwiseLinear => DynamicFastFieldReader::BlockwiseLinear(
|
||||
BlockwiseLinearCodec::open_from_bytes(bytes)?.into(),
|
||||
),
|
||||
FastFieldCodecType::Gcd => {
|
||||
let codec_type = FastFieldCodecType::deserialize(&mut bytes)?;
|
||||
match codec_type {
|
||||
FastFieldCodecType::Bitpacked => DynamicFastFieldReader::BitpackedGCD(
|
||||
open_gcd_from_bytes::<BitpackedCodec>(bytes)?.into(),
|
||||
),
|
||||
FastFieldCodecType::Linear => DynamicFastFieldReader::LinearGCD(
|
||||
open_gcd_from_bytes::<LinearCodec>(bytes)?.into(),
|
||||
),
|
||||
FastFieldCodecType::BlockwiseLinear => {
|
||||
DynamicFastFieldReader::BlockwiseLinearGCD(
|
||||
open_gcd_from_bytes::<BlockwiseLinearCodec>(bytes)?.into(),
|
||||
)
|
||||
}
|
||||
FastFieldCodecType::Gcd => {
|
||||
return Err(DataCorruption::comment_only(
|
||||
"Gcd codec wrapped into another gcd codec. This combination is not \
|
||||
allowed.",
|
||||
)
|
||||
.into())
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
Ok(reader)
|
||||
}
|
||||
|
||||
/// Returns correct the reader wrapped in the `DynamicFastFieldReader` enum for the data.
|
||||
pub fn open(file: FileSlice) -> crate::Result<DynamicFastFieldReader<Item>> {
|
||||
let mut bytes = file.read_bytes()?;
|
||||
let codec_type = FastFieldCodecType::deserialize(&mut bytes)?;
|
||||
Self::open_from_id(bytes, codec_type)
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns correct the reader wrapped in the `DynamicFastFieldReader` enum for the data.
|
||||
pub fn open_fast_field<Item: FastValue>(
|
||||
mut bytes: OwnedBytes,
|
||||
) -> crate::Result<Arc<dyn Column<Item>>> {
|
||||
let codec_type = FastFieldCodecType::deserialize(&mut bytes)?;
|
||||
open_from_id(bytes, codec_type)
|
||||
impl<Item: FastValue> Column<Item> for DynamicFastFieldReader<Item> {
|
||||
#[inline]
|
||||
fn get_val(&self, idx: u64) -> Item {
|
||||
match self {
|
||||
Self::Bitpacked(reader) => reader.get_val(idx),
|
||||
Self::Linear(reader) => reader.get_val(idx),
|
||||
Self::BlockwiseLinear(reader) => reader.get_val(idx),
|
||||
Self::BitpackedGCD(reader) => reader.get_val(idx),
|
||||
Self::LinearGCD(reader) => reader.get_val(idx),
|
||||
Self::BlockwiseLinearGCD(reader) => reader.get_val(idx),
|
||||
}
|
||||
}
|
||||
#[inline]
|
||||
fn get_range(&self, start: u64, output: &mut [Item]) {
|
||||
match self {
|
||||
Self::Bitpacked(reader) => reader.get_range(start, output),
|
||||
Self::Linear(reader) => reader.get_range(start, output),
|
||||
Self::BlockwiseLinear(reader) => reader.get_range(start, output),
|
||||
Self::BitpackedGCD(reader) => reader.get_range(start, output),
|
||||
Self::LinearGCD(reader) => reader.get_range(start, output),
|
||||
Self::BlockwiseLinearGCD(reader) => reader.get_range(start, output),
|
||||
}
|
||||
}
|
||||
fn min_value(&self) -> Item {
|
||||
match self {
|
||||
Self::Bitpacked(reader) => reader.min_value(),
|
||||
Self::Linear(reader) => reader.min_value(),
|
||||
Self::BlockwiseLinear(reader) => reader.min_value(),
|
||||
Self::BitpackedGCD(reader) => reader.min_value(),
|
||||
Self::LinearGCD(reader) => reader.min_value(),
|
||||
Self::BlockwiseLinearGCD(reader) => reader.min_value(),
|
||||
}
|
||||
}
|
||||
fn max_value(&self) -> Item {
|
||||
match self {
|
||||
Self::Bitpacked(reader) => reader.max_value(),
|
||||
Self::Linear(reader) => reader.max_value(),
|
||||
Self::BlockwiseLinear(reader) => reader.max_value(),
|
||||
Self::BitpackedGCD(reader) => reader.max_value(),
|
||||
Self::LinearGCD(reader) => reader.max_value(),
|
||||
Self::BlockwiseLinearGCD(reader) => reader.max_value(),
|
||||
}
|
||||
}
|
||||
|
||||
fn num_vals(&self) -> u64 {
|
||||
match self {
|
||||
Self::Bitpacked(reader) => reader.num_vals(),
|
||||
Self::Linear(reader) => reader.num_vals(),
|
||||
Self::BlockwiseLinear(reader) => reader.num_vals(),
|
||||
Self::BitpackedGCD(reader) => reader.num_vals(),
|
||||
Self::LinearGCD(reader) => reader.num_vals(),
|
||||
Self::BlockwiseLinearGCD(reader) => reader.num_vals(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Wrapper for accessing a fastfield.
|
||||
@@ -160,3 +239,40 @@ impl<Item: FastValue, C: Column + Clone> Column<Item> for FastFieldReaderCodecWr
|
||||
self.reader.num_vals()
|
||||
}
|
||||
}
|
||||
|
||||
impl<Item: FastValue> From<Vec<Item>> for DynamicFastFieldReader<Item> {
|
||||
fn from(vals: Vec<Item>) -> DynamicFastFieldReader<Item> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_u64_field("field", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let path = Path::new("__dummy__");
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory
|
||||
.open_write(path)
|
||||
.expect("With a RamDirectory, this should never fail.");
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write)
|
||||
.expect("With a RamDirectory, this should never fail.");
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
|
||||
{
|
||||
let fast_field_writer = fast_field_writers
|
||||
.get_field_writer_mut(field)
|
||||
.expect("With a RamDirectory, this should never fail.");
|
||||
for val in vals {
|
||||
fast_field_writer.add_val(val.to_u64());
|
||||
}
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new(), None)
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
|
||||
let file = directory.open_read(path).expect("Failed to open the file");
|
||||
let composite_file = CompositeFile::open(&file).expect("Failed to read the composite file");
|
||||
let field_file = composite_file
|
||||
.open_read(field)
|
||||
.expect("File component not found");
|
||||
DynamicFastFieldReader::open(field_file).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,9 +1,5 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use fastfield_codecs::Column;
|
||||
|
||||
use super::reader::DynamicFastFieldReader;
|
||||
use crate::directory::{CompositeFile, FileSlice};
|
||||
use crate::fastfield::reader::open_fast_field;
|
||||
use crate::fastfield::{
|
||||
BytesFastFieldReader, FastFieldNotAvailableError, FastValue, MultiValuedFastFieldReader,
|
||||
};
|
||||
@@ -113,16 +109,14 @@ impl FastFieldReaders {
|
||||
&self,
|
||||
field: Field,
|
||||
index: usize,
|
||||
) -> crate::Result<Arc<dyn Column<TFastValue>>> {
|
||||
) -> crate::Result<DynamicFastFieldReader<TFastValue>> {
|
||||
let fast_field_slice = self.fast_field_data(field, index)?;
|
||||
let bytes = fast_field_slice.read_bytes()?;
|
||||
open_fast_field(bytes)
|
||||
DynamicFastFieldReader::open(fast_field_slice)
|
||||
}
|
||||
|
||||
pub(crate) fn typed_fast_field_reader<TFastValue: FastValue>(
|
||||
&self,
|
||||
field: Field,
|
||||
) -> crate::Result<Arc<dyn Column<TFastValue>>> {
|
||||
) -> crate::Result<DynamicFastFieldReader<TFastValue>> {
|
||||
self.typed_fast_field_reader_with_idx(field, 0)
|
||||
}
|
||||
|
||||
@@ -138,7 +132,7 @@ impl FastFieldReaders {
|
||||
/// Returns the `u64` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a u64 fast field, this method returns an Error.
|
||||
pub fn u64(&self, field: Field) -> crate::Result<Arc<dyn Column<u64>>> {
|
||||
pub fn u64(&self, field: Field) -> crate::Result<DynamicFastFieldReader<u64>> {
|
||||
self.check_type(field, FastType::U64, Cardinality::SingleValue)?;
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
@@ -148,14 +142,14 @@ impl FastFieldReaders {
|
||||
///
|
||||
/// If not, the fastfield reader will returns the u64-value associated to the original
|
||||
/// FastValue.
|
||||
pub fn u64_lenient(&self, field: Field) -> crate::Result<Arc<dyn Column<u64>>> {
|
||||
pub fn u64_lenient(&self, field: Field) -> crate::Result<DynamicFastFieldReader<u64>> {
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
|
||||
/// Returns the `i64` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a i64 fast field, this method returns an Error.
|
||||
pub fn i64(&self, field: Field) -> crate::Result<Arc<dyn Column<i64>>> {
|
||||
pub fn i64(&self, field: Field) -> crate::Result<DynamicFastFieldReader<i64>> {
|
||||
self.check_type(field, FastType::I64, Cardinality::SingleValue)?;
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
@@ -163,7 +157,7 @@ impl FastFieldReaders {
|
||||
/// Returns the `date` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a date fast field, this method returns an Error.
|
||||
pub fn date(&self, field: Field) -> crate::Result<Arc<dyn Column<DateTime>>> {
|
||||
pub fn date(&self, field: Field) -> crate::Result<DynamicFastFieldReader<DateTime>> {
|
||||
self.check_type(field, FastType::Date, Cardinality::SingleValue)?;
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
@@ -171,7 +165,7 @@ impl FastFieldReaders {
|
||||
/// Returns the `f64` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a f64 fast field, this method returns an Error.
|
||||
pub fn f64(&self, field: Field) -> crate::Result<Arc<dyn Column<f64>>> {
|
||||
pub fn f64(&self, field: Field) -> crate::Result<DynamicFastFieldReader<f64>> {
|
||||
self.check_type(field, FastType::F64, Cardinality::SingleValue)?;
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
@@ -179,7 +173,7 @@ impl FastFieldReaders {
|
||||
/// Returns the `bool` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a bool fast field, this method returns an Error.
|
||||
pub fn bool(&self, field: Field) -> crate::Result<Arc<dyn Column<bool>>> {
|
||||
pub fn bool(&self, field: Field) -> crate::Result<DynamicFastFieldReader<bool>> {
|
||||
self.check_type(field, FastType::Bool, Cardinality::SingleValue)?;
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
@@ -247,8 +241,7 @@ impl FastFieldReaders {
|
||||
)));
|
||||
}
|
||||
let fast_field_idx_file = self.fast_field_data(field, 0)?;
|
||||
let fast_field_idx_bytes = fast_field_idx_file.read_bytes()?;
|
||||
let idx_reader = open_fast_field(fast_field_idx_bytes)?;
|
||||
let idx_reader = DynamicFastFieldReader::open(fast_field_idx_file)?;
|
||||
let data = self.fast_field_data(field, 1)?;
|
||||
BytesFastFieldReader::open(idx_reader, data)
|
||||
} else {
|
||||
|
||||
@@ -6,7 +6,7 @@ use fastdivide::DividerU64;
|
||||
pub use fastfield_codecs::bitpacked::{BitpackedCodec, BitpackedSerializerLegacy};
|
||||
use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec;
|
||||
use fastfield_codecs::linear::LinearCodec;
|
||||
use fastfield_codecs::{monotonic_map_column, FastFieldCodecType};
|
||||
use fastfield_codecs::FastFieldCodecType;
|
||||
pub use fastfield_codecs::{Column, FastFieldCodec, FastFieldStats};
|
||||
|
||||
use super::{find_gcd, ALL_CODECS, GCD_DEFAULT};
|
||||
@@ -64,8 +64,8 @@ impl From<FastFieldCodecType> for FastFieldCodecEnableCheck {
|
||||
|
||||
// use this, when this is merged and stabilized explicit_generic_args_with_impl_trait
|
||||
// https://github.com/rust-lang/rust/pull/86176
|
||||
fn codec_estimation<C: FastFieldCodec, D: Column>(
|
||||
fastfield_accessor: &D,
|
||||
fn codec_estimation<C: FastFieldCodec>(
|
||||
fastfield_accessor: &impl Column,
|
||||
estimations: &mut Vec<(f32, FastFieldCodecType)>,
|
||||
) {
|
||||
if let Some(ratio) = C::estimate(fastfield_accessor) {
|
||||
@@ -136,22 +136,56 @@ impl CompositeFastFieldSerializer {
|
||||
}
|
||||
|
||||
Self::write_header(field_write, FastFieldCodecType::Gcd)?;
|
||||
struct GCDWrappedFFAccess<T: Column> {
|
||||
fastfield_accessor: T,
|
||||
base_value: u64,
|
||||
max_value: u64,
|
||||
num_vals: u64,
|
||||
gcd: DividerU64,
|
||||
}
|
||||
|
||||
impl<T: Column> Column for GCDWrappedFFAccess<T> {
|
||||
fn get_val(&self, position: u64) -> u64 {
|
||||
self.gcd
|
||||
.divide(self.fastfield_accessor.get_val(position) - self.base_value)
|
||||
}
|
||||
fn iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
|
||||
Box::new(
|
||||
self.fastfield_accessor
|
||||
.iter()
|
||||
.map(|val| self.gcd.divide(val - self.base_value)),
|
||||
)
|
||||
}
|
||||
fn min_value(&self) -> u64 {
|
||||
0
|
||||
}
|
||||
|
||||
fn max_value(&self) -> u64 {
|
||||
self.max_value
|
||||
}
|
||||
|
||||
fn num_vals(&self) -> u64 {
|
||||
self.num_vals
|
||||
}
|
||||
}
|
||||
|
||||
let num_vals = fastfield_accessor.num_vals();
|
||||
let base_value = fastfield_accessor.min_value();
|
||||
let max_value = (fastfield_accessor.max_value() - fastfield_accessor.min_value()) / gcd;
|
||||
|
||||
let gcd_divider = DividerU64::divide_by(gcd);
|
||||
|
||||
let divided_fastfield_accessor = monotonic_map_column(fastfield_accessor, |val: u64| {
|
||||
gcd_divider.divide(val - base_value)
|
||||
});
|
||||
|
||||
let num_vals = divided_fastfield_accessor.num_vals();
|
||||
let fastfield_accessor = GCDWrappedFFAccess {
|
||||
fastfield_accessor,
|
||||
base_value,
|
||||
max_value,
|
||||
num_vals,
|
||||
gcd: DividerU64::divide_by(gcd),
|
||||
};
|
||||
|
||||
Self::create_auto_detect_u64_fast_field_with_idx_gcd(
|
||||
self.codec_enable_checker.clone(),
|
||||
field,
|
||||
field_write,
|
||||
divided_fastfield_accessor,
|
||||
fastfield_accessor,
|
||||
)?;
|
||||
write_gcd_header(field_write, base_value, gcd, num_vals)?;
|
||||
Ok(())
|
||||
@@ -168,13 +202,13 @@ impl CompositeFastFieldSerializer {
|
||||
let mut estimations = vec![];
|
||||
|
||||
if codec_enable_checker.is_enabled(FastFieldCodecType::Bitpacked) {
|
||||
codec_estimation::<BitpackedCodec, _>(&fastfield_accessor, &mut estimations);
|
||||
codec_estimation::<BitpackedCodec>(&fastfield_accessor, &mut estimations);
|
||||
}
|
||||
if codec_enable_checker.is_enabled(FastFieldCodecType::Linear) {
|
||||
codec_estimation::<LinearCodec, _>(&fastfield_accessor, &mut estimations);
|
||||
codec_estimation::<LinearCodec>(&fastfield_accessor, &mut estimations);
|
||||
}
|
||||
if codec_enable_checker.is_enabled(FastFieldCodecType::BlockwiseLinear) {
|
||||
codec_estimation::<BlockwiseLinearCodec, _>(&fastfield_accessor, &mut estimations);
|
||||
codec_estimation::<BlockwiseLinearCodec>(&fastfield_accessor, &mut estimations);
|
||||
}
|
||||
if let Some(broken_estimation) = estimations.iter().find(|estimation| estimation.0.is_nan())
|
||||
{
|
||||
|
||||
@@ -143,6 +143,8 @@ pub(crate) fn get_doc_id_mapping_from_field(
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests_indexsorting {
|
||||
use fastfield_codecs::Column;
|
||||
|
||||
use crate::collector::TopDocs;
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::query::QueryParser;
|
||||
|
||||
@@ -775,6 +775,7 @@ impl Drop for IndexWriter {
|
||||
mod tests {
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use fastfield_codecs::Column;
|
||||
use proptest::prelude::*;
|
||||
use proptest::prop_oneof;
|
||||
use proptest::strategy::Strategy;
|
||||
|
||||
@@ -9,8 +9,8 @@ use crate::core::{Segment, SegmentReader};
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::error::DataCorruption;
|
||||
use crate::fastfield::{
|
||||
AliveBitSet, Column, CompositeFastFieldSerializer, FastFieldStats, MultiValueLength,
|
||||
MultiValuedFastFieldReader,
|
||||
AliveBitSet, Column, CompositeFastFieldSerializer, DynamicFastFieldReader, FastFieldStats,
|
||||
MultiValueLength, MultiValuedFastFieldReader,
|
||||
};
|
||||
use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter};
|
||||
use crate::indexer::doc_id_mapping::{expect_field_id_for_sort_field, SegmentDocIdMapping};
|
||||
@@ -87,7 +87,7 @@ pub struct IndexMerger {
|
||||
}
|
||||
|
||||
fn compute_min_max_val(
|
||||
u64_reader: &dyn Column<u64>,
|
||||
u64_reader: &impl Column<u64>,
|
||||
segment_reader: &SegmentReader,
|
||||
) -> Option<(u64, u64)> {
|
||||
if segment_reader.max_doc() == 0 {
|
||||
@@ -341,12 +341,12 @@ impl IndexMerger {
|
||||
.readers
|
||||
.iter()
|
||||
.filter_map(|reader| {
|
||||
let u64_reader: Arc<dyn Column<u64>> =
|
||||
let u64_reader: DynamicFastFieldReader<u64> =
|
||||
reader.fast_fields().typed_fast_field_reader(field).expect(
|
||||
"Failed to find a reader for single fast field. This is a tantivy bug and \
|
||||
it should never happen.",
|
||||
);
|
||||
compute_min_max_val(&*u64_reader, reader)
|
||||
compute_min_max_val(&u64_reader, reader)
|
||||
})
|
||||
.reduce(|a, b| (a.0.min(b.0), a.1.max(b.1)))
|
||||
.expect("Unexpected error, empty readers in IndexMerger");
|
||||
@@ -355,7 +355,7 @@ impl IndexMerger {
|
||||
.readers
|
||||
.iter()
|
||||
.map(|reader| {
|
||||
let u64_reader: Arc<dyn Column<u64>> =
|
||||
let u64_reader: DynamicFastFieldReader<u64> =
|
||||
reader.fast_fields().typed_fast_field_reader(field).expect(
|
||||
"Failed to find a reader for single fast field. This is a tantivy bug and \
|
||||
it should never happen.",
|
||||
@@ -372,7 +372,7 @@ impl IndexMerger {
|
||||
#[derive(Clone)]
|
||||
struct SortedDocIdFieldAccessProvider<'a> {
|
||||
doc_id_mapping: &'a SegmentDocIdMapping,
|
||||
fast_field_readers: &'a Vec<Arc<dyn Column<u64>>>,
|
||||
fast_field_readers: &'a Vec<DynamicFastFieldReader<u64>>,
|
||||
stats: FastFieldStats,
|
||||
}
|
||||
impl<'a> Column for SortedDocIdFieldAccessProvider<'a> {
|
||||
@@ -443,7 +443,7 @@ impl IndexMerger {
|
||||
pub(crate) fn get_sort_field_accessor(
|
||||
reader: &SegmentReader,
|
||||
sort_by_field: &IndexSortByField,
|
||||
) -> crate::Result<Arc<dyn Column>> {
|
||||
) -> crate::Result<impl Column> {
|
||||
let field_id = expect_field_id_for_sort_field(reader.schema(), sort_by_field)?; // for now expect fastfield, but not strictly required
|
||||
let value_accessor = reader.fast_fields().u64_lenient(field_id)?;
|
||||
Ok(value_accessor)
|
||||
@@ -452,7 +452,7 @@ impl IndexMerger {
|
||||
pub(crate) fn get_reader_with_sort_field_accessor(
|
||||
&self,
|
||||
sort_by_field: &IndexSortByField,
|
||||
) -> crate::Result<Vec<(SegmentOrdinal, Arc<dyn Column>)>> {
|
||||
) -> crate::Result<Vec<(SegmentOrdinal, impl Column)>> {
|
||||
let reader_ordinal_and_field_accessors = self
|
||||
.readers
|
||||
.iter()
|
||||
@@ -618,7 +618,7 @@ impl IndexMerger {
|
||||
.map(|reader| {
|
||||
let u64s_reader: MultiValuedFastFieldReader<u64> = reader
|
||||
.fast_fields()
|
||||
.typed_fast_field_multi_reader::<u64>(field)
|
||||
.typed_fast_field_multi_reader(field)
|
||||
.expect(
|
||||
"Failed to find index for multivalued field. This is a bug in tantivy, \
|
||||
please report.",
|
||||
@@ -668,7 +668,7 @@ impl IndexMerger {
|
||||
{
|
||||
let mut serialize_vals =
|
||||
fast_field_serializer.new_u64_fast_field_with_idx(field, 0u64, max_term_ord, 1)?;
|
||||
let mut vals: Vec<u64> = Vec::with_capacity(100);
|
||||
let mut vals = Vec::with_capacity(100);
|
||||
|
||||
for old_doc_addr in doc_id_mapping.iter_old_doc_addrs() {
|
||||
let term_ordinal_mapping: &[TermOrdinal] =
|
||||
@@ -742,7 +742,7 @@ impl IndexMerger {
|
||||
for reader in &self.readers {
|
||||
let ff_reader: MultiValuedFastFieldReader<u64> = reader
|
||||
.fast_fields()
|
||||
.typed_fast_field_multi_reader::<u64>(field)
|
||||
.typed_fast_field_multi_reader(field)
|
||||
.expect(
|
||||
"Failed to find multivalued fast field reader. This is a bug in tantivy. \
|
||||
Please report.",
|
||||
@@ -1199,6 +1199,7 @@ impl IndexMerger {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use byteorder::{BigEndian, ReadBytesExt};
|
||||
use fastfield_codecs::Column;
|
||||
use schema::FAST;
|
||||
|
||||
use crate::collector::tests::{
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use fastfield_codecs::Column;
|
||||
|
||||
use crate::collector::TopDocs;
|
||||
use crate::core::Index;
|
||||
use crate::fastfield::{AliveBitSet, MultiValuedFastFieldReader};
|
||||
@@ -478,12 +480,11 @@ mod tests {
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench_sorted_index_merge {
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use fastfield_codecs::Column;
|
||||
use test::{self, Bencher};
|
||||
|
||||
use crate::core::Index;
|
||||
use crate::fastfield::DynamicFastFieldReader;
|
||||
use crate::indexer::merger::IndexMerger;
|
||||
use crate::schema::{Cardinality, NumericOptions, Schema};
|
||||
use crate::{IndexSettings, IndexSortByField, IndexWriter, Order};
|
||||
@@ -535,7 +536,7 @@ mod bench_sorted_index_merge {
|
||||
b.iter(|| {
|
||||
let sorted_doc_ids = doc_id_mapping.iter_old_doc_addrs().map(|doc_addr| {
|
||||
let reader = &merger.readers[doc_addr.segment_ord as usize];
|
||||
let u64_reader: Arc<dyn Column<u64>> =
|
||||
let u64_reader: DynamicFastFieldReader<u64> =
|
||||
reader.fast_fields().typed_fast_field_reader(field).expect(
|
||||
"Failed to find a reader for single fast field. This is a tantivy bug and \
|
||||
it should never happen.",
|
||||
|
||||
@@ -301,7 +301,7 @@ pub use self::docset::{DocSet, TERMINATED};
|
||||
pub use crate::core::{
|
||||
Executor, Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, InvertedIndexReader,
|
||||
Order, Searcher, SearcherGeneration, Segment, SegmentComponent, SegmentId, SegmentMeta,
|
||||
SegmentReader,
|
||||
SegmentReader, SingleSegmentIndexWriter,
|
||||
};
|
||||
pub use crate::directory::Directory;
|
||||
pub use crate::indexer::demuxer::*;
|
||||
@@ -421,6 +421,7 @@ pub struct DocAddress {
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
use common::{BinarySerializable, FixedSize};
|
||||
use fastfield_codecs::Column;
|
||||
use rand::distributions::{Bernoulli, Uniform};
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
|
||||
@@ -116,7 +116,7 @@ pub(crate) struct IndexingPosition {
|
||||
/// and building a `Segment` in anonymous memory.
|
||||
///
|
||||
/// `PostingsWriter` writes in a `MemoryArena`.
|
||||
pub(crate) trait PostingsWriter {
|
||||
pub(crate) trait PostingsWriter: Send + Sync {
|
||||
/// Record that a document contains a term at a given position.
|
||||
///
|
||||
/// * doc - the document id
|
||||
|
||||
@@ -56,7 +56,7 @@ impl<'a> Iterator for VInt32Reader<'a> {
|
||||
/// * the document id
|
||||
/// * the term frequency
|
||||
/// * the term positions
|
||||
pub(crate) trait Recorder: Copy + Default + 'static {
|
||||
pub(crate) trait Recorder: Copy + Default + Send + Sync + 'static {
|
||||
/// Returns the current document
|
||||
fn current_doc(&self) -> u32;
|
||||
/// Starts recording information about a new document
|
||||
|
||||
@@ -339,7 +339,7 @@ impl StoreReader {
|
||||
async fn read_block_async(&self, checkpoint: &Checkpoint) -> crate::AsyncIoResult<Block> {
|
||||
let cache_key = checkpoint.byte_range.start;
|
||||
if let Some(block) = self.cache.get_from_cache(checkpoint.byte_range.start) {
|
||||
return Ok(block);
|
||||
return Ok(block.clone());
|
||||
}
|
||||
|
||||
let compressed_block = self
|
||||
|
||||
@@ -172,7 +172,8 @@ where TValueReader: value::ValueReader
|
||||
}
|
||||
|
||||
pub fn suffix(&self) -> &[u8] {
|
||||
self.block_reader
|
||||
&self
|
||||
.block_reader
|
||||
.buffer_from_to(self.suffix_start, self.suffix_end)
|
||||
}
|
||||
|
||||
|
||||
@@ -50,7 +50,7 @@ pub struct SSTableIndexBuilder {
|
||||
/// matches `left <= left' < right`.
|
||||
fn find_shorter_str_in_between(left: &mut Vec<u8>, right: &[u8]) {
|
||||
assert!(&left[..] < right);
|
||||
let common_len = common_prefix_len(left, right);
|
||||
let common_len = common_prefix_len(&left, right);
|
||||
if left.len() == common_len {
|
||||
return;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user