mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-30 14:02:55 +00:00
Compare commits
14 Commits
refact-cod
...
refact-dyn
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7d551fb580 | ||
|
|
a451f6d60d | ||
|
|
f740ddeee3 | ||
|
|
7a26cc9022 | ||
|
|
54972caa7c | ||
|
|
5d436759b0 | ||
|
|
6f563b1606 | ||
|
|
095fb68fda | ||
|
|
6316eaefc6 | ||
|
|
5331be800b | ||
|
|
c73b425bc1 | ||
|
|
54cfd0d154 | ||
|
|
0dd62169c8 | ||
|
|
3984cafccc |
@@ -7,10 +7,12 @@
|
||||
// Of course, you can have a look at the tantivy's built-in collectors
|
||||
// such as the `CountCollector` for more examples.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use fastfield_codecs::Column;
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
use tantivy::collector::{Collector, SegmentCollector};
|
||||
use tantivy::fastfield::{DynamicFastFieldReader, FastFieldReader};
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::{Field, Schema, FAST, INDEXED, TEXT};
|
||||
use tantivy::{doc, Index, Score, SegmentReader};
|
||||
@@ -95,7 +97,7 @@ impl Collector for StatsCollector {
|
||||
}
|
||||
|
||||
struct StatsSegmentCollector {
|
||||
fast_field_reader: DynamicFastFieldReader<u64>,
|
||||
fast_field_reader: Arc<dyn Column<u64>>,
|
||||
stats: Stats,
|
||||
}
|
||||
|
||||
@@ -103,7 +105,7 @@ impl SegmentCollector for StatsSegmentCollector {
|
||||
type Fruit = Option<Stats>;
|
||||
|
||||
fn collect(&mut self, doc: u32, _score: Score) {
|
||||
let value = self.fast_field_reader.get(doc) as f64;
|
||||
let value = self.fast_field_reader.get_val(doc as u64) as f64;
|
||||
self.stats.count += 1;
|
||||
self.stats.sum += value;
|
||||
self.stats.squared_sum += value * value;
|
||||
|
||||
@@ -3,7 +3,6 @@ use std::collections::{HashMap, HashSet};
|
||||
use std::sync::{Arc, RwLock, Weak};
|
||||
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::fastfield::FastFieldReader;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::{Field, Schema, FAST, TEXT};
|
||||
use tantivy::{
|
||||
@@ -52,7 +51,7 @@ impl Warmer for DynamicPriceColumn {
|
||||
let product_id_reader = segment.fast_fields().u64(self.field)?;
|
||||
let product_ids: Vec<ProductId> = segment
|
||||
.doc_ids_alive()
|
||||
.map(|doc| product_id_reader.get(doc))
|
||||
.map(|doc| product_id_reader.get_val(doc as u64))
|
||||
.collect();
|
||||
let mut prices_it = self.price_fetcher.fetch_prices(&product_ids).into_iter();
|
||||
let mut price_vals: Vec<Price> = Vec::new();
|
||||
|
||||
@@ -4,7 +4,7 @@ use common::BinarySerializable;
|
||||
use ownedbytes::OwnedBytes;
|
||||
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
|
||||
|
||||
use crate::{FastFieldCodec, FastFieldCodecType, FastFieldDataAccess};
|
||||
use crate::{Column, FastFieldCodec, FastFieldCodecType};
|
||||
|
||||
/// Depending on the field type, a different
|
||||
/// fast field is required.
|
||||
@@ -17,7 +17,7 @@ pub struct BitpackedReader {
|
||||
num_vals: u64,
|
||||
}
|
||||
|
||||
impl FastFieldDataAccess for BitpackedReader {
|
||||
impl Column for BitpackedReader {
|
||||
#[inline]
|
||||
fn get_val(&self, doc: u64) -> u64 {
|
||||
self.min_value_u64 + self.bit_unpacker.get(doc, &self.data)
|
||||
@@ -124,10 +124,7 @@ impl FastFieldCodec for BitpackedCodec {
|
||||
/// It requires a `min_value` and a `max_value` to compute
|
||||
/// compute the minimum number of bits required to encode
|
||||
/// values.
|
||||
fn serialize(
|
||||
write: &mut impl Write,
|
||||
fastfield_accessor: &dyn FastFieldDataAccess,
|
||||
) -> io::Result<()> {
|
||||
fn serialize(write: &mut impl Write, fastfield_accessor: &dyn Column) -> io::Result<()> {
|
||||
let mut serializer = BitpackedSerializerLegacy::open(
|
||||
write,
|
||||
fastfield_accessor.min_value(),
|
||||
@@ -141,21 +138,19 @@ impl FastFieldCodec for BitpackedCodec {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
fn is_applicable(_fastfield_accessor: &impl FastFieldDataAccess) -> bool {
|
||||
true
|
||||
}
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32 {
|
||||
|
||||
fn estimate(fastfield_accessor: &impl Column) -> Option<f32> {
|
||||
let amplitude = fastfield_accessor.max_value() - fastfield_accessor.min_value();
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let num_bits_uncompressed = 64;
|
||||
num_bits as f32 / num_bits_uncompressed as f32
|
||||
Some(num_bits as f32 / num_bits_uncompressed as f32)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_data_sets;
|
||||
use crate::tests::get_codec_test_datasets;
|
||||
|
||||
fn create_and_validate(data: &[u64], name: &str) {
|
||||
crate::tests::create_and_validate::<BitpackedCodec>(data, name);
|
||||
@@ -163,7 +158,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_with_codec_data_sets() {
|
||||
let data_sets = get_codec_test_data_sets();
|
||||
let data_sets = get_codec_test_datasets();
|
||||
for (mut data, name) in data_sets {
|
||||
create_and_validate(&data, name);
|
||||
data.reverse();
|
||||
|
||||
@@ -18,7 +18,7 @@ use ownedbytes::OwnedBytes;
|
||||
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
|
||||
|
||||
use crate::linear::{get_calculated_value, get_slope};
|
||||
use crate::{FastFieldCodec, FastFieldCodecType, FastFieldDataAccess};
|
||||
use crate::{Column, FastFieldCodec, FastFieldCodecType};
|
||||
|
||||
const CHUNK_SIZE: u64 = 512;
|
||||
|
||||
@@ -146,7 +146,7 @@ fn get_interpolation_function(doc: u64, interpolations: &[Function]) -> &Functio
|
||||
&interpolations[get_interpolation_position(doc)]
|
||||
}
|
||||
|
||||
impl FastFieldDataAccess for BlockwiseLinearReader {
|
||||
impl Column for BlockwiseLinearReader {
|
||||
#[inline]
|
||||
fn get_val(&self, idx: u64) -> u64 {
|
||||
let interpolation = get_interpolation_function(idx, &self.footer.interpolations);
|
||||
@@ -195,10 +195,7 @@ impl FastFieldCodec for BlockwiseLinearCodec {
|
||||
}
|
||||
|
||||
/// Creates a new fast field serializer.
|
||||
fn serialize(
|
||||
write: &mut impl Write,
|
||||
fastfield_accessor: &dyn FastFieldDataAccess,
|
||||
) -> io::Result<()> {
|
||||
fn serialize(write: &mut impl Write, fastfield_accessor: &dyn Column) -> io::Result<()> {
|
||||
assert!(fastfield_accessor.min_value() <= fastfield_accessor.max_value());
|
||||
|
||||
let first_val = fastfield_accessor.get_val(0);
|
||||
@@ -289,10 +286,14 @@ impl FastFieldCodec for BlockwiseLinearCodec {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess) -> bool {
|
||||
if fastfield_accessor.num_vals() < 5_000 {
|
||||
return false;
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima are for the deviation of the calculated value and
|
||||
/// the offset is also unknown.
|
||||
fn estimate(fastfield_accessor: &impl Column) -> Option<f32> {
|
||||
if fastfield_accessor.num_vals() < 10 * CHUNK_SIZE {
|
||||
return None;
|
||||
}
|
||||
|
||||
// On serialization the offset is added to the actual value.
|
||||
// We need to make sure this won't run into overflow calculation issues.
|
||||
// For this we take the maximum theroretical offset and add this to the max value.
|
||||
@@ -304,14 +305,9 @@ impl FastFieldCodec for BlockwiseLinearCodec {
|
||||
.checked_add(theorethical_maximum_offset)
|
||||
.is_none()
|
||||
{
|
||||
return false;
|
||||
return None;
|
||||
}
|
||||
true
|
||||
}
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima are for the deviation of the calculated value and
|
||||
/// the offset is also unknown.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32 {
|
||||
|
||||
let first_val_in_first_block = fastfield_accessor.get_val(0);
|
||||
let last_elem_in_first_chunk = CHUNK_SIZE.min(fastfield_accessor.num_vals());
|
||||
let last_val_in_first_block =
|
||||
@@ -350,7 +346,7 @@ impl FastFieldCodec for BlockwiseLinearCodec {
|
||||
// function metadata per block
|
||||
+ 29 * (fastfield_accessor.num_vals() / CHUNK_SIZE);
|
||||
let num_bits_uncompressed = 64 * fastfield_accessor.num_vals();
|
||||
num_bits as f32 / num_bits_uncompressed as f32
|
||||
Some(num_bits as f32 / num_bits_uncompressed as f32)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -365,10 +361,10 @@ fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_data_sets;
|
||||
use crate::tests::get_codec_test_datasets;
|
||||
|
||||
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
|
||||
crate::tests::create_and_validate::<BlockwiseLinearCodec, BlockwiseLinearReader>(data, name)
|
||||
fn create_and_validate(data: &[u64], name: &str) -> Option<(f32, f32)> {
|
||||
crate::tests::create_and_validate::<BlockwiseLinearCodec>(data, name)
|
||||
}
|
||||
|
||||
const HIGHEST_BIT: u64 = 1 << 63;
|
||||
@@ -382,7 +378,7 @@ mod tests {
|
||||
.map(i64_to_u64)
|
||||
.collect::<Vec<_>>();
|
||||
let (estimate, actual_compression) =
|
||||
create_and_validate(&data, "simple monotonically large i64");
|
||||
create_and_validate(&data, "simple monotonically large i64").unwrap();
|
||||
assert!(actual_compression < 0.2);
|
||||
assert!(estimate < 0.20);
|
||||
assert!(estimate > 0.15);
|
||||
@@ -393,7 +389,7 @@ mod tests {
|
||||
fn test_compression() {
|
||||
let data = (10..=6_000_u64).collect::<Vec<_>>();
|
||||
let (estimate, actual_compression) =
|
||||
create_and_validate(&data, "simple monotonically large");
|
||||
create_and_validate(&data, "simple monotonically large").unwrap();
|
||||
assert!(actual_compression < 0.2);
|
||||
assert!(estimate < 0.20);
|
||||
assert!(estimate > 0.15);
|
||||
@@ -402,7 +398,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_with_codec_data_sets() {
|
||||
let data_sets = get_codec_test_data_sets();
|
||||
let data_sets = get_codec_test_datasets();
|
||||
for (mut data, name) in data_sets {
|
||||
create_and_validate(&data, name);
|
||||
data.reverse();
|
||||
|
||||
146
fastfield_codecs/src/column.rs
Normal file
146
fastfield_codecs/src/column.rs
Normal file
@@ -0,0 +1,146 @@
|
||||
use std::marker::PhantomData;
|
||||
|
||||
pub trait Column<T = u64> {
|
||||
/// Return the value associated to the given idx.
|
||||
///
|
||||
/// This accessor should return as fast as possible.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `idx` is greater than the column length.
|
||||
fn get_val(&self, idx: u64) -> T;
|
||||
|
||||
/// Fills an output buffer with the fast field values
|
||||
/// associated with the `DocId` going from
|
||||
/// `start` to `start + output.len()`.
|
||||
///
|
||||
/// Regardless of the type of `Item`, this method works
|
||||
/// - transmuting the output array
|
||||
/// - extracting the `Item`s as if they were `u64`
|
||||
/// - possibly converting the `u64` value to the right type.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `start + output.len()` is greater than
|
||||
/// the segment's `maxdoc`.
|
||||
fn get_range(&self, start: u64, output: &mut [T]) {
|
||||
for (out, idx) in output.iter_mut().zip(start..) {
|
||||
*out = self.get_val(idx);
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the minimum value for this fast field.
|
||||
///
|
||||
/// The min value does not take in account of possible
|
||||
/// deleted document, and should be considered as a lower bound
|
||||
/// of the actual minimum value.
|
||||
fn min_value(&self) -> T;
|
||||
|
||||
/// Returns the maximum value for this fast field.
|
||||
///
|
||||
/// The max value does not take in account of possible
|
||||
/// deleted document, and should be considered as an upper bound
|
||||
/// of the actual maximum value
|
||||
fn max_value(&self) -> T;
|
||||
|
||||
fn num_vals(&self) -> u64;
|
||||
|
||||
/// Returns a iterator over the data
|
||||
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = T> + 'a> {
|
||||
Box::new((0..self.num_vals()).map(|idx| self.get_val(idx)))
|
||||
}
|
||||
}
|
||||
|
||||
struct VecColumn<'a>(&'a [u64]);
|
||||
impl<'a> Column for VecColumn<'a> {
|
||||
fn get_val(&self, position: u64) -> u64 {
|
||||
self.0[position as usize]
|
||||
}
|
||||
|
||||
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
|
||||
Box::new(self.0.iter().cloned())
|
||||
}
|
||||
|
||||
fn min_value(&self) -> u64 {
|
||||
self.0.iter().min().cloned().unwrap_or(0)
|
||||
}
|
||||
|
||||
fn max_value(&self) -> u64 {
|
||||
self.0.iter().max().cloned().unwrap_or(0)
|
||||
}
|
||||
|
||||
fn num_vals(&self) -> u64 {
|
||||
self.0.len() as u64
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a [u64]> for VecColumn<'a> {
|
||||
fn from(data: &'a [u64]) -> Self {
|
||||
Self(data)
|
||||
}
|
||||
}
|
||||
|
||||
struct MonotonicMappingColumn<C, T, Input> {
|
||||
from_column: C,
|
||||
monotonic_mapping: T,
|
||||
_phantom: PhantomData<Input>,
|
||||
}
|
||||
|
||||
/// Creates a view of a column transformed by a monotonic mapping.
|
||||
pub fn monotonic_map_column<C, T, Input, Output>(
|
||||
from_column: C,
|
||||
monotonic_mapping: T,
|
||||
) -> impl Column<Output>
|
||||
where
|
||||
C: Column<Input>,
|
||||
T: Fn(Input) -> Output,
|
||||
{
|
||||
MonotonicMappingColumn {
|
||||
from_column,
|
||||
monotonic_mapping,
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
impl<C, T, Input, Output> Column<Output> for MonotonicMappingColumn<C, T, Input>
|
||||
where
|
||||
C: Column<Input>,
|
||||
T: Fn(Input) -> Output,
|
||||
{
|
||||
fn get_val(&self, idx: u64) -> Output {
|
||||
let from_val = self.from_column.get_val(idx);
|
||||
(self.monotonic_mapping)(from_val)
|
||||
}
|
||||
|
||||
fn min_value(&self) -> Output {
|
||||
let from_min_value = self.from_column.min_value();
|
||||
(self.monotonic_mapping)(from_min_value)
|
||||
}
|
||||
|
||||
fn max_value(&self) -> Output {
|
||||
let from_max_value = self.from_column.max_value();
|
||||
(self.monotonic_mapping)(from_max_value)
|
||||
}
|
||||
|
||||
fn num_vals(&self) -> u64 {
|
||||
self.from_column.num_vals()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_monotonic_mapping() {
|
||||
let vals = &[1u64, 3u64][..];
|
||||
let col = VecColumn::from(vals);
|
||||
let mapped = monotonic_map_column(col, |el| el + 4);
|
||||
assert_eq!(mapped.min_value(), 5u64);
|
||||
assert_eq!(mapped.max_value(), 7u64);
|
||||
assert_eq!(mapped.num_vals(), 2);
|
||||
assert_eq!(mapped.num_vals(), 2);
|
||||
assert_eq!(mapped.get_val(0), 5);
|
||||
assert_eq!(mapped.get_val(0), 7);
|
||||
}
|
||||
}
|
||||
@@ -12,16 +12,9 @@ pub mod bitpacked;
|
||||
pub mod blockwise_linear;
|
||||
pub mod linear;
|
||||
|
||||
pub trait FastFieldDataAccess {
|
||||
fn get_val(&self, doc: u64) -> u64;
|
||||
fn min_value(&self) -> u64;
|
||||
fn max_value(&self) -> u64;
|
||||
fn num_vals(&self) -> u64;
|
||||
/// Returns a iterator over the data
|
||||
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = u64> + 'a> {
|
||||
Box::new((0..self.num_vals()).map(|idx| self.get_val(idx)))
|
||||
}
|
||||
}
|
||||
mod column;
|
||||
|
||||
pub use self::column::{monotonic_map_column, Column};
|
||||
|
||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
|
||||
#[repr(u8)]
|
||||
@@ -63,12 +56,12 @@ impl FastFieldCodecType {
|
||||
|
||||
/// The FastFieldSerializerEstimate trait is required on all variants
|
||||
/// of fast field compressions, to decide which one to choose.
|
||||
pub trait FastFieldCodec {
|
||||
pub trait FastFieldCodec: 'static {
|
||||
/// A codex needs to provide a unique name and id, which is
|
||||
/// used for debugging and de/serialization.
|
||||
const CODEC_TYPE: FastFieldCodecType;
|
||||
|
||||
type Reader: FastFieldDataAccess;
|
||||
type Reader: Column<u64> + 'static;
|
||||
|
||||
/// Reads the metadata and returns the CodecReader
|
||||
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self::Reader>;
|
||||
@@ -77,20 +70,16 @@ pub trait FastFieldCodec {
|
||||
///
|
||||
/// The fastfield_accessor iterator should be preferred over using fastfield_accessor for
|
||||
/// performance reasons.
|
||||
fn serialize(
|
||||
write: &mut impl Write,
|
||||
fastfield_accessor: &dyn FastFieldDataAccess,
|
||||
) -> io::Result<()>;
|
||||
|
||||
/// Check if the Codec is able to compress the data
|
||||
fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess) -> bool;
|
||||
fn serialize(write: &mut impl Write, fastfield_accessor: &dyn Column<u64>) -> io::Result<()>;
|
||||
|
||||
/// Returns an estimate of the compression ratio.
|
||||
/// If the codec is not applicable, returns `None`.
|
||||
///
|
||||
/// The baseline is uncompressed 64bit data.
|
||||
///
|
||||
/// It could make sense to also return a value representing
|
||||
/// computational complexity.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32;
|
||||
fn estimate(fastfield_accessor: &impl Column) -> Option<f32>;
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -101,48 +90,6 @@ pub struct FastFieldStats {
|
||||
pub num_vals: u64,
|
||||
}
|
||||
|
||||
impl<'a> FastFieldDataAccess for &'a [u64] {
|
||||
fn get_val(&self, position: u64) -> u64 {
|
||||
self[position as usize]
|
||||
}
|
||||
|
||||
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
|
||||
Box::new((self as &[u64]).iter().cloned())
|
||||
}
|
||||
|
||||
fn min_value(&self) -> u64 {
|
||||
self.iter().min().unwrap_or(0)
|
||||
}
|
||||
|
||||
fn max_value(&self) -> u64 {
|
||||
self.iter().max().unwrap_or(0)
|
||||
}
|
||||
|
||||
fn num_vals(&self) -> u64 {
|
||||
self.len() as u64
|
||||
}
|
||||
}
|
||||
|
||||
impl FastFieldDataAccess for Vec<u64> {
|
||||
fn get_val(&self, position: u64) -> u64 {
|
||||
self[position as usize]
|
||||
}
|
||||
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
|
||||
Box::new((self as &[u64]).iter().cloned())
|
||||
}
|
||||
fn min_value(&self) -> u64 {
|
||||
self.iter().min().unwrap_or(0)
|
||||
}
|
||||
|
||||
fn max_value(&self) -> u64 {
|
||||
self.iter().max().unwrap_or(0)
|
||||
}
|
||||
|
||||
fn num_vals(&self) -> u64 {
|
||||
self.len() as u64
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use proptest::arbitrary::any;
|
||||
@@ -152,28 +99,28 @@ mod tests {
|
||||
use crate::blockwise_linear::BlockwiseLinearCodec;
|
||||
use crate::linear::LinearCodec;
|
||||
|
||||
pub fn create_and_validate<Codec: FastFieldCodec>(data: &[u64], name: &str) -> (f32, f32) {
|
||||
if !Codec::is_applicable(&data) {
|
||||
return (f32::MAX, 0.0);
|
||||
}
|
||||
let estimation = Codec::estimate(&data);
|
||||
pub fn create_and_validate<Codec: FastFieldCodec>(
|
||||
data: &[u64],
|
||||
name: &str,
|
||||
) -> Option<(f32, f32)> {
|
||||
let estimation = Codec::estimate(&VecColum::from(data))?;
|
||||
|
||||
let mut out: Vec<u8> = Vec::new();
|
||||
Codec::serialize(&mut out, &data).unwrap();
|
||||
Codec::serialize(&mut out, &VecColum::from(data)).unwrap();
|
||||
|
||||
let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0);
|
||||
|
||||
let reader = Codec::open_from_bytes(OwnedBytes::new(out)).unwrap();
|
||||
assert_eq!(reader.num_vals(), data.len() as u64);
|
||||
for (doc, orig_val) in data.iter().enumerate() {
|
||||
for (doc, orig_val) in data.iter().copied().enumerate() {
|
||||
let val = reader.get_val(doc as u64);
|
||||
if val != *orig_val {
|
||||
panic!(
|
||||
"val {val:?} does not match orig_val {orig_val:?}, in data set {name}, data \
|
||||
{data:?}",
|
||||
);
|
||||
}
|
||||
assert_eq!(
|
||||
val, orig_val,
|
||||
"val `{val}` does not match orig_val {orig_val:?}, in data set {name}, data \
|
||||
`{data:?}`",
|
||||
);
|
||||
}
|
||||
(estimation, actual_compression)
|
||||
Some((estimation, actual_compression))
|
||||
}
|
||||
|
||||
proptest! {
|
||||
@@ -193,10 +140,10 @@ mod tests {
|
||||
|
||||
}
|
||||
|
||||
pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
pub fn get_codec_test_datasets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
let mut data_and_names = vec![];
|
||||
|
||||
let data = (10..=20_u64).collect::<Vec<_>>();
|
||||
let data = (10..=10_000_u64).collect::<Vec<_>>();
|
||||
data_and_names.push((data, "simple monotonically increasing"));
|
||||
|
||||
data_and_names.push((
|
||||
@@ -206,17 +153,23 @@ mod tests {
|
||||
data_and_names.push((vec![5, 50, 3, 13, 1, 1000, 35], "rand small"));
|
||||
data_and_names.push((vec![10], "single value"));
|
||||
|
||||
data_and_names.push((
|
||||
vec![1572656989877777, 1170935903116329, 720575940379279, 0],
|
||||
"overflow error",
|
||||
));
|
||||
|
||||
data_and_names
|
||||
}
|
||||
|
||||
fn test_codec<C: FastFieldCodec>() {
|
||||
let codec_name = format!("{:?}", C::CODEC_TYPE);
|
||||
for (data, dataset_name) in get_codec_test_data_sets() {
|
||||
let (estimate, actual) = crate::tests::create_and_validate::<C>(&data, dataset_name);
|
||||
let result = if estimate == f32::MAX {
|
||||
"Disabled".to_string()
|
||||
} else {
|
||||
for (data, dataset_name) in get_codec_test_datasets() {
|
||||
let estimate_actual_opt: Option<(f32, f32)> =
|
||||
crate::tests::create_and_validate::<C>(&data, dataset_name);
|
||||
let result = if let Some((estimate, actual)) = estimate_actual_opt {
|
||||
format!("Estimate `{estimate}` Actual `{actual}`")
|
||||
} else {
|
||||
"Disabled".to_string()
|
||||
};
|
||||
println!("Codec {codec_name}, DataSet {dataset_name}, {result}");
|
||||
}
|
||||
@@ -239,38 +192,41 @@ mod tests {
|
||||
#[test]
|
||||
fn estimation_good_interpolation_case() {
|
||||
let data = (10..=20000_u64).collect::<Vec<_>>();
|
||||
let data: VecColum = data.as_slice().into();
|
||||
|
||||
let linear_interpol_estimation = LinearCodec::estimate(&data);
|
||||
let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
|
||||
assert_le!(linear_interpol_estimation, 0.01);
|
||||
|
||||
let multi_linear_interpol_estimation = BlockwiseLinearCodec::estimate(&data);
|
||||
let multi_linear_interpol_estimation = BlockwiseLinearCodec::estimate(&data).unwrap();
|
||||
assert_le!(multi_linear_interpol_estimation, 0.2);
|
||||
assert_le!(linear_interpol_estimation, multi_linear_interpol_estimation);
|
||||
|
||||
let bitpacked_estimation = BitpackedCodec::estimate(&data);
|
||||
let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
|
||||
assert_le!(linear_interpol_estimation, bitpacked_estimation);
|
||||
}
|
||||
#[test]
|
||||
fn estimation_test_bad_interpolation_case() {
|
||||
let data = vec![200, 10, 10, 10, 10, 1000, 20];
|
||||
let data: &[u64] = &[200, 10, 10, 10, 10, 1000, 20];
|
||||
|
||||
let linear_interpol_estimation = LinearCodec::estimate(&data);
|
||||
let data: VecColum = data.into();
|
||||
let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
|
||||
assert_le!(linear_interpol_estimation, 0.32);
|
||||
|
||||
let bitpacked_estimation = BitpackedCodec::estimate(&data);
|
||||
let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
|
||||
assert_le!(bitpacked_estimation, linear_interpol_estimation);
|
||||
}
|
||||
#[test]
|
||||
fn estimation_test_bad_interpolation_case_monotonically_increasing() {
|
||||
let mut data = (200..=20000_u64).collect::<Vec<_>>();
|
||||
let mut data: Vec<u64> = (200..=20000_u64).collect();
|
||||
data.push(1_000_000);
|
||||
let data: VecColum = data.as_slice().into();
|
||||
|
||||
// in this case the linear interpolation can't in fact not be worse than bitpacking,
|
||||
// but the estimator adds some threshold, which leads to estimated worse behavior
|
||||
let linear_interpol_estimation = LinearCodec::estimate(&data);
|
||||
let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
|
||||
assert_le!(linear_interpol_estimation, 0.35);
|
||||
|
||||
let bitpacked_estimation = BitpackedCodec::estimate(&data);
|
||||
let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
|
||||
assert_le!(bitpacked_estimation, 0.32);
|
||||
assert_le!(bitpacked_estimation, linear_interpol_estimation);
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@ use common::{BinarySerializable, FixedSize};
|
||||
use ownedbytes::OwnedBytes;
|
||||
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
|
||||
|
||||
use crate::{FastFieldCodec, FastFieldCodecType, FastFieldDataAccess};
|
||||
use crate::{Column, FastFieldCodec, FastFieldCodecType};
|
||||
|
||||
/// Depending on the field type, a different
|
||||
/// fast field is required.
|
||||
@@ -57,7 +57,7 @@ impl FixedSize for LinearFooter {
|
||||
const SIZE_IN_BYTES: usize = 56;
|
||||
}
|
||||
|
||||
impl FastFieldDataAccess for LinearReader {
|
||||
impl Column for LinearReader {
|
||||
#[inline]
|
||||
fn get_val(&self, doc: u64) -> u64 {
|
||||
let calculated_value = get_calculated_value(self.footer.first_val, doc, self.slope);
|
||||
@@ -115,9 +115,9 @@ fn diff(val1: u64, val2: u64) -> f64 {
|
||||
#[inline]
|
||||
pub fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
|
||||
if slope < 0.0 {
|
||||
first_val - (pos as f32 * -slope) as u64
|
||||
first_val.saturating_sub((pos as f32 * -slope) as u64)
|
||||
} else {
|
||||
first_val + (pos as f32 * slope) as u64
|
||||
first_val.saturating_add((pos as f32 * slope) as u64)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -143,10 +143,7 @@ impl FastFieldCodec for LinearCodec {
|
||||
}
|
||||
|
||||
/// Creates a new fast field serializer.
|
||||
fn serialize(
|
||||
write: &mut impl Write,
|
||||
fastfield_accessor: &dyn FastFieldDataAccess,
|
||||
) -> io::Result<()> {
|
||||
fn serialize(write: &mut impl Write, fastfield_accessor: &dyn Column) -> io::Result<()> {
|
||||
assert!(fastfield_accessor.min_value() <= fastfield_accessor.max_value());
|
||||
|
||||
let first_val = fastfield_accessor.get_val(0);
|
||||
@@ -192,10 +189,15 @@ impl FastFieldCodec for LinearCodec {
|
||||
footer.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess) -> bool {
|
||||
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima for the deviation of the calculated value are and
|
||||
/// the offset to shift all values to >=0 is also unknown.
|
||||
fn estimate(fastfield_accessor: &impl Column) -> Option<f32> {
|
||||
if fastfield_accessor.num_vals() < 3 {
|
||||
return false; // disable compressor for this case
|
||||
return None; // disable compressor for this case
|
||||
}
|
||||
|
||||
// On serialisation the offset is added to the actual value.
|
||||
// We need to make sure this won't run into overflow calculation issues.
|
||||
// For this we take the maximum theroretical offset and add this to the max value.
|
||||
@@ -207,14 +209,9 @@ impl FastFieldCodec for LinearCodec {
|
||||
.checked_add(theorethical_maximum_offset)
|
||||
.is_none()
|
||||
{
|
||||
return false;
|
||||
return None;
|
||||
}
|
||||
true
|
||||
}
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima for the deviation of the calculated value are and
|
||||
/// the offset to shift all values to >=0 is also unknown.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32 {
|
||||
|
||||
let first_val = fastfield_accessor.get_val(0);
|
||||
let last_val = fastfield_accessor.get_val(fastfield_accessor.num_vals() as u64 - 1);
|
||||
let slope = get_slope(first_val, last_val, fastfield_accessor.num_vals());
|
||||
@@ -246,7 +243,7 @@ impl FastFieldCodec for LinearCodec {
|
||||
* fastfield_accessor.num_vals()
|
||||
+ LinearFooter::SIZE_IN_BYTES as u64;
|
||||
let num_bits_uncompressed = 64 * fastfield_accessor.num_vals();
|
||||
num_bits as f32 / num_bits_uncompressed as f32
|
||||
Some(num_bits as f32 / num_bits_uncompressed as f32)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -262,10 +259,10 @@ fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_data_sets;
|
||||
use crate::tests::get_codec_test_datasets;
|
||||
|
||||
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
|
||||
crate::tests::create_and_validate::<LinearCodec, LinearReader>(data, name)
|
||||
fn create_and_validate(data: &[u64], name: &str) -> Option<(f32, f32)> {
|
||||
crate::tests::create_and_validate::<LinearCodec>(data, name)
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -292,15 +289,15 @@ mod tests {
|
||||
fn test_compression() {
|
||||
let data = (10..=6_000_u64).collect::<Vec<_>>();
|
||||
let (estimate, actual_compression) =
|
||||
create_and_validate(&data, "simple monotonically large");
|
||||
create_and_validate(&data, "simple monotonically large").unwrap();
|
||||
|
||||
assert!(actual_compression < 0.01);
|
||||
assert!(estimate < 0.01);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_with_codec_data_sets() {
|
||||
let data_sets = get_codec_test_data_sets();
|
||||
fn test_with_codec_datasets() {
|
||||
let data_sets = get_codec_test_datasets();
|
||||
for (mut data, name) in data_sets {
|
||||
create_and_validate(&data, name);
|
||||
data.reverse();
|
||||
@@ -317,6 +314,13 @@ mod tests {
|
||||
|
||||
create_and_validate(&data, "large amplitude");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn overflow_error_test() {
|
||||
let data = vec![1572656989877777, 1170935903116329, 720575940379279, 0];
|
||||
create_and_validate(&data, "overflow test");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn linear_interpol_fast_concave_data() {
|
||||
let data = vec![0, 1, 2, 5, 8, 10, 20, 50];
|
||||
@@ -337,9 +341,10 @@ mod tests {
|
||||
#[test]
|
||||
fn linear_interpol_fast_field_rand() {
|
||||
for _ in 0..5000 {
|
||||
let mut data = (0..50).map(|_| rand::random::<u64>()).collect::<Vec<_>>();
|
||||
let mut data = (0..10_000)
|
||||
.map(|_| rand::random::<u64>())
|
||||
.collect::<Vec<_>>();
|
||||
create_and_validate(&data, "random");
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data, "random");
|
||||
}
|
||||
|
||||
@@ -1,10 +1,35 @@
|
||||
#[macro_use]
|
||||
extern crate prettytable;
|
||||
use fastfield_codecs::bitpacked::BitpackedCodec;
|
||||
use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec;
|
||||
use fastfield_codecs::linear::LinearCodec;
|
||||
use fastfield_codecs::{FastFieldCodec, FastFieldCodecType, FastFieldStats};
|
||||
use fastfield_codecs::{Column, FastFieldCodec, FastFieldCodecType, FastFieldStats};
|
||||
use prettytable::{Cell, Row, Table};
|
||||
|
||||
struct Data<'a>(&'a [u64]);
|
||||
|
||||
impl<'a> Column for Data<'a> {
|
||||
fn get_val(&self, position: u64) -> u64 {
|
||||
self.0[position as usize]
|
||||
}
|
||||
|
||||
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
|
||||
Box::new(self.0.iter().cloned())
|
||||
}
|
||||
|
||||
fn min_value(&self) -> u64 {
|
||||
*self.0.iter().min().unwrap_or(&0)
|
||||
}
|
||||
|
||||
fn max_value(&self) -> u64 {
|
||||
*self.0.iter().max().unwrap_or(&0)
|
||||
}
|
||||
|
||||
fn num_vals(&self) -> u64 {
|
||||
self.0.len() as u64
|
||||
}
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let mut table = Table::new();
|
||||
|
||||
@@ -12,37 +37,30 @@ fn main() {
|
||||
table.add_row(row!["", "Compression Ratio", "Compression Estimation"]);
|
||||
|
||||
for (data, data_set_name) in get_codec_test_data_sets() {
|
||||
let mut results = vec![];
|
||||
let res = serialize_with_codec::<LinearCodec>(&data);
|
||||
results.push(res);
|
||||
let res = serialize_with_codec::<BlockwiseLinearCodec>(&data);
|
||||
results.push(res);
|
||||
let res = serialize_with_codec::<fastfield_codecs::bitpacked::BitpackedCodec>(&data);
|
||||
results.push(res);
|
||||
|
||||
// let best_estimation_codec = results
|
||||
//.iter()
|
||||
//.min_by(|res1, res2| res1.partial_cmp(&res2).unwrap())
|
||||
//.unwrap();
|
||||
let results: Vec<(f32, f32, FastFieldCodecType)> = [
|
||||
serialize_with_codec::<LinearCodec>(&data),
|
||||
serialize_with_codec::<BlockwiseLinearCodec>(&data),
|
||||
serialize_with_codec::<BlockwiseLinearCodec>(&data),
|
||||
serialize_with_codec::<BitpackedCodec>(&data),
|
||||
]
|
||||
.into_iter()
|
||||
.flatten()
|
||||
.collect();
|
||||
let best_compression_ratio_codec = results
|
||||
.iter()
|
||||
.min_by(|res1, res2| res1.partial_cmp(res2).unwrap())
|
||||
.min_by(|&res1, &res2| res1.partial_cmp(res2).unwrap())
|
||||
.cloned()
|
||||
.unwrap();
|
||||
|
||||
table.add_row(Row::new(vec![Cell::new(data_set_name).style_spec("Bbb")]));
|
||||
for (is_applicable, est, comp, codec_type) in results {
|
||||
let (est_cell, ratio_cell) = if !is_applicable {
|
||||
("Codec Disabled".to_string(), "".to_string())
|
||||
} else {
|
||||
(est.to_string(), comp.to_string())
|
||||
};
|
||||
for (est, comp, codec_type) in results {
|
||||
let est_cell = est.to_string();
|
||||
let ratio_cell = comp.to_string();
|
||||
let style = if comp == best_compression_ratio_codec.1 {
|
||||
"Fb"
|
||||
} else {
|
||||
""
|
||||
};
|
||||
|
||||
table.add_row(Row::new(vec![
|
||||
Cell::new(&format!("{codec_type:?}")).style_spec("bFg"),
|
||||
Cell::new(&ratio_cell).style_spec(style),
|
||||
@@ -91,17 +109,13 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
|
||||
pub fn serialize_with_codec<C: FastFieldCodec>(
|
||||
data: &[u64],
|
||||
) -> (bool, f32, f32, FastFieldCodecType) {
|
||||
let is_applicable = C::is_applicable(&data);
|
||||
if !is_applicable {
|
||||
return (false, 0.0, 0.0, C::CODEC_TYPE);
|
||||
}
|
||||
let estimation = C::estimate(&data);
|
||||
let mut out = vec![];
|
||||
) -> Option<(f32, f32, FastFieldCodecType)> {
|
||||
let data = Data(data);
|
||||
let estimation = C::estimate(&data)?;
|
||||
let mut out = Vec::new();
|
||||
C::serialize(&mut out, &data).unwrap();
|
||||
|
||||
let actual_compression = out.len() as f32 / (data.len() * 8) as f32;
|
||||
(true, estimation, actual_compression, C::CODEC_TYPE)
|
||||
let actual_compression = out.len() as f32 / (data.num_vals() * 8) as f32;
|
||||
Some((estimation, actual_compression, C::CODEC_TYPE))
|
||||
}
|
||||
|
||||
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
|
||||
|
||||
@@ -4,14 +4,14 @@ use std::rc::Rc;
|
||||
use std::sync::atomic::AtomicU32;
|
||||
use std::sync::Arc;
|
||||
|
||||
use fastfield_codecs::Column;
|
||||
|
||||
use super::agg_req::{Aggregation, Aggregations, BucketAggregationType, MetricAggregation};
|
||||
use super::bucket::{HistogramAggregation, RangeAggregation, TermsAggregation};
|
||||
use super::metric::{AverageAggregation, StatsAggregation};
|
||||
use super::segment_agg_result::BucketCount;
|
||||
use super::VecWithNames;
|
||||
use crate::fastfield::{
|
||||
type_and_cardinality, DynamicFastFieldReader, FastType, MultiValuedFastFieldReader,
|
||||
};
|
||||
use crate::fastfield::{type_and_cardinality, FastType, MultiValuedFastFieldReader};
|
||||
use crate::schema::{Cardinality, Type};
|
||||
use crate::{InvertedIndexReader, SegmentReader, TantivyError};
|
||||
|
||||
@@ -37,10 +37,16 @@ impl AggregationsWithAccessor {
|
||||
#[derive(Clone)]
|
||||
pub(crate) enum FastFieldAccessor {
|
||||
Multi(MultiValuedFastFieldReader<u64>),
|
||||
Single(DynamicFastFieldReader<u64>),
|
||||
Single(Arc<dyn Column<u64>>),
|
||||
}
|
||||
impl FastFieldAccessor {
|
||||
pub fn as_single(&self) -> Option<&DynamicFastFieldReader<u64>> {
|
||||
pub fn as_single(&self) -> Option<&dyn Column<u64>> {
|
||||
match self {
|
||||
FastFieldAccessor::Multi(_) => None,
|
||||
FastFieldAccessor::Single(reader) => Some(&**reader),
|
||||
}
|
||||
}
|
||||
pub fn into_single(self) -> Option<Arc<dyn Column<u64>>> {
|
||||
match self {
|
||||
FastFieldAccessor::Multi(_) => None,
|
||||
FastFieldAccessor::Single(reader) => Some(reader),
|
||||
@@ -118,7 +124,7 @@ impl BucketAggregationWithAccessor {
|
||||
pub struct MetricAggregationWithAccessor {
|
||||
pub metric: MetricAggregation,
|
||||
pub field_type: Type,
|
||||
pub accessor: DynamicFastFieldReader<u64>,
|
||||
pub accessor: Arc<dyn Column>,
|
||||
}
|
||||
|
||||
impl MetricAggregationWithAccessor {
|
||||
@@ -134,9 +140,8 @@ impl MetricAggregationWithAccessor {
|
||||
|
||||
Ok(MetricAggregationWithAccessor {
|
||||
accessor: accessor
|
||||
.as_single()
|
||||
.expect("unexpected fast field cardinality")
|
||||
.clone(),
|
||||
.into_single()
|
||||
.expect("unexpected fast field cardinality"),
|
||||
field_type,
|
||||
metric: metric.clone(),
|
||||
})
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::cmp::Ordering;
|
||||
use std::fmt::Display;
|
||||
|
||||
use fastfield_codecs::Column;
|
||||
use itertools::Itertools;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
@@ -14,7 +15,6 @@ use crate::aggregation::intermediate_agg_result::{
|
||||
IntermediateAggregationResults, IntermediateBucketResult, IntermediateHistogramBucketEntry,
|
||||
};
|
||||
use crate::aggregation::segment_agg_result::SegmentAggregationResultsCollector;
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader};
|
||||
use crate::schema::Type;
|
||||
use crate::{DocId, TantivyError};
|
||||
|
||||
@@ -263,7 +263,7 @@ impl SegmentHistogramCollector {
|
||||
req: &HistogramAggregation,
|
||||
sub_aggregation: &AggregationsWithAccessor,
|
||||
field_type: Type,
|
||||
accessor: &DynamicFastFieldReader<u64>,
|
||||
accessor: &dyn Column<u64>,
|
||||
) -> crate::Result<Self> {
|
||||
req.validate()?;
|
||||
let min = f64_from_fastfield_u64(accessor.min_value(), &field_type);
|
||||
@@ -331,10 +331,10 @@ impl SegmentHistogramCollector {
|
||||
.expect("unexpected fast field cardinatility");
|
||||
let mut iter = doc.chunks_exact(4);
|
||||
for docs in iter.by_ref() {
|
||||
let val0 = self.f64_from_fastfield_u64(accessor.get(docs[0]));
|
||||
let val1 = self.f64_from_fastfield_u64(accessor.get(docs[1]));
|
||||
let val2 = self.f64_from_fastfield_u64(accessor.get(docs[2]));
|
||||
let val3 = self.f64_from_fastfield_u64(accessor.get(docs[3]));
|
||||
let val0 = self.f64_from_fastfield_u64(accessor.get_val(docs[0] as u64));
|
||||
let val1 = self.f64_from_fastfield_u64(accessor.get_val(docs[1] as u64));
|
||||
let val2 = self.f64_from_fastfield_u64(accessor.get_val(docs[2] as u64));
|
||||
let val3 = self.f64_from_fastfield_u64(accessor.get_val(docs[3] as u64));
|
||||
|
||||
let bucket_pos0 = get_bucket_num(val0);
|
||||
let bucket_pos1 = get_bucket_num(val1);
|
||||
@@ -370,8 +370,8 @@ impl SegmentHistogramCollector {
|
||||
&bucket_with_accessor.sub_aggregation,
|
||||
)?;
|
||||
}
|
||||
for doc in iter.remainder() {
|
||||
let val = f64_from_fastfield_u64(accessor.get(*doc), &self.field_type);
|
||||
for &doc in iter.remainder() {
|
||||
let val = f64_from_fastfield_u64(accessor.get_val(doc as u64), &self.field_type);
|
||||
if !bounds.contains(val) {
|
||||
continue;
|
||||
}
|
||||
@@ -382,7 +382,7 @@ impl SegmentHistogramCollector {
|
||||
self.buckets[bucket_pos].key,
|
||||
get_bucket_val(val, self.interval, self.offset) as f64
|
||||
);
|
||||
self.increment_bucket(bucket_pos, *doc, &bucket_with_accessor.sub_aggregation)?;
|
||||
self.increment_bucket(bucket_pos, doc, &bucket_with_accessor.sub_aggregation)?;
|
||||
}
|
||||
if force_flush {
|
||||
if let Some(sub_aggregations) = self.sub_aggregations.as_mut() {
|
||||
|
||||
@@ -12,7 +12,6 @@ use crate::aggregation::intermediate_agg_result::{
|
||||
};
|
||||
use crate::aggregation::segment_agg_result::{BucketCount, SegmentAggregationResultsCollector};
|
||||
use crate::aggregation::{f64_from_fastfield_u64, f64_to_fastfield_u64, Key, SerializedKey};
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::schema::Type;
|
||||
use crate::{DocId, TantivyError};
|
||||
|
||||
@@ -264,10 +263,10 @@ impl SegmentRangeCollector {
|
||||
.as_single()
|
||||
.expect("unexpected fast field cardinatility");
|
||||
for docs in iter.by_ref() {
|
||||
let val1 = accessor.get(docs[0]);
|
||||
let val2 = accessor.get(docs[1]);
|
||||
let val3 = accessor.get(docs[2]);
|
||||
let val4 = accessor.get(docs[3]);
|
||||
let val1 = accessor.get_val(docs[0] as u64);
|
||||
let val2 = accessor.get_val(docs[1] as u64);
|
||||
let val3 = accessor.get_val(docs[2] as u64);
|
||||
let val4 = accessor.get_val(docs[3] as u64);
|
||||
let bucket_pos1 = self.get_bucket_pos(val1);
|
||||
let bucket_pos2 = self.get_bucket_pos(val2);
|
||||
let bucket_pos3 = self.get_bucket_pos(val3);
|
||||
@@ -278,10 +277,10 @@ impl SegmentRangeCollector {
|
||||
self.increment_bucket(bucket_pos3, docs[2], &bucket_with_accessor.sub_aggregation)?;
|
||||
self.increment_bucket(bucket_pos4, docs[3], &bucket_with_accessor.sub_aggregation)?;
|
||||
}
|
||||
for doc in iter.remainder() {
|
||||
let val = accessor.get(*doc);
|
||||
for &doc in iter.remainder() {
|
||||
let val = accessor.get_val(doc as u64);
|
||||
let bucket_pos = self.get_bucket_pos(val);
|
||||
self.increment_bucket(bucket_pos, *doc, &bucket_with_accessor.sub_aggregation)?;
|
||||
self.increment_bucket(bucket_pos, doc, &bucket_with_accessor.sub_aggregation)?;
|
||||
}
|
||||
if force_flush {
|
||||
for bucket in &mut self.buckets {
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use std::fmt::Debug;
|
||||
|
||||
use fastfield_codecs::Column;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::aggregation::f64_from_fastfield_u64;
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader};
|
||||
use crate::schema::Type;
|
||||
use crate::DocId;
|
||||
|
||||
@@ -57,13 +57,13 @@ impl SegmentAverageCollector {
|
||||
data: Default::default(),
|
||||
}
|
||||
}
|
||||
pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &DynamicFastFieldReader<u64>) {
|
||||
pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &dyn Column<u64>) {
|
||||
let mut iter = doc.chunks_exact(4);
|
||||
for docs in iter.by_ref() {
|
||||
let val1 = field.get(docs[0]);
|
||||
let val2 = field.get(docs[1]);
|
||||
let val3 = field.get(docs[2]);
|
||||
let val4 = field.get(docs[3]);
|
||||
let val1 = field.get_val(docs[0] as u64);
|
||||
let val2 = field.get_val(docs[1] as u64);
|
||||
let val3 = field.get_val(docs[2] as u64);
|
||||
let val4 = field.get_val(docs[3] as u64);
|
||||
let val1 = f64_from_fastfield_u64(val1, &self.field_type);
|
||||
let val2 = f64_from_fastfield_u64(val2, &self.field_type);
|
||||
let val3 = f64_from_fastfield_u64(val3, &self.field_type);
|
||||
@@ -73,8 +73,8 @@ impl SegmentAverageCollector {
|
||||
self.data.collect(val3);
|
||||
self.data.collect(val4);
|
||||
}
|
||||
for doc in iter.remainder() {
|
||||
let val = field.get(*doc);
|
||||
for &doc in iter.remainder() {
|
||||
let val = field.get_val(doc as u64);
|
||||
let val = f64_from_fastfield_u64(val, &self.field_type);
|
||||
self.data.collect(val);
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use fastfield_codecs::Column;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::aggregation::f64_from_fastfield_u64;
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader};
|
||||
use crate::schema::Type;
|
||||
use crate::{DocId, TantivyError};
|
||||
|
||||
@@ -163,13 +163,13 @@ impl SegmentStatsCollector {
|
||||
stats: IntermediateStats::default(),
|
||||
}
|
||||
}
|
||||
pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &DynamicFastFieldReader<u64>) {
|
||||
pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &dyn Column<u64>) {
|
||||
let mut iter = doc.chunks_exact(4);
|
||||
for docs in iter.by_ref() {
|
||||
let val1 = field.get(docs[0]);
|
||||
let val2 = field.get(docs[1]);
|
||||
let val3 = field.get(docs[2]);
|
||||
let val4 = field.get(docs[3]);
|
||||
let val1 = field.get_val(docs[0] as u64);
|
||||
let val2 = field.get_val(docs[1] as u64);
|
||||
let val3 = field.get_val(docs[2] as u64);
|
||||
let val4 = field.get_val(docs[3] as u64);
|
||||
let val1 = f64_from_fastfield_u64(val1, &self.field_type);
|
||||
let val2 = f64_from_fastfield_u64(val2, &self.field_type);
|
||||
let val3 = f64_from_fastfield_u64(val3, &self.field_type);
|
||||
@@ -179,8 +179,8 @@ impl SegmentStatsCollector {
|
||||
self.stats.collect(val3);
|
||||
self.stats.collect(val4);
|
||||
}
|
||||
for doc in iter.remainder() {
|
||||
let val = field.get(*doc);
|
||||
for &doc in iter.remainder() {
|
||||
let val = field.get_val(doc as u64);
|
||||
let val = f64_from_fastfield_u64(val, &self.field_type);
|
||||
self.stats.collect(val);
|
||||
}
|
||||
|
||||
@@ -185,10 +185,10 @@ impl SegmentMetricResultCollector {
|
||||
pub(crate) fn collect_block(&mut self, doc: &[DocId], metric: &MetricAggregationWithAccessor) {
|
||||
match self {
|
||||
SegmentMetricResultCollector::Average(avg_collector) => {
|
||||
avg_collector.collect_block(doc, &metric.accessor);
|
||||
avg_collector.collect_block(doc, &*metric.accessor);
|
||||
}
|
||||
SegmentMetricResultCollector::Stats(stats_collector) => {
|
||||
stats_collector.collect_block(doc, &metric.accessor);
|
||||
stats_collector.collect_block(doc, &*metric.accessor);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,9 +10,12 @@
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
use std::marker::PhantomData;
|
||||
use std::sync::Arc;
|
||||
|
||||
use fastfield_codecs::Column;
|
||||
|
||||
use crate::collector::{Collector, SegmentCollector};
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue};
|
||||
use crate::fastfield::FastValue;
|
||||
use crate::schema::Field;
|
||||
use crate::{Score, SegmentReader, TantivyError};
|
||||
|
||||
@@ -158,7 +161,7 @@ where
|
||||
TPredicate: 'static,
|
||||
TPredicateValue: FastValue,
|
||||
{
|
||||
fast_field_reader: DynamicFastFieldReader<TPredicateValue>,
|
||||
fast_field_reader: Arc<dyn Column<TPredicateValue>>,
|
||||
segment_collector: TSegmentCollector,
|
||||
predicate: TPredicate,
|
||||
t_predicate_value: PhantomData<TPredicateValue>,
|
||||
@@ -174,7 +177,7 @@ where
|
||||
type Fruit = TSegmentCollector::Fruit;
|
||||
|
||||
fn collect(&mut self, doc: u32, score: Score) {
|
||||
let value = self.fast_field_reader.get(doc);
|
||||
let value = self.fast_field_reader.get_val(doc as u64);
|
||||
if (self.predicate)(value) {
|
||||
self.segment_collector.collect(doc, score)
|
||||
}
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use fastdivide::DividerU64;
|
||||
use fastfield_codecs::Column;
|
||||
|
||||
use crate::collector::{Collector, SegmentCollector};
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue};
|
||||
use crate::fastfield::FastValue;
|
||||
use crate::schema::{Field, Type};
|
||||
use crate::{DocId, Score};
|
||||
|
||||
@@ -84,14 +87,14 @@ impl HistogramComputer {
|
||||
}
|
||||
pub struct SegmentHistogramCollector {
|
||||
histogram_computer: HistogramComputer,
|
||||
ff_reader: DynamicFastFieldReader<u64>,
|
||||
ff_reader: Arc<dyn Column<u64>>,
|
||||
}
|
||||
|
||||
impl SegmentCollector for SegmentHistogramCollector {
|
||||
type Fruit = Vec<u64>;
|
||||
|
||||
fn collect(&mut self, doc: DocId, _score: Score) {
|
||||
let value = self.ff_reader.get(doc);
|
||||
let value = self.ff_reader.get_val(doc as u64);
|
||||
self.histogram_computer.add_value(value);
|
||||
}
|
||||
|
||||
|
||||
@@ -1,7 +1,11 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use fastfield_codecs::Column;
|
||||
|
||||
use super::*;
|
||||
use crate::collector::{Count, FilterCollector, TopDocs};
|
||||
use crate::core::SegmentReader;
|
||||
use crate::fastfield::{BytesFastFieldReader, DynamicFastFieldReader, FastFieldReader};
|
||||
use crate::fastfield::BytesFastFieldReader;
|
||||
use crate::query::{AllQuery, QueryParser};
|
||||
use crate::schema::{Field, Schema, FAST, TEXT};
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
@@ -156,7 +160,7 @@ pub struct FastFieldTestCollector {
|
||||
|
||||
pub struct FastFieldSegmentCollector {
|
||||
vals: Vec<u64>,
|
||||
reader: DynamicFastFieldReader<u64>,
|
||||
reader: Arc<dyn Column<u64>>,
|
||||
}
|
||||
|
||||
impl FastFieldTestCollector {
|
||||
@@ -197,7 +201,7 @@ impl SegmentCollector for FastFieldSegmentCollector {
|
||||
type Fruit = Vec<u64>;
|
||||
|
||||
fn collect(&mut self, doc: DocId, _score: Score) {
|
||||
let val = self.reader.get(doc);
|
||||
let val = self.reader.get_val(doc as u64);
|
||||
self.vals.push(val);
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
use std::collections::BinaryHeap;
|
||||
use std::fmt;
|
||||
use std::marker::PhantomData;
|
||||
use std::sync::Arc;
|
||||
|
||||
use fastfield_codecs::Column;
|
||||
|
||||
use super::Collector;
|
||||
use crate::collector::custom_score_top_collector::CustomScoreTopCollector;
|
||||
@@ -9,7 +12,7 @@ use crate::collector::tweak_score_top_collector::TweakedScoreTopCollector;
|
||||
use crate::collector::{
|
||||
CustomScorer, CustomSegmentScorer, ScoreSegmentTweaker, ScoreTweaker, SegmentCollector,
|
||||
};
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue};
|
||||
use crate::fastfield::FastValue;
|
||||
use crate::query::Weight;
|
||||
use crate::schema::Field;
|
||||
use crate::{DocAddress, DocId, Score, SegmentOrdinal, SegmentReader, TantivyError};
|
||||
@@ -129,12 +132,12 @@ impl fmt::Debug for TopDocs {
|
||||
}
|
||||
|
||||
struct ScorerByFastFieldReader {
|
||||
ff_reader: DynamicFastFieldReader<u64>,
|
||||
ff_reader: Arc<dyn Column<u64>>,
|
||||
}
|
||||
|
||||
impl CustomSegmentScorer<u64> for ScorerByFastFieldReader {
|
||||
fn score(&mut self, doc: DocId) -> u64 {
|
||||
self.ff_reader.get(doc)
|
||||
self.ff_reader.get_val(doc as u64)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -407,7 +410,6 @@ impl TopDocs {
|
||||
/// # use tantivy::query::QueryParser;
|
||||
/// use tantivy::SegmentReader;
|
||||
/// use tantivy::collector::TopDocs;
|
||||
/// use tantivy::fastfield::FastFieldReader;
|
||||
/// use tantivy::schema::Field;
|
||||
///
|
||||
/// fn create_schema() -> Schema {
|
||||
@@ -456,7 +458,7 @@ impl TopDocs {
|
||||
///
|
||||
/// // We can now define our actual scoring function
|
||||
/// move |doc: DocId, original_score: Score| {
|
||||
/// let popularity: u64 = popularity_reader.get(doc);
|
||||
/// let popularity: u64 = popularity_reader.get_val(doc as u64);
|
||||
/// // Well.. For the sake of the example we use a simple logarithm
|
||||
/// // function.
|
||||
/// let popularity_boost_score = ((2u64 + popularity) as Score).log2();
|
||||
@@ -515,7 +517,6 @@ impl TopDocs {
|
||||
/// use tantivy::SegmentReader;
|
||||
/// use tantivy::collector::TopDocs;
|
||||
/// use tantivy::schema::Field;
|
||||
/// use tantivy::fastfield::FastFieldReader;
|
||||
///
|
||||
/// # fn create_schema() -> Schema {
|
||||
/// # let mut schema_builder = Schema::builder();
|
||||
@@ -567,8 +568,8 @@ impl TopDocs {
|
||||
///
|
||||
/// // We can now define our actual scoring function
|
||||
/// move |doc: DocId| {
|
||||
/// let popularity: u64 = popularity_reader.get(doc);
|
||||
/// let boosted: u64 = boosted_reader.get(doc);
|
||||
/// let popularity: u64 = popularity_reader.get_val(doc as u64);
|
||||
/// let boosted: u64 = boosted_reader.get_val(doc as u64);
|
||||
/// // Score do not have to be `f64` in tantivy.
|
||||
/// // Here we return a couple to get lexicographical order
|
||||
/// // for free.
|
||||
|
||||
@@ -16,7 +16,7 @@ use crate::directory::MmapDirectory;
|
||||
use crate::directory::{Directory, ManagedDirectory, RamDirectory, INDEX_WRITER_LOCK};
|
||||
use crate::error::{DataCorruption, TantivyError};
|
||||
use crate::indexer::index_writer::{MAX_NUM_THREAD, MEMORY_ARENA_NUM_BYTES_MIN};
|
||||
use crate::indexer::segment_updater::save_new_metas;
|
||||
use crate::indexer::segment_updater::save_metas;
|
||||
use crate::reader::{IndexReader, IndexReaderBuilder};
|
||||
use crate::schema::{Field, FieldType, Schema};
|
||||
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
|
||||
@@ -47,6 +47,34 @@ fn load_metas(
|
||||
.map_err(From::from)
|
||||
}
|
||||
|
||||
/// Save the index meta file.
|
||||
/// This operation is atomic :
|
||||
/// Either
|
||||
/// - it fails, in which case an error is returned,
|
||||
/// and the `meta.json` remains untouched,
|
||||
/// - it succeeds, and `meta.json` is written
|
||||
/// and flushed.
|
||||
///
|
||||
/// This method is not part of tantivy's public API
|
||||
fn save_new_metas(
|
||||
schema: Schema,
|
||||
index_settings: IndexSettings,
|
||||
directory: &dyn Directory,
|
||||
) -> crate::Result<()> {
|
||||
save_metas(
|
||||
&IndexMeta {
|
||||
index_settings,
|
||||
segments: Vec::new(),
|
||||
schema,
|
||||
opstamp: 0u64,
|
||||
payload: None,
|
||||
},
|
||||
directory,
|
||||
)?;
|
||||
directory.sync_directory()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// IndexBuilder can be used to create an index.
|
||||
///
|
||||
/// Use in conjunction with `SchemaBuilder`. Global index settings
|
||||
|
||||
@@ -1,5 +1,9 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use fastfield_codecs::Column;
|
||||
|
||||
use crate::directory::{FileSlice, OwnedBytes};
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, MultiValueLength};
|
||||
use crate::fastfield::MultiValueLength;
|
||||
use crate::DocId;
|
||||
|
||||
/// Reader for byte array fast fields
|
||||
@@ -14,13 +18,13 @@ use crate::DocId;
|
||||
/// and the start index for the next document, and keeping the bytes in between.
|
||||
#[derive(Clone)]
|
||||
pub struct BytesFastFieldReader {
|
||||
idx_reader: DynamicFastFieldReader<u64>,
|
||||
idx_reader: Arc<dyn Column<u64>>,
|
||||
values: OwnedBytes,
|
||||
}
|
||||
|
||||
impl BytesFastFieldReader {
|
||||
pub(crate) fn open(
|
||||
idx_reader: DynamicFastFieldReader<u64>,
|
||||
idx_reader: Arc<dyn Column<u64>>,
|
||||
values_file: FileSlice,
|
||||
) -> crate::Result<BytesFastFieldReader> {
|
||||
let values = values_file.read_bytes()?;
|
||||
@@ -28,8 +32,9 @@ impl BytesFastFieldReader {
|
||||
}
|
||||
|
||||
fn range(&self, doc: DocId) -> (usize, usize) {
|
||||
let start = self.idx_reader.get(doc) as usize;
|
||||
let stop = self.idx_reader.get(doc + 1) as usize;
|
||||
let idx = doc as u64;
|
||||
let start = self.idx_reader.get_val(idx) as usize;
|
||||
let stop = self.idx_reader.get_val(idx + 1) as usize;
|
||||
(start, stop)
|
||||
}
|
||||
|
||||
|
||||
@@ -3,20 +3,11 @@ use std::num::NonZeroU64;
|
||||
|
||||
use common::BinarySerializable;
|
||||
use fastdivide::DividerU64;
|
||||
use fastfield_codecs::{FastFieldCodec, FastFieldDataAccess};
|
||||
use fastfield_codecs::{monotonic_map_column, Column, FastFieldCodec};
|
||||
use ownedbytes::OwnedBytes;
|
||||
|
||||
pub const GCD_DEFAULT: u64 = 1;
|
||||
|
||||
/// Wrapper for accessing a fastfield.
|
||||
///
|
||||
/// Holds the data and the codec to the read the data.
|
||||
#[derive(Clone)]
|
||||
pub struct GCDReader<CodecReader: FastFieldDataAccess> {
|
||||
gcd_params: GCDParams,
|
||||
reader: CodecReader,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
struct GCDParams {
|
||||
gcd: u64,
|
||||
@@ -24,12 +15,6 @@ struct GCDParams {
|
||||
num_vals: u64,
|
||||
}
|
||||
|
||||
impl GCDParams {
|
||||
pub fn eval(&self, val: u64) -> u64 {
|
||||
self.min_value + self.gcd * val
|
||||
}
|
||||
}
|
||||
|
||||
impl BinarySerializable for GCDParams {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
self.gcd.serialize(writer)?;
|
||||
@@ -52,31 +37,13 @@ impl BinarySerializable for GCDParams {
|
||||
|
||||
pub fn open_gcd_from_bytes<WrappedCodec: FastFieldCodec>(
|
||||
bytes: OwnedBytes,
|
||||
) -> io::Result<GCDReader<WrappedCodec::Reader>> {
|
||||
) -> io::Result<impl Column> {
|
||||
let footer_offset = bytes.len() - 24;
|
||||
let (body, mut footer) = bytes.split(footer_offset);
|
||||
let gcd_params = GCDParams::deserialize(&mut footer)?;
|
||||
let gcd_remap = move |val: u64| gcd_params.min_value + gcd_params.gcd * val;
|
||||
let reader: WrappedCodec::Reader = WrappedCodec::open_from_bytes(body)?;
|
||||
Ok(GCDReader { gcd_params, reader })
|
||||
}
|
||||
|
||||
impl<C: FastFieldDataAccess + Clone> FastFieldDataAccess for GCDReader<C> {
|
||||
#[inline]
|
||||
fn get_val(&self, doc: u64) -> u64 {
|
||||
let val = self.reader.get_val(doc);
|
||||
self.gcd_params.eval(val)
|
||||
}
|
||||
|
||||
fn min_value(&self) -> u64 {
|
||||
self.gcd_params.eval(self.reader.min_value())
|
||||
}
|
||||
|
||||
fn max_value(&self) -> u64 {
|
||||
self.gcd_params.eval(self.reader.max_value())
|
||||
}
|
||||
fn num_vals(&self) -> u64 {
|
||||
self.gcd_params.num_vals
|
||||
}
|
||||
Ok(monotonic_map_column(reader, gcd_remap))
|
||||
}
|
||||
|
||||
pub fn write_gcd_header<W: Write>(
|
||||
@@ -134,17 +101,19 @@ mod tests {
|
||||
use std::collections::HashMap;
|
||||
use std::num::NonZeroU64;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, SystemTime};
|
||||
|
||||
use common::HasLen;
|
||||
use fastfield_codecs::Column;
|
||||
|
||||
use crate::directory::{CompositeFile, RamDirectory, WritePtr};
|
||||
use crate::fastfield::gcd::compute_gcd;
|
||||
use crate::fastfield::reader::open_fast_field;
|
||||
use crate::fastfield::serializer::FastFieldCodecEnableCheck;
|
||||
use crate::fastfield::tests::{FIELD, FIELDI64, SCHEMA, SCHEMAI64};
|
||||
use crate::fastfield::tests::{encode_decode_fast_field, FIELD, FIELDI64, SCHEMA, SCHEMAI64};
|
||||
use crate::fastfield::{
|
||||
find_gcd, CompositeFastFieldSerializer, DynamicFastFieldReader, FastFieldCodecType,
|
||||
FastFieldReader, FastFieldsWriter, ALL_CODECS,
|
||||
find_gcd, CompositeFastFieldSerializer, FastFieldCodecType, FastFieldsWriter, ALL_CODECS,
|
||||
};
|
||||
use crate::schema::{Cardinality, Schema};
|
||||
use crate::{DateOptions, DatePrecision, DateTime, Directory};
|
||||
@@ -186,11 +155,10 @@ mod tests {
|
||||
let file = directory.open_read(path).unwrap();
|
||||
let composite_file = CompositeFile::open(&file)?;
|
||||
let file = composite_file.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<i64>::open(file)?;
|
||||
|
||||
assert_eq!(fast_field_reader.get(0), -4000i64);
|
||||
assert_eq!(fast_field_reader.get(1), -3000i64);
|
||||
assert_eq!(fast_field_reader.get(2), -2000i64);
|
||||
let fast_field_reader: Arc<dyn Column<i64>> = open_fast_field(file.read_bytes()?)?;
|
||||
assert_eq!(fast_field_reader.get_val(0), -4000i64);
|
||||
assert_eq!(fast_field_reader.get_val(1), -3000i64);
|
||||
assert_eq!(fast_field_reader.get_val(2), -2000i64);
|
||||
assert_eq!(fast_field_reader.max_value(), (num_vals as i64 - 5) * 1000);
|
||||
assert_eq!(fast_field_reader.min_value(), -4000i64);
|
||||
let file = directory.open_read(path).unwrap();
|
||||
@@ -209,7 +177,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_fastfield_gcd_i64() -> crate::Result<()> {
|
||||
for &code_type in ALL_CODECS {
|
||||
test_fastfield_gcd_i64_with_codec(code_type, 5005)?;
|
||||
test_fastfield_gcd_i64_with_codec(code_type, 5500)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -228,10 +196,10 @@ mod tests {
|
||||
let file = directory.open_read(path).unwrap();
|
||||
let composite_file = CompositeFile::open(&file)?;
|
||||
let file = composite_file.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(file)?;
|
||||
assert_eq!(fast_field_reader.get(0), 1000u64);
|
||||
assert_eq!(fast_field_reader.get(1), 2000u64);
|
||||
assert_eq!(fast_field_reader.get(2), 3000u64);
|
||||
let fast_field_reader = open_fast_field::<u64>(file.read_bytes()?)?;
|
||||
assert_eq!(fast_field_reader.get_val(0), 1000u64);
|
||||
assert_eq!(fast_field_reader.get_val(1), 2000u64);
|
||||
assert_eq!(fast_field_reader.get_val(2), 3000u64);
|
||||
assert_eq!(fast_field_reader.max_value(), num_vals as u64 * 1000);
|
||||
assert_eq!(fast_field_reader.min_value(), 1000u64);
|
||||
let file = directory.open_read(path).unwrap();
|
||||
@@ -250,17 +218,17 @@ mod tests {
|
||||
#[test]
|
||||
fn test_fastfield_gcd_u64() -> crate::Result<()> {
|
||||
for &code_type in ALL_CODECS {
|
||||
test_fastfield_gcd_u64_with_codec(code_type, 5005)?;
|
||||
test_fastfield_gcd_u64_with_codec(code_type, 5500)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_fastfield2() {
|
||||
let test_fastfield = DynamicFastFieldReader::<u64>::from(vec![100, 200, 300]);
|
||||
assert_eq!(test_fastfield.get(0), 100);
|
||||
assert_eq!(test_fastfield.get(1), 200);
|
||||
assert_eq!(test_fastfield.get(2), 300);
|
||||
let test_fastfield = encode_decode_fast_field(&[100u64, 200u64, 300u64]);
|
||||
assert_eq!(test_fastfield.get_val(0), 100);
|
||||
assert_eq!(test_fastfield.get_val(1), 200);
|
||||
assert_eq!(test_fastfield.get_val(2), 300);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -323,11 +291,11 @@ mod tests {
|
||||
let composite_file = CompositeFile::open(&file)?;
|
||||
let file = composite_file.open_read(*FIELD).unwrap();
|
||||
let len = file.len();
|
||||
let test_fastfield = DynamicFastFieldReader::<DateTime>::open(file)?;
|
||||
let test_fastfield = open_fast_field::<DateTime>(file.read_bytes()?)?;
|
||||
|
||||
assert_eq!(test_fastfield.get(0), time1.truncate(precision));
|
||||
assert_eq!(test_fastfield.get(1), time2.truncate(precision));
|
||||
assert_eq!(test_fastfield.get(2), time3.truncate(precision));
|
||||
assert_eq!(test_fastfield.get_val(0), time1.truncate(precision));
|
||||
assert_eq!(test_fastfield.get_val(1), time2.truncate(precision));
|
||||
assert_eq!(test_fastfield.get_val(2), time3.truncate(precision));
|
||||
Ok(len)
|
||||
}
|
||||
|
||||
|
||||
@@ -26,12 +26,11 @@ pub use self::alive_bitset::{intersect_alive_bitsets, write_alive_bitset, AliveB
|
||||
pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter};
|
||||
pub use self::error::{FastFieldNotAvailableError, Result};
|
||||
pub use self::facet_reader::FacetReader;
|
||||
pub(crate) use self::gcd::{find_gcd, GCDReader, GCD_DEFAULT};
|
||||
pub(crate) use self::gcd::{find_gcd, GCD_DEFAULT};
|
||||
pub use self::multivalued::{MultiValuedFastFieldReader, MultiValuedFastFieldWriter};
|
||||
pub use self::reader::{DynamicFastFieldReader, FastFieldReader};
|
||||
pub use self::readers::FastFieldReaders;
|
||||
pub(crate) use self::readers::{type_and_cardinality, FastType};
|
||||
pub use self::serializer::{CompositeFastFieldSerializer, FastFieldDataAccess, FastFieldStats};
|
||||
pub use self::serializer::{Column, CompositeFastFieldSerializer, FastFieldStats};
|
||||
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
|
||||
use crate::schema::{Cardinality, FieldType, Type, Value};
|
||||
use crate::{DateTime, DocId};
|
||||
@@ -266,6 +265,7 @@ mod tests {
|
||||
use std::collections::HashMap;
|
||||
use std::ops::Range;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
|
||||
use common::HasLen;
|
||||
use once_cell::sync::Lazy;
|
||||
@@ -275,6 +275,7 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr};
|
||||
use crate::fastfield::reader::open_fast_field;
|
||||
use crate::merge_policy::NoMergePolicy;
|
||||
use crate::schema::{Document, Field, Schema, FAST, STRING, TEXT};
|
||||
use crate::time::OffsetDateTime;
|
||||
@@ -295,12 +296,54 @@ mod tests {
|
||||
pub static FIELD: Lazy<Field> = Lazy::new(|| SCHEMA.get_field("field").unwrap());
|
||||
pub static FIELDI64: Lazy<Field> = Lazy::new(|| SCHEMAI64.get_field("field").unwrap());
|
||||
|
||||
/// Encode values using the most appropriate codec and and then loads it
|
||||
/// right away.
|
||||
///
|
||||
/// This is useful in tests and bench.
|
||||
pub(crate) fn encode_decode_fast_field<Item: FastValue>(
|
||||
vals: &[Item],
|
||||
) -> Arc<dyn Column<Item>> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_u64_field("field", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let path = Path::new("__dummy__");
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory
|
||||
.open_write(path)
|
||||
.expect("With a RamDirectory, this should never fail.");
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write)
|
||||
.expect("With a RamDirectory, this should never fail.");
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
|
||||
{
|
||||
let fast_field_writer = fast_field_writers
|
||||
.get_field_writer_mut(field)
|
||||
.expect("With a RamDirectory, this should never fail.");
|
||||
for val in vals {
|
||||
fast_field_writer.add_val(val.to_u64());
|
||||
}
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new(), None)
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(path).expect("Failed to open the file");
|
||||
let composite_file = CompositeFile::open(&file).expect("Failed to read the composite file");
|
||||
let field_bytes = composite_file
|
||||
.open_read(field)
|
||||
.expect("File component not found")
|
||||
.read_bytes()
|
||||
.unwrap();
|
||||
open_fast_field(field_bytes).unwrap()
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_fastfield() {
|
||||
let test_fastfield = DynamicFastFieldReader::<u64>::from(vec![100, 200, 300]);
|
||||
assert_eq!(test_fastfield.get(0), 100);
|
||||
assert_eq!(test_fastfield.get(1), 200);
|
||||
assert_eq!(test_fastfield.get(2), 300);
|
||||
let test_fastfield = encode_decode_fast_field(&[100u64, 200u64, 300u64]);
|
||||
assert_eq!(test_fastfield.get_val(0u64), 100);
|
||||
assert_eq!(test_fastfield.get_val(1u64), 200);
|
||||
assert_eq!(test_fastfield.get_val(2u64), 300);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -328,11 +371,11 @@ mod tests {
|
||||
let file = directory.open_read(path).unwrap();
|
||||
assert_eq!(file.len(), 45);
|
||||
let composite_file = CompositeFile::open(&file)?;
|
||||
let file = composite_file.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(file)?;
|
||||
assert_eq!(fast_field_reader.get(0), 13u64);
|
||||
assert_eq!(fast_field_reader.get(1), 14u64);
|
||||
assert_eq!(fast_field_reader.get(2), 2u64);
|
||||
let fast_field_bytes = composite_file.open_read(*FIELD).unwrap().read_bytes()?;
|
||||
let fast_field_reader = open_fast_field::<u64>(fast_field_bytes)?;
|
||||
assert_eq!(fast_field_reader.get_val(0), 13u64);
|
||||
assert_eq!(fast_field_reader.get_val(1), 14u64);
|
||||
assert_eq!(fast_field_reader.get_val(2), 2u64);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -360,17 +403,20 @@ mod tests {
|
||||
assert_eq!(file.len(), 70);
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file)?;
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?;
|
||||
assert_eq!(fast_field_reader.get(0), 4u64);
|
||||
assert_eq!(fast_field_reader.get(1), 14_082_001u64);
|
||||
assert_eq!(fast_field_reader.get(2), 3_052u64);
|
||||
assert_eq!(fast_field_reader.get(3), 9002u64);
|
||||
assert_eq!(fast_field_reader.get(4), 15_001u64);
|
||||
assert_eq!(fast_field_reader.get(5), 777u64);
|
||||
assert_eq!(fast_field_reader.get(6), 1_002u64);
|
||||
assert_eq!(fast_field_reader.get(7), 1_501u64);
|
||||
assert_eq!(fast_field_reader.get(8), 215u64);
|
||||
let data = fast_fields_composite
|
||||
.open_read(*FIELD)
|
||||
.unwrap()
|
||||
.read_bytes()?;
|
||||
let fast_field_reader = open_fast_field::<u64>(data)?;
|
||||
assert_eq!(fast_field_reader.get_val(0), 4u64);
|
||||
assert_eq!(fast_field_reader.get_val(1), 14_082_001u64);
|
||||
assert_eq!(fast_field_reader.get_val(2), 3_052u64);
|
||||
assert_eq!(fast_field_reader.get_val(3), 9002u64);
|
||||
assert_eq!(fast_field_reader.get_val(4), 15_001u64);
|
||||
assert_eq!(fast_field_reader.get_val(5), 777u64);
|
||||
assert_eq!(fast_field_reader.get_val(6), 1_002u64);
|
||||
assert_eq!(fast_field_reader.get_val(7), 1_501u64);
|
||||
assert_eq!(fast_field_reader.get_val(8), 215u64);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -396,10 +442,13 @@ mod tests {
|
||||
assert_eq!(file.len(), 43);
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?;
|
||||
let data = fast_fields_composite
|
||||
.open_read(*FIELD)
|
||||
.unwrap()
|
||||
.read_bytes()?;
|
||||
let fast_field_reader = open_fast_field::<u64>(data)?;
|
||||
for doc in 0..10_000 {
|
||||
assert_eq!(fast_field_reader.get(doc), 100_000u64);
|
||||
assert_eq!(fast_field_reader.get_val(doc), 100_000u64);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
@@ -428,12 +477,15 @@ mod tests {
|
||||
assert_eq!(file.len(), 80051);
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file)?;
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?;
|
||||
assert_eq!(fast_field_reader.get(0), 0u64);
|
||||
let data = fast_fields_composite
|
||||
.open_read(*FIELD)
|
||||
.unwrap()
|
||||
.read_bytes()?;
|
||||
let fast_field_reader = open_fast_field::<u64>(data)?;
|
||||
assert_eq!(fast_field_reader.get_val(0), 0u64);
|
||||
for doc in 1..10_001 {
|
||||
assert_eq!(
|
||||
fast_field_reader.get(doc),
|
||||
fast_field_reader.get_val(doc),
|
||||
5_000_000_000_000_000_000u64 + doc as u64 - 1u64
|
||||
);
|
||||
}
|
||||
@@ -469,13 +521,16 @@ mod tests {
|
||||
assert_eq!(file.len(), 75_usize); // linear interpol size after calc improvement
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file)?;
|
||||
let data = fast_fields_composite.open_read(i64_field).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<i64>::open(data)?;
|
||||
let data = fast_fields_composite
|
||||
.open_read(i64_field)
|
||||
.unwrap()
|
||||
.read_bytes()?;
|
||||
let fast_field_reader = open_fast_field::<i64>(data)?;
|
||||
|
||||
assert_eq!(fast_field_reader.min_value(), -100i64);
|
||||
assert_eq!(fast_field_reader.max_value(), 9_999i64);
|
||||
for (doc, i) in (-100i64..10_000i64).enumerate() {
|
||||
assert_eq!(fast_field_reader.get(doc as u32), i);
|
||||
assert_eq!(fast_field_reader.get_val(doc as u64), i);
|
||||
}
|
||||
let mut buffer = vec![0i64; 100];
|
||||
fast_field_reader.get_range(53, &mut buffer[..]);
|
||||
@@ -509,9 +564,12 @@ mod tests {
|
||||
let file = directory.open_read(path).unwrap();
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file).unwrap();
|
||||
let data = fast_fields_composite.open_read(i64_field).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<i64>::open(data)?;
|
||||
assert_eq!(fast_field_reader.get(0u32), 0i64);
|
||||
let data = fast_fields_composite
|
||||
.open_read(i64_field)
|
||||
.unwrap()
|
||||
.read_bytes()?;
|
||||
let fast_field_reader = open_fast_field::<i64>(data)?;
|
||||
assert_eq!(fast_field_reader.get_val(0), 0i64);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -547,11 +605,14 @@ mod tests {
|
||||
let file = directory.open_read(path)?;
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file)?;
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?;
|
||||
let data = fast_fields_composite
|
||||
.open_read(*FIELD)
|
||||
.unwrap()
|
||||
.read_bytes()?;
|
||||
let fast_field_reader = open_fast_field::<u64>(data)?;
|
||||
|
||||
for a in 0..n {
|
||||
assert_eq!(fast_field_reader.get(a as u32), permutation[a as usize]);
|
||||
assert_eq!(fast_field_reader.get_val(a as u64), permutation[a as usize]);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
@@ -607,7 +668,7 @@ mod tests {
|
||||
let mut all = vec![];
|
||||
|
||||
for doc in docs {
|
||||
let mut out = vec![];
|
||||
let mut out: Vec<u64> = vec![];
|
||||
ff.get_vals(doc, &mut out);
|
||||
all.extend(out);
|
||||
}
|
||||
@@ -842,19 +903,19 @@ mod tests {
|
||||
let dates_fast_field = fast_fields.dates(multi_date_field).unwrap();
|
||||
let mut dates = vec![];
|
||||
{
|
||||
assert_eq!(date_fast_field.get(0u32).into_timestamp_micros(), 1i64);
|
||||
assert_eq!(date_fast_field.get_val(0).into_timestamp_micros(), 1i64);
|
||||
dates_fast_field.get_vals(0u32, &mut dates);
|
||||
assert_eq!(dates.len(), 2);
|
||||
assert_eq!(dates[0].into_timestamp_micros(), 2i64);
|
||||
assert_eq!(dates[1].into_timestamp_micros(), 3i64);
|
||||
}
|
||||
{
|
||||
assert_eq!(date_fast_field.get(1u32).into_timestamp_micros(), 4i64);
|
||||
assert_eq!(date_fast_field.get_val(1).into_timestamp_micros(), 4i64);
|
||||
dates_fast_field.get_vals(1u32, &mut dates);
|
||||
assert!(dates.is_empty());
|
||||
}
|
||||
{
|
||||
assert_eq!(date_fast_field.get(2u32).into_timestamp_micros(), 0i64);
|
||||
assert_eq!(date_fast_field.get_val(2).into_timestamp_micros(), 0i64);
|
||||
dates_fast_field.get_vals(2u32, &mut dates);
|
||||
assert_eq!(dates.len(), 2);
|
||||
assert_eq!(dates[0].into_timestamp_micros(), 5i64);
|
||||
@@ -865,11 +926,11 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
pub fn test_fastfield_bool() {
|
||||
let test_fastfield = DynamicFastFieldReader::<bool>::from(vec![true, false, true, false]);
|
||||
assert_eq!(test_fastfield.get(0), true);
|
||||
assert_eq!(test_fastfield.get(1), false);
|
||||
assert_eq!(test_fastfield.get(2), true);
|
||||
assert_eq!(test_fastfield.get(3), false);
|
||||
let test_fastfield = encode_decode_fast_field::<bool>(&[true, false, true, false]);
|
||||
assert_eq!(test_fastfield.get_val(0), true);
|
||||
assert_eq!(test_fastfield.get_val(1), false);
|
||||
assert_eq!(test_fastfield.get_val(2), true);
|
||||
assert_eq!(test_fastfield.get_val(3), false);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -898,12 +959,12 @@ mod tests {
|
||||
let file = directory.open_read(path).unwrap();
|
||||
assert_eq!(file.len(), 44);
|
||||
let composite_file = CompositeFile::open(&file)?;
|
||||
let file = composite_file.open_read(field).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<bool>::open(file)?;
|
||||
assert_eq!(fast_field_reader.get(0), true);
|
||||
assert_eq!(fast_field_reader.get(1), false);
|
||||
assert_eq!(fast_field_reader.get(2), true);
|
||||
assert_eq!(fast_field_reader.get(3), false);
|
||||
let data = composite_file.open_read(field).unwrap().read_bytes()?;
|
||||
let fast_field_reader = open_fast_field::<bool>(data)?;
|
||||
assert_eq!(fast_field_reader.get_val(0), true);
|
||||
assert_eq!(fast_field_reader.get_val(1), false);
|
||||
assert_eq!(fast_field_reader.get_val(2), true);
|
||||
assert_eq!(fast_field_reader.get_val(3), false);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -934,11 +995,11 @@ mod tests {
|
||||
let file = directory.open_read(path).unwrap();
|
||||
assert_eq!(file.len(), 56);
|
||||
let composite_file = CompositeFile::open(&file)?;
|
||||
let file = composite_file.open_read(field).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<bool>::open(file)?;
|
||||
let data = composite_file.open_read(field).unwrap().read_bytes()?;
|
||||
let fast_field_reader = open_fast_field::<bool>(data)?;
|
||||
for i in 0..25 {
|
||||
assert_eq!(fast_field_reader.get(i * 2), true);
|
||||
assert_eq!(fast_field_reader.get(i * 2 + 1), false);
|
||||
assert_eq!(fast_field_reader.get_val(i * 2), true);
|
||||
assert_eq!(fast_field_reader.get_val(i * 2 + 1), false);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -968,9 +1029,9 @@ mod tests {
|
||||
let file = directory.open_read(path).unwrap();
|
||||
assert_eq!(file.len(), 43);
|
||||
let composite_file = CompositeFile::open(&file)?;
|
||||
let file = composite_file.open_read(field).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<bool>::open(file)?;
|
||||
assert_eq!(fast_field_reader.get(0), false);
|
||||
let data = composite_file.open_read(field).unwrap().read_bytes()?;
|
||||
let fast_field_reader = open_fast_field::<bool>(data)?;
|
||||
assert_eq!(fast_field_reader.get_val(0), false);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -978,32 +1039,17 @@ mod tests {
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
|
||||
use fastfield_codecs::Column;
|
||||
use test::{self, Bencher};
|
||||
|
||||
use super::tests::{generate_permutation, FIELD, SCHEMA};
|
||||
use super::*;
|
||||
use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr};
|
||||
use crate::fastfield::tests::generate_permutation_gcd;
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::fastfield::tests::{
|
||||
encode_decode_fast_field, generate_permutation, generate_permutation_gcd,
|
||||
};
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_linear_veclookup(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
b.iter(|| {
|
||||
let n = test::black_box(7000u32);
|
||||
let mut a = 0u64;
|
||||
for i in (0u32..n / 7).map(|v| v * 7) {
|
||||
a ^= permutation[i as usize];
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_veclookup(b: &mut Bencher) {
|
||||
fn bench_intfastfield_jumpy_veclookup(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
b.iter(|| {
|
||||
let n = test::black_box(1000u32);
|
||||
@@ -1016,102 +1062,81 @@ mod bench {
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_linear_fflookup(b: &mut Bencher) {
|
||||
let path = Path::new("test");
|
||||
fn bench_intfastfield_jumpy_fflookup(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for &x in &permutation {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>x));
|
||||
let column: Arc<dyn Column<u64>> = encode_decode_fast_field(&permutation);
|
||||
b.iter(|| {
|
||||
let n = test::black_box(1000u32);
|
||||
let mut a = 0u64;
|
||||
for _ in 0u32..n {
|
||||
a = column.get_val(a as u64);
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new(), None)
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(&path).unwrap();
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
|
||||
a
|
||||
});
|
||||
}
|
||||
|
||||
b.iter(|| {
|
||||
let n = test::black_box(7000u32);
|
||||
let mut a = 0u64;
|
||||
for i in (0u32..n / 7).map(|val| val * 7) {
|
||||
a ^= fast_field_reader.get(i);
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
#[bench]
|
||||
fn bench_intfastfield_linear_fflookup(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
let column: Arc<dyn Column<u64>> = encode_decode_fast_field(&permutation);
|
||||
b.iter(|| {
|
||||
let n = test::black_box(7000u32);
|
||||
let mut a = 0u64;
|
||||
for i in (0..n / 7).map(|val| val * 7) {
|
||||
a += column.get_val(i as u64);
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_linear_vec(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
b.iter(|| {
|
||||
let n = test::black_box(7000);
|
||||
let mut a = 0u64;
|
||||
for i in (0..n / 7).map(|val| val * 7) {
|
||||
a += permutation[i];
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_fflookup(b: &mut Bencher) {
|
||||
let path = Path::new("test");
|
||||
let permutation = generate_permutation();
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for &x in &permutation {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>x));
|
||||
let column: Arc<dyn Column<u64>> = encode_decode_fast_field(&permutation);
|
||||
b.iter(|| {
|
||||
let mut a = 0u64;
|
||||
for i in 0u64..permutation.len() as u64 {
|
||||
a = column.get_val(i);
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new(), None)
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(&path).unwrap();
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let mut a = 0u32;
|
||||
for i in 0u32..permutation.len() as u32 {
|
||||
a = fast_field_reader.get(i) as u32;
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_fflookup_gcd(b: &mut Bencher) {
|
||||
let path = Path::new("test");
|
||||
let permutation = generate_permutation_gcd();
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for &x in &permutation {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>x));
|
||||
let column: Arc<dyn Column<u64>> = encode_decode_fast_field(&permutation);
|
||||
b.iter(|| {
|
||||
let mut a = 0u64;
|
||||
for i in 0..permutation.len() as u64 {
|
||||
a += column.get_val(i);
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new(), None)
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(&path).unwrap();
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
|
||||
a
|
||||
});
|
||||
}
|
||||
|
||||
b.iter(|| {
|
||||
let mut a = 0u32;
|
||||
for i in 0u32..permutation.len() as u32 {
|
||||
a = fast_field_reader.get(i) as u32;
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
#[bench]
|
||||
fn bench_intfastfield_vec(b: &mut Bencher) {
|
||||
let permutation = generate_permutation_gcd();
|
||||
b.iter(|| {
|
||||
let mut a = 0u64;
|
||||
for i in 0..permutation.len() {
|
||||
a += permutation[i as usize] as u64;
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -346,6 +346,7 @@ mod tests {
|
||||
assert!(test_multivalued_no_panic(&ops[..]).is_ok());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multivalued_proptest_gcd() {
|
||||
use IndexingOp::*;
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue, MultiValueLength};
|
||||
use fastfield_codecs::Column;
|
||||
|
||||
use crate::fastfield::{FastValue, MultiValueLength};
|
||||
use crate::DocId;
|
||||
|
||||
/// Reader for a multivalued `u64` fast field.
|
||||
@@ -12,14 +15,14 @@ use crate::DocId;
|
||||
/// The `idx_reader` associated, for each document, the index of its first value.
|
||||
#[derive(Clone)]
|
||||
pub struct MultiValuedFastFieldReader<Item: FastValue> {
|
||||
idx_reader: DynamicFastFieldReader<u64>,
|
||||
vals_reader: DynamicFastFieldReader<Item>,
|
||||
idx_reader: Arc<dyn Column<u64>>,
|
||||
vals_reader: Arc<dyn Column<Item>>,
|
||||
}
|
||||
|
||||
impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
|
||||
pub(crate) fn open(
|
||||
idx_reader: DynamicFastFieldReader<u64>,
|
||||
vals_reader: DynamicFastFieldReader<Item>,
|
||||
idx_reader: Arc<dyn Column<u64>>,
|
||||
vals_reader: Arc<dyn Column<Item>>,
|
||||
) -> MultiValuedFastFieldReader<Item> {
|
||||
MultiValuedFastFieldReader {
|
||||
idx_reader,
|
||||
@@ -31,8 +34,9 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
|
||||
/// to the given document are `start..end`.
|
||||
#[inline]
|
||||
fn range(&self, doc: DocId) -> Range<u64> {
|
||||
let start = self.idx_reader.get(doc);
|
||||
let end = self.idx_reader.get(doc + 1);
|
||||
let idx = doc as u64;
|
||||
let start = self.idx_reader.get_val(idx);
|
||||
let end = self.idx_reader.get_val(idx + 1);
|
||||
start..end
|
||||
}
|
||||
|
||||
|
||||
@@ -1,175 +1,65 @@
|
||||
use std::collections::HashMap;
|
||||
use std::marker::PhantomData;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
|
||||
use common::BinarySerializable;
|
||||
use fastfield_codecs::bitpacked::{BitpackedCodec, BitpackedReader};
|
||||
use fastfield_codecs::blockwise_linear::{BlockwiseLinearCodec, BlockwiseLinearReader};
|
||||
use fastfield_codecs::linear::{LinearCodec, LinearReader};
|
||||
use fastfield_codecs::{FastFieldCodec, FastFieldCodecType, FastFieldDataAccess};
|
||||
use fastfield_codecs::bitpacked::BitpackedCodec;
|
||||
use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec;
|
||||
use fastfield_codecs::linear::LinearCodec;
|
||||
use fastfield_codecs::{monotonic_map_column, Column, FastFieldCodec, FastFieldCodecType};
|
||||
|
||||
use super::gcd::open_gcd_from_bytes;
|
||||
use super::FastValue;
|
||||
use crate::directory::{CompositeFile, Directory, FileSlice, OwnedBytes, RamDirectory, WritePtr};
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::error::DataCorruption;
|
||||
use crate::fastfield::{CompositeFastFieldSerializer, FastFieldsWriter, GCDReader};
|
||||
use crate::schema::{Schema, FAST};
|
||||
use crate::DocId;
|
||||
|
||||
/// FastFieldReader is the trait to access fast field data.
|
||||
pub trait FastFieldReader<Item: FastValue>: Clone {
|
||||
/// Return the value associated to the given document.
|
||||
///
|
||||
/// This accessor should return as fast as possible.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `doc` is greater than the segment
|
||||
fn get(&self, doc: DocId) -> Item;
|
||||
|
||||
/// Fills an output buffer with the fast field values
|
||||
/// associated with the `DocId` going from
|
||||
/// `start` to `start + output.len()`.
|
||||
///
|
||||
/// Regardless of the type of `Item`, this method works
|
||||
/// - transmuting the output array
|
||||
/// - extracting the `Item`s as if they were `u64`
|
||||
/// - possibly converting the `u64` value to the right type.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `start + output.len()` is greater than
|
||||
/// the segment's `maxdoc`.
|
||||
fn get_range(&self, start: u64, output: &mut [Item]);
|
||||
|
||||
/// Returns the minimum value for this fast field.
|
||||
///
|
||||
/// The min value does not take in account of possible
|
||||
/// deleted document, and should be considered as a lower bound
|
||||
/// of the actual minimum value.
|
||||
fn min_value(&self) -> Item;
|
||||
|
||||
/// Returns the maximum value for this fast field.
|
||||
///
|
||||
/// The max value does not take in account of possible
|
||||
/// deleted document, and should be considered as an upper bound
|
||||
/// of the actual maximum value.
|
||||
fn max_value(&self) -> Item;
|
||||
fn open_codec_from_bytes<C: FastFieldCodec, Item: FastValue>(
|
||||
bytes: OwnedBytes,
|
||||
) -> crate::Result<Arc<dyn Column<Item>>> {
|
||||
let reader = C::open_from_bytes(bytes)?;
|
||||
Ok(Arc::new(monotonic_map_column(reader, Item::from_u64)))
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
/// DynamicFastFieldReader wraps different readers to access
|
||||
/// the various encoded fastfield data
|
||||
pub enum DynamicFastFieldReader<Item: FastValue> {
|
||||
/// Bitpacked compressed fastfield data.
|
||||
Bitpacked(FastFieldReaderCodecWrapper<Item, BitpackedReader>),
|
||||
/// Linear interpolated values + bitpacked
|
||||
Linear(FastFieldReaderCodecWrapper<Item, LinearReader>),
|
||||
/// Blockwise linear interpolated values + bitpacked
|
||||
BlockwiseLinear(FastFieldReaderCodecWrapper<Item, BlockwiseLinearReader>),
|
||||
|
||||
/// GCD and Bitpacked compressed fastfield data.
|
||||
BitpackedGCD(FastFieldReaderCodecWrapper<Item, GCDReader<BitpackedReader>>),
|
||||
/// GCD and Linear interpolated values + bitpacked
|
||||
LinearGCD(FastFieldReaderCodecWrapper<Item, GCDReader<LinearReader>>),
|
||||
/// GCD and Blockwise linear interpolated values + bitpacked
|
||||
BlockwiseLinearGCD(FastFieldReaderCodecWrapper<Item, GCDReader<BlockwiseLinearReader>>),
|
||||
fn open_codec_with_gcd<C: FastFieldCodec, Item: FastValue>(
|
||||
bytes: OwnedBytes,
|
||||
) -> crate::Result<Arc<dyn Column<Item>>> {
|
||||
let reader = open_gcd_from_bytes::<C>(bytes)?;
|
||||
Ok(Arc::new(monotonic_map_column(reader, Item::from_u64)))
|
||||
}
|
||||
|
||||
impl<Item: FastValue> DynamicFastFieldReader<Item> {
|
||||
/// Returns correct the reader wrapped in the `DynamicFastFieldReader` enum for the data.
|
||||
pub fn open_from_id(
|
||||
mut bytes: OwnedBytes,
|
||||
codec_type: FastFieldCodecType,
|
||||
) -> crate::Result<DynamicFastFieldReader<Item>> {
|
||||
let reader = match codec_type {
|
||||
FastFieldCodecType::Bitpacked => {
|
||||
DynamicFastFieldReader::Bitpacked(BitpackedCodec::open_from_bytes(bytes)?.into())
|
||||
}
|
||||
FastFieldCodecType::Linear => {
|
||||
DynamicFastFieldReader::Linear(LinearCodec::open_from_bytes(bytes)?.into())
|
||||
}
|
||||
FastFieldCodecType::BlockwiseLinear => DynamicFastFieldReader::BlockwiseLinear(
|
||||
BlockwiseLinearCodec::open_from_bytes(bytes)?.into(),
|
||||
),
|
||||
FastFieldCodecType::Gcd => {
|
||||
let codec_type = FastFieldCodecType::deserialize(&mut bytes)?;
|
||||
match codec_type {
|
||||
FastFieldCodecType::Bitpacked => DynamicFastFieldReader::BitpackedGCD(
|
||||
open_gcd_from_bytes::<BitpackedCodec>(bytes)?.into(),
|
||||
),
|
||||
FastFieldCodecType::Linear => DynamicFastFieldReader::LinearGCD(
|
||||
open_gcd_from_bytes::<LinearCodec>(bytes)?.into(),
|
||||
),
|
||||
FastFieldCodecType::BlockwiseLinear => {
|
||||
DynamicFastFieldReader::BlockwiseLinearGCD(
|
||||
open_gcd_from_bytes::<BlockwiseLinearCodec>(bytes)?.into(),
|
||||
)
|
||||
}
|
||||
FastFieldCodecType::Gcd => {
|
||||
return Err(DataCorruption::comment_only(
|
||||
"Gcd codec wrapped into another gcd codec. This combination is not \
|
||||
allowed.",
|
||||
)
|
||||
.into())
|
||||
}
|
||||
/// Returns correct the reader wrapped in the `DynamicFastFieldReader` enum for the data.
|
||||
fn open_from_id<Item: FastValue>(
|
||||
mut bytes: OwnedBytes,
|
||||
codec_type: FastFieldCodecType,
|
||||
) -> crate::Result<Arc<dyn Column<Item>>> {
|
||||
match codec_type {
|
||||
FastFieldCodecType::Bitpacked => open_codec_from_bytes::<BitpackedCodec, _>(bytes),
|
||||
FastFieldCodecType::Linear => open_codec_from_bytes::<LinearCodec, _>(bytes),
|
||||
FastFieldCodecType::BlockwiseLinear => {
|
||||
open_codec_from_bytes::<BlockwiseLinearCodec, _>(bytes)
|
||||
}
|
||||
FastFieldCodecType::Gcd => {
|
||||
let codec_type = FastFieldCodecType::deserialize(&mut bytes)?;
|
||||
match codec_type {
|
||||
FastFieldCodecType::Bitpacked => open_codec_with_gcd::<BitpackedCodec, _>(bytes),
|
||||
FastFieldCodecType::Linear => open_codec_with_gcd::<LinearCodec, _>(bytes),
|
||||
FastFieldCodecType::BlockwiseLinear => {
|
||||
open_codec_with_gcd::<BlockwiseLinearCodec, _>(bytes)
|
||||
}
|
||||
FastFieldCodecType::Gcd => Err(DataCorruption::comment_only(
|
||||
"Gcd codec wrapped into another gcd codec. This combination is not allowed.",
|
||||
)
|
||||
.into()),
|
||||
}
|
||||
};
|
||||
Ok(reader)
|
||||
}
|
||||
|
||||
/// Returns correct the reader wrapped in the `DynamicFastFieldReader` enum for the data.
|
||||
pub fn open(file: FileSlice) -> crate::Result<DynamicFastFieldReader<Item>> {
|
||||
let mut bytes = file.read_bytes()?;
|
||||
let codec_type = FastFieldCodecType::deserialize(&mut bytes)?;
|
||||
Self::open_from_id(bytes, codec_type)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
|
||||
#[inline]
|
||||
fn get(&self, doc: DocId) -> Item {
|
||||
match self {
|
||||
Self::Bitpacked(reader) => reader.get(doc),
|
||||
Self::Linear(reader) => reader.get(doc),
|
||||
Self::BlockwiseLinear(reader) => reader.get(doc),
|
||||
Self::BitpackedGCD(reader) => reader.get(doc),
|
||||
Self::LinearGCD(reader) => reader.get(doc),
|
||||
Self::BlockwiseLinearGCD(reader) => reader.get(doc),
|
||||
}
|
||||
}
|
||||
#[inline]
|
||||
fn get_range(&self, start: u64, output: &mut [Item]) {
|
||||
match self {
|
||||
Self::Bitpacked(reader) => reader.get_range(start, output),
|
||||
Self::Linear(reader) => reader.get_range(start, output),
|
||||
Self::BlockwiseLinear(reader) => reader.get_range(start, output),
|
||||
Self::BitpackedGCD(reader) => reader.get_range(start, output),
|
||||
Self::LinearGCD(reader) => reader.get_range(start, output),
|
||||
Self::BlockwiseLinearGCD(reader) => reader.get_range(start, output),
|
||||
}
|
||||
}
|
||||
fn min_value(&self) -> Item {
|
||||
match self {
|
||||
Self::Bitpacked(reader) => reader.min_value(),
|
||||
Self::Linear(reader) => reader.min_value(),
|
||||
Self::BlockwiseLinear(reader) => reader.min_value(),
|
||||
Self::BitpackedGCD(reader) => reader.min_value(),
|
||||
Self::LinearGCD(reader) => reader.min_value(),
|
||||
Self::BlockwiseLinearGCD(reader) => reader.min_value(),
|
||||
}
|
||||
}
|
||||
fn max_value(&self) -> Item {
|
||||
match self {
|
||||
Self::Bitpacked(reader) => reader.max_value(),
|
||||
Self::Linear(reader) => reader.max_value(),
|
||||
Self::BlockwiseLinear(reader) => reader.max_value(),
|
||||
Self::BitpackedGCD(reader) => reader.max_value(),
|
||||
Self::LinearGCD(reader) => reader.max_value(),
|
||||
Self::BlockwiseLinearGCD(reader) => reader.max_value(),
|
||||
}
|
||||
}
|
||||
/// Returns correct the reader wrapped in the `DynamicFastFieldReader` enum for the data.
|
||||
pub fn open_fast_field<Item: FastValue>(
|
||||
mut bytes: OwnedBytes,
|
||||
) -> crate::Result<Arc<dyn Column<Item>>> {
|
||||
let codec_type = FastFieldCodecType::deserialize(&mut bytes)?;
|
||||
open_from_id(bytes, codec_type)
|
||||
}
|
||||
|
||||
/// Wrapper for accessing a fastfield.
|
||||
@@ -192,10 +82,10 @@ impl<Item: FastValue, CodecReader> From<CodecReader>
|
||||
}
|
||||
}
|
||||
|
||||
impl<Item: FastValue, D: FastFieldDataAccess> FastFieldReaderCodecWrapper<Item, D> {
|
||||
impl<Item: FastValue, D: Column> FastFieldReaderCodecWrapper<Item, D> {
|
||||
#[inline]
|
||||
pub(crate) fn get_u64(&self, doc: u64) -> Item {
|
||||
let data = self.reader.get_val(doc);
|
||||
pub(crate) fn get_u64(&self, idx: u64) -> Item {
|
||||
let data = self.reader.get_val(idx);
|
||||
Item::from_u64(data)
|
||||
}
|
||||
|
||||
@@ -218,9 +108,7 @@ impl<Item: FastValue, D: FastFieldDataAccess> FastFieldReaderCodecWrapper<Item,
|
||||
}
|
||||
}
|
||||
|
||||
impl<Item: FastValue, C: FastFieldDataAccess + Clone> FastFieldReader<Item>
|
||||
for FastFieldReaderCodecWrapper<Item, C>
|
||||
{
|
||||
impl<Item: FastValue, C: Column + Clone> Column<Item> for FastFieldReaderCodecWrapper<Item, C> {
|
||||
/// Return the value associated to the given document.
|
||||
///
|
||||
/// This accessor should return as fast as possible.
|
||||
@@ -229,8 +117,8 @@ impl<Item: FastValue, C: FastFieldDataAccess + Clone> FastFieldReader<Item>
|
||||
///
|
||||
/// May panic if `doc` is greater than the segment
|
||||
// `maxdoc`.
|
||||
fn get(&self, doc: DocId) -> Item {
|
||||
self.get_u64(u64::from(doc))
|
||||
fn get_val(&self, idx: u64) -> Item {
|
||||
self.get_u64(idx)
|
||||
}
|
||||
|
||||
/// Fills an output buffer with the fast field values
|
||||
@@ -267,41 +155,8 @@ impl<Item: FastValue, C: FastFieldDataAccess + Clone> FastFieldReader<Item>
|
||||
fn max_value(&self) -> Item {
|
||||
Item::from_u64(self.reader.max_value())
|
||||
}
|
||||
}
|
||||
|
||||
impl<Item: FastValue> From<Vec<Item>> for DynamicFastFieldReader<Item> {
|
||||
fn from(vals: Vec<Item>) -> DynamicFastFieldReader<Item> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_u64_field("field", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let path = Path::new("__dummy__");
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory
|
||||
.open_write(path)
|
||||
.expect("With a RamDirectory, this should never fail.");
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write)
|
||||
.expect("With a RamDirectory, this should never fail.");
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
|
||||
{
|
||||
let fast_field_writer = fast_field_writers
|
||||
.get_field_writer_mut(field)
|
||||
.expect("With a RamDirectory, this should never fail.");
|
||||
for val in vals {
|
||||
fast_field_writer.add_val(val.to_u64());
|
||||
}
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new(), None)
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
|
||||
let file = directory.open_read(path).expect("Failed to open the file");
|
||||
let composite_file = CompositeFile::open(&file).expect("Failed to read the composite file");
|
||||
let field_file = composite_file
|
||||
.open_read(field)
|
||||
.expect("File component not found");
|
||||
DynamicFastFieldReader::open(field_file).unwrap()
|
||||
fn num_vals(&self) -> u64 {
|
||||
self.reader.num_vals()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,9 @@
|
||||
use super::reader::DynamicFastFieldReader;
|
||||
use std::sync::Arc;
|
||||
|
||||
use fastfield_codecs::Column;
|
||||
|
||||
use crate::directory::{CompositeFile, FileSlice};
|
||||
use crate::fastfield::reader::open_fast_field;
|
||||
use crate::fastfield::{
|
||||
BytesFastFieldReader, FastFieldNotAvailableError, FastValue, MultiValuedFastFieldReader,
|
||||
};
|
||||
@@ -109,14 +113,16 @@ impl FastFieldReaders {
|
||||
&self,
|
||||
field: Field,
|
||||
index: usize,
|
||||
) -> crate::Result<DynamicFastFieldReader<TFastValue>> {
|
||||
) -> crate::Result<Arc<dyn Column<TFastValue>>> {
|
||||
let fast_field_slice = self.fast_field_data(field, index)?;
|
||||
DynamicFastFieldReader::open(fast_field_slice)
|
||||
let bytes = fast_field_slice.read_bytes()?;
|
||||
open_fast_field(bytes)
|
||||
}
|
||||
|
||||
pub(crate) fn typed_fast_field_reader<TFastValue: FastValue>(
|
||||
&self,
|
||||
field: Field,
|
||||
) -> crate::Result<DynamicFastFieldReader<TFastValue>> {
|
||||
) -> crate::Result<Arc<dyn Column<TFastValue>>> {
|
||||
self.typed_fast_field_reader_with_idx(field, 0)
|
||||
}
|
||||
|
||||
@@ -132,7 +138,7 @@ impl FastFieldReaders {
|
||||
/// Returns the `u64` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a u64 fast field, this method returns an Error.
|
||||
pub fn u64(&self, field: Field) -> crate::Result<DynamicFastFieldReader<u64>> {
|
||||
pub fn u64(&self, field: Field) -> crate::Result<Arc<dyn Column<u64>>> {
|
||||
self.check_type(field, FastType::U64, Cardinality::SingleValue)?;
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
@@ -142,14 +148,14 @@ impl FastFieldReaders {
|
||||
///
|
||||
/// If not, the fastfield reader will returns the u64-value associated to the original
|
||||
/// FastValue.
|
||||
pub fn u64_lenient(&self, field: Field) -> crate::Result<DynamicFastFieldReader<u64>> {
|
||||
pub fn u64_lenient(&self, field: Field) -> crate::Result<Arc<dyn Column<u64>>> {
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
|
||||
/// Returns the `i64` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a i64 fast field, this method returns an Error.
|
||||
pub fn i64(&self, field: Field) -> crate::Result<DynamicFastFieldReader<i64>> {
|
||||
pub fn i64(&self, field: Field) -> crate::Result<Arc<dyn Column<i64>>> {
|
||||
self.check_type(field, FastType::I64, Cardinality::SingleValue)?;
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
@@ -157,7 +163,7 @@ impl FastFieldReaders {
|
||||
/// Returns the `date` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a date fast field, this method returns an Error.
|
||||
pub fn date(&self, field: Field) -> crate::Result<DynamicFastFieldReader<DateTime>> {
|
||||
pub fn date(&self, field: Field) -> crate::Result<Arc<dyn Column<DateTime>>> {
|
||||
self.check_type(field, FastType::Date, Cardinality::SingleValue)?;
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
@@ -165,7 +171,7 @@ impl FastFieldReaders {
|
||||
/// Returns the `f64` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a f64 fast field, this method returns an Error.
|
||||
pub fn f64(&self, field: Field) -> crate::Result<DynamicFastFieldReader<f64>> {
|
||||
pub fn f64(&self, field: Field) -> crate::Result<Arc<dyn Column<f64>>> {
|
||||
self.check_type(field, FastType::F64, Cardinality::SingleValue)?;
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
@@ -173,7 +179,7 @@ impl FastFieldReaders {
|
||||
/// Returns the `bool` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a bool fast field, this method returns an Error.
|
||||
pub fn bool(&self, field: Field) -> crate::Result<DynamicFastFieldReader<bool>> {
|
||||
pub fn bool(&self, field: Field) -> crate::Result<Arc<dyn Column<bool>>> {
|
||||
self.check_type(field, FastType::Bool, Cardinality::SingleValue)?;
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
@@ -241,7 +247,8 @@ impl FastFieldReaders {
|
||||
)));
|
||||
}
|
||||
let fast_field_idx_file = self.fast_field_data(field, 0)?;
|
||||
let idx_reader = DynamicFastFieldReader::open(fast_field_idx_file)?;
|
||||
let fast_field_idx_bytes = fast_field_idx_file.read_bytes()?;
|
||||
let idx_reader = open_fast_field(fast_field_idx_bytes)?;
|
||||
let data = self.fast_field_data(field, 1)?;
|
||||
BytesFastFieldReader::open(idx_reader, data)
|
||||
} else {
|
||||
|
||||
@@ -6,8 +6,8 @@ use fastdivide::DividerU64;
|
||||
pub use fastfield_codecs::bitpacked::{BitpackedCodec, BitpackedSerializerLegacy};
|
||||
use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec;
|
||||
use fastfield_codecs::linear::LinearCodec;
|
||||
use fastfield_codecs::FastFieldCodecType;
|
||||
pub use fastfield_codecs::{FastFieldCodec, FastFieldDataAccess, FastFieldStats};
|
||||
use fastfield_codecs::{monotonic_map_column, FastFieldCodecType};
|
||||
pub use fastfield_codecs::{Column, FastFieldCodec, FastFieldStats};
|
||||
|
||||
use super::{find_gcd, ALL_CODECS, GCD_DEFAULT};
|
||||
use crate::directory::{CompositeWrite, WritePtr};
|
||||
@@ -64,15 +64,13 @@ impl From<FastFieldCodecType> for FastFieldCodecEnableCheck {
|
||||
|
||||
// use this, when this is merged and stabilized explicit_generic_args_with_impl_trait
|
||||
// https://github.com/rust-lang/rust/pull/86176
|
||||
fn codec_estimation<C: FastFieldCodec, A: FastFieldDataAccess>(
|
||||
fastfield_accessor: &A,
|
||||
fn codec_estimation<C: FastFieldCodec, D: Column>(
|
||||
fastfield_accessor: &D,
|
||||
estimations: &mut Vec<(f32, FastFieldCodecType)>,
|
||||
) {
|
||||
if !C::is_applicable(fastfield_accessor) {
|
||||
return;
|
||||
if let Some(ratio) = C::estimate(fastfield_accessor) {
|
||||
estimations.push((ratio, C::CODEC_TYPE));
|
||||
}
|
||||
let ratio = C::estimate(fastfield_accessor);
|
||||
estimations.push((ratio, C::CODEC_TYPE));
|
||||
}
|
||||
|
||||
impl CompositeFastFieldSerializer {
|
||||
@@ -99,7 +97,7 @@ impl CompositeFastFieldSerializer {
|
||||
pub fn create_auto_detect_u64_fast_field(
|
||||
&mut self,
|
||||
field: Field,
|
||||
fastfield_accessor: impl FastFieldDataAccess,
|
||||
fastfield_accessor: impl Column,
|
||||
) -> io::Result<()> {
|
||||
self.create_auto_detect_u64_fast_field_with_idx(field, fastfield_accessor, 0)
|
||||
}
|
||||
@@ -119,7 +117,7 @@ impl CompositeFastFieldSerializer {
|
||||
pub fn create_auto_detect_u64_fast_field_with_idx(
|
||||
&mut self,
|
||||
field: Field,
|
||||
fastfield_accessor: impl FastFieldDataAccess,
|
||||
fastfield_accessor: impl Column,
|
||||
idx: usize,
|
||||
) -> io::Result<()> {
|
||||
let min_value = fastfield_accessor.min_value();
|
||||
@@ -138,56 +136,22 @@ impl CompositeFastFieldSerializer {
|
||||
}
|
||||
|
||||
Self::write_header(field_write, FastFieldCodecType::Gcd)?;
|
||||
struct GCDWrappedFFAccess<T: FastFieldDataAccess> {
|
||||
fastfield_accessor: T,
|
||||
base_value: u64,
|
||||
max_value: u64,
|
||||
num_vals: u64,
|
||||
gcd: DividerU64,
|
||||
}
|
||||
|
||||
impl<T: FastFieldDataAccess> FastFieldDataAccess for GCDWrappedFFAccess<T> {
|
||||
fn get_val(&self, position: u64) -> u64 {
|
||||
self.gcd
|
||||
.divide(self.fastfield_accessor.get_val(position) - self.base_value)
|
||||
}
|
||||
fn iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
|
||||
Box::new(
|
||||
self.fastfield_accessor
|
||||
.iter()
|
||||
.map(|val| self.gcd.divide(val - self.base_value)),
|
||||
)
|
||||
}
|
||||
fn min_value(&self) -> u64 {
|
||||
0
|
||||
}
|
||||
|
||||
fn max_value(&self) -> u64 {
|
||||
self.max_value
|
||||
}
|
||||
|
||||
fn num_vals(&self) -> u64 {
|
||||
self.num_vals
|
||||
}
|
||||
}
|
||||
|
||||
let num_vals = fastfield_accessor.num_vals();
|
||||
let base_value = fastfield_accessor.min_value();
|
||||
let max_value = (fastfield_accessor.max_value() - fastfield_accessor.min_value()) / gcd;
|
||||
|
||||
let fastfield_accessor = GCDWrappedFFAccess {
|
||||
fastfield_accessor,
|
||||
base_value,
|
||||
max_value,
|
||||
num_vals,
|
||||
gcd: DividerU64::divide_by(gcd),
|
||||
};
|
||||
let gcd_divider = DividerU64::divide_by(gcd);
|
||||
|
||||
let divided_fastfield_accessor = monotonic_map_column(fastfield_accessor, |val: u64| {
|
||||
gcd_divider.divide(val - base_value)
|
||||
});
|
||||
|
||||
let num_vals = divided_fastfield_accessor.num_vals();
|
||||
|
||||
Self::create_auto_detect_u64_fast_field_with_idx_gcd(
|
||||
self.codec_enable_checker.clone(),
|
||||
field,
|
||||
field_write,
|
||||
fastfield_accessor,
|
||||
divided_fastfield_accessor,
|
||||
)?;
|
||||
write_gcd_header(field_write, base_value, gcd, num_vals)?;
|
||||
Ok(())
|
||||
@@ -199,7 +163,7 @@ impl CompositeFastFieldSerializer {
|
||||
codec_enable_checker: FastFieldCodecEnableCheck,
|
||||
field: Field,
|
||||
field_write: &mut CountingWriter<W>,
|
||||
fastfield_accessor: impl FastFieldDataAccess,
|
||||
fastfield_accessor: impl Column,
|
||||
) -> io::Result<()> {
|
||||
let mut estimations = vec![];
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::collections::HashMap;
|
||||
use std::io;
|
||||
|
||||
use common;
|
||||
use fastfield_codecs::FastFieldDataAccess;
|
||||
use fastfield_codecs::Column;
|
||||
use fnv::FnvHashMap;
|
||||
use tantivy_bitpacker::BlockedBitpacker;
|
||||
|
||||
@@ -384,7 +384,7 @@ struct WriterFastFieldAccessProvider<'map, 'bitp> {
|
||||
vals: &'bitp BlockedBitpacker,
|
||||
stats: FastFieldStats,
|
||||
}
|
||||
impl<'map, 'bitp> FastFieldDataAccess for WriterFastFieldAccessProvider<'map, 'bitp> {
|
||||
impl<'map, 'bitp> Column for WriterFastFieldAccessProvider<'map, 'bitp> {
|
||||
/// Return the value associated to the given doc.
|
||||
///
|
||||
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance
|
||||
|
||||
@@ -144,7 +144,6 @@ pub(crate) fn get_doc_id_mapping_from_field(
|
||||
#[cfg(test)]
|
||||
mod tests_indexsorting {
|
||||
use crate::collector::TopDocs;
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::{Schema, *};
|
||||
@@ -464,9 +463,9 @@ mod tests_indexsorting {
|
||||
let my_number = index.schema().get_field("my_number").unwrap();
|
||||
|
||||
let fast_field = fast_fields.u64(my_number).unwrap();
|
||||
assert_eq!(fast_field.get(0u32), 10u64);
|
||||
assert_eq!(fast_field.get(1u32), 20u64);
|
||||
assert_eq!(fast_field.get(2u32), 30u64);
|
||||
assert_eq!(fast_field.get_val(0), 10u64);
|
||||
assert_eq!(fast_field.get_val(1), 20u64);
|
||||
assert_eq!(fast_field.get_val(2), 30u64);
|
||||
|
||||
let multi_numbers = index.schema().get_field("multi_numbers").unwrap();
|
||||
let multifield = fast_fields.u64s(multi_numbers).unwrap();
|
||||
|
||||
@@ -174,9 +174,7 @@ fn index_documents(
|
||||
segment_updater: &mut SegmentUpdater,
|
||||
mut delete_cursor: DeleteCursor,
|
||||
) -> crate::Result<()> {
|
||||
let schema = segment.schema();
|
||||
|
||||
let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), schema)?;
|
||||
let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone())?;
|
||||
for document_group in grouped_document_iterator {
|
||||
for doc in document_group {
|
||||
segment_writer.add_document(doc)?;
|
||||
@@ -785,7 +783,6 @@ mod tests {
|
||||
use crate::collector::TopDocs;
|
||||
use crate::directory::error::LockError;
|
||||
use crate::error::*;
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::indexer::NoMergePolicy;
|
||||
use crate::query::{QueryParser, TermQuery};
|
||||
use crate::schema::{
|
||||
@@ -1327,7 +1324,7 @@ mod tests {
|
||||
let fast_field_reader = segment_reader.fast_fields().u64(id_field)?;
|
||||
let in_order_alive_ids: Vec<u64> = segment_reader
|
||||
.doc_ids_alive()
|
||||
.map(|doc| fast_field_reader.get(doc))
|
||||
.map(|doc| fast_field_reader.get_val(doc as u64))
|
||||
.collect();
|
||||
assert_eq!(&in_order_alive_ids[..], &[9, 8, 7, 6, 5, 4, 1, 0]);
|
||||
Ok(())
|
||||
@@ -1493,7 +1490,7 @@ mod tests {
|
||||
let ff_reader = segment_reader.fast_fields().u64(id_field).unwrap();
|
||||
segment_reader
|
||||
.doc_ids_alive()
|
||||
.map(move |doc| ff_reader.get(doc))
|
||||
.map(move |doc| ff_reader.get_val(doc as u64))
|
||||
})
|
||||
.collect();
|
||||
|
||||
@@ -1504,7 +1501,7 @@ mod tests {
|
||||
let ff_reader = segment_reader.fast_fields().u64(id_field).unwrap();
|
||||
segment_reader
|
||||
.doc_ids_alive()
|
||||
.map(move |doc| ff_reader.get(doc))
|
||||
.map(move |doc| ff_reader.get_val(doc as u64))
|
||||
})
|
||||
.collect();
|
||||
|
||||
@@ -1622,7 +1619,7 @@ mod tests {
|
||||
facet_reader
|
||||
.facet_from_ord(facet_ords[0], &mut facet)
|
||||
.unwrap();
|
||||
let id = ff_reader.get(doc_id);
|
||||
let id = ff_reader.get_val(doc_id as u64);
|
||||
let facet_expected = Facet::from(&("/cola/".to_string() + &id.to_string()));
|
||||
|
||||
assert_eq!(facet, facet_expected);
|
||||
|
||||
@@ -4,14 +4,13 @@ use std::sync::Arc;
|
||||
|
||||
use itertools::Itertools;
|
||||
use measure_time::debug_time;
|
||||
use tantivy_bitpacker::minmax;
|
||||
|
||||
use crate::core::{Segment, SegmentReader};
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::error::DataCorruption;
|
||||
use crate::fastfield::{
|
||||
AliveBitSet, CompositeFastFieldSerializer, DynamicFastFieldReader, FastFieldDataAccess,
|
||||
FastFieldReader, FastFieldStats, MultiValueLength, MultiValuedFastFieldReader,
|
||||
AliveBitSet, Column, CompositeFastFieldSerializer, FastFieldStats, MultiValueLength,
|
||||
MultiValuedFastFieldReader,
|
||||
};
|
||||
use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter};
|
||||
use crate::indexer::doc_id_mapping::{expect_field_id_for_sort_field, SegmentDocIdMapping};
|
||||
@@ -88,7 +87,7 @@ pub struct IndexMerger {
|
||||
}
|
||||
|
||||
fn compute_min_max_val(
|
||||
u64_reader: &impl FastFieldReader<u64>,
|
||||
u64_reader: &dyn Column<u64>,
|
||||
segment_reader: &SegmentReader,
|
||||
) -> Option<(u64, u64)> {
|
||||
if segment_reader.max_doc() == 0 {
|
||||
@@ -102,11 +101,11 @@ fn compute_min_max_val(
|
||||
}
|
||||
// some deleted documents,
|
||||
// we need to recompute the max / min
|
||||
minmax(
|
||||
segment_reader
|
||||
.doc_ids_alive()
|
||||
.map(|doc_id| u64_reader.get(doc_id)),
|
||||
)
|
||||
segment_reader
|
||||
.doc_ids_alive()
|
||||
.map(|doc_id| u64_reader.get_val(doc_id as u64))
|
||||
.minmax()
|
||||
.into_option()
|
||||
}
|
||||
|
||||
struct TermOrdinalMapping {
|
||||
@@ -134,7 +133,7 @@ impl TermOrdinalMapping {
|
||||
fn max_term_ord(&self) -> TermOrdinal {
|
||||
self.per_segment_new_term_ordinals
|
||||
.iter()
|
||||
.flat_map(|term_ordinals| term_ordinals.iter().max())
|
||||
.flat_map(|term_ordinals| term_ordinals.iter().max().cloned())
|
||||
.max()
|
||||
.unwrap_or_default()
|
||||
}
|
||||
@@ -342,12 +341,12 @@ impl IndexMerger {
|
||||
.readers
|
||||
.iter()
|
||||
.filter_map(|reader| {
|
||||
let u64_reader: DynamicFastFieldReader<u64> =
|
||||
let u64_reader: Arc<dyn Column<u64>> =
|
||||
reader.fast_fields().typed_fast_field_reader(field).expect(
|
||||
"Failed to find a reader for single fast field. This is a tantivy bug and \
|
||||
it should never happen.",
|
||||
);
|
||||
compute_min_max_val(&u64_reader, reader)
|
||||
compute_min_max_val(&*u64_reader, reader)
|
||||
})
|
||||
.reduce(|a, b| (a.0.min(b.0), a.1.max(b.1)))
|
||||
.expect("Unexpected error, empty readers in IndexMerger");
|
||||
@@ -356,7 +355,7 @@ impl IndexMerger {
|
||||
.readers
|
||||
.iter()
|
||||
.map(|reader| {
|
||||
let u64_reader: DynamicFastFieldReader<u64> =
|
||||
let u64_reader: Arc<dyn Column<u64>> =
|
||||
reader.fast_fields().typed_fast_field_reader(field).expect(
|
||||
"Failed to find a reader for single fast field. This is a tantivy bug and \
|
||||
it should never happen.",
|
||||
@@ -373,16 +372,16 @@ impl IndexMerger {
|
||||
#[derive(Clone)]
|
||||
struct SortedDocIdFieldAccessProvider<'a> {
|
||||
doc_id_mapping: &'a SegmentDocIdMapping,
|
||||
fast_field_readers: &'a Vec<DynamicFastFieldReader<u64>>,
|
||||
fast_field_readers: &'a Vec<Arc<dyn Column<u64>>>,
|
||||
stats: FastFieldStats,
|
||||
}
|
||||
impl<'a> FastFieldDataAccess for SortedDocIdFieldAccessProvider<'a> {
|
||||
impl<'a> Column for SortedDocIdFieldAccessProvider<'a> {
|
||||
fn get_val(&self, doc: u64) -> u64 {
|
||||
let DocAddress {
|
||||
doc_id,
|
||||
segment_ord,
|
||||
} = self.doc_id_mapping.get_old_doc_addr(doc as u32);
|
||||
self.fast_field_readers[segment_ord as usize].get(doc_id)
|
||||
self.fast_field_readers[segment_ord as usize].get_val(doc_id as u64)
|
||||
}
|
||||
|
||||
fn iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
|
||||
@@ -392,7 +391,7 @@ impl IndexMerger {
|
||||
.map(|old_doc_addr| {
|
||||
let fast_field_reader =
|
||||
&self.fast_field_readers[old_doc_addr.segment_ord as usize];
|
||||
fast_field_reader.get(old_doc_addr.doc_id)
|
||||
fast_field_reader.get_val(old_doc_addr.doc_id as u64)
|
||||
}),
|
||||
)
|
||||
}
|
||||
@@ -429,7 +428,7 @@ impl IndexMerger {
|
||||
|
||||
let everything_is_in_order = reader_ordinal_and_field_accessors
|
||||
.into_iter()
|
||||
.map(|reader| reader.1)
|
||||
.map(|(_, col)| Arc::new(col))
|
||||
.tuple_windows()
|
||||
.all(|(field_accessor1, field_accessor2)| {
|
||||
if sort_by_field.order.is_asc() {
|
||||
@@ -444,7 +443,7 @@ impl IndexMerger {
|
||||
pub(crate) fn get_sort_field_accessor(
|
||||
reader: &SegmentReader,
|
||||
sort_by_field: &IndexSortByField,
|
||||
) -> crate::Result<impl FastFieldReader<u64>> {
|
||||
) -> crate::Result<Arc<dyn Column>> {
|
||||
let field_id = expect_field_id_for_sort_field(reader.schema(), sort_by_field)?; // for now expect fastfield, but not strictly required
|
||||
let value_accessor = reader.fast_fields().u64_lenient(field_id)?;
|
||||
Ok(value_accessor)
|
||||
@@ -453,7 +452,7 @@ impl IndexMerger {
|
||||
pub(crate) fn get_reader_with_sort_field_accessor(
|
||||
&self,
|
||||
sort_by_field: &IndexSortByField,
|
||||
) -> crate::Result<Vec<(SegmentOrdinal, impl FastFieldReader<u64> + Clone)>> {
|
||||
) -> crate::Result<Vec<(SegmentOrdinal, Arc<dyn Column>)>> {
|
||||
let reader_ordinal_and_field_accessors = self
|
||||
.readers
|
||||
.iter()
|
||||
@@ -506,8 +505,8 @@ impl IndexMerger {
|
||||
doc_id_reader_pair
|
||||
.into_iter()
|
||||
.kmerge_by(|a, b| {
|
||||
let val1 = a.2.get(a.0);
|
||||
let val2 = b.2.get(b.0);
|
||||
let val1 = a.2.get_val(a.0 as u64);
|
||||
let val2 = b.2.get_val(b.0 as u64);
|
||||
if sort_by_field.order == Order::Asc {
|
||||
val1 < val2
|
||||
} else {
|
||||
@@ -578,7 +577,7 @@ impl IndexMerger {
|
||||
offsets: &'a [u64],
|
||||
stats: FastFieldStats,
|
||||
}
|
||||
impl<'a> FastFieldDataAccess for FieldIndexAccessProvider<'a> {
|
||||
impl<'a> Column for FieldIndexAccessProvider<'a> {
|
||||
fn get_val(&self, doc: u64) -> u64 {
|
||||
self.offsets[doc as usize]
|
||||
}
|
||||
@@ -619,7 +618,7 @@ impl IndexMerger {
|
||||
.map(|reader| {
|
||||
let u64s_reader: MultiValuedFastFieldReader<u64> = reader
|
||||
.fast_fields()
|
||||
.typed_fast_field_multi_reader(field)
|
||||
.typed_fast_field_multi_reader::<u64>(field)
|
||||
.expect(
|
||||
"Failed to find index for multivalued field. This is a bug in tantivy, \
|
||||
please report.",
|
||||
@@ -669,7 +668,7 @@ impl IndexMerger {
|
||||
{
|
||||
let mut serialize_vals =
|
||||
fast_field_serializer.new_u64_fast_field_with_idx(field, 0u64, max_term_ord, 1)?;
|
||||
let mut vals = Vec::with_capacity(100);
|
||||
let mut vals: Vec<u64> = Vec::with_capacity(100);
|
||||
|
||||
for old_doc_addr in doc_id_mapping.iter_old_doc_addrs() {
|
||||
let term_ordinal_mapping: &[TermOrdinal] =
|
||||
@@ -743,7 +742,7 @@ impl IndexMerger {
|
||||
for reader in &self.readers {
|
||||
let ff_reader: MultiValuedFastFieldReader<u64> = reader
|
||||
.fast_fields()
|
||||
.typed_fast_field_multi_reader(field)
|
||||
.typed_fast_field_multi_reader::<u64>(field)
|
||||
.expect(
|
||||
"Failed to find multivalued fast field reader. This is a bug in tantivy. \
|
||||
Please report.",
|
||||
@@ -778,14 +777,14 @@ impl IndexMerger {
|
||||
offsets: Vec<u64>,
|
||||
stats: FastFieldStats,
|
||||
}
|
||||
impl<'a> FastFieldDataAccess for SortedDocIdMultiValueAccessProvider<'a> {
|
||||
impl<'a> Column for SortedDocIdMultiValueAccessProvider<'a> {
|
||||
fn get_val(&self, pos: u64) -> u64 {
|
||||
// use the offsets index to find the doc_id which will contain the position.
|
||||
// the offsets are strictly increasing so we can do a simple search on it.
|
||||
let new_doc_id: DocId =
|
||||
self.offsets
|
||||
.iter()
|
||||
.position(|offset| offset > pos)
|
||||
.position(|&offset| offset > pos)
|
||||
.expect("pos is out of bounds") as DocId
|
||||
- 1u32;
|
||||
|
||||
@@ -1207,7 +1206,6 @@ mod tests {
|
||||
};
|
||||
use crate::collector::{Count, FacetCollector};
|
||||
use crate::core::Index;
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::query::{AllQuery, BooleanQuery, Scorer, TermQuery};
|
||||
use crate::schema::{
|
||||
Cardinality, Document, Facet, FacetOptions, IndexRecordOption, NumericOptions, Term,
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
mod tests {
|
||||
use crate::collector::TopDocs;
|
||||
use crate::core::Index;
|
||||
use crate::fastfield::{AliveBitSet, FastFieldReader, MultiValuedFastFieldReader};
|
||||
use crate::fastfield::{AliveBitSet, MultiValuedFastFieldReader};
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::{
|
||||
self, BytesOptions, Cardinality, Facet, FacetOptions, IndexRecordOption, NumericOptions,
|
||||
@@ -186,17 +186,17 @@ mod tests {
|
||||
|
||||
let fast_fields = segment_reader.fast_fields();
|
||||
let fast_field = fast_fields.u64(int_field).unwrap();
|
||||
assert_eq!(fast_field.get(5u32), 1u64);
|
||||
assert_eq!(fast_field.get(4u32), 2u64);
|
||||
assert_eq!(fast_field.get(3u32), 3u64);
|
||||
assert_eq!(fast_field.get_val(5), 1u64);
|
||||
assert_eq!(fast_field.get_val(4), 2u64);
|
||||
assert_eq!(fast_field.get_val(3), 3u64);
|
||||
if force_disjunct_segment_sort_values {
|
||||
assert_eq!(fast_field.get(2u32), 20u64);
|
||||
assert_eq!(fast_field.get(1u32), 100u64);
|
||||
assert_eq!(fast_field.get_val(2u64), 20u64);
|
||||
assert_eq!(fast_field.get_val(1u64), 100u64);
|
||||
} else {
|
||||
assert_eq!(fast_field.get(2u32), 10u64);
|
||||
assert_eq!(fast_field.get(1u32), 20u64);
|
||||
assert_eq!(fast_field.get_val(2u64), 10u64);
|
||||
assert_eq!(fast_field.get_val(1u64), 20u64);
|
||||
}
|
||||
assert_eq!(fast_field.get(0u32), 1_000u64);
|
||||
assert_eq!(fast_field.get_val(0u64), 1_000u64);
|
||||
|
||||
// test new field norm mapping
|
||||
{
|
||||
@@ -373,12 +373,12 @@ mod tests {
|
||||
|
||||
let fast_fields = segment_reader.fast_fields();
|
||||
let fast_field = fast_fields.u64(int_field).unwrap();
|
||||
assert_eq!(fast_field.get(0u32), 1u64);
|
||||
assert_eq!(fast_field.get(1u32), 2u64);
|
||||
assert_eq!(fast_field.get(2u32), 3u64);
|
||||
assert_eq!(fast_field.get(3u32), 10u64);
|
||||
assert_eq!(fast_field.get(4u32), 20u64);
|
||||
assert_eq!(fast_field.get(5u32), 1_000u64);
|
||||
assert_eq!(fast_field.get_val(0), 1u64);
|
||||
assert_eq!(fast_field.get_val(1), 2u64);
|
||||
assert_eq!(fast_field.get_val(2), 3u64);
|
||||
assert_eq!(fast_field.get_val(3), 10u64);
|
||||
assert_eq!(fast_field.get_val(4), 20u64);
|
||||
assert_eq!(fast_field.get_val(5), 1_000u64);
|
||||
|
||||
let get_vals = |fast_field: &MultiValuedFastFieldReader<u64>, doc_id: u32| -> Vec<u64> {
|
||||
let mut vals = vec![];
|
||||
@@ -478,11 +478,12 @@ mod tests {
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench_sorted_index_merge {
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use fastfield_codecs::Column;
|
||||
use test::{self, Bencher};
|
||||
|
||||
use crate::core::Index;
|
||||
// use cratedoc_id, readerdoc_id_mappinglet vals = reader.fate::schema;
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader};
|
||||
use crate::indexer::merger::IndexMerger;
|
||||
use crate::schema::{Cardinality, NumericOptions, Schema};
|
||||
use crate::{IndexSettings, IndexSortByField, IndexWriter, Order};
|
||||
@@ -534,7 +535,7 @@ mod bench_sorted_index_merge {
|
||||
b.iter(|| {
|
||||
let sorted_doc_ids = doc_id_mapping.iter_old_doc_addrs().map(|doc_addr| {
|
||||
let reader = &merger.readers[doc_addr.segment_ord as usize];
|
||||
let u64_reader: DynamicFastFieldReader<u64> =
|
||||
let u64_reader: Arc<dyn Column<u64>> =
|
||||
reader.fast_fields().typed_fast_field_reader(field).expect(
|
||||
"Failed to find a reader for single fast field. This is a tantivy bug and \
|
||||
it should never happen.",
|
||||
@@ -544,7 +545,7 @@ mod bench_sorted_index_merge {
|
||||
// add values in order of the new doc_ids
|
||||
let mut val = 0;
|
||||
for (doc_id, _reader, field_reader) in sorted_doc_ids {
|
||||
val = field_reader.get(doc_id);
|
||||
val = field_reader.get_val(doc_id as u64);
|
||||
}
|
||||
|
||||
val
|
||||
|
||||
@@ -25,39 +25,10 @@ use crate::indexer::{
|
||||
DefaultMergePolicy, MergeCandidate, MergeOperation, MergePolicy, SegmentEntry,
|
||||
SegmentSerializer,
|
||||
};
|
||||
use crate::schema::Schema;
|
||||
use crate::{FutureResult, Opstamp};
|
||||
|
||||
const NUM_MERGE_THREADS: usize = 4;
|
||||
|
||||
/// Save the index meta file.
|
||||
/// This operation is atomic :
|
||||
/// Either
|
||||
/// - it fails, in which case an error is returned,
|
||||
/// and the `meta.json` remains untouched,
|
||||
/// - it succeeds, and `meta.json` is written
|
||||
/// and flushed.
|
||||
///
|
||||
/// This method is not part of tantivy's public API
|
||||
pub fn save_new_metas(
|
||||
schema: Schema,
|
||||
index_settings: IndexSettings,
|
||||
directory: &dyn Directory,
|
||||
) -> crate::Result<()> {
|
||||
save_metas(
|
||||
&IndexMeta {
|
||||
index_settings,
|
||||
segments: Vec::new(),
|
||||
schema,
|
||||
opstamp: 0u64,
|
||||
payload: None,
|
||||
},
|
||||
directory,
|
||||
)?;
|
||||
directory.sync_directory()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Save the index meta file.
|
||||
/// This operation is atomic:
|
||||
/// Either
|
||||
@@ -67,7 +38,7 @@ pub fn save_new_metas(
|
||||
/// and flushed.
|
||||
///
|
||||
/// This method is not part of tantivy's public API
|
||||
fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate::Result<()> {
|
||||
pub(crate) fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate::Result<()> {
|
||||
info!("save metas");
|
||||
let mut buffer = serde_json::to_vec_pretty(metas)?;
|
||||
// Just adding a new line at the end of the buffer.
|
||||
|
||||
@@ -80,8 +80,8 @@ impl SegmentWriter {
|
||||
pub fn for_segment(
|
||||
memory_budget_in_bytes: usize,
|
||||
segment: Segment,
|
||||
schema: Schema,
|
||||
) -> crate::Result<SegmentWriter> {
|
||||
let schema = segment.schema();
|
||||
let tokenizer_manager = segment.index().tokenizers().clone();
|
||||
let table_size = compute_initial_table_size(memory_budget_in_bytes)?;
|
||||
let segment_serializer = SegmentSerializer::for_segment(segment, false)?;
|
||||
|
||||
@@ -429,7 +429,6 @@ pub mod tests {
|
||||
use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE;
|
||||
use crate::core::SegmentReader;
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::merge_policy::NoMergePolicy;
|
||||
use crate::query::BooleanQuery;
|
||||
use crate::schema::*;
|
||||
@@ -1036,21 +1035,21 @@ pub mod tests {
|
||||
let fast_field_reader_opt = segment_reader.fast_fields().u64(fast_field_unsigned);
|
||||
assert!(fast_field_reader_opt.is_ok());
|
||||
let fast_field_reader = fast_field_reader_opt.unwrap();
|
||||
assert_eq!(fast_field_reader.get(0), 4u64)
|
||||
assert_eq!(fast_field_reader.get_val(0), 4u64)
|
||||
}
|
||||
|
||||
{
|
||||
let fast_field_reader_res = segment_reader.fast_fields().i64(fast_field_signed);
|
||||
assert!(fast_field_reader_res.is_ok());
|
||||
let fast_field_reader = fast_field_reader_res.unwrap();
|
||||
assert_eq!(fast_field_reader.get(0), 4i64)
|
||||
assert_eq!(fast_field_reader.get_val(0), 4i64)
|
||||
}
|
||||
|
||||
{
|
||||
let fast_field_reader_res = segment_reader.fast_fields().f64(fast_field_float);
|
||||
assert!(fast_field_reader_res.is_ok());
|
||||
let fast_field_reader = fast_field_reader_res.unwrap();
|
||||
assert_eq!(fast_field_reader.get(0), 4f64)
|
||||
assert_eq!(fast_field_reader.get_val(0), 4f64)
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -227,7 +227,7 @@ pub mod tests {
|
||||
|
||||
{
|
||||
let mut segment_writer =
|
||||
SegmentWriter::for_segment(3_000_000, segment.clone(), schema).unwrap();
|
||||
SegmentWriter::for_segment(3_000_000, segment.clone()).unwrap();
|
||||
{
|
||||
// checking that position works if the field has two values
|
||||
let op = AddOperation {
|
||||
|
||||
@@ -339,7 +339,7 @@ impl StoreReader {
|
||||
async fn read_block_async(&self, checkpoint: &Checkpoint) -> crate::AsyncIoResult<Block> {
|
||||
let cache_key = checkpoint.byte_range.start;
|
||||
if let Some(block) = self.cache.get_from_cache(checkpoint.byte_range.start) {
|
||||
return Ok(block.clone());
|
||||
return Ok(block);
|
||||
}
|
||||
|
||||
let compressed_block = self
|
||||
|
||||
@@ -172,8 +172,7 @@ where TValueReader: value::ValueReader
|
||||
}
|
||||
|
||||
pub fn suffix(&self) -> &[u8] {
|
||||
&self
|
||||
.block_reader
|
||||
self.block_reader
|
||||
.buffer_from_to(self.suffix_start, self.suffix_end)
|
||||
}
|
||||
|
||||
|
||||
@@ -50,7 +50,7 @@ pub struct SSTableIndexBuilder {
|
||||
/// matches `left <= left' < right`.
|
||||
fn find_shorter_str_in_between(left: &mut Vec<u8>, right: &[u8]) {
|
||||
assert!(&left[..] < right);
|
||||
let common_len = common_prefix_len(&left, right);
|
||||
let common_len = common_prefix_len(left, right);
|
||||
if left.len() == common_len {
|
||||
return;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user