Compare commits

...

14 Commits

Author SHA1 Message Date
Paul Masurel
7d551fb580 Refactoring dyn Column 2022-09-02 00:41:46 +09:00
Paul Masurel
a451f6d60d Minor refactoring. (#1495) 2022-08-31 12:00:58 +09:00
PSeitz
f740ddeee3 Merge pull request #1493 from quickwit-oss/remove_vec_impl
remove Column impl on Vec
2022-08-29 07:54:33 -07:00
Pascal Seitz
7a26cc9022 add VecColumn 2022-08-29 15:49:43 +02:00
Pascal Seitz
54972caa7c remove Column impl on Vec
remove Column impl on Vec to avoid function shadowing
2022-08-29 11:57:41 +02:00
PSeitz
5d436759b0 Merge pull request #1480 from quickwit-oss/overflow_issue
fix overflow issue in interpolation
2022-08-28 16:44:00 -07:00
PSeitz
6f563b1606 Merge pull request #1491 from quickwit-oss/col-trait-refact
Introducing a column trait
2022-08-28 10:05:25 -07:00
Pascal Seitz
095fb68fda fix doc test 2022-08-28 18:30:39 +02:00
Pascal Seitz
6316eaefc6 fix benches 2022-08-28 14:38:30 +02:00
Paul Masurel
5331be800b Introducing a column trait 2022-08-28 14:14:27 +02:00
Paul Masurel
c73b425bc1 Fixing unit tests 2022-08-27 23:20:57 +02:00
Paul Masurel
54cfd0d154 Removing Deserializer trait (#1489)
Removing Deserializer trait and renaming the `Serializer` trait `FastFieldCodec`.
Small refactoring estimate.
2022-08-28 04:54:55 +09:00
PSeitz
0dd62169c8 merge FastFieldCodecReader wit FastFieldDataAccess (#1485)
* num_vals to FastFieldCodecReader

* split open_from_bytes to own trait

* rename get_u64 to ge_val

* merge traits
2022-08-28 03:58:28 +09:00
Pascal Seitz
3984cafccc fix overflow issue in interpolation
use saturating_sub and saturating_add to cover edge cases with values close to u64::MAX or 0 in combination with imprecise computation
2022-08-24 20:08:13 +02:00
40 changed files with 979 additions and 1026 deletions

View File

@@ -7,10 +7,12 @@
// Of course, you can have a look at the tantivy's built-in collectors // Of course, you can have a look at the tantivy's built-in collectors
// such as the `CountCollector` for more examples. // such as the `CountCollector` for more examples.
use std::sync::Arc;
use fastfield_codecs::Column;
// --- // ---
// Importing tantivy... // Importing tantivy...
use tantivy::collector::{Collector, SegmentCollector}; use tantivy::collector::{Collector, SegmentCollector};
use tantivy::fastfield::{DynamicFastFieldReader, FastFieldReader};
use tantivy::query::QueryParser; use tantivy::query::QueryParser;
use tantivy::schema::{Field, Schema, FAST, INDEXED, TEXT}; use tantivy::schema::{Field, Schema, FAST, INDEXED, TEXT};
use tantivy::{doc, Index, Score, SegmentReader}; use tantivy::{doc, Index, Score, SegmentReader};
@@ -95,7 +97,7 @@ impl Collector for StatsCollector {
} }
struct StatsSegmentCollector { struct StatsSegmentCollector {
fast_field_reader: DynamicFastFieldReader<u64>, fast_field_reader: Arc<dyn Column<u64>>,
stats: Stats, stats: Stats,
} }
@@ -103,7 +105,7 @@ impl SegmentCollector for StatsSegmentCollector {
type Fruit = Option<Stats>; type Fruit = Option<Stats>;
fn collect(&mut self, doc: u32, _score: Score) { fn collect(&mut self, doc: u32, _score: Score) {
let value = self.fast_field_reader.get(doc) as f64; let value = self.fast_field_reader.get_val(doc as u64) as f64;
self.stats.count += 1; self.stats.count += 1;
self.stats.sum += value; self.stats.sum += value;
self.stats.squared_sum += value * value; self.stats.squared_sum += value * value;

View File

@@ -3,7 +3,6 @@ use std::collections::{HashMap, HashSet};
use std::sync::{Arc, RwLock, Weak}; use std::sync::{Arc, RwLock, Weak};
use tantivy::collector::TopDocs; use tantivy::collector::TopDocs;
use tantivy::fastfield::FastFieldReader;
use tantivy::query::QueryParser; use tantivy::query::QueryParser;
use tantivy::schema::{Field, Schema, FAST, TEXT}; use tantivy::schema::{Field, Schema, FAST, TEXT};
use tantivy::{ use tantivy::{
@@ -52,7 +51,7 @@ impl Warmer for DynamicPriceColumn {
let product_id_reader = segment.fast_fields().u64(self.field)?; let product_id_reader = segment.fast_fields().u64(self.field)?;
let product_ids: Vec<ProductId> = segment let product_ids: Vec<ProductId> = segment
.doc_ids_alive() .doc_ids_alive()
.map(|doc| product_id_reader.get(doc)) .map(|doc| product_id_reader.get_val(doc as u64))
.collect(); .collect();
let mut prices_it = self.price_fetcher.fetch_prices(&product_ids).into_iter(); let mut prices_it = self.price_fetcher.fetch_prices(&product_ids).into_iter();
let mut price_vals: Vec<Price> = Vec::new(); let mut price_vals: Vec<Price> = Vec::new();

View File

@@ -4,9 +4,9 @@ extern crate test;
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use fastfield_codecs::bitpacked::{BitpackedReader, BitpackedSerializer}; use fastfield_codecs::bitpacked::BitpackedCodec;
use fastfield_codecs::blockwise_linear::{BlockwiseLinearReader, BlockwiseLinearSerializer}; use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec;
use fastfield_codecs::linear::{LinearReader, LinearSerializer}; use fastfield_codecs::linear::LinearCodec;
use fastfield_codecs::*; use fastfield_codecs::*;
fn get_data() -> Vec<u64> { fn get_data() -> Vec<u64> {
@@ -25,27 +25,25 @@ mod tests {
fn value_iter() -> impl Iterator<Item = u64> { fn value_iter() -> impl Iterator<Item = u64> {
0..20_000 0..20_000
} }
fn bench_get<S: FastFieldCodecSerializer, R: FastFieldCodecReader>( fn bench_get<Codec: FastFieldCodec>(b: &mut Bencher, data: &[u64]) {
b: &mut Bencher,
data: &[u64],
) {
let mut bytes = vec![]; let mut bytes = vec![];
S::serialize(&mut bytes, &data).unwrap(); Codec::serialize(&mut bytes, &data).unwrap();
let reader = R::open_from_bytes(OwnedBytes::new(bytes)).unwrap(); let reader = Codec::open_from_bytes(OwnedBytes::new(bytes)).unwrap();
b.iter(|| { b.iter(|| {
let mut sum = 0u64; let mut sum = 0u64;
for pos in value_iter() { for pos in value_iter() {
let val = reader.get_u64(pos as u64); let val = reader.get_val(pos as u64);
debug_assert_eq!(data[pos as usize], val); debug_assert_eq!(data[pos as usize], val);
sum = sum.wrapping_add(val); sum = sum.wrapping_add(val);
} }
sum sum
}); });
} }
fn bench_create<S: FastFieldCodecSerializer>(b: &mut Bencher, data: &[u64]) { fn bench_create<Codec: FastFieldCodec>(b: &mut Bencher, data: &[u64]) {
let mut bytes = vec![]; let mut bytes = Vec::new();
b.iter(|| { b.iter(|| {
S::serialize(&mut bytes, &data).unwrap(); bytes.clear();
Codec::serialize(&mut bytes, &data).unwrap();
}); });
} }
@@ -54,32 +52,32 @@ mod tests {
#[bench] #[bench]
fn bench_fastfield_bitpack_create(b: &mut Bencher) { fn bench_fastfield_bitpack_create(b: &mut Bencher) {
let data: Vec<_> = get_data(); let data: Vec<_> = get_data();
bench_create::<BitpackedSerializer>(b, &data); bench_create::<BitpackedCodec>(b, &data);
} }
#[bench] #[bench]
fn bench_fastfield_linearinterpol_create(b: &mut Bencher) { fn bench_fastfield_linearinterpol_create(b: &mut Bencher) {
let data: Vec<_> = get_data(); let data: Vec<_> = get_data();
bench_create::<LinearSerializer>(b, &data); bench_create::<LinearCodec>(b, &data);
} }
#[bench] #[bench]
fn bench_fastfield_multilinearinterpol_create(b: &mut Bencher) { fn bench_fastfield_multilinearinterpol_create(b: &mut Bencher) {
let data: Vec<_> = get_data(); let data: Vec<_> = get_data();
bench_create::<BlockwiseLinearSerializer>(b, &data); bench_create::<BlockwiseLinearCodec>(b, &data);
} }
#[bench] #[bench]
fn bench_fastfield_bitpack_get(b: &mut Bencher) { fn bench_fastfield_bitpack_get(b: &mut Bencher) {
let data: Vec<_> = get_data(); let data: Vec<_> = get_data();
bench_get::<BitpackedSerializer, BitpackedReader>(b, &data); bench_get::<BitpackedCodec>(b, &data);
} }
#[bench] #[bench]
fn bench_fastfield_linearinterpol_get(b: &mut Bencher) { fn bench_fastfield_linearinterpol_get(b: &mut Bencher) {
let data: Vec<_> = get_data(); let data: Vec<_> = get_data();
bench_get::<LinearSerializer, LinearReader>(b, &data); bench_get::<LinearCodec>(b, &data);
} }
#[bench] #[bench]
fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) { fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) {
let data: Vec<_> = get_data(); let data: Vec<_> = get_data();
bench_get::<BlockwiseLinearSerializer, BlockwiseLinearReader>(b, &data); bench_get::<BlockwiseLinearCodec>(b, &data);
} }
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats { pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
let min_value = data.iter().cloned().min().unwrap_or(0); let min_value = data.iter().cloned().min().unwrap_or(0);

View File

@@ -4,9 +4,7 @@ use common::BinarySerializable;
use ownedbytes::OwnedBytes; use ownedbytes::OwnedBytes;
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker}; use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
use crate::{ use crate::{Column, FastFieldCodec, FastFieldCodecType};
FastFieldCodecReader, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess,
};
/// Depending on the field type, a different /// Depending on the field type, a different
/// fast field is required. /// fast field is required.
@@ -14,29 +12,14 @@ use crate::{
pub struct BitpackedReader { pub struct BitpackedReader {
data: OwnedBytes, data: OwnedBytes,
bit_unpacker: BitUnpacker, bit_unpacker: BitUnpacker,
pub min_value_u64: u64, min_value_u64: u64,
pub max_value_u64: u64, max_value_u64: u64,
num_vals: u64,
} }
impl FastFieldCodecReader for BitpackedReader { impl Column for BitpackedReader {
/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self> {
let footer_offset = bytes.len() - 16;
let (data, mut footer) = bytes.split(footer_offset);
let min_value = u64::deserialize(&mut footer)?;
let amplitude = u64::deserialize(&mut footer)?;
let max_value = min_value + amplitude;
let num_bits = compute_num_bits(amplitude);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(BitpackedReader {
data,
min_value_u64: min_value,
max_value_u64: max_value,
bit_unpacker,
})
}
#[inline] #[inline]
fn get_u64(&self, doc: u64) -> u64 { fn get_val(&self, doc: u64) -> u64 {
self.min_value_u64 + self.bit_unpacker.get(doc, &self.data) self.min_value_u64 + self.bit_unpacker.get(doc, &self.data)
} }
#[inline] #[inline]
@@ -47,11 +30,16 @@ impl FastFieldCodecReader for BitpackedReader {
fn max_value(&self) -> u64 { fn max_value(&self) -> u64 {
self.max_value_u64 self.max_value_u64
} }
#[inline]
fn num_vals(&self) -> u64 {
self.num_vals
}
} }
pub struct BitpackedSerializerLegacy<'a, W: 'a + Write> { pub struct BitpackedSerializerLegacy<'a, W: 'a + Write> {
bit_packer: BitPacker, bit_packer: BitPacker,
write: &'a mut W, write: &'a mut W,
min_value: u64, min_value: u64,
num_vals: u64,
amplitude: u64, amplitude: u64,
num_bits: u8, num_bits: u8,
} }
@@ -78,6 +66,7 @@ impl<'a, W: Write> BitpackedSerializerLegacy<'a, W> {
bit_packer, bit_packer,
write, write,
min_value, min_value,
num_vals: 0,
amplitude, amplitude,
num_bits, num_bits,
}) })
@@ -88,22 +77,45 @@ impl<'a, W: Write> BitpackedSerializerLegacy<'a, W> {
let val_to_write: u64 = val - self.min_value; let val_to_write: u64 = val - self.min_value;
self.bit_packer self.bit_packer
.write(val_to_write, self.num_bits, &mut self.write)?; .write(val_to_write, self.num_bits, &mut self.write)?;
self.num_vals += 1;
Ok(()) Ok(())
} }
pub fn close_field(mut self) -> io::Result<()> { pub fn close_field(mut self) -> io::Result<()> {
self.bit_packer.close(&mut self.write)?; self.bit_packer.close(&mut self.write)?;
self.min_value.serialize(&mut self.write)?; self.min_value.serialize(&mut self.write)?;
self.amplitude.serialize(&mut self.write)?; self.amplitude.serialize(&mut self.write)?;
self.num_vals.serialize(&mut self.write)?;
Ok(()) Ok(())
} }
} }
pub struct BitpackedSerializer {} pub struct BitpackedCodec;
impl FastFieldCodecSerializer for BitpackedSerializer { impl FastFieldCodec for BitpackedCodec {
/// The CODEC_TYPE is an enum value used for serialization. /// The CODEC_TYPE is an enum value used for serialization.
const CODEC_TYPE: FastFieldCodecType = FastFieldCodecType::Bitpacked; const CODEC_TYPE: FastFieldCodecType = FastFieldCodecType::Bitpacked;
type Reader = BitpackedReader;
/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self::Reader> {
let footer_offset = bytes.len() - 24;
let (data, mut footer) = bytes.split(footer_offset);
let min_value = u64::deserialize(&mut footer)?;
let amplitude = u64::deserialize(&mut footer)?;
let num_vals = u64::deserialize(&mut footer)?;
let max_value = min_value + amplitude;
let num_bits = compute_num_bits(amplitude);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(BitpackedReader {
data,
bit_unpacker,
min_value_u64: min_value,
max_value_u64: max_value,
num_vals,
})
}
/// Serializes data with the BitpackedFastFieldSerializer. /// Serializes data with the BitpackedFastFieldSerializer.
/// ///
/// The serializer in fact encode the values by bitpacking /// The serializer in fact encode the values by bitpacking
@@ -112,10 +124,7 @@ impl FastFieldCodecSerializer for BitpackedSerializer {
/// It requires a `min_value` and a `max_value` to compute /// It requires a `min_value` and a `max_value` to compute
/// compute the minimum number of bits required to encode /// compute the minimum number of bits required to encode
/// values. /// values.
fn serialize( fn serialize(write: &mut impl Write, fastfield_accessor: &dyn Column) -> io::Result<()> {
write: &mut impl Write,
fastfield_accessor: &dyn FastFieldDataAccess,
) -> io::Result<()> {
let mut serializer = BitpackedSerializerLegacy::open( let mut serializer = BitpackedSerializerLegacy::open(
write, write,
fastfield_accessor.min_value(), fastfield_accessor.min_value(),
@@ -129,29 +138,27 @@ impl FastFieldCodecSerializer for BitpackedSerializer {
Ok(()) Ok(())
} }
fn is_applicable(_fastfield_accessor: &impl FastFieldDataAccess) -> bool {
true fn estimate(fastfield_accessor: &impl Column) -> Option<f32> {
}
fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32 {
let amplitude = fastfield_accessor.max_value() - fastfield_accessor.min_value(); let amplitude = fastfield_accessor.max_value() - fastfield_accessor.min_value();
let num_bits = compute_num_bits(amplitude); let num_bits = compute_num_bits(amplitude);
let num_bits_uncompressed = 64; let num_bits_uncompressed = 64;
num_bits as f32 / num_bits_uncompressed as f32 Some(num_bits as f32 / num_bits_uncompressed as f32)
} }
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::tests::get_codec_test_data_sets; use crate::tests::get_codec_test_datasets;
fn create_and_validate(data: &[u64], name: &str) { fn create_and_validate(data: &[u64], name: &str) {
crate::tests::create_and_validate::<BitpackedSerializer, BitpackedReader>(data, name); crate::tests::create_and_validate::<BitpackedCodec>(data, name);
} }
#[test] #[test]
fn test_with_codec_data_sets() { fn test_with_codec_data_sets() {
let data_sets = get_codec_test_data_sets(); let data_sets = get_codec_test_datasets();
for (mut data, name) in data_sets { for (mut data, name) in data_sets {
create_and_validate(&data, name); create_and_validate(&data, name);
data.reverse(); data.reverse();

View File

@@ -18,9 +18,7 @@ use ownedbytes::OwnedBytes;
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker}; use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
use crate::linear::{get_calculated_value, get_slope}; use crate::linear::{get_calculated_value, get_slope};
use crate::{ use crate::{Column, FastFieldCodec, FastFieldCodecType};
FastFieldCodecReader, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess,
};
const CHUNK_SIZE: u64 = 512; const CHUNK_SIZE: u64 = 512;
@@ -148,18 +146,9 @@ fn get_interpolation_function(doc: u64, interpolations: &[Function]) -> &Functio
&interpolations[get_interpolation_position(doc)] &interpolations[get_interpolation_position(doc)]
} }
impl FastFieldCodecReader for BlockwiseLinearReader { impl Column for BlockwiseLinearReader {
/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self> {
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
let footer_offset = bytes.len() - 4 - footer_len as usize;
let (data, mut footer) = bytes.split(footer_offset);
let footer = BlockwiseLinearFooter::deserialize(&mut footer)?;
Ok(BlockwiseLinearReader { data, footer })
}
#[inline] #[inline]
fn get_u64(&self, idx: u64) -> u64 { fn get_val(&self, idx: u64) -> u64 {
let interpolation = get_interpolation_function(idx, &self.footer.interpolations); let interpolation = get_interpolation_function(idx, &self.footer.interpolations);
let in_block_idx = idx - interpolation.start_pos; let in_block_idx = idx - interpolation.start_pos;
let calculated_value = get_calculated_value( let calculated_value = get_calculated_value(
@@ -182,18 +171,31 @@ impl FastFieldCodecReader for BlockwiseLinearReader {
fn max_value(&self) -> u64 { fn max_value(&self) -> u64 {
self.footer.max_value self.footer.max_value
} }
#[inline]
fn num_vals(&self) -> u64 {
self.footer.num_vals
}
} }
/// Same as LinearSerializer, but working on chunks of CHUNK_SIZE elements. /// Same as LinearSerializer, but working on chunks of CHUNK_SIZE elements.
pub struct BlockwiseLinearSerializer {} pub struct BlockwiseLinearCodec;
impl FastFieldCodecSerializer for BlockwiseLinearSerializer { impl FastFieldCodec for BlockwiseLinearCodec {
const CODEC_TYPE: FastFieldCodecType = FastFieldCodecType::BlockwiseLinear; const CODEC_TYPE: FastFieldCodecType = FastFieldCodecType::BlockwiseLinear;
type Reader = BlockwiseLinearReader;
/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self::Reader> {
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
let footer_offset = bytes.len() - 4 - footer_len as usize;
let (data, mut footer) = bytes.split(footer_offset);
let footer = BlockwiseLinearFooter::deserialize(&mut footer)?;
Ok(BlockwiseLinearReader { data, footer })
}
/// Creates a new fast field serializer. /// Creates a new fast field serializer.
fn serialize( fn serialize(write: &mut impl Write, fastfield_accessor: &dyn Column) -> io::Result<()> {
write: &mut impl Write,
fastfield_accessor: &dyn FastFieldDataAccess,
) -> io::Result<()> {
assert!(fastfield_accessor.min_value() <= fastfield_accessor.max_value()); assert!(fastfield_accessor.min_value() <= fastfield_accessor.max_value());
let first_val = fastfield_accessor.get_val(0); let first_val = fastfield_accessor.get_val(0);
@@ -284,10 +286,14 @@ impl FastFieldCodecSerializer for BlockwiseLinearSerializer {
Ok(()) Ok(())
} }
fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess) -> bool { /// estimation for linear interpolation is hard because, you don't know
if fastfield_accessor.num_vals() < 5_000 { /// where the local maxima are for the deviation of the calculated value and
return false; /// the offset is also unknown.
fn estimate(fastfield_accessor: &impl Column) -> Option<f32> {
if fastfield_accessor.num_vals() < 10 * CHUNK_SIZE {
return None;
} }
// On serialization the offset is added to the actual value. // On serialization the offset is added to the actual value.
// We need to make sure this won't run into overflow calculation issues. // We need to make sure this won't run into overflow calculation issues.
// For this we take the maximum theroretical offset and add this to the max value. // For this we take the maximum theroretical offset and add this to the max value.
@@ -299,14 +305,9 @@ impl FastFieldCodecSerializer for BlockwiseLinearSerializer {
.checked_add(theorethical_maximum_offset) .checked_add(theorethical_maximum_offset)
.is_none() .is_none()
{ {
return false; return None;
} }
true
}
/// estimation for linear interpolation is hard because, you don't know
/// where the local maxima are for the deviation of the calculated value and
/// the offset is also unknown.
fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32 {
let first_val_in_first_block = fastfield_accessor.get_val(0); let first_val_in_first_block = fastfield_accessor.get_val(0);
let last_elem_in_first_chunk = CHUNK_SIZE.min(fastfield_accessor.num_vals()); let last_elem_in_first_chunk = CHUNK_SIZE.min(fastfield_accessor.num_vals());
let last_val_in_first_block = let last_val_in_first_block =
@@ -345,7 +346,7 @@ impl FastFieldCodecSerializer for BlockwiseLinearSerializer {
// function metadata per block // function metadata per block
+ 29 * (fastfield_accessor.num_vals() / CHUNK_SIZE); + 29 * (fastfield_accessor.num_vals() / CHUNK_SIZE);
let num_bits_uncompressed = 64 * fastfield_accessor.num_vals(); let num_bits_uncompressed = 64 * fastfield_accessor.num_vals();
num_bits as f32 / num_bits_uncompressed as f32 Some(num_bits as f32 / num_bits_uncompressed as f32)
} }
} }
@@ -360,12 +361,10 @@ fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::tests::get_codec_test_data_sets; use crate::tests::get_codec_test_datasets;
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) { fn create_and_validate(data: &[u64], name: &str) -> Option<(f32, f32)> {
crate::tests::create_and_validate::<BlockwiseLinearSerializer, BlockwiseLinearReader>( crate::tests::create_and_validate::<BlockwiseLinearCodec>(data, name)
data, name,
)
} }
const HIGHEST_BIT: u64 = 1 << 63; const HIGHEST_BIT: u64 = 1 << 63;
@@ -379,7 +378,7 @@ mod tests {
.map(i64_to_u64) .map(i64_to_u64)
.collect::<Vec<_>>(); .collect::<Vec<_>>();
let (estimate, actual_compression) = let (estimate, actual_compression) =
create_and_validate(&data, "simple monotonically large i64"); create_and_validate(&data, "simple monotonically large i64").unwrap();
assert!(actual_compression < 0.2); assert!(actual_compression < 0.2);
assert!(estimate < 0.20); assert!(estimate < 0.20);
assert!(estimate > 0.15); assert!(estimate > 0.15);
@@ -390,7 +389,7 @@ mod tests {
fn test_compression() { fn test_compression() {
let data = (10..=6_000_u64).collect::<Vec<_>>(); let data = (10..=6_000_u64).collect::<Vec<_>>();
let (estimate, actual_compression) = let (estimate, actual_compression) =
create_and_validate(&data, "simple monotonically large"); create_and_validate(&data, "simple monotonically large").unwrap();
assert!(actual_compression < 0.2); assert!(actual_compression < 0.2);
assert!(estimate < 0.20); assert!(estimate < 0.20);
assert!(estimate > 0.15); assert!(estimate > 0.15);
@@ -399,7 +398,7 @@ mod tests {
#[test] #[test]
fn test_with_codec_data_sets() { fn test_with_codec_data_sets() {
let data_sets = get_codec_test_data_sets(); let data_sets = get_codec_test_datasets();
for (mut data, name) in data_sets { for (mut data, name) in data_sets {
create_and_validate(&data, name); create_and_validate(&data, name);
data.reverse(); data.reverse();

View File

@@ -0,0 +1,146 @@
use std::marker::PhantomData;
pub trait Column<T = u64> {
/// Return the value associated to the given idx.
///
/// This accessor should return as fast as possible.
///
/// # Panics
///
/// May panic if `idx` is greater than the column length.
fn get_val(&self, idx: u64) -> T;
/// Fills an output buffer with the fast field values
/// associated with the `DocId` going from
/// `start` to `start + output.len()`.
///
/// Regardless of the type of `Item`, this method works
/// - transmuting the output array
/// - extracting the `Item`s as if they were `u64`
/// - possibly converting the `u64` value to the right type.
///
/// # Panics
///
/// May panic if `start + output.len()` is greater than
/// the segment's `maxdoc`.
fn get_range(&self, start: u64, output: &mut [T]) {
for (out, idx) in output.iter_mut().zip(start..) {
*out = self.get_val(idx);
}
}
/// Returns the minimum value for this fast field.
///
/// The min value does not take in account of possible
/// deleted document, and should be considered as a lower bound
/// of the actual minimum value.
fn min_value(&self) -> T;
/// Returns the maximum value for this fast field.
///
/// The max value does not take in account of possible
/// deleted document, and should be considered as an upper bound
/// of the actual maximum value
fn max_value(&self) -> T;
fn num_vals(&self) -> u64;
/// Returns a iterator over the data
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = T> + 'a> {
Box::new((0..self.num_vals()).map(|idx| self.get_val(idx)))
}
}
struct VecColumn<'a>(&'a [u64]);
impl<'a> Column for VecColumn<'a> {
fn get_val(&self, position: u64) -> u64 {
self.0[position as usize]
}
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
Box::new(self.0.iter().cloned())
}
fn min_value(&self) -> u64 {
self.0.iter().min().cloned().unwrap_or(0)
}
fn max_value(&self) -> u64 {
self.0.iter().max().cloned().unwrap_or(0)
}
fn num_vals(&self) -> u64 {
self.0.len() as u64
}
}
impl<'a> From<&'a [u64]> for VecColumn<'a> {
fn from(data: &'a [u64]) -> Self {
Self(data)
}
}
struct MonotonicMappingColumn<C, T, Input> {
from_column: C,
monotonic_mapping: T,
_phantom: PhantomData<Input>,
}
/// Creates a view of a column transformed by a monotonic mapping.
pub fn monotonic_map_column<C, T, Input, Output>(
from_column: C,
monotonic_mapping: T,
) -> impl Column<Output>
where
C: Column<Input>,
T: Fn(Input) -> Output,
{
MonotonicMappingColumn {
from_column,
monotonic_mapping,
_phantom: PhantomData,
}
}
impl<C, T, Input, Output> Column<Output> for MonotonicMappingColumn<C, T, Input>
where
C: Column<Input>,
T: Fn(Input) -> Output,
{
fn get_val(&self, idx: u64) -> Output {
let from_val = self.from_column.get_val(idx);
(self.monotonic_mapping)(from_val)
}
fn min_value(&self) -> Output {
let from_min_value = self.from_column.min_value();
(self.monotonic_mapping)(from_min_value)
}
fn max_value(&self) -> Output {
let from_max_value = self.from_column.max_value();
(self.monotonic_mapping)(from_max_value)
}
fn num_vals(&self) -> u64 {
self.from_column.num_vals()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_monotonic_mapping() {
let vals = &[1u64, 3u64][..];
let col = VecColumn::from(vals);
let mapped = monotonic_map_column(col, |el| el + 4);
assert_eq!(mapped.min_value(), 5u64);
assert_eq!(mapped.max_value(), 7u64);
assert_eq!(mapped.num_vals(), 2);
assert_eq!(mapped.num_vals(), 2);
assert_eq!(mapped.get_val(0), 5);
assert_eq!(mapped.get_val(0), 7);
}
}

View File

@@ -12,13 +12,9 @@ pub mod bitpacked;
pub mod blockwise_linear; pub mod blockwise_linear;
pub mod linear; pub mod linear;
pub trait FastFieldCodecReader: Sized { mod column;
/// reads the metadata and returns the CodecReader
fn open_from_bytes(bytes: OwnedBytes) -> std::io::Result<Self>; pub use self::column::{monotonic_map_column, Column};
fn get_u64(&self, doc: u64) -> u64;
fn min_value(&self) -> u64;
fn max_value(&self) -> u64;
}
#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)] #[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
#[repr(u8)] #[repr(u8)]
@@ -60,54 +56,30 @@ impl FastFieldCodecType {
/// The FastFieldSerializerEstimate trait is required on all variants /// The FastFieldSerializerEstimate trait is required on all variants
/// of fast field compressions, to decide which one to choose. /// of fast field compressions, to decide which one to choose.
pub trait FastFieldCodecSerializer { pub trait FastFieldCodec: 'static {
/// A codex needs to provide a unique name and id, which is /// A codex needs to provide a unique name and id, which is
/// used for debugging and de/serialization. /// used for debugging and de/serialization.
const CODEC_TYPE: FastFieldCodecType; const CODEC_TYPE: FastFieldCodecType;
/// Check if the Codec is able to compress the data type Reader: Column<u64> + 'static;
fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess) -> bool;
/// Returns an estimate of the compression ratio. /// Reads the metadata and returns the CodecReader
/// The baseline is uncompressed 64bit data. fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self::Reader>;
///
/// It could make sense to also return a value representing
/// computational complexity.
fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32;
/// Serializes the data using the serializer into write. /// Serializes the data using the serializer into write.
/// ///
/// The fastfield_accessor iterator should be preferred over using fastfield_accessor for /// The fastfield_accessor iterator should be preferred over using fastfield_accessor for
/// performance reasons. /// performance reasons.
fn serialize( fn serialize(write: &mut impl Write, fastfield_accessor: &dyn Column<u64>) -> io::Result<()>;
write: &mut impl Write,
fastfield_accessor: &dyn FastFieldDataAccess,
) -> io::Result<()>;
}
/// FastFieldDataAccess is the trait to access fast field data during serialization and estimation. /// Returns an estimate of the compression ratio.
pub trait FastFieldDataAccess { /// If the codec is not applicable, returns `None`.
/// Return the value associated to the given position.
/// ///
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance /// The baseline is uncompressed 64bit data.
/// reasons.
/// ///
/// # Panics /// It could make sense to also return a value representing
/// /// computational complexity.
/// May panic if `position` is greater than the index. fn estimate(fastfield_accessor: &impl Column) -> Option<f32>;
fn get_val(&self, position: u64) -> u64;
/// Returns a iterator over the data
fn iter(&self) -> Box<dyn Iterator<Item = u64> + '_>;
/// min value of the data
fn min_value(&self) -> u64;
/// max value of the data
fn max_value(&self) -> u64;
/// num vals
fn num_vals(&self) -> u64;
} }
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
@@ -118,104 +90,60 @@ pub struct FastFieldStats {
pub num_vals: u64, pub num_vals: u64,
} }
impl<'a> FastFieldDataAccess for &'a [u64] {
fn get_val(&self, position: u64) -> u64 {
self[position as usize]
}
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
Box::new((self as &[u64]).iter().cloned())
}
fn min_value(&self) -> u64 {
self.iter().min().unwrap_or(0)
}
fn max_value(&self) -> u64 {
self.iter().max().unwrap_or(0)
}
fn num_vals(&self) -> u64 {
self.len() as u64
}
}
impl FastFieldDataAccess for Vec<u64> {
fn get_val(&self, position: u64) -> u64 {
self[position as usize]
}
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
Box::new((self as &[u64]).iter().cloned())
}
fn min_value(&self) -> u64 {
self.iter().min().unwrap_or(0)
}
fn max_value(&self) -> u64 {
self.iter().max().unwrap_or(0)
}
fn num_vals(&self) -> u64 {
self.len() as u64
}
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use proptest::arbitrary::any; use proptest::arbitrary::any;
use proptest::proptest; use proptest::proptest;
use crate::bitpacked::{BitpackedReader, BitpackedSerializer}; use crate::bitpacked::BitpackedCodec;
use crate::blockwise_linear::{BlockwiseLinearReader, BlockwiseLinearSerializer}; use crate::blockwise_linear::BlockwiseLinearCodec;
use crate::linear::{LinearReader, LinearSerializer}; use crate::linear::LinearCodec;
pub fn create_and_validate<S: FastFieldCodecSerializer, R: FastFieldCodecReader>( pub fn create_and_validate<Codec: FastFieldCodec>(
data: &[u64], data: &[u64],
name: &str, name: &str,
) -> (f32, f32) { ) -> Option<(f32, f32)> {
if !S::is_applicable(&data) { let estimation = Codec::estimate(&VecColum::from(data))?;
return (f32::MAX, 0.0);
}
let estimation = S::estimate(&data);
let mut out: Vec<u8> = Vec::new(); let mut out: Vec<u8> = Vec::new();
S::serialize(&mut out, &data).unwrap(); Codec::serialize(&mut out, &VecColum::from(data)).unwrap();
let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0); let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0);
let reader = R::open_from_bytes(OwnedBytes::new(out)).unwrap(); let reader = Codec::open_from_bytes(OwnedBytes::new(out)).unwrap();
for (doc, orig_val) in data.iter().enumerate() { assert_eq!(reader.num_vals(), data.len() as u64);
let val = reader.get_u64(doc as u64); for (doc, orig_val) in data.iter().copied().enumerate() {
if val != *orig_val { let val = reader.get_val(doc as u64);
panic!( assert_eq!(
"val {val:?} does not match orig_val {orig_val:?}, in data set {name}, data \ val, orig_val,
{data:?}", "val `{val}` does not match orig_val {orig_val:?}, in data set {name}, data \
); `{data:?}`",
} );
} }
(estimation, actual_compression) Some((estimation, actual_compression))
} }
proptest! { proptest! {
#[test] #[test]
fn test_proptest_small(data in proptest::collection::vec(any::<u64>(), 1..10)) { fn test_proptest_small(data in proptest::collection::vec(any::<u64>(), 1..10)) {
create_and_validate::<LinearSerializer, LinearReader>(&data, "proptest linearinterpol"); create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
create_and_validate::<BlockwiseLinearSerializer, BlockwiseLinearReader>(&data, "proptest multilinearinterpol"); create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
create_and_validate::<BitpackedSerializer, BitpackedReader>(&data, "proptest bitpacked"); create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
} }
#[test] #[test]
fn test_proptest_large(data in proptest::collection::vec(any::<u64>(), 1..6000)) { fn test_proptest_large(data in proptest::collection::vec(any::<u64>(), 1..6000)) {
create_and_validate::<LinearSerializer, LinearReader>(&data, "proptest linearinterpol"); create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
create_and_validate::<BlockwiseLinearSerializer, BlockwiseLinearReader>(&data, "proptest multilinearinterpol"); create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
create_and_validate::<BitpackedSerializer, BitpackedReader>(&data, "proptest bitpacked"); create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
} }
} }
pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> { pub fn get_codec_test_datasets() -> Vec<(Vec<u64>, &'static str)> {
let mut data_and_names = vec![]; let mut data_and_names = vec![];
let data = (10..=20_u64).collect::<Vec<_>>(); let data = (10..=10_000_u64).collect::<Vec<_>>();
data_and_names.push((data, "simple monotonically increasing")); data_and_names.push((data, "simple monotonically increasing"));
data_and_names.push(( data_and_names.push((
@@ -225,32 +153,38 @@ mod tests {
data_and_names.push((vec![5, 50, 3, 13, 1, 1000, 35], "rand small")); data_and_names.push((vec![5, 50, 3, 13, 1, 1000, 35], "rand small"));
data_and_names.push((vec![10], "single value")); data_and_names.push((vec![10], "single value"));
data_and_names.push((
vec![1572656989877777, 1170935903116329, 720575940379279, 0],
"overflow error",
));
data_and_names data_and_names
} }
fn test_codec<S: FastFieldCodecSerializer, R: FastFieldCodecReader>() { fn test_codec<C: FastFieldCodec>() {
let codec_name = format!("{:?}", S::CODEC_TYPE); let codec_name = format!("{:?}", C::CODEC_TYPE);
for (data, dataset_name) in get_codec_test_data_sets() { for (data, dataset_name) in get_codec_test_datasets() {
let (estimate, actual) = crate::tests::create_and_validate::<S, R>(&data, dataset_name); let estimate_actual_opt: Option<(f32, f32)> =
let result = if estimate == f32::MAX { crate::tests::create_and_validate::<C>(&data, dataset_name);
"Disabled".to_string() let result = if let Some((estimate, actual)) = estimate_actual_opt {
} else {
format!("Estimate `{estimate}` Actual `{actual}`") format!("Estimate `{estimate}` Actual `{actual}`")
} else {
"Disabled".to_string()
}; };
println!("Codec {codec_name}, DataSet {dataset_name}, {result}"); println!("Codec {codec_name}, DataSet {dataset_name}, {result}");
} }
} }
#[test] #[test]
fn test_codec_bitpacking() { fn test_codec_bitpacking() {
test_codec::<BitpackedSerializer, BitpackedReader>(); test_codec::<BitpackedCodec>();
} }
#[test] #[test]
fn test_codec_interpolation() { fn test_codec_interpolation() {
test_codec::<LinearSerializer, LinearReader>(); test_codec::<LinearCodec>();
} }
#[test] #[test]
fn test_codec_multi_interpolation() { fn test_codec_multi_interpolation() {
test_codec::<BlockwiseLinearSerializer, BlockwiseLinearReader>(); test_codec::<BlockwiseLinearCodec>();
} }
use super::*; use super::*;
@@ -258,38 +192,41 @@ mod tests {
#[test] #[test]
fn estimation_good_interpolation_case() { fn estimation_good_interpolation_case() {
let data = (10..=20000_u64).collect::<Vec<_>>(); let data = (10..=20000_u64).collect::<Vec<_>>();
let data: VecColum = data.as_slice().into();
let linear_interpol_estimation = LinearSerializer::estimate(&data); let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
assert_le!(linear_interpol_estimation, 0.01); assert_le!(linear_interpol_estimation, 0.01);
let multi_linear_interpol_estimation = BlockwiseLinearSerializer::estimate(&data); let multi_linear_interpol_estimation = BlockwiseLinearCodec::estimate(&data).unwrap();
assert_le!(multi_linear_interpol_estimation, 0.2); assert_le!(multi_linear_interpol_estimation, 0.2);
assert_le!(linear_interpol_estimation, multi_linear_interpol_estimation); assert_le!(linear_interpol_estimation, multi_linear_interpol_estimation);
let bitpacked_estimation = BitpackedSerializer::estimate(&data); let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
assert_le!(linear_interpol_estimation, bitpacked_estimation); assert_le!(linear_interpol_estimation, bitpacked_estimation);
} }
#[test] #[test]
fn estimation_test_bad_interpolation_case() { fn estimation_test_bad_interpolation_case() {
let data = vec![200, 10, 10, 10, 10, 1000, 20]; let data: &[u64] = &[200, 10, 10, 10, 10, 1000, 20];
let linear_interpol_estimation = LinearSerializer::estimate(&data); let data: VecColum = data.into();
let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
assert_le!(linear_interpol_estimation, 0.32); assert_le!(linear_interpol_estimation, 0.32);
let bitpacked_estimation = BitpackedSerializer::estimate(&data); let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
assert_le!(bitpacked_estimation, linear_interpol_estimation); assert_le!(bitpacked_estimation, linear_interpol_estimation);
} }
#[test] #[test]
fn estimation_test_bad_interpolation_case_monotonically_increasing() { fn estimation_test_bad_interpolation_case_monotonically_increasing() {
let mut data = (200..=20000_u64).collect::<Vec<_>>(); let mut data: Vec<u64> = (200..=20000_u64).collect();
data.push(1_000_000); data.push(1_000_000);
let data: VecColum = data.as_slice().into();
// in this case the linear interpolation can't in fact not be worse than bitpacking, // in this case the linear interpolation can't in fact not be worse than bitpacking,
// but the estimator adds some threshold, which leads to estimated worse behavior // but the estimator adds some threshold, which leads to estimated worse behavior
let linear_interpol_estimation = LinearSerializer::estimate(&data); let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
assert_le!(linear_interpol_estimation, 0.35); assert_le!(linear_interpol_estimation, 0.35);
let bitpacked_estimation = BitpackedSerializer::estimate(&data); let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
assert_le!(bitpacked_estimation, 0.32); assert_le!(bitpacked_estimation, 0.32);
assert_le!(bitpacked_estimation, linear_interpol_estimation); assert_le!(bitpacked_estimation, linear_interpol_estimation);
} }

View File

@@ -5,9 +5,7 @@ use common::{BinarySerializable, FixedSize};
use ownedbytes::OwnedBytes; use ownedbytes::OwnedBytes;
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker}; use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
use crate::{ use crate::{Column, FastFieldCodec, FastFieldCodecType};
FastFieldCodecReader, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess,
};
/// Depending on the field type, a different /// Depending on the field type, a different
/// fast field is required. /// fast field is required.
@@ -59,24 +57,9 @@ impl FixedSize for LinearFooter {
const SIZE_IN_BYTES: usize = 56; const SIZE_IN_BYTES: usize = 56;
} }
impl FastFieldCodecReader for LinearReader { impl Column for LinearReader {
/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self> {
let footer_offset = bytes.len() - LinearFooter::SIZE_IN_BYTES;
let (data, mut footer) = bytes.split(footer_offset);
let footer = LinearFooter::deserialize(&mut footer)?;
let slope = get_slope(footer.first_val, footer.last_val, footer.num_vals);
let num_bits = compute_num_bits(footer.relative_max_value);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(LinearReader {
data,
bit_unpacker,
footer,
slope,
})
}
#[inline] #[inline]
fn get_u64(&self, doc: u64) -> u64 { fn get_val(&self, doc: u64) -> u64 {
let calculated_value = get_calculated_value(self.footer.first_val, doc, self.slope); let calculated_value = get_calculated_value(self.footer.first_val, doc, self.slope);
(calculated_value + self.bit_unpacker.get(doc, &self.data)) - self.footer.offset (calculated_value + self.bit_unpacker.get(doc, &self.data)) - self.footer.offset
} }
@@ -89,11 +72,15 @@ impl FastFieldCodecReader for LinearReader {
fn max_value(&self) -> u64 { fn max_value(&self) -> u64 {
self.footer.max_value self.footer.max_value
} }
#[inline]
fn num_vals(&self) -> u64 {
self.footer.num_vals
}
} }
/// Fastfield serializer, which tries to guess values by linear interpolation /// Fastfield serializer, which tries to guess values by linear interpolation
/// and stores the difference bitpacked. /// and stores the difference bitpacked.
pub struct LinearSerializer {} pub struct LinearCodec;
#[inline] #[inline]
pub(crate) fn get_slope(first_val: u64, last_val: u64, num_vals: u64) -> f32 { pub(crate) fn get_slope(first_val: u64, last_val: u64, num_vals: u64) -> f32 {
@@ -128,20 +115,35 @@ fn diff(val1: u64, val2: u64) -> f64 {
#[inline] #[inline]
pub fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 { pub fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
if slope < 0.0 { if slope < 0.0 {
first_val - (pos as f32 * -slope) as u64 first_val.saturating_sub((pos as f32 * -slope) as u64)
} else { } else {
first_val + (pos as f32 * slope) as u64 first_val.saturating_add((pos as f32 * slope) as u64)
} }
} }
impl FastFieldCodecSerializer for LinearSerializer { impl FastFieldCodec for LinearCodec {
const CODEC_TYPE: FastFieldCodecType = FastFieldCodecType::Linear; const CODEC_TYPE: FastFieldCodecType = FastFieldCodecType::Linear;
type Reader = LinearReader;
/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self::Reader> {
let footer_offset = bytes.len() - LinearFooter::SIZE_IN_BYTES;
let (data, mut footer) = bytes.split(footer_offset);
let footer = LinearFooter::deserialize(&mut footer)?;
let slope = get_slope(footer.first_val, footer.last_val, footer.num_vals);
let num_bits = compute_num_bits(footer.relative_max_value);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(LinearReader {
data,
bit_unpacker,
footer,
slope,
})
}
/// Creates a new fast field serializer. /// Creates a new fast field serializer.
fn serialize( fn serialize(write: &mut impl Write, fastfield_accessor: &dyn Column) -> io::Result<()> {
write: &mut impl Write,
fastfield_accessor: &dyn FastFieldDataAccess,
) -> io::Result<()> {
assert!(fastfield_accessor.min_value() <= fastfield_accessor.max_value()); assert!(fastfield_accessor.min_value() <= fastfield_accessor.max_value());
let first_val = fastfield_accessor.get_val(0); let first_val = fastfield_accessor.get_val(0);
@@ -187,10 +189,15 @@ impl FastFieldCodecSerializer for LinearSerializer {
footer.serialize(write)?; footer.serialize(write)?;
Ok(()) Ok(())
} }
fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess) -> bool {
/// estimation for linear interpolation is hard because, you don't know
/// where the local maxima for the deviation of the calculated value are and
/// the offset to shift all values to >=0 is also unknown.
fn estimate(fastfield_accessor: &impl Column) -> Option<f32> {
if fastfield_accessor.num_vals() < 3 { if fastfield_accessor.num_vals() < 3 {
return false; // disable compressor for this case return None; // disable compressor for this case
} }
// On serialisation the offset is added to the actual value. // On serialisation the offset is added to the actual value.
// We need to make sure this won't run into overflow calculation issues. // We need to make sure this won't run into overflow calculation issues.
// For this we take the maximum theroretical offset and add this to the max value. // For this we take the maximum theroretical offset and add this to the max value.
@@ -202,14 +209,9 @@ impl FastFieldCodecSerializer for LinearSerializer {
.checked_add(theorethical_maximum_offset) .checked_add(theorethical_maximum_offset)
.is_none() .is_none()
{ {
return false; return None;
} }
true
}
/// estimation for linear interpolation is hard because, you don't know
/// where the local maxima for the deviation of the calculated value are and
/// the offset to shift all values to >=0 is also unknown.
fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32 {
let first_val = fastfield_accessor.get_val(0); let first_val = fastfield_accessor.get_val(0);
let last_val = fastfield_accessor.get_val(fastfield_accessor.num_vals() as u64 - 1); let last_val = fastfield_accessor.get_val(fastfield_accessor.num_vals() as u64 - 1);
let slope = get_slope(first_val, last_val, fastfield_accessor.num_vals()); let slope = get_slope(first_val, last_val, fastfield_accessor.num_vals());
@@ -241,7 +243,7 @@ impl FastFieldCodecSerializer for LinearSerializer {
* fastfield_accessor.num_vals() * fastfield_accessor.num_vals()
+ LinearFooter::SIZE_IN_BYTES as u64; + LinearFooter::SIZE_IN_BYTES as u64;
let num_bits_uncompressed = 64 * fastfield_accessor.num_vals(); let num_bits_uncompressed = 64 * fastfield_accessor.num_vals();
num_bits as f32 / num_bits_uncompressed as f32 Some(num_bits as f32 / num_bits_uncompressed as f32)
} }
} }
@@ -257,10 +259,10 @@ fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::tests::get_codec_test_data_sets; use crate::tests::get_codec_test_datasets;
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) { fn create_and_validate(data: &[u64], name: &str) -> Option<(f32, f32)> {
crate::tests::create_and_validate::<LinearSerializer, LinearReader>(data, name) crate::tests::create_and_validate::<LinearCodec>(data, name)
} }
#[test] #[test]
@@ -287,15 +289,15 @@ mod tests {
fn test_compression() { fn test_compression() {
let data = (10..=6_000_u64).collect::<Vec<_>>(); let data = (10..=6_000_u64).collect::<Vec<_>>();
let (estimate, actual_compression) = let (estimate, actual_compression) =
create_and_validate(&data, "simple monotonically large"); create_and_validate(&data, "simple monotonically large").unwrap();
assert!(actual_compression < 0.01); assert!(actual_compression < 0.01);
assert!(estimate < 0.01); assert!(estimate < 0.01);
} }
#[test] #[test]
fn test_with_codec_data_sets() { fn test_with_codec_datasets() {
let data_sets = get_codec_test_data_sets(); let data_sets = get_codec_test_datasets();
for (mut data, name) in data_sets { for (mut data, name) in data_sets {
create_and_validate(&data, name); create_and_validate(&data, name);
data.reverse(); data.reverse();
@@ -312,6 +314,13 @@ mod tests {
create_and_validate(&data, "large amplitude"); create_and_validate(&data, "large amplitude");
} }
#[test]
fn overflow_error_test() {
let data = vec![1572656989877777, 1170935903116329, 720575940379279, 0];
create_and_validate(&data, "overflow test");
}
#[test] #[test]
fn linear_interpol_fast_concave_data() { fn linear_interpol_fast_concave_data() {
let data = vec![0, 1, 2, 5, 8, 10, 20, 50]; let data = vec![0, 1, 2, 5, 8, 10, 20, 50];
@@ -332,9 +341,10 @@ mod tests {
#[test] #[test]
fn linear_interpol_fast_field_rand() { fn linear_interpol_fast_field_rand() {
for _ in 0..5000 { for _ in 0..5000 {
let mut data = (0..50).map(|_| rand::random::<u64>()).collect::<Vec<_>>(); let mut data = (0..10_000)
.map(|_| rand::random::<u64>())
.collect::<Vec<_>>();
create_and_validate(&data, "random"); create_and_validate(&data, "random");
data.reverse(); data.reverse();
create_and_validate(&data, "random"); create_and_validate(&data, "random");
} }

View File

@@ -1,10 +1,35 @@
#[macro_use] #[macro_use]
extern crate prettytable; extern crate prettytable;
use fastfield_codecs::blockwise_linear::BlockwiseLinearSerializer; use fastfield_codecs::bitpacked::BitpackedCodec;
use fastfield_codecs::linear::LinearSerializer; use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec;
use fastfield_codecs::{FastFieldCodecSerializer, FastFieldCodecType, FastFieldStats}; use fastfield_codecs::linear::LinearCodec;
use fastfield_codecs::{Column, FastFieldCodec, FastFieldCodecType, FastFieldStats};
use prettytable::{Cell, Row, Table}; use prettytable::{Cell, Row, Table};
struct Data<'a>(&'a [u64]);
impl<'a> Column for Data<'a> {
fn get_val(&self, position: u64) -> u64 {
self.0[position as usize]
}
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
Box::new(self.0.iter().cloned())
}
fn min_value(&self) -> u64 {
*self.0.iter().min().unwrap_or(&0)
}
fn max_value(&self) -> u64 {
*self.0.iter().max().unwrap_or(&0)
}
fn num_vals(&self) -> u64 {
self.0.len() as u64
}
}
fn main() { fn main() {
let mut table = Table::new(); let mut table = Table::new();
@@ -12,37 +37,30 @@ fn main() {
table.add_row(row!["", "Compression Ratio", "Compression Estimation"]); table.add_row(row!["", "Compression Ratio", "Compression Estimation"]);
for (data, data_set_name) in get_codec_test_data_sets() { for (data, data_set_name) in get_codec_test_data_sets() {
let mut results = vec![]; let results: Vec<(f32, f32, FastFieldCodecType)> = [
let res = serialize_with_codec::<LinearSerializer>(&data); serialize_with_codec::<LinearCodec>(&data),
results.push(res); serialize_with_codec::<BlockwiseLinearCodec>(&data),
let res = serialize_with_codec::<BlockwiseLinearSerializer>(&data); serialize_with_codec::<BlockwiseLinearCodec>(&data),
results.push(res); serialize_with_codec::<BitpackedCodec>(&data),
let res = serialize_with_codec::<fastfield_codecs::bitpacked::BitpackedSerializer>(&data); ]
results.push(res); .into_iter()
.flatten()
// let best_estimation_codec = results .collect();
//.iter()
//.min_by(|res1, res2| res1.partial_cmp(&res2).unwrap())
//.unwrap();
let best_compression_ratio_codec = results let best_compression_ratio_codec = results
.iter() .iter()
.min_by(|res1, res2| res1.partial_cmp(res2).unwrap()) .min_by(|&res1, &res2| res1.partial_cmp(res2).unwrap())
.cloned() .cloned()
.unwrap(); .unwrap();
table.add_row(Row::new(vec![Cell::new(data_set_name).style_spec("Bbb")])); table.add_row(Row::new(vec![Cell::new(data_set_name).style_spec("Bbb")]));
for (is_applicable, est, comp, codec_type) in results { for (est, comp, codec_type) in results {
let (est_cell, ratio_cell) = if !is_applicable { let est_cell = est.to_string();
("Codec Disabled".to_string(), "".to_string()) let ratio_cell = comp.to_string();
} else {
(est.to_string(), comp.to_string())
};
let style = if comp == best_compression_ratio_codec.1 { let style = if comp == best_compression_ratio_codec.1 {
"Fb" "Fb"
} else { } else {
"" ""
}; };
table.add_row(Row::new(vec![ table.add_row(Row::new(vec![
Cell::new(&format!("{codec_type:?}")).style_spec("bFg"), Cell::new(&format!("{codec_type:?}")).style_spec("bFg"),
Cell::new(&ratio_cell).style_spec(style), Cell::new(&ratio_cell).style_spec(style),
@@ -89,19 +107,15 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
data_and_names data_and_names
} }
pub fn serialize_with_codec<S: FastFieldCodecSerializer>( pub fn serialize_with_codec<C: FastFieldCodec>(
data: &[u64], data: &[u64],
) -> (bool, f32, f32, FastFieldCodecType) { ) -> Option<(f32, f32, FastFieldCodecType)> {
let is_applicable = S::is_applicable(&data); let data = Data(data);
if !is_applicable { let estimation = C::estimate(&data)?;
return (false, 0.0, 0.0, S::CODEC_TYPE); let mut out = Vec::new();
} C::serialize(&mut out, &data).unwrap();
let estimation = S::estimate(&data); let actual_compression = out.len() as f32 / (data.num_vals() * 8) as f32;
let mut out = vec![]; Some((estimation, actual_compression, C::CODEC_TYPE))
S::serialize(&mut out, &data).unwrap();
let actual_compression = out.len() as f32 / (data.len() * 8) as f32;
(true, estimation, actual_compression, S::CODEC_TYPE)
} }
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats { pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {

View File

@@ -4,14 +4,14 @@ use std::rc::Rc;
use std::sync::atomic::AtomicU32; use std::sync::atomic::AtomicU32;
use std::sync::Arc; use std::sync::Arc;
use fastfield_codecs::Column;
use super::agg_req::{Aggregation, Aggregations, BucketAggregationType, MetricAggregation}; use super::agg_req::{Aggregation, Aggregations, BucketAggregationType, MetricAggregation};
use super::bucket::{HistogramAggregation, RangeAggregation, TermsAggregation}; use super::bucket::{HistogramAggregation, RangeAggregation, TermsAggregation};
use super::metric::{AverageAggregation, StatsAggregation}; use super::metric::{AverageAggregation, StatsAggregation};
use super::segment_agg_result::BucketCount; use super::segment_agg_result::BucketCount;
use super::VecWithNames; use super::VecWithNames;
use crate::fastfield::{ use crate::fastfield::{type_and_cardinality, FastType, MultiValuedFastFieldReader};
type_and_cardinality, DynamicFastFieldReader, FastType, MultiValuedFastFieldReader,
};
use crate::schema::{Cardinality, Type}; use crate::schema::{Cardinality, Type};
use crate::{InvertedIndexReader, SegmentReader, TantivyError}; use crate::{InvertedIndexReader, SegmentReader, TantivyError};
@@ -37,10 +37,16 @@ impl AggregationsWithAccessor {
#[derive(Clone)] #[derive(Clone)]
pub(crate) enum FastFieldAccessor { pub(crate) enum FastFieldAccessor {
Multi(MultiValuedFastFieldReader<u64>), Multi(MultiValuedFastFieldReader<u64>),
Single(DynamicFastFieldReader<u64>), Single(Arc<dyn Column<u64>>),
} }
impl FastFieldAccessor { impl FastFieldAccessor {
pub fn as_single(&self) -> Option<&DynamicFastFieldReader<u64>> { pub fn as_single(&self) -> Option<&dyn Column<u64>> {
match self {
FastFieldAccessor::Multi(_) => None,
FastFieldAccessor::Single(reader) => Some(&**reader),
}
}
pub fn into_single(self) -> Option<Arc<dyn Column<u64>>> {
match self { match self {
FastFieldAccessor::Multi(_) => None, FastFieldAccessor::Multi(_) => None,
FastFieldAccessor::Single(reader) => Some(reader), FastFieldAccessor::Single(reader) => Some(reader),
@@ -118,7 +124,7 @@ impl BucketAggregationWithAccessor {
pub struct MetricAggregationWithAccessor { pub struct MetricAggregationWithAccessor {
pub metric: MetricAggregation, pub metric: MetricAggregation,
pub field_type: Type, pub field_type: Type,
pub accessor: DynamicFastFieldReader<u64>, pub accessor: Arc<dyn Column>,
} }
impl MetricAggregationWithAccessor { impl MetricAggregationWithAccessor {
@@ -134,9 +140,8 @@ impl MetricAggregationWithAccessor {
Ok(MetricAggregationWithAccessor { Ok(MetricAggregationWithAccessor {
accessor: accessor accessor: accessor
.as_single() .into_single()
.expect("unexpected fast field cardinality") .expect("unexpected fast field cardinality"),
.clone(),
field_type, field_type,
metric: metric.clone(), metric: metric.clone(),
}) })

View File

@@ -1,6 +1,7 @@
use std::cmp::Ordering; use std::cmp::Ordering;
use std::fmt::Display; use std::fmt::Display;
use fastfield_codecs::Column;
use itertools::Itertools; use itertools::Itertools;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
@@ -14,7 +15,6 @@ use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResults, IntermediateBucketResult, IntermediateHistogramBucketEntry, IntermediateAggregationResults, IntermediateBucketResult, IntermediateHistogramBucketEntry,
}; };
use crate::aggregation::segment_agg_result::SegmentAggregationResultsCollector; use crate::aggregation::segment_agg_result::SegmentAggregationResultsCollector;
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader};
use crate::schema::Type; use crate::schema::Type;
use crate::{DocId, TantivyError}; use crate::{DocId, TantivyError};
@@ -263,7 +263,7 @@ impl SegmentHistogramCollector {
req: &HistogramAggregation, req: &HistogramAggregation,
sub_aggregation: &AggregationsWithAccessor, sub_aggregation: &AggregationsWithAccessor,
field_type: Type, field_type: Type,
accessor: &DynamicFastFieldReader<u64>, accessor: &dyn Column<u64>,
) -> crate::Result<Self> { ) -> crate::Result<Self> {
req.validate()?; req.validate()?;
let min = f64_from_fastfield_u64(accessor.min_value(), &field_type); let min = f64_from_fastfield_u64(accessor.min_value(), &field_type);
@@ -331,10 +331,10 @@ impl SegmentHistogramCollector {
.expect("unexpected fast field cardinatility"); .expect("unexpected fast field cardinatility");
let mut iter = doc.chunks_exact(4); let mut iter = doc.chunks_exact(4);
for docs in iter.by_ref() { for docs in iter.by_ref() {
let val0 = self.f64_from_fastfield_u64(accessor.get(docs[0])); let val0 = self.f64_from_fastfield_u64(accessor.get_val(docs[0] as u64));
let val1 = self.f64_from_fastfield_u64(accessor.get(docs[1])); let val1 = self.f64_from_fastfield_u64(accessor.get_val(docs[1] as u64));
let val2 = self.f64_from_fastfield_u64(accessor.get(docs[2])); let val2 = self.f64_from_fastfield_u64(accessor.get_val(docs[2] as u64));
let val3 = self.f64_from_fastfield_u64(accessor.get(docs[3])); let val3 = self.f64_from_fastfield_u64(accessor.get_val(docs[3] as u64));
let bucket_pos0 = get_bucket_num(val0); let bucket_pos0 = get_bucket_num(val0);
let bucket_pos1 = get_bucket_num(val1); let bucket_pos1 = get_bucket_num(val1);
@@ -370,8 +370,8 @@ impl SegmentHistogramCollector {
&bucket_with_accessor.sub_aggregation, &bucket_with_accessor.sub_aggregation,
)?; )?;
} }
for doc in iter.remainder() { for &doc in iter.remainder() {
let val = f64_from_fastfield_u64(accessor.get(*doc), &self.field_type); let val = f64_from_fastfield_u64(accessor.get_val(doc as u64), &self.field_type);
if !bounds.contains(val) { if !bounds.contains(val) {
continue; continue;
} }
@@ -382,7 +382,7 @@ impl SegmentHistogramCollector {
self.buckets[bucket_pos].key, self.buckets[bucket_pos].key,
get_bucket_val(val, self.interval, self.offset) as f64 get_bucket_val(val, self.interval, self.offset) as f64
); );
self.increment_bucket(bucket_pos, *doc, &bucket_with_accessor.sub_aggregation)?; self.increment_bucket(bucket_pos, doc, &bucket_with_accessor.sub_aggregation)?;
} }
if force_flush { if force_flush {
if let Some(sub_aggregations) = self.sub_aggregations.as_mut() { if let Some(sub_aggregations) = self.sub_aggregations.as_mut() {

View File

@@ -12,7 +12,6 @@ use crate::aggregation::intermediate_agg_result::{
}; };
use crate::aggregation::segment_agg_result::{BucketCount, SegmentAggregationResultsCollector}; use crate::aggregation::segment_agg_result::{BucketCount, SegmentAggregationResultsCollector};
use crate::aggregation::{f64_from_fastfield_u64, f64_to_fastfield_u64, Key, SerializedKey}; use crate::aggregation::{f64_from_fastfield_u64, f64_to_fastfield_u64, Key, SerializedKey};
use crate::fastfield::FastFieldReader;
use crate::schema::Type; use crate::schema::Type;
use crate::{DocId, TantivyError}; use crate::{DocId, TantivyError};
@@ -264,10 +263,10 @@ impl SegmentRangeCollector {
.as_single() .as_single()
.expect("unexpected fast field cardinatility"); .expect("unexpected fast field cardinatility");
for docs in iter.by_ref() { for docs in iter.by_ref() {
let val1 = accessor.get(docs[0]); let val1 = accessor.get_val(docs[0] as u64);
let val2 = accessor.get(docs[1]); let val2 = accessor.get_val(docs[1] as u64);
let val3 = accessor.get(docs[2]); let val3 = accessor.get_val(docs[2] as u64);
let val4 = accessor.get(docs[3]); let val4 = accessor.get_val(docs[3] as u64);
let bucket_pos1 = self.get_bucket_pos(val1); let bucket_pos1 = self.get_bucket_pos(val1);
let bucket_pos2 = self.get_bucket_pos(val2); let bucket_pos2 = self.get_bucket_pos(val2);
let bucket_pos3 = self.get_bucket_pos(val3); let bucket_pos3 = self.get_bucket_pos(val3);
@@ -278,10 +277,10 @@ impl SegmentRangeCollector {
self.increment_bucket(bucket_pos3, docs[2], &bucket_with_accessor.sub_aggregation)?; self.increment_bucket(bucket_pos3, docs[2], &bucket_with_accessor.sub_aggregation)?;
self.increment_bucket(bucket_pos4, docs[3], &bucket_with_accessor.sub_aggregation)?; self.increment_bucket(bucket_pos4, docs[3], &bucket_with_accessor.sub_aggregation)?;
} }
for doc in iter.remainder() { for &doc in iter.remainder() {
let val = accessor.get(*doc); let val = accessor.get_val(doc as u64);
let bucket_pos = self.get_bucket_pos(val); let bucket_pos = self.get_bucket_pos(val);
self.increment_bucket(bucket_pos, *doc, &bucket_with_accessor.sub_aggregation)?; self.increment_bucket(bucket_pos, doc, &bucket_with_accessor.sub_aggregation)?;
} }
if force_flush { if force_flush {
for bucket in &mut self.buckets { for bucket in &mut self.buckets {

View File

@@ -1,9 +1,9 @@
use std::fmt::Debug; use std::fmt::Debug;
use fastfield_codecs::Column;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use crate::aggregation::f64_from_fastfield_u64; use crate::aggregation::f64_from_fastfield_u64;
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader};
use crate::schema::Type; use crate::schema::Type;
use crate::DocId; use crate::DocId;
@@ -57,13 +57,13 @@ impl SegmentAverageCollector {
data: Default::default(), data: Default::default(),
} }
} }
pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &DynamicFastFieldReader<u64>) { pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &dyn Column<u64>) {
let mut iter = doc.chunks_exact(4); let mut iter = doc.chunks_exact(4);
for docs in iter.by_ref() { for docs in iter.by_ref() {
let val1 = field.get(docs[0]); let val1 = field.get_val(docs[0] as u64);
let val2 = field.get(docs[1]); let val2 = field.get_val(docs[1] as u64);
let val3 = field.get(docs[2]); let val3 = field.get_val(docs[2] as u64);
let val4 = field.get(docs[3]); let val4 = field.get_val(docs[3] as u64);
let val1 = f64_from_fastfield_u64(val1, &self.field_type); let val1 = f64_from_fastfield_u64(val1, &self.field_type);
let val2 = f64_from_fastfield_u64(val2, &self.field_type); let val2 = f64_from_fastfield_u64(val2, &self.field_type);
let val3 = f64_from_fastfield_u64(val3, &self.field_type); let val3 = f64_from_fastfield_u64(val3, &self.field_type);
@@ -73,8 +73,8 @@ impl SegmentAverageCollector {
self.data.collect(val3); self.data.collect(val3);
self.data.collect(val4); self.data.collect(val4);
} }
for doc in iter.remainder() { for &doc in iter.remainder() {
let val = field.get(*doc); let val = field.get_val(doc as u64);
let val = f64_from_fastfield_u64(val, &self.field_type); let val = f64_from_fastfield_u64(val, &self.field_type);
self.data.collect(val); self.data.collect(val);
} }

View File

@@ -1,7 +1,7 @@
use fastfield_codecs::Column;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use crate::aggregation::f64_from_fastfield_u64; use crate::aggregation::f64_from_fastfield_u64;
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader};
use crate::schema::Type; use crate::schema::Type;
use crate::{DocId, TantivyError}; use crate::{DocId, TantivyError};
@@ -163,13 +163,13 @@ impl SegmentStatsCollector {
stats: IntermediateStats::default(), stats: IntermediateStats::default(),
} }
} }
pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &DynamicFastFieldReader<u64>) { pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &dyn Column<u64>) {
let mut iter = doc.chunks_exact(4); let mut iter = doc.chunks_exact(4);
for docs in iter.by_ref() { for docs in iter.by_ref() {
let val1 = field.get(docs[0]); let val1 = field.get_val(docs[0] as u64);
let val2 = field.get(docs[1]); let val2 = field.get_val(docs[1] as u64);
let val3 = field.get(docs[2]); let val3 = field.get_val(docs[2] as u64);
let val4 = field.get(docs[3]); let val4 = field.get_val(docs[3] as u64);
let val1 = f64_from_fastfield_u64(val1, &self.field_type); let val1 = f64_from_fastfield_u64(val1, &self.field_type);
let val2 = f64_from_fastfield_u64(val2, &self.field_type); let val2 = f64_from_fastfield_u64(val2, &self.field_type);
let val3 = f64_from_fastfield_u64(val3, &self.field_type); let val3 = f64_from_fastfield_u64(val3, &self.field_type);
@@ -179,8 +179,8 @@ impl SegmentStatsCollector {
self.stats.collect(val3); self.stats.collect(val3);
self.stats.collect(val4); self.stats.collect(val4);
} }
for doc in iter.remainder() { for &doc in iter.remainder() {
let val = field.get(*doc); let val = field.get_val(doc as u64);
let val = f64_from_fastfield_u64(val, &self.field_type); let val = f64_from_fastfield_u64(val, &self.field_type);
self.stats.collect(val); self.stats.collect(val);
} }

View File

@@ -185,10 +185,10 @@ impl SegmentMetricResultCollector {
pub(crate) fn collect_block(&mut self, doc: &[DocId], metric: &MetricAggregationWithAccessor) { pub(crate) fn collect_block(&mut self, doc: &[DocId], metric: &MetricAggregationWithAccessor) {
match self { match self {
SegmentMetricResultCollector::Average(avg_collector) => { SegmentMetricResultCollector::Average(avg_collector) => {
avg_collector.collect_block(doc, &metric.accessor); avg_collector.collect_block(doc, &*metric.accessor);
} }
SegmentMetricResultCollector::Stats(stats_collector) => { SegmentMetricResultCollector::Stats(stats_collector) => {
stats_collector.collect_block(doc, &metric.accessor); stats_collector.collect_block(doc, &*metric.accessor);
} }
} }
} }

View File

@@ -10,9 +10,12 @@
// --- // ---
// Importing tantivy... // Importing tantivy...
use std::marker::PhantomData; use std::marker::PhantomData;
use std::sync::Arc;
use fastfield_codecs::Column;
use crate::collector::{Collector, SegmentCollector}; use crate::collector::{Collector, SegmentCollector};
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue}; use crate::fastfield::FastValue;
use crate::schema::Field; use crate::schema::Field;
use crate::{Score, SegmentReader, TantivyError}; use crate::{Score, SegmentReader, TantivyError};
@@ -158,7 +161,7 @@ where
TPredicate: 'static, TPredicate: 'static,
TPredicateValue: FastValue, TPredicateValue: FastValue,
{ {
fast_field_reader: DynamicFastFieldReader<TPredicateValue>, fast_field_reader: Arc<dyn Column<TPredicateValue>>,
segment_collector: TSegmentCollector, segment_collector: TSegmentCollector,
predicate: TPredicate, predicate: TPredicate,
t_predicate_value: PhantomData<TPredicateValue>, t_predicate_value: PhantomData<TPredicateValue>,
@@ -174,7 +177,7 @@ where
type Fruit = TSegmentCollector::Fruit; type Fruit = TSegmentCollector::Fruit;
fn collect(&mut self, doc: u32, score: Score) { fn collect(&mut self, doc: u32, score: Score) {
let value = self.fast_field_reader.get(doc); let value = self.fast_field_reader.get_val(doc as u64);
if (self.predicate)(value) { if (self.predicate)(value) {
self.segment_collector.collect(doc, score) self.segment_collector.collect(doc, score)
} }

View File

@@ -1,7 +1,10 @@
use std::sync::Arc;
use fastdivide::DividerU64; use fastdivide::DividerU64;
use fastfield_codecs::Column;
use crate::collector::{Collector, SegmentCollector}; use crate::collector::{Collector, SegmentCollector};
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue}; use crate::fastfield::FastValue;
use crate::schema::{Field, Type}; use crate::schema::{Field, Type};
use crate::{DocId, Score}; use crate::{DocId, Score};
@@ -84,14 +87,14 @@ impl HistogramComputer {
} }
pub struct SegmentHistogramCollector { pub struct SegmentHistogramCollector {
histogram_computer: HistogramComputer, histogram_computer: HistogramComputer,
ff_reader: DynamicFastFieldReader<u64>, ff_reader: Arc<dyn Column<u64>>,
} }
impl SegmentCollector for SegmentHistogramCollector { impl SegmentCollector for SegmentHistogramCollector {
type Fruit = Vec<u64>; type Fruit = Vec<u64>;
fn collect(&mut self, doc: DocId, _score: Score) { fn collect(&mut self, doc: DocId, _score: Score) {
let value = self.ff_reader.get(doc); let value = self.ff_reader.get_val(doc as u64);
self.histogram_computer.add_value(value); self.histogram_computer.add_value(value);
} }

View File

@@ -1,7 +1,11 @@
use std::sync::Arc;
use fastfield_codecs::Column;
use super::*; use super::*;
use crate::collector::{Count, FilterCollector, TopDocs}; use crate::collector::{Count, FilterCollector, TopDocs};
use crate::core::SegmentReader; use crate::core::SegmentReader;
use crate::fastfield::{BytesFastFieldReader, DynamicFastFieldReader, FastFieldReader}; use crate::fastfield::BytesFastFieldReader;
use crate::query::{AllQuery, QueryParser}; use crate::query::{AllQuery, QueryParser};
use crate::schema::{Field, Schema, FAST, TEXT}; use crate::schema::{Field, Schema, FAST, TEXT};
use crate::time::format_description::well_known::Rfc3339; use crate::time::format_description::well_known::Rfc3339;
@@ -156,7 +160,7 @@ pub struct FastFieldTestCollector {
pub struct FastFieldSegmentCollector { pub struct FastFieldSegmentCollector {
vals: Vec<u64>, vals: Vec<u64>,
reader: DynamicFastFieldReader<u64>, reader: Arc<dyn Column<u64>>,
} }
impl FastFieldTestCollector { impl FastFieldTestCollector {
@@ -197,7 +201,7 @@ impl SegmentCollector for FastFieldSegmentCollector {
type Fruit = Vec<u64>; type Fruit = Vec<u64>;
fn collect(&mut self, doc: DocId, _score: Score) { fn collect(&mut self, doc: DocId, _score: Score) {
let val = self.reader.get(doc); let val = self.reader.get_val(doc as u64);
self.vals.push(val); self.vals.push(val);
} }

View File

@@ -1,6 +1,9 @@
use std::collections::BinaryHeap; use std::collections::BinaryHeap;
use std::fmt; use std::fmt;
use std::marker::PhantomData; use std::marker::PhantomData;
use std::sync::Arc;
use fastfield_codecs::Column;
use super::Collector; use super::Collector;
use crate::collector::custom_score_top_collector::CustomScoreTopCollector; use crate::collector::custom_score_top_collector::CustomScoreTopCollector;
@@ -9,7 +12,7 @@ use crate::collector::tweak_score_top_collector::TweakedScoreTopCollector;
use crate::collector::{ use crate::collector::{
CustomScorer, CustomSegmentScorer, ScoreSegmentTweaker, ScoreTweaker, SegmentCollector, CustomScorer, CustomSegmentScorer, ScoreSegmentTweaker, ScoreTweaker, SegmentCollector,
}; };
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue}; use crate::fastfield::FastValue;
use crate::query::Weight; use crate::query::Weight;
use crate::schema::Field; use crate::schema::Field;
use crate::{DocAddress, DocId, Score, SegmentOrdinal, SegmentReader, TantivyError}; use crate::{DocAddress, DocId, Score, SegmentOrdinal, SegmentReader, TantivyError};
@@ -129,12 +132,12 @@ impl fmt::Debug for TopDocs {
} }
struct ScorerByFastFieldReader { struct ScorerByFastFieldReader {
ff_reader: DynamicFastFieldReader<u64>, ff_reader: Arc<dyn Column<u64>>,
} }
impl CustomSegmentScorer<u64> for ScorerByFastFieldReader { impl CustomSegmentScorer<u64> for ScorerByFastFieldReader {
fn score(&mut self, doc: DocId) -> u64 { fn score(&mut self, doc: DocId) -> u64 {
self.ff_reader.get(doc) self.ff_reader.get_val(doc as u64)
} }
} }
@@ -407,7 +410,6 @@ impl TopDocs {
/// # use tantivy::query::QueryParser; /// # use tantivy::query::QueryParser;
/// use tantivy::SegmentReader; /// use tantivy::SegmentReader;
/// use tantivy::collector::TopDocs; /// use tantivy::collector::TopDocs;
/// use tantivy::fastfield::FastFieldReader;
/// use tantivy::schema::Field; /// use tantivy::schema::Field;
/// ///
/// fn create_schema() -> Schema { /// fn create_schema() -> Schema {
@@ -456,7 +458,7 @@ impl TopDocs {
/// ///
/// // We can now define our actual scoring function /// // We can now define our actual scoring function
/// move |doc: DocId, original_score: Score| { /// move |doc: DocId, original_score: Score| {
/// let popularity: u64 = popularity_reader.get(doc); /// let popularity: u64 = popularity_reader.get_val(doc as u64);
/// // Well.. For the sake of the example we use a simple logarithm /// // Well.. For the sake of the example we use a simple logarithm
/// // function. /// // function.
/// let popularity_boost_score = ((2u64 + popularity) as Score).log2(); /// let popularity_boost_score = ((2u64 + popularity) as Score).log2();
@@ -515,7 +517,6 @@ impl TopDocs {
/// use tantivy::SegmentReader; /// use tantivy::SegmentReader;
/// use tantivy::collector::TopDocs; /// use tantivy::collector::TopDocs;
/// use tantivy::schema::Field; /// use tantivy::schema::Field;
/// use tantivy::fastfield::FastFieldReader;
/// ///
/// # fn create_schema() -> Schema { /// # fn create_schema() -> Schema {
/// # let mut schema_builder = Schema::builder(); /// # let mut schema_builder = Schema::builder();
@@ -567,8 +568,8 @@ impl TopDocs {
/// ///
/// // We can now define our actual scoring function /// // We can now define our actual scoring function
/// move |doc: DocId| { /// move |doc: DocId| {
/// let popularity: u64 = popularity_reader.get(doc); /// let popularity: u64 = popularity_reader.get_val(doc as u64);
/// let boosted: u64 = boosted_reader.get(doc); /// let boosted: u64 = boosted_reader.get_val(doc as u64);
/// // Score do not have to be `f64` in tantivy. /// // Score do not have to be `f64` in tantivy.
/// // Here we return a couple to get lexicographical order /// // Here we return a couple to get lexicographical order
/// // for free. /// // for free.

View File

@@ -16,7 +16,7 @@ use crate::directory::MmapDirectory;
use crate::directory::{Directory, ManagedDirectory, RamDirectory, INDEX_WRITER_LOCK}; use crate::directory::{Directory, ManagedDirectory, RamDirectory, INDEX_WRITER_LOCK};
use crate::error::{DataCorruption, TantivyError}; use crate::error::{DataCorruption, TantivyError};
use crate::indexer::index_writer::{MAX_NUM_THREAD, MEMORY_ARENA_NUM_BYTES_MIN}; use crate::indexer::index_writer::{MAX_NUM_THREAD, MEMORY_ARENA_NUM_BYTES_MIN};
use crate::indexer::segment_updater::save_new_metas; use crate::indexer::segment_updater::save_metas;
use crate::reader::{IndexReader, IndexReaderBuilder}; use crate::reader::{IndexReader, IndexReaderBuilder};
use crate::schema::{Field, FieldType, Schema}; use crate::schema::{Field, FieldType, Schema};
use crate::tokenizer::{TextAnalyzer, TokenizerManager}; use crate::tokenizer::{TextAnalyzer, TokenizerManager};
@@ -47,6 +47,34 @@ fn load_metas(
.map_err(From::from) .map_err(From::from)
} }
/// Save the index meta file.
/// This operation is atomic :
/// Either
/// - it fails, in which case an error is returned,
/// and the `meta.json` remains untouched,
/// - it succeeds, and `meta.json` is written
/// and flushed.
///
/// This method is not part of tantivy's public API
fn save_new_metas(
schema: Schema,
index_settings: IndexSettings,
directory: &dyn Directory,
) -> crate::Result<()> {
save_metas(
&IndexMeta {
index_settings,
segments: Vec::new(),
schema,
opstamp: 0u64,
payload: None,
},
directory,
)?;
directory.sync_directory()?;
Ok(())
}
/// IndexBuilder can be used to create an index. /// IndexBuilder can be used to create an index.
/// ///
/// Use in conjunction with `SchemaBuilder`. Global index settings /// Use in conjunction with `SchemaBuilder`. Global index settings

View File

@@ -1,5 +1,9 @@
use std::sync::Arc;
use fastfield_codecs::Column;
use crate::directory::{FileSlice, OwnedBytes}; use crate::directory::{FileSlice, OwnedBytes};
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, MultiValueLength}; use crate::fastfield::MultiValueLength;
use crate::DocId; use crate::DocId;
/// Reader for byte array fast fields /// Reader for byte array fast fields
@@ -14,13 +18,13 @@ use crate::DocId;
/// and the start index for the next document, and keeping the bytes in between. /// and the start index for the next document, and keeping the bytes in between.
#[derive(Clone)] #[derive(Clone)]
pub struct BytesFastFieldReader { pub struct BytesFastFieldReader {
idx_reader: DynamicFastFieldReader<u64>, idx_reader: Arc<dyn Column<u64>>,
values: OwnedBytes, values: OwnedBytes,
} }
impl BytesFastFieldReader { impl BytesFastFieldReader {
pub(crate) fn open( pub(crate) fn open(
idx_reader: DynamicFastFieldReader<u64>, idx_reader: Arc<dyn Column<u64>>,
values_file: FileSlice, values_file: FileSlice,
) -> crate::Result<BytesFastFieldReader> { ) -> crate::Result<BytesFastFieldReader> {
let values = values_file.read_bytes()?; let values = values_file.read_bytes()?;
@@ -28,8 +32,9 @@ impl BytesFastFieldReader {
} }
fn range(&self, doc: DocId) -> (usize, usize) { fn range(&self, doc: DocId) -> (usize, usize) {
let start = self.idx_reader.get(doc) as usize; let idx = doc as u64;
let stop = self.idx_reader.get(doc + 1) as usize; let start = self.idx_reader.get_val(idx) as usize;
let stop = self.idx_reader.get_val(idx + 1) as usize;
(start, stop) (start, stop)
} }

View File

@@ -3,56 +3,58 @@ use std::num::NonZeroU64;
use common::BinarySerializable; use common::BinarySerializable;
use fastdivide::DividerU64; use fastdivide::DividerU64;
use fastfield_codecs::FastFieldCodecReader; use fastfield_codecs::{monotonic_map_column, Column, FastFieldCodec};
use ownedbytes::OwnedBytes; use ownedbytes::OwnedBytes;
pub const GCD_DEFAULT: u64 = 1; pub const GCD_DEFAULT: u64 = 1;
/// Wrapper for accessing a fastfield. #[derive(Debug, Clone, Copy)]
/// struct GCDParams {
/// Holds the data and the codec to the read the data.
#[derive(Clone)]
pub struct GCDFastFieldCodec<CodecReader> {
gcd: u64, gcd: u64,
min_value: u64, min_value: u64,
reader: CodecReader, num_vals: u64,
} }
impl<C: FastFieldCodecReader + Clone> FastFieldCodecReader for GCDFastFieldCodec<C> { impl BinarySerializable for GCDParams {
/// Opens a fast field given the bytes. fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
fn open_from_bytes(bytes: OwnedBytes) -> std::io::Result<Self> { self.gcd.serialize(writer)?;
let footer_offset = bytes.len() - 16; self.min_value.serialize(writer)?;
let (body, mut footer) = bytes.split(footer_offset); self.num_vals.serialize(writer)?;
let gcd = u64::deserialize(&mut footer)?; Ok(())
let min_value = u64::deserialize(&mut footer)?; }
let reader = C::open_from_bytes(body)?;
Ok(GCDFastFieldCodec { fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let gcd: u64 = u64::deserialize(reader)?;
let min_value: u64 = u64::deserialize(reader)?;
let num_vals: u64 = u64::deserialize(reader)?;
Ok(Self {
gcd, gcd,
min_value, min_value,
reader, num_vals,
}) })
} }
#[inline]
fn get_u64(&self, doc: u64) -> u64 {
let mut data = self.reader.get_u64(doc);
data *= self.gcd;
data += self.min_value;
data
}
fn min_value(&self) -> u64 {
self.min_value + self.reader.min_value() * self.gcd
}
fn max_value(&self) -> u64 {
self.min_value + self.reader.max_value() * self.gcd
}
} }
pub fn write_gcd_header<W: Write>(field_write: &mut W, min_value: u64, gcd: u64) -> io::Result<()> { pub fn open_gcd_from_bytes<WrappedCodec: FastFieldCodec>(
bytes: OwnedBytes,
) -> io::Result<impl Column> {
let footer_offset = bytes.len() - 24;
let (body, mut footer) = bytes.split(footer_offset);
let gcd_params = GCDParams::deserialize(&mut footer)?;
let gcd_remap = move |val: u64| gcd_params.min_value + gcd_params.gcd * val;
let reader: WrappedCodec::Reader = WrappedCodec::open_from_bytes(body)?;
Ok(monotonic_map_column(reader, gcd_remap))
}
pub fn write_gcd_header<W: Write>(
field_write: &mut W,
min_value: u64,
gcd: u64,
num_vals: u64,
) -> io::Result<()> {
gcd.serialize(field_write)?; gcd.serialize(field_write)?;
min_value.serialize(field_write)?; min_value.serialize(field_write)?;
num_vals.serialize(field_write)?;
Ok(()) Ok(())
} }
@@ -99,17 +101,19 @@ mod tests {
use std::collections::HashMap; use std::collections::HashMap;
use std::num::NonZeroU64; use std::num::NonZeroU64;
use std::path::Path; use std::path::Path;
use std::sync::Arc;
use std::time::{Duration, SystemTime}; use std::time::{Duration, SystemTime};
use common::HasLen; use common::HasLen;
use fastfield_codecs::Column;
use crate::directory::{CompositeFile, RamDirectory, WritePtr}; use crate::directory::{CompositeFile, RamDirectory, WritePtr};
use crate::fastfield::gcd::compute_gcd; use crate::fastfield::gcd::compute_gcd;
use crate::fastfield::reader::open_fast_field;
use crate::fastfield::serializer::FastFieldCodecEnableCheck; use crate::fastfield::serializer::FastFieldCodecEnableCheck;
use crate::fastfield::tests::{FIELD, FIELDI64, SCHEMA, SCHEMAI64}; use crate::fastfield::tests::{encode_decode_fast_field, FIELD, FIELDI64, SCHEMA, SCHEMAI64};
use crate::fastfield::{ use crate::fastfield::{
find_gcd, CompositeFastFieldSerializer, DynamicFastFieldReader, FastFieldCodecType, find_gcd, CompositeFastFieldSerializer, FastFieldCodecType, FastFieldsWriter, ALL_CODECS,
FastFieldReader, FastFieldsWriter, ALL_CODECS,
}; };
use crate::schema::{Cardinality, Schema}; use crate::schema::{Cardinality, Schema};
use crate::{DateOptions, DatePrecision, DateTime, Directory}; use crate::{DateOptions, DatePrecision, DateTime, Directory};
@@ -151,11 +155,10 @@ mod tests {
let file = directory.open_read(path).unwrap(); let file = directory.open_read(path).unwrap();
let composite_file = CompositeFile::open(&file)?; let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(*FIELD).unwrap(); let file = composite_file.open_read(*FIELD).unwrap();
let fast_field_reader = DynamicFastFieldReader::<i64>::open(file)?; let fast_field_reader: Arc<dyn Column<i64>> = open_fast_field(file.read_bytes()?)?;
assert_eq!(fast_field_reader.get_val(0), -4000i64);
assert_eq!(fast_field_reader.get(0), -4000i64); assert_eq!(fast_field_reader.get_val(1), -3000i64);
assert_eq!(fast_field_reader.get(1), -3000i64); assert_eq!(fast_field_reader.get_val(2), -2000i64);
assert_eq!(fast_field_reader.get(2), -2000i64);
assert_eq!(fast_field_reader.max_value(), (num_vals as i64 - 5) * 1000); assert_eq!(fast_field_reader.max_value(), (num_vals as i64 - 5) * 1000);
assert_eq!(fast_field_reader.min_value(), -4000i64); assert_eq!(fast_field_reader.min_value(), -4000i64);
let file = directory.open_read(path).unwrap(); let file = directory.open_read(path).unwrap();
@@ -174,7 +177,7 @@ mod tests {
#[test] #[test]
fn test_fastfield_gcd_i64() -> crate::Result<()> { fn test_fastfield_gcd_i64() -> crate::Result<()> {
for &code_type in ALL_CODECS { for &code_type in ALL_CODECS {
test_fastfield_gcd_i64_with_codec(code_type, 5005)?; test_fastfield_gcd_i64_with_codec(code_type, 5500)?;
} }
Ok(()) Ok(())
} }
@@ -193,10 +196,10 @@ mod tests {
let file = directory.open_read(path).unwrap(); let file = directory.open_read(path).unwrap();
let composite_file = CompositeFile::open(&file)?; let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(*FIELD).unwrap(); let file = composite_file.open_read(*FIELD).unwrap();
let fast_field_reader = DynamicFastFieldReader::<u64>::open(file)?; let fast_field_reader = open_fast_field::<u64>(file.read_bytes()?)?;
assert_eq!(fast_field_reader.get(0), 1000u64); assert_eq!(fast_field_reader.get_val(0), 1000u64);
assert_eq!(fast_field_reader.get(1), 2000u64); assert_eq!(fast_field_reader.get_val(1), 2000u64);
assert_eq!(fast_field_reader.get(2), 3000u64); assert_eq!(fast_field_reader.get_val(2), 3000u64);
assert_eq!(fast_field_reader.max_value(), num_vals as u64 * 1000); assert_eq!(fast_field_reader.max_value(), num_vals as u64 * 1000);
assert_eq!(fast_field_reader.min_value(), 1000u64); assert_eq!(fast_field_reader.min_value(), 1000u64);
let file = directory.open_read(path).unwrap(); let file = directory.open_read(path).unwrap();
@@ -215,17 +218,17 @@ mod tests {
#[test] #[test]
fn test_fastfield_gcd_u64() -> crate::Result<()> { fn test_fastfield_gcd_u64() -> crate::Result<()> {
for &code_type in ALL_CODECS { for &code_type in ALL_CODECS {
test_fastfield_gcd_u64_with_codec(code_type, 5005)?; test_fastfield_gcd_u64_with_codec(code_type, 5500)?;
} }
Ok(()) Ok(())
} }
#[test] #[test]
pub fn test_fastfield2() { pub fn test_fastfield2() {
let test_fastfield = DynamicFastFieldReader::<u64>::from(vec![100, 200, 300]); let test_fastfield = encode_decode_fast_field(&[100u64, 200u64, 300u64]);
assert_eq!(test_fastfield.get(0), 100); assert_eq!(test_fastfield.get_val(0), 100);
assert_eq!(test_fastfield.get(1), 200); assert_eq!(test_fastfield.get_val(1), 200);
assert_eq!(test_fastfield.get(2), 300); assert_eq!(test_fastfield.get_val(2), 300);
} }
#[test] #[test]
@@ -288,11 +291,11 @@ mod tests {
let composite_file = CompositeFile::open(&file)?; let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(*FIELD).unwrap(); let file = composite_file.open_read(*FIELD).unwrap();
let len = file.len(); let len = file.len();
let test_fastfield = DynamicFastFieldReader::<DateTime>::open(file)?; let test_fastfield = open_fast_field::<DateTime>(file.read_bytes()?)?;
assert_eq!(test_fastfield.get(0), time1.truncate(precision)); assert_eq!(test_fastfield.get_val(0), time1.truncate(precision));
assert_eq!(test_fastfield.get(1), time2.truncate(precision)); assert_eq!(test_fastfield.get_val(1), time2.truncate(precision));
assert_eq!(test_fastfield.get(2), time3.truncate(precision)); assert_eq!(test_fastfield.get_val(2), time3.truncate(precision));
Ok(len) Ok(len)
} }

View File

@@ -26,12 +26,11 @@ pub use self::alive_bitset::{intersect_alive_bitsets, write_alive_bitset, AliveB
pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter}; pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter};
pub use self::error::{FastFieldNotAvailableError, Result}; pub use self::error::{FastFieldNotAvailableError, Result};
pub use self::facet_reader::FacetReader; pub use self::facet_reader::FacetReader;
pub(crate) use self::gcd::{find_gcd, GCDFastFieldCodec, GCD_DEFAULT}; pub(crate) use self::gcd::{find_gcd, GCD_DEFAULT};
pub use self::multivalued::{MultiValuedFastFieldReader, MultiValuedFastFieldWriter}; pub use self::multivalued::{MultiValuedFastFieldReader, MultiValuedFastFieldWriter};
pub use self::reader::{DynamicFastFieldReader, FastFieldReader};
pub use self::readers::FastFieldReaders; pub use self::readers::FastFieldReaders;
pub(crate) use self::readers::{type_and_cardinality, FastType}; pub(crate) use self::readers::{type_and_cardinality, FastType};
pub use self::serializer::{CompositeFastFieldSerializer, FastFieldDataAccess, FastFieldStats}; pub use self::serializer::{Column, CompositeFastFieldSerializer, FastFieldStats};
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter}; pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
use crate::schema::{Cardinality, FieldType, Type, Value}; use crate::schema::{Cardinality, FieldType, Type, Value};
use crate::{DateTime, DocId}; use crate::{DateTime, DocId};
@@ -266,6 +265,7 @@ mod tests {
use std::collections::HashMap; use std::collections::HashMap;
use std::ops::Range; use std::ops::Range;
use std::path::Path; use std::path::Path;
use std::sync::Arc;
use common::HasLen; use common::HasLen;
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
@@ -275,6 +275,7 @@ mod tests {
use super::*; use super::*;
use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr}; use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr};
use crate::fastfield::reader::open_fast_field;
use crate::merge_policy::NoMergePolicy; use crate::merge_policy::NoMergePolicy;
use crate::schema::{Document, Field, Schema, FAST, STRING, TEXT}; use crate::schema::{Document, Field, Schema, FAST, STRING, TEXT};
use crate::time::OffsetDateTime; use crate::time::OffsetDateTime;
@@ -295,12 +296,54 @@ mod tests {
pub static FIELD: Lazy<Field> = Lazy::new(|| SCHEMA.get_field("field").unwrap()); pub static FIELD: Lazy<Field> = Lazy::new(|| SCHEMA.get_field("field").unwrap());
pub static FIELDI64: Lazy<Field> = Lazy::new(|| SCHEMAI64.get_field("field").unwrap()); pub static FIELDI64: Lazy<Field> = Lazy::new(|| SCHEMAI64.get_field("field").unwrap());
/// Encode values using the most appropriate codec and and then loads it
/// right away.
///
/// This is useful in tests and bench.
pub(crate) fn encode_decode_fast_field<Item: FastValue>(
vals: &[Item],
) -> Arc<dyn Column<Item>> {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field("field", FAST);
let schema = schema_builder.build();
let path = Path::new("__dummy__");
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory
.open_write(path)
.expect("With a RamDirectory, this should never fail.");
let mut serializer = CompositeFastFieldSerializer::from_write(write)
.expect("With a RamDirectory, this should never fail.");
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
{
let fast_field_writer = fast_field_writers
.get_field_writer_mut(field)
.expect("With a RamDirectory, this should never fail.");
for val in vals {
fast_field_writer.add_val(val.to_u64());
}
}
fast_field_writers
.serialize(&mut serializer, &HashMap::new(), None)
.unwrap();
serializer.close().unwrap();
}
let file = directory.open_read(path).expect("Failed to open the file");
let composite_file = CompositeFile::open(&file).expect("Failed to read the composite file");
let field_bytes = composite_file
.open_read(field)
.expect("File component not found")
.read_bytes()
.unwrap();
open_fast_field(field_bytes).unwrap()
}
#[test] #[test]
pub fn test_fastfield() { pub fn test_fastfield() {
let test_fastfield = DynamicFastFieldReader::<u64>::from(vec![100, 200, 300]); let test_fastfield = encode_decode_fast_field(&[100u64, 200u64, 300u64]);
assert_eq!(test_fastfield.get(0), 100); assert_eq!(test_fastfield.get_val(0u64), 100);
assert_eq!(test_fastfield.get(1), 200); assert_eq!(test_fastfield.get_val(1u64), 200);
assert_eq!(test_fastfield.get(2), 300); assert_eq!(test_fastfield.get_val(2u64), 300);
} }
#[test] #[test]
@@ -326,13 +369,13 @@ mod tests {
serializer.close().unwrap(); serializer.close().unwrap();
} }
let file = directory.open_read(path).unwrap(); let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 37); assert_eq!(file.len(), 45);
let composite_file = CompositeFile::open(&file)?; let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(*FIELD).unwrap(); let fast_field_bytes = composite_file.open_read(*FIELD).unwrap().read_bytes()?;
let fast_field_reader = DynamicFastFieldReader::<u64>::open(file)?; let fast_field_reader = open_fast_field::<u64>(fast_field_bytes)?;
assert_eq!(fast_field_reader.get(0), 13u64); assert_eq!(fast_field_reader.get_val(0), 13u64);
assert_eq!(fast_field_reader.get(1), 14u64); assert_eq!(fast_field_reader.get_val(1), 14u64);
assert_eq!(fast_field_reader.get(2), 2u64); assert_eq!(fast_field_reader.get_val(2), 2u64);
Ok(()) Ok(())
} }
@@ -357,20 +400,23 @@ mod tests {
serializer.close()?; serializer.close()?;
} }
let file = directory.open_read(path)?; let file = directory.open_read(path)?;
assert_eq!(file.len(), 62); assert_eq!(file.len(), 70);
{ {
let fast_fields_composite = CompositeFile::open(&file)?; let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(*FIELD).unwrap(); let data = fast_fields_composite
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?; .open_read(*FIELD)
assert_eq!(fast_field_reader.get(0), 4u64); .unwrap()
assert_eq!(fast_field_reader.get(1), 14_082_001u64); .read_bytes()?;
assert_eq!(fast_field_reader.get(2), 3_052u64); let fast_field_reader = open_fast_field::<u64>(data)?;
assert_eq!(fast_field_reader.get(3), 9002u64); assert_eq!(fast_field_reader.get_val(0), 4u64);
assert_eq!(fast_field_reader.get(4), 15_001u64); assert_eq!(fast_field_reader.get_val(1), 14_082_001u64);
assert_eq!(fast_field_reader.get(5), 777u64); assert_eq!(fast_field_reader.get_val(2), 3_052u64);
assert_eq!(fast_field_reader.get(6), 1_002u64); assert_eq!(fast_field_reader.get_val(3), 9002u64);
assert_eq!(fast_field_reader.get(7), 1_501u64); assert_eq!(fast_field_reader.get_val(4), 15_001u64);
assert_eq!(fast_field_reader.get(8), 215u64); assert_eq!(fast_field_reader.get_val(5), 777u64);
assert_eq!(fast_field_reader.get_val(6), 1_002u64);
assert_eq!(fast_field_reader.get_val(7), 1_501u64);
assert_eq!(fast_field_reader.get_val(8), 215u64);
} }
Ok(()) Ok(())
} }
@@ -393,13 +439,16 @@ mod tests {
serializer.close().unwrap(); serializer.close().unwrap();
} }
let file = directory.open_read(path).unwrap(); let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 35); assert_eq!(file.len(), 43);
{ {
let fast_fields_composite = CompositeFile::open(&file).unwrap(); let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap(); let data = fast_fields_composite
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?; .open_read(*FIELD)
.unwrap()
.read_bytes()?;
let fast_field_reader = open_fast_field::<u64>(data)?;
for doc in 0..10_000 { for doc in 0..10_000 {
assert_eq!(fast_field_reader.get(doc), 100_000u64); assert_eq!(fast_field_reader.get_val(doc), 100_000u64);
} }
} }
Ok(()) Ok(())
@@ -425,15 +474,18 @@ mod tests {
serializer.close().unwrap(); serializer.close().unwrap();
} }
let file = directory.open_read(path).unwrap(); let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 80043); assert_eq!(file.len(), 80051);
{ {
let fast_fields_composite = CompositeFile::open(&file)?; let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(*FIELD).unwrap(); let data = fast_fields_composite
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?; .open_read(*FIELD)
assert_eq!(fast_field_reader.get(0), 0u64); .unwrap()
.read_bytes()?;
let fast_field_reader = open_fast_field::<u64>(data)?;
assert_eq!(fast_field_reader.get_val(0), 0u64);
for doc in 1..10_001 { for doc in 1..10_001 {
assert_eq!( assert_eq!(
fast_field_reader.get(doc), fast_field_reader.get_val(doc),
5_000_000_000_000_000_000u64 + doc as u64 - 1u64 5_000_000_000_000_000_000u64 + doc as u64 - 1u64
); );
} }
@@ -469,13 +521,16 @@ mod tests {
assert_eq!(file.len(), 75_usize); // linear interpol size after calc improvement assert_eq!(file.len(), 75_usize); // linear interpol size after calc improvement
{ {
let fast_fields_composite = CompositeFile::open(&file)?; let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(i64_field).unwrap(); let data = fast_fields_composite
let fast_field_reader = DynamicFastFieldReader::<i64>::open(data)?; .open_read(i64_field)
.unwrap()
.read_bytes()?;
let fast_field_reader = open_fast_field::<i64>(data)?;
assert_eq!(fast_field_reader.min_value(), -100i64); assert_eq!(fast_field_reader.min_value(), -100i64);
assert_eq!(fast_field_reader.max_value(), 9_999i64); assert_eq!(fast_field_reader.max_value(), 9_999i64);
for (doc, i) in (-100i64..10_000i64).enumerate() { for (doc, i) in (-100i64..10_000i64).enumerate() {
assert_eq!(fast_field_reader.get(doc as u32), i); assert_eq!(fast_field_reader.get_val(doc as u64), i);
} }
let mut buffer = vec![0i64; 100]; let mut buffer = vec![0i64; 100];
fast_field_reader.get_range(53, &mut buffer[..]); fast_field_reader.get_range(53, &mut buffer[..]);
@@ -509,9 +564,12 @@ mod tests {
let file = directory.open_read(path).unwrap(); let file = directory.open_read(path).unwrap();
{ {
let fast_fields_composite = CompositeFile::open(&file).unwrap(); let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(i64_field).unwrap(); let data = fast_fields_composite
let fast_field_reader = DynamicFastFieldReader::<i64>::open(data)?; .open_read(i64_field)
assert_eq!(fast_field_reader.get(0u32), 0i64); .unwrap()
.read_bytes()?;
let fast_field_reader = open_fast_field::<i64>(data)?;
assert_eq!(fast_field_reader.get_val(0), 0i64);
} }
Ok(()) Ok(())
} }
@@ -547,11 +605,14 @@ mod tests {
let file = directory.open_read(path)?; let file = directory.open_read(path)?;
{ {
let fast_fields_composite = CompositeFile::open(&file)?; let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(*FIELD).unwrap(); let data = fast_fields_composite
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?; .open_read(*FIELD)
.unwrap()
.read_bytes()?;
let fast_field_reader = open_fast_field::<u64>(data)?;
for a in 0..n { for a in 0..n {
assert_eq!(fast_field_reader.get(a as u32), permutation[a as usize]); assert_eq!(fast_field_reader.get_val(a as u64), permutation[a as usize]);
} }
} }
Ok(()) Ok(())
@@ -607,7 +668,7 @@ mod tests {
let mut all = vec![]; let mut all = vec![];
for doc in docs { for doc in docs {
let mut out = vec![]; let mut out: Vec<u64> = vec![];
ff.get_vals(doc, &mut out); ff.get_vals(doc, &mut out);
all.extend(out); all.extend(out);
} }
@@ -842,19 +903,19 @@ mod tests {
let dates_fast_field = fast_fields.dates(multi_date_field).unwrap(); let dates_fast_field = fast_fields.dates(multi_date_field).unwrap();
let mut dates = vec![]; let mut dates = vec![];
{ {
assert_eq!(date_fast_field.get(0u32).into_timestamp_micros(), 1i64); assert_eq!(date_fast_field.get_val(0).into_timestamp_micros(), 1i64);
dates_fast_field.get_vals(0u32, &mut dates); dates_fast_field.get_vals(0u32, &mut dates);
assert_eq!(dates.len(), 2); assert_eq!(dates.len(), 2);
assert_eq!(dates[0].into_timestamp_micros(), 2i64); assert_eq!(dates[0].into_timestamp_micros(), 2i64);
assert_eq!(dates[1].into_timestamp_micros(), 3i64); assert_eq!(dates[1].into_timestamp_micros(), 3i64);
} }
{ {
assert_eq!(date_fast_field.get(1u32).into_timestamp_micros(), 4i64); assert_eq!(date_fast_field.get_val(1).into_timestamp_micros(), 4i64);
dates_fast_field.get_vals(1u32, &mut dates); dates_fast_field.get_vals(1u32, &mut dates);
assert!(dates.is_empty()); assert!(dates.is_empty());
} }
{ {
assert_eq!(date_fast_field.get(2u32).into_timestamp_micros(), 0i64); assert_eq!(date_fast_field.get_val(2).into_timestamp_micros(), 0i64);
dates_fast_field.get_vals(2u32, &mut dates); dates_fast_field.get_vals(2u32, &mut dates);
assert_eq!(dates.len(), 2); assert_eq!(dates.len(), 2);
assert_eq!(dates[0].into_timestamp_micros(), 5i64); assert_eq!(dates[0].into_timestamp_micros(), 5i64);
@@ -865,11 +926,11 @@ mod tests {
#[test] #[test]
pub fn test_fastfield_bool() { pub fn test_fastfield_bool() {
let test_fastfield = DynamicFastFieldReader::<bool>::from(vec![true, false, true, false]); let test_fastfield = encode_decode_fast_field::<bool>(&[true, false, true, false]);
assert_eq!(test_fastfield.get(0), true); assert_eq!(test_fastfield.get_val(0), true);
assert_eq!(test_fastfield.get(1), false); assert_eq!(test_fastfield.get_val(1), false);
assert_eq!(test_fastfield.get(2), true); assert_eq!(test_fastfield.get_val(2), true);
assert_eq!(test_fastfield.get(3), false); assert_eq!(test_fastfield.get_val(3), false);
} }
#[test] #[test]
@@ -896,14 +957,14 @@ mod tests {
serializer.close().unwrap(); serializer.close().unwrap();
} }
let file = directory.open_read(path).unwrap(); let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 36); assert_eq!(file.len(), 44);
let composite_file = CompositeFile::open(&file)?; let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(field).unwrap(); let data = composite_file.open_read(field).unwrap().read_bytes()?;
let fast_field_reader = DynamicFastFieldReader::<bool>::open(file)?; let fast_field_reader = open_fast_field::<bool>(data)?;
assert_eq!(fast_field_reader.get(0), true); assert_eq!(fast_field_reader.get_val(0), true);
assert_eq!(fast_field_reader.get(1), false); assert_eq!(fast_field_reader.get_val(1), false);
assert_eq!(fast_field_reader.get(2), true); assert_eq!(fast_field_reader.get_val(2), true);
assert_eq!(fast_field_reader.get(3), false); assert_eq!(fast_field_reader.get_val(3), false);
Ok(()) Ok(())
} }
@@ -932,13 +993,13 @@ mod tests {
serializer.close().unwrap(); serializer.close().unwrap();
} }
let file = directory.open_read(path).unwrap(); let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 48); assert_eq!(file.len(), 56);
let composite_file = CompositeFile::open(&file)?; let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(field).unwrap(); let data = composite_file.open_read(field).unwrap().read_bytes()?;
let fast_field_reader = DynamicFastFieldReader::<bool>::open(file)?; let fast_field_reader = open_fast_field::<bool>(data)?;
for i in 0..25 { for i in 0..25 {
assert_eq!(fast_field_reader.get(i * 2), true); assert_eq!(fast_field_reader.get_val(i * 2), true);
assert_eq!(fast_field_reader.get(i * 2 + 1), false); assert_eq!(fast_field_reader.get_val(i * 2 + 1), false);
} }
Ok(()) Ok(())
@@ -966,11 +1027,11 @@ mod tests {
serializer.close().unwrap(); serializer.close().unwrap();
} }
let file = directory.open_read(path).unwrap(); let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 35); assert_eq!(file.len(), 43);
let composite_file = CompositeFile::open(&file)?; let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(field).unwrap(); let data = composite_file.open_read(field).unwrap().read_bytes()?;
let fast_field_reader = DynamicFastFieldReader::<bool>::open(file)?; let fast_field_reader = open_fast_field::<bool>(data)?;
assert_eq!(fast_field_reader.get(0), false); assert_eq!(fast_field_reader.get_val(0), false);
Ok(()) Ok(())
} }
@@ -978,32 +1039,17 @@ mod tests {
#[cfg(all(test, feature = "unstable"))] #[cfg(all(test, feature = "unstable"))]
mod bench { mod bench {
use std::collections::HashMap; use std::sync::Arc;
use std::path::Path;
use fastfield_codecs::Column;
use test::{self, Bencher}; use test::{self, Bencher};
use super::tests::{generate_permutation, FIELD, SCHEMA}; use crate::fastfield::tests::{
use super::*; encode_decode_fast_field, generate_permutation, generate_permutation_gcd,
use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr}; };
use crate::fastfield::tests::generate_permutation_gcd;
use crate::fastfield::FastFieldReader;
#[bench] #[bench]
fn bench_intfastfield_linear_veclookup(b: &mut Bencher) { fn bench_intfastfield_jumpy_veclookup(b: &mut Bencher) {
let permutation = generate_permutation();
b.iter(|| {
let n = test::black_box(7000u32);
let mut a = 0u64;
for i in (0u32..n / 7).map(|v| v * 7) {
a ^= permutation[i as usize];
}
a
});
}
#[bench]
fn bench_intfastfield_veclookup(b: &mut Bencher) {
let permutation = generate_permutation(); let permutation = generate_permutation();
b.iter(|| { b.iter(|| {
let n = test::black_box(1000u32); let n = test::black_box(1000u32);
@@ -1016,102 +1062,81 @@ mod bench {
} }
#[bench] #[bench]
fn bench_intfastfield_linear_fflookup(b: &mut Bencher) { fn bench_intfastfield_jumpy_fflookup(b: &mut Bencher) {
let path = Path::new("test");
let permutation = generate_permutation(); let permutation = generate_permutation();
let directory: RamDirectory = RamDirectory::create(); let column: Arc<dyn Column<u64>> = encode_decode_fast_field(&permutation);
{ b.iter(|| {
let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); let n = test::black_box(1000u32);
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap(); let mut a = 0u64;
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); for _ in 0u32..n {
for &x in &permutation { a = column.get_val(a as u64);
fast_field_writers.add_document(&doc!(*FIELD=>x));
} }
fast_field_writers a
.serialize(&mut serializer, &HashMap::new(), None) });
.unwrap(); }
serializer.close().unwrap();
}
let file = directory.open_read(&path).unwrap();
{
let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
b.iter(|| { #[bench]
let n = test::black_box(7000u32); fn bench_intfastfield_linear_fflookup(b: &mut Bencher) {
let mut a = 0u64; let permutation = generate_permutation();
for i in (0u32..n / 7).map(|val| val * 7) { let column: Arc<dyn Column<u64>> = encode_decode_fast_field(&permutation);
a ^= fast_field_reader.get(i); b.iter(|| {
} let n = test::black_box(7000u32);
a let mut a = 0u64;
}); for i in (0..n / 7).map(|val| val * 7) {
} a += column.get_val(i as u64);
}
a
});
}
#[bench]
fn bench_intfastfield_linear_vec(b: &mut Bencher) {
let permutation = generate_permutation();
b.iter(|| {
let n = test::black_box(7000);
let mut a = 0u64;
for i in (0..n / 7).map(|val| val * 7) {
a += permutation[i];
}
a
});
} }
#[bench] #[bench]
fn bench_intfastfield_fflookup(b: &mut Bencher) { fn bench_intfastfield_fflookup(b: &mut Bencher) {
let path = Path::new("test");
let permutation = generate_permutation(); let permutation = generate_permutation();
let directory: RamDirectory = RamDirectory::create(); let column: Arc<dyn Column<u64>> = encode_decode_fast_field(&permutation);
{ b.iter(|| {
let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); let mut a = 0u64;
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap(); for i in 0u64..permutation.len() as u64 {
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); a = column.get_val(i);
for &x in &permutation {
fast_field_writers.add_document(&doc!(*FIELD=>x));
} }
fast_field_writers a
.serialize(&mut serializer, &HashMap::new(), None) });
.unwrap();
serializer.close().unwrap();
}
let file = directory.open_read(&path).unwrap();
{
let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
b.iter(|| {
let mut a = 0u32;
for i in 0u32..permutation.len() as u32 {
a = fast_field_reader.get(i) as u32;
}
a
});
}
} }
#[bench] #[bench]
fn bench_intfastfield_fflookup_gcd(b: &mut Bencher) { fn bench_intfastfield_fflookup_gcd(b: &mut Bencher) {
let path = Path::new("test");
let permutation = generate_permutation_gcd(); let permutation = generate_permutation_gcd();
let directory: RamDirectory = RamDirectory::create(); let column: Arc<dyn Column<u64>> = encode_decode_fast_field(&permutation);
{ b.iter(|| {
let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); let mut a = 0u64;
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap(); for i in 0..permutation.len() as u64 {
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); a += column.get_val(i);
for &x in &permutation {
fast_field_writers.add_document(&doc!(*FIELD=>x));
} }
fast_field_writers a
.serialize(&mut serializer, &HashMap::new(), None) });
.unwrap(); }
serializer.close().unwrap();
}
let file = directory.open_read(&path).unwrap();
{
let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
b.iter(|| { #[bench]
let mut a = 0u32; fn bench_intfastfield_vec(b: &mut Bencher) {
for i in 0u32..permutation.len() as u32 { let permutation = generate_permutation_gcd();
a = fast_field_reader.get(i) as u32; b.iter(|| {
} let mut a = 0u64;
a for i in 0..permutation.len() {
}); a += permutation[i as usize] as u64;
} }
a
});
} }
} }

View File

@@ -346,6 +346,7 @@ mod tests {
assert!(test_multivalued_no_panic(&ops[..]).is_ok()); assert!(test_multivalued_no_panic(&ops[..]).is_ok());
} }
} }
#[test] #[test]
fn test_multivalued_proptest_gcd() { fn test_multivalued_proptest_gcd() {
use IndexingOp::*; use IndexingOp::*;

View File

@@ -1,6 +1,9 @@
use std::ops::Range; use std::ops::Range;
use std::sync::Arc;
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue, MultiValueLength}; use fastfield_codecs::Column;
use crate::fastfield::{FastValue, MultiValueLength};
use crate::DocId; use crate::DocId;
/// Reader for a multivalued `u64` fast field. /// Reader for a multivalued `u64` fast field.
@@ -12,14 +15,14 @@ use crate::DocId;
/// The `idx_reader` associated, for each document, the index of its first value. /// The `idx_reader` associated, for each document, the index of its first value.
#[derive(Clone)] #[derive(Clone)]
pub struct MultiValuedFastFieldReader<Item: FastValue> { pub struct MultiValuedFastFieldReader<Item: FastValue> {
idx_reader: DynamicFastFieldReader<u64>, idx_reader: Arc<dyn Column<u64>>,
vals_reader: DynamicFastFieldReader<Item>, vals_reader: Arc<dyn Column<Item>>,
} }
impl<Item: FastValue> MultiValuedFastFieldReader<Item> { impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
pub(crate) fn open( pub(crate) fn open(
idx_reader: DynamicFastFieldReader<u64>, idx_reader: Arc<dyn Column<u64>>,
vals_reader: DynamicFastFieldReader<Item>, vals_reader: Arc<dyn Column<Item>>,
) -> MultiValuedFastFieldReader<Item> { ) -> MultiValuedFastFieldReader<Item> {
MultiValuedFastFieldReader { MultiValuedFastFieldReader {
idx_reader, idx_reader,
@@ -31,8 +34,9 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
/// to the given document are `start..end`. /// to the given document are `start..end`.
#[inline] #[inline]
fn range(&self, doc: DocId) -> Range<u64> { fn range(&self, doc: DocId) -> Range<u64> {
let start = self.idx_reader.get(doc); let idx = doc as u64;
let end = self.idx_reader.get(doc + 1); let start = self.idx_reader.get_val(idx);
let end = self.idx_reader.get_val(idx + 1);
start..end start..end
} }

View File

@@ -1,193 +1,65 @@
use std::collections::HashMap;
use std::marker::PhantomData; use std::marker::PhantomData;
use std::path::Path; use std::sync::Arc;
use common::BinarySerializable; use common::BinarySerializable;
use fastfield_codecs::bitpacked::BitpackedReader; use fastfield_codecs::bitpacked::BitpackedCodec;
use fastfield_codecs::blockwise_linear::BlockwiseLinearReader; use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec;
use fastfield_codecs::linear::LinearReader; use fastfield_codecs::linear::LinearCodec;
use fastfield_codecs::{FastFieldCodecReader, FastFieldCodecType}; use fastfield_codecs::{monotonic_map_column, Column, FastFieldCodec, FastFieldCodecType};
use super::{FastValue, GCDFastFieldCodec}; use super::gcd::open_gcd_from_bytes;
use crate::directory::{CompositeFile, Directory, FileSlice, OwnedBytes, RamDirectory, WritePtr}; use super::FastValue;
use crate::directory::OwnedBytes;
use crate::error::DataCorruption; use crate::error::DataCorruption;
use crate::fastfield::{CompositeFastFieldSerializer, FastFieldsWriter};
use crate::schema::{Schema, FAST};
use crate::DocId;
/// FastFieldReader is the trait to access fast field data. fn open_codec_from_bytes<C: FastFieldCodec, Item: FastValue>(
pub trait FastFieldReader<Item: FastValue>: Clone { bytes: OwnedBytes,
/// Return the value associated to the given document. ) -> crate::Result<Arc<dyn Column<Item>>> {
/// let reader = C::open_from_bytes(bytes)?;
/// This accessor should return as fast as possible. Ok(Arc::new(monotonic_map_column(reader, Item::from_u64)))
///
/// # Panics
///
/// May panic if `doc` is greater than the segment
fn get(&self, doc: DocId) -> Item;
/// Fills an output buffer with the fast field values
/// associated with the `DocId` going from
/// `start` to `start + output.len()`.
///
/// Regardless of the type of `Item`, this method works
/// - transmuting the output array
/// - extracting the `Item`s as if they were `u64`
/// - possibly converting the `u64` value to the right type.
///
/// # Panics
///
/// May panic if `start + output.len()` is greater than
/// the segment's `maxdoc`.
fn get_range(&self, start: u64, output: &mut [Item]);
/// Returns the minimum value for this fast field.
///
/// The min value does not take in account of possible
/// deleted document, and should be considered as a lower bound
/// of the actual minimum value.
fn min_value(&self) -> Item;
/// Returns the maximum value for this fast field.
///
/// The max value does not take in account of possible
/// deleted document, and should be considered as an upper bound
/// of the actual maximum value.
fn max_value(&self) -> Item;
} }
#[derive(Clone)] fn open_codec_with_gcd<C: FastFieldCodec, Item: FastValue>(
/// DynamicFastFieldReader wraps different readers to access bytes: OwnedBytes,
/// the various encoded fastfield data ) -> crate::Result<Arc<dyn Column<Item>>> {
pub enum DynamicFastFieldReader<Item: FastValue> { let reader = open_gcd_from_bytes::<C>(bytes)?;
/// Bitpacked compressed fastfield data. Ok(Arc::new(monotonic_map_column(reader, Item::from_u64)))
Bitpacked(FastFieldReaderCodecWrapper<Item, BitpackedReader>),
/// Linear interpolated values + bitpacked
Linear(FastFieldReaderCodecWrapper<Item, LinearReader>),
/// Blockwise linear interpolated values + bitpacked
BlockwiseLinear(FastFieldReaderCodecWrapper<Item, BlockwiseLinearReader>),
/// GCD and Bitpacked compressed fastfield data.
BitpackedGCD(FastFieldReaderCodecWrapper<Item, GCDFastFieldCodec<BitpackedReader>>),
/// GCD and Linear interpolated values + bitpacked
LinearGCD(FastFieldReaderCodecWrapper<Item, GCDFastFieldCodec<LinearReader>>),
/// GCD and Blockwise linear interpolated values + bitpacked
BlockwiseLinearGCD(FastFieldReaderCodecWrapper<Item, GCDFastFieldCodec<BlockwiseLinearReader>>),
} }
impl<Item: FastValue> DynamicFastFieldReader<Item> { /// Returns correct the reader wrapped in the `DynamicFastFieldReader` enum for the data.
/// Returns correct the reader wrapped in the `DynamicFastFieldReader` enum for the data. fn open_from_id<Item: FastValue>(
pub fn open_from_id( mut bytes: OwnedBytes,
mut bytes: OwnedBytes, codec_type: FastFieldCodecType,
codec_type: FastFieldCodecType, ) -> crate::Result<Arc<dyn Column<Item>>> {
) -> crate::Result<DynamicFastFieldReader<Item>> { match codec_type {
let reader = match codec_type { FastFieldCodecType::Bitpacked => open_codec_from_bytes::<BitpackedCodec, _>(bytes),
FastFieldCodecType::Bitpacked => { FastFieldCodecType::Linear => open_codec_from_bytes::<LinearCodec, _>(bytes),
DynamicFastFieldReader::Bitpacked(FastFieldReaderCodecWrapper::< FastFieldCodecType::BlockwiseLinear => {
Item, open_codec_from_bytes::<BlockwiseLinearCodec, _>(bytes)
BitpackedReader, }
>::open_from_bytes(bytes)?) FastFieldCodecType::Gcd => {
} let codec_type = FastFieldCodecType::deserialize(&mut bytes)?;
FastFieldCodecType::Linear => DynamicFastFieldReader::Linear( match codec_type {
FastFieldReaderCodecWrapper::<Item, LinearReader>::open_from_bytes(bytes)?, FastFieldCodecType::Bitpacked => open_codec_with_gcd::<BitpackedCodec, _>(bytes),
), FastFieldCodecType::Linear => open_codec_with_gcd::<LinearCodec, _>(bytes),
FastFieldCodecType::BlockwiseLinear => { FastFieldCodecType::BlockwiseLinear => {
DynamicFastFieldReader::BlockwiseLinear(FastFieldReaderCodecWrapper::< open_codec_with_gcd::<BlockwiseLinearCodec, _>(bytes)
Item,
BlockwiseLinearReader,
>::open_from_bytes(bytes)?)
}
FastFieldCodecType::Gcd => {
let codec_type = FastFieldCodecType::deserialize(&mut bytes)?;
match codec_type {
FastFieldCodecType::Bitpacked => {
DynamicFastFieldReader::BitpackedGCD(FastFieldReaderCodecWrapper::<
Item,
GCDFastFieldCodec<BitpackedReader>,
>::open_from_bytes(
bytes
)?)
}
FastFieldCodecType::Linear => {
DynamicFastFieldReader::LinearGCD(FastFieldReaderCodecWrapper::<
Item,
GCDFastFieldCodec<LinearReader>,
>::open_from_bytes(
bytes
)?)
}
FastFieldCodecType::BlockwiseLinear => {
DynamicFastFieldReader::BlockwiseLinearGCD(FastFieldReaderCodecWrapper::<
Item,
GCDFastFieldCodec<BlockwiseLinearReader>,
>::open_from_bytes(
bytes
)?)
}
FastFieldCodecType::Gcd => {
return Err(DataCorruption::comment_only(
"Gcd codec wrapped into another gcd codec. This combination is not \
allowed.",
)
.into())
}
} }
FastFieldCodecType::Gcd => Err(DataCorruption::comment_only(
"Gcd codec wrapped into another gcd codec. This combination is not allowed.",
)
.into()),
} }
}; }
Ok(reader)
}
/// Returns correct the reader wrapped in the `DynamicFastFieldReader` enum for the data.
pub fn open(file: FileSlice) -> crate::Result<DynamicFastFieldReader<Item>> {
let mut bytes = file.read_bytes()?;
let codec_type = FastFieldCodecType::deserialize(&mut bytes)?;
Self::open_from_id(bytes, codec_type)
} }
} }
impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> { /// Returns correct the reader wrapped in the `DynamicFastFieldReader` enum for the data.
#[inline] pub fn open_fast_field<Item: FastValue>(
fn get(&self, doc: DocId) -> Item { mut bytes: OwnedBytes,
match self { ) -> crate::Result<Arc<dyn Column<Item>>> {
Self::Bitpacked(reader) => reader.get(doc), let codec_type = FastFieldCodecType::deserialize(&mut bytes)?;
Self::Linear(reader) => reader.get(doc), open_from_id(bytes, codec_type)
Self::BlockwiseLinear(reader) => reader.get(doc),
Self::BitpackedGCD(reader) => reader.get(doc),
Self::LinearGCD(reader) => reader.get(doc),
Self::BlockwiseLinearGCD(reader) => reader.get(doc),
}
}
#[inline]
fn get_range(&self, start: u64, output: &mut [Item]) {
match self {
Self::Bitpacked(reader) => reader.get_range(start, output),
Self::Linear(reader) => reader.get_range(start, output),
Self::BlockwiseLinear(reader) => reader.get_range(start, output),
Self::BitpackedGCD(reader) => reader.get_range(start, output),
Self::LinearGCD(reader) => reader.get_range(start, output),
Self::BlockwiseLinearGCD(reader) => reader.get_range(start, output),
}
}
fn min_value(&self) -> Item {
match self {
Self::Bitpacked(reader) => reader.min_value(),
Self::Linear(reader) => reader.min_value(),
Self::BlockwiseLinear(reader) => reader.min_value(),
Self::BitpackedGCD(reader) => reader.min_value(),
Self::LinearGCD(reader) => reader.min_value(),
Self::BlockwiseLinearGCD(reader) => reader.min_value(),
}
}
fn max_value(&self) -> Item {
match self {
Self::Bitpacked(reader) => reader.max_value(),
Self::Linear(reader) => reader.max_value(),
Self::BlockwiseLinear(reader) => reader.max_value(),
Self::BitpackedGCD(reader) => reader.max_value(),
Self::LinearGCD(reader) => reader.max_value(),
Self::BlockwiseLinearGCD(reader) => reader.max_value(),
}
}
} }
/// Wrapper for accessing a fastfield. /// Wrapper for accessing a fastfield.
@@ -199,34 +71,21 @@ pub struct FastFieldReaderCodecWrapper<Item: FastValue, CodecReader> {
_phantom: PhantomData<Item>, _phantom: PhantomData<Item>,
} }
impl<Item: FastValue, C: FastFieldCodecReader> FastFieldReaderCodecWrapper<Item, C> { impl<Item: FastValue, CodecReader> From<CodecReader>
/// Opens a fast field given a file. for FastFieldReaderCodecWrapper<Item, CodecReader>
pub fn open(file: FileSlice) -> crate::Result<Self> { {
let mut bytes = file.read_bytes()?; fn from(reader: CodecReader) -> Self {
let codec_code = bytes.read_u8(); FastFieldReaderCodecWrapper {
let codec_type = FastFieldCodecType::from_code(codec_code).ok_or_else(|| {
DataCorruption::comment_only("Unknown codec code does not exist `{codec_code}`")
})?;
assert_eq!(
FastFieldCodecType::Bitpacked,
codec_type,
"Tried to open fast field as bitpacked encoded (id=1), but got serializer with \
different id"
);
Self::open_from_bytes(bytes)
}
/// Opens a fast field given the bytes.
pub fn open_from_bytes(bytes: OwnedBytes) -> crate::Result<Self> {
let reader = C::open_from_bytes(bytes)?;
Ok(FastFieldReaderCodecWrapper {
reader, reader,
_phantom: PhantomData, _phantom: PhantomData,
}) }
} }
}
impl<Item: FastValue, D: Column> FastFieldReaderCodecWrapper<Item, D> {
#[inline] #[inline]
pub(crate) fn get_u64(&self, doc: u64) -> Item { pub(crate) fn get_u64(&self, idx: u64) -> Item {
let data = self.reader.get_u64(doc); let data = self.reader.get_val(idx);
Item::from_u64(data) Item::from_u64(data)
} }
@@ -249,9 +108,7 @@ impl<Item: FastValue, C: FastFieldCodecReader> FastFieldReaderCodecWrapper<Item,
} }
} }
impl<Item: FastValue, C: FastFieldCodecReader + Clone> FastFieldReader<Item> impl<Item: FastValue, C: Column + Clone> Column<Item> for FastFieldReaderCodecWrapper<Item, C> {
for FastFieldReaderCodecWrapper<Item, C>
{
/// Return the value associated to the given document. /// Return the value associated to the given document.
/// ///
/// This accessor should return as fast as possible. /// This accessor should return as fast as possible.
@@ -260,8 +117,8 @@ impl<Item: FastValue, C: FastFieldCodecReader + Clone> FastFieldReader<Item>
/// ///
/// May panic if `doc` is greater than the segment /// May panic if `doc` is greater than the segment
// `maxdoc`. // `maxdoc`.
fn get(&self, doc: DocId) -> Item { fn get_val(&self, idx: u64) -> Item {
self.get_u64(u64::from(doc)) self.get_u64(idx)
} }
/// Fills an output buffer with the fast field values /// Fills an output buffer with the fast field values
@@ -298,41 +155,8 @@ impl<Item: FastValue, C: FastFieldCodecReader + Clone> FastFieldReader<Item>
fn max_value(&self) -> Item { fn max_value(&self) -> Item {
Item::from_u64(self.reader.max_value()) Item::from_u64(self.reader.max_value())
} }
}
impl<Item: FastValue> From<Vec<Item>> for DynamicFastFieldReader<Item> { fn num_vals(&self) -> u64 {
fn from(vals: Vec<Item>) -> DynamicFastFieldReader<Item> { self.reader.num_vals()
let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field("field", FAST);
let schema = schema_builder.build();
let path = Path::new("__dummy__");
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory
.open_write(path)
.expect("With a RamDirectory, this should never fail.");
let mut serializer = CompositeFastFieldSerializer::from_write(write)
.expect("With a RamDirectory, this should never fail.");
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
{
let fast_field_writer = fast_field_writers
.get_field_writer_mut(field)
.expect("With a RamDirectory, this should never fail.");
for val in vals {
fast_field_writer.add_val(val.to_u64());
}
}
fast_field_writers
.serialize(&mut serializer, &HashMap::new(), None)
.unwrap();
serializer.close().unwrap();
}
let file = directory.open_read(path).expect("Failed to open the file");
let composite_file = CompositeFile::open(&file).expect("Failed to read the composite file");
let field_file = composite_file
.open_read(field)
.expect("File component not found");
DynamicFastFieldReader::open(field_file).unwrap()
} }
} }

View File

@@ -1,5 +1,9 @@
use super::reader::DynamicFastFieldReader; use std::sync::Arc;
use fastfield_codecs::Column;
use crate::directory::{CompositeFile, FileSlice}; use crate::directory::{CompositeFile, FileSlice};
use crate::fastfield::reader::open_fast_field;
use crate::fastfield::{ use crate::fastfield::{
BytesFastFieldReader, FastFieldNotAvailableError, FastValue, MultiValuedFastFieldReader, BytesFastFieldReader, FastFieldNotAvailableError, FastValue, MultiValuedFastFieldReader,
}; };
@@ -109,14 +113,16 @@ impl FastFieldReaders {
&self, &self,
field: Field, field: Field,
index: usize, index: usize,
) -> crate::Result<DynamicFastFieldReader<TFastValue>> { ) -> crate::Result<Arc<dyn Column<TFastValue>>> {
let fast_field_slice = self.fast_field_data(field, index)?; let fast_field_slice = self.fast_field_data(field, index)?;
DynamicFastFieldReader::open(fast_field_slice) let bytes = fast_field_slice.read_bytes()?;
open_fast_field(bytes)
} }
pub(crate) fn typed_fast_field_reader<TFastValue: FastValue>( pub(crate) fn typed_fast_field_reader<TFastValue: FastValue>(
&self, &self,
field: Field, field: Field,
) -> crate::Result<DynamicFastFieldReader<TFastValue>> { ) -> crate::Result<Arc<dyn Column<TFastValue>>> {
self.typed_fast_field_reader_with_idx(field, 0) self.typed_fast_field_reader_with_idx(field, 0)
} }
@@ -132,7 +138,7 @@ impl FastFieldReaders {
/// Returns the `u64` fast field reader reader associated to `field`. /// Returns the `u64` fast field reader reader associated to `field`.
/// ///
/// If `field` is not a u64 fast field, this method returns an Error. /// If `field` is not a u64 fast field, this method returns an Error.
pub fn u64(&self, field: Field) -> crate::Result<DynamicFastFieldReader<u64>> { pub fn u64(&self, field: Field) -> crate::Result<Arc<dyn Column<u64>>> {
self.check_type(field, FastType::U64, Cardinality::SingleValue)?; self.check_type(field, FastType::U64, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field) self.typed_fast_field_reader(field)
} }
@@ -142,14 +148,14 @@ impl FastFieldReaders {
/// ///
/// If not, the fastfield reader will returns the u64-value associated to the original /// If not, the fastfield reader will returns the u64-value associated to the original
/// FastValue. /// FastValue.
pub fn u64_lenient(&self, field: Field) -> crate::Result<DynamicFastFieldReader<u64>> { pub fn u64_lenient(&self, field: Field) -> crate::Result<Arc<dyn Column<u64>>> {
self.typed_fast_field_reader(field) self.typed_fast_field_reader(field)
} }
/// Returns the `i64` fast field reader reader associated to `field`. /// Returns the `i64` fast field reader reader associated to `field`.
/// ///
/// If `field` is not a i64 fast field, this method returns an Error. /// If `field` is not a i64 fast field, this method returns an Error.
pub fn i64(&self, field: Field) -> crate::Result<DynamicFastFieldReader<i64>> { pub fn i64(&self, field: Field) -> crate::Result<Arc<dyn Column<i64>>> {
self.check_type(field, FastType::I64, Cardinality::SingleValue)?; self.check_type(field, FastType::I64, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field) self.typed_fast_field_reader(field)
} }
@@ -157,7 +163,7 @@ impl FastFieldReaders {
/// Returns the `date` fast field reader reader associated to `field`. /// Returns the `date` fast field reader reader associated to `field`.
/// ///
/// If `field` is not a date fast field, this method returns an Error. /// If `field` is not a date fast field, this method returns an Error.
pub fn date(&self, field: Field) -> crate::Result<DynamicFastFieldReader<DateTime>> { pub fn date(&self, field: Field) -> crate::Result<Arc<dyn Column<DateTime>>> {
self.check_type(field, FastType::Date, Cardinality::SingleValue)?; self.check_type(field, FastType::Date, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field) self.typed_fast_field_reader(field)
} }
@@ -165,7 +171,7 @@ impl FastFieldReaders {
/// Returns the `f64` fast field reader reader associated to `field`. /// Returns the `f64` fast field reader reader associated to `field`.
/// ///
/// If `field` is not a f64 fast field, this method returns an Error. /// If `field` is not a f64 fast field, this method returns an Error.
pub fn f64(&self, field: Field) -> crate::Result<DynamicFastFieldReader<f64>> { pub fn f64(&self, field: Field) -> crate::Result<Arc<dyn Column<f64>>> {
self.check_type(field, FastType::F64, Cardinality::SingleValue)?; self.check_type(field, FastType::F64, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field) self.typed_fast_field_reader(field)
} }
@@ -173,7 +179,7 @@ impl FastFieldReaders {
/// Returns the `bool` fast field reader reader associated to `field`. /// Returns the `bool` fast field reader reader associated to `field`.
/// ///
/// If `field` is not a bool fast field, this method returns an Error. /// If `field` is not a bool fast field, this method returns an Error.
pub fn bool(&self, field: Field) -> crate::Result<DynamicFastFieldReader<bool>> { pub fn bool(&self, field: Field) -> crate::Result<Arc<dyn Column<bool>>> {
self.check_type(field, FastType::Bool, Cardinality::SingleValue)?; self.check_type(field, FastType::Bool, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field) self.typed_fast_field_reader(field)
} }
@@ -241,7 +247,8 @@ impl FastFieldReaders {
))); )));
} }
let fast_field_idx_file = self.fast_field_data(field, 0)?; let fast_field_idx_file = self.fast_field_data(field, 0)?;
let idx_reader = DynamicFastFieldReader::open(fast_field_idx_file)?; let fast_field_idx_bytes = fast_field_idx_file.read_bytes()?;
let idx_reader = open_fast_field(fast_field_idx_bytes)?;
let data = self.fast_field_data(field, 1)?; let data = self.fast_field_data(field, 1)?;
BytesFastFieldReader::open(idx_reader, data) BytesFastFieldReader::open(idx_reader, data)
} else { } else {

View File

@@ -3,11 +3,11 @@ use std::num::NonZeroU64;
use common::{BinarySerializable, CountingWriter}; use common::{BinarySerializable, CountingWriter};
use fastdivide::DividerU64; use fastdivide::DividerU64;
pub use fastfield_codecs::bitpacked::{BitpackedSerializer, BitpackedSerializerLegacy}; pub use fastfield_codecs::bitpacked::{BitpackedCodec, BitpackedSerializerLegacy};
use fastfield_codecs::blockwise_linear::BlockwiseLinearSerializer; use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec;
use fastfield_codecs::linear::LinearSerializer; use fastfield_codecs::linear::LinearCodec;
use fastfield_codecs::FastFieldCodecType; use fastfield_codecs::{monotonic_map_column, FastFieldCodecType};
pub use fastfield_codecs::{FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats}; pub use fastfield_codecs::{Column, FastFieldCodec, FastFieldStats};
use super::{find_gcd, ALL_CODECS, GCD_DEFAULT}; use super::{find_gcd, ALL_CODECS, GCD_DEFAULT};
use crate::directory::{CompositeWrite, WritePtr}; use crate::directory::{CompositeWrite, WritePtr};
@@ -64,15 +64,13 @@ impl From<FastFieldCodecType> for FastFieldCodecEnableCheck {
// use this, when this is merged and stabilized explicit_generic_args_with_impl_trait // use this, when this is merged and stabilized explicit_generic_args_with_impl_trait
// https://github.com/rust-lang/rust/pull/86176 // https://github.com/rust-lang/rust/pull/86176
fn codec_estimation<T: FastFieldCodecSerializer, A: FastFieldDataAccess>( fn codec_estimation<C: FastFieldCodec, D: Column>(
fastfield_accessor: &A, fastfield_accessor: &D,
estimations: &mut Vec<(f32, FastFieldCodecType)>, estimations: &mut Vec<(f32, FastFieldCodecType)>,
) { ) {
if !T::is_applicable(fastfield_accessor) { if let Some(ratio) = C::estimate(fastfield_accessor) {
return; estimations.push((ratio, C::CODEC_TYPE));
} }
let ratio = T::estimate(fastfield_accessor);
estimations.push((ratio, T::CODEC_TYPE));
} }
impl CompositeFastFieldSerializer { impl CompositeFastFieldSerializer {
@@ -99,7 +97,7 @@ impl CompositeFastFieldSerializer {
pub fn create_auto_detect_u64_fast_field( pub fn create_auto_detect_u64_fast_field(
&mut self, &mut self,
field: Field, field: Field,
fastfield_accessor: impl FastFieldDataAccess, fastfield_accessor: impl Column,
) -> io::Result<()> { ) -> io::Result<()> {
self.create_auto_detect_u64_fast_field_with_idx(field, fastfield_accessor, 0) self.create_auto_detect_u64_fast_field_with_idx(field, fastfield_accessor, 0)
} }
@@ -119,7 +117,7 @@ impl CompositeFastFieldSerializer {
pub fn create_auto_detect_u64_fast_field_with_idx( pub fn create_auto_detect_u64_fast_field_with_idx(
&mut self, &mut self,
field: Field, field: Field,
fastfield_accessor: impl FastFieldDataAccess, fastfield_accessor: impl Column,
idx: usize, idx: usize,
) -> io::Result<()> { ) -> io::Result<()> {
let min_value = fastfield_accessor.min_value(); let min_value = fastfield_accessor.min_value();
@@ -138,58 +136,24 @@ impl CompositeFastFieldSerializer {
} }
Self::write_header(field_write, FastFieldCodecType::Gcd)?; Self::write_header(field_write, FastFieldCodecType::Gcd)?;
struct GCDWrappedFFAccess<T: FastFieldDataAccess> {
fastfield_accessor: T,
base_value: u64,
max_value: u64,
num_vals: u64,
gcd: DividerU64,
}
impl<T: FastFieldDataAccess> FastFieldDataAccess for GCDWrappedFFAccess<T> {
fn get_val(&self, position: u64) -> u64 {
self.gcd
.divide(self.fastfield_accessor.get_val(position) - self.base_value)
}
fn iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
Box::new(
self.fastfield_accessor
.iter()
.map(|val| self.gcd.divide(val - self.base_value)),
)
}
fn min_value(&self) -> u64 {
0
}
fn max_value(&self) -> u64 {
self.max_value
}
fn num_vals(&self) -> u64 {
self.num_vals
}
}
let num_vals = fastfield_accessor.num_vals();
let base_value = fastfield_accessor.min_value(); let base_value = fastfield_accessor.min_value();
let max_value = (fastfield_accessor.max_value() - fastfield_accessor.min_value()) / gcd;
let fastfield_accessor = GCDWrappedFFAccess { let gcd_divider = DividerU64::divide_by(gcd);
fastfield_accessor,
base_value, let divided_fastfield_accessor = monotonic_map_column(fastfield_accessor, |val: u64| {
max_value, gcd_divider.divide(val - base_value)
num_vals, });
gcd: DividerU64::divide_by(gcd),
}; let num_vals = divided_fastfield_accessor.num_vals();
Self::create_auto_detect_u64_fast_field_with_idx_gcd( Self::create_auto_detect_u64_fast_field_with_idx_gcd(
self.codec_enable_checker.clone(), self.codec_enable_checker.clone(),
field, field,
field_write, field_write,
fastfield_accessor, divided_fastfield_accessor,
)?; )?;
write_gcd_header(field_write, base_value, gcd)?; write_gcd_header(field_write, base_value, gcd, num_vals)?;
Ok(()) Ok(())
} }
@@ -199,18 +163,18 @@ impl CompositeFastFieldSerializer {
codec_enable_checker: FastFieldCodecEnableCheck, codec_enable_checker: FastFieldCodecEnableCheck,
field: Field, field: Field,
field_write: &mut CountingWriter<W>, field_write: &mut CountingWriter<W>,
fastfield_accessor: impl FastFieldDataAccess, fastfield_accessor: impl Column,
) -> io::Result<()> { ) -> io::Result<()> {
let mut estimations = vec![]; let mut estimations = vec![];
if codec_enable_checker.is_enabled(FastFieldCodecType::Bitpacked) { if codec_enable_checker.is_enabled(FastFieldCodecType::Bitpacked) {
codec_estimation::<BitpackedSerializer, _>(&fastfield_accessor, &mut estimations); codec_estimation::<BitpackedCodec, _>(&fastfield_accessor, &mut estimations);
} }
if codec_enable_checker.is_enabled(FastFieldCodecType::Linear) { if codec_enable_checker.is_enabled(FastFieldCodecType::Linear) {
codec_estimation::<LinearSerializer, _>(&fastfield_accessor, &mut estimations); codec_estimation::<LinearCodec, _>(&fastfield_accessor, &mut estimations);
} }
if codec_enable_checker.is_enabled(FastFieldCodecType::BlockwiseLinear) { if codec_enable_checker.is_enabled(FastFieldCodecType::BlockwiseLinear) {
codec_estimation::<BlockwiseLinearSerializer, _>(&fastfield_accessor, &mut estimations); codec_estimation::<BlockwiseLinearCodec, _>(&fastfield_accessor, &mut estimations);
} }
if let Some(broken_estimation) = estimations.iter().find(|estimation| estimation.0.is_nan()) if let Some(broken_estimation) = estimations.iter().find(|estimation| estimation.0.is_nan())
{ {
@@ -229,13 +193,13 @@ impl CompositeFastFieldSerializer {
Self::write_header(field_write, codec_type)?; Self::write_header(field_write, codec_type)?;
match codec_type { match codec_type {
FastFieldCodecType::Bitpacked => { FastFieldCodecType::Bitpacked => {
BitpackedSerializer::serialize(field_write, &fastfield_accessor)?; BitpackedCodec::serialize(field_write, &fastfield_accessor)?;
} }
FastFieldCodecType::Linear => { FastFieldCodecType::Linear => {
LinearSerializer::serialize(field_write, &fastfield_accessor)?; LinearCodec::serialize(field_write, &fastfield_accessor)?;
} }
FastFieldCodecType::BlockwiseLinear => { FastFieldCodecType::BlockwiseLinear => {
BlockwiseLinearSerializer::serialize(field_write, &fastfield_accessor)?; BlockwiseLinearCodec::serialize(field_write, &fastfield_accessor)?;
} }
FastFieldCodecType::Gcd => { FastFieldCodecType::Gcd => {
return Err(io::Error::new( return Err(io::Error::new(

View File

@@ -2,12 +2,13 @@ use std::collections::HashMap;
use std::io; use std::io;
use common; use common;
use fastfield_codecs::Column;
use fnv::FnvHashMap; use fnv::FnvHashMap;
use tantivy_bitpacker::BlockedBitpacker; use tantivy_bitpacker::BlockedBitpacker;
use super::multivalued::MultiValuedFastFieldWriter; use super::multivalued::MultiValuedFastFieldWriter;
use super::serializer::FastFieldStats; use super::serializer::FastFieldStats;
use super::{FastFieldDataAccess, FastFieldType, FastValue}; use super::{FastFieldType, FastValue};
use crate::fastfield::{BytesFastFieldWriter, CompositeFastFieldSerializer}; use crate::fastfield::{BytesFastFieldWriter, CompositeFastFieldSerializer};
use crate::indexer::doc_id_mapping::DocIdMapping; use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::UnorderedTermId; use crate::postings::UnorderedTermId;
@@ -383,7 +384,7 @@ struct WriterFastFieldAccessProvider<'map, 'bitp> {
vals: &'bitp BlockedBitpacker, vals: &'bitp BlockedBitpacker,
stats: FastFieldStats, stats: FastFieldStats,
} }
impl<'map, 'bitp> FastFieldDataAccess for WriterFastFieldAccessProvider<'map, 'bitp> { impl<'map, 'bitp> Column for WriterFastFieldAccessProvider<'map, 'bitp> {
/// Return the value associated to the given doc. /// Return the value associated to the given doc.
/// ///
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance /// Whenever possible use the Iterator passed to the fastfield creation instead, for performance

View File

@@ -144,7 +144,6 @@ pub(crate) fn get_doc_id_mapping_from_field(
#[cfg(test)] #[cfg(test)]
mod tests_indexsorting { mod tests_indexsorting {
use crate::collector::TopDocs; use crate::collector::TopDocs;
use crate::fastfield::FastFieldReader;
use crate::indexer::doc_id_mapping::DocIdMapping; use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::query::QueryParser; use crate::query::QueryParser;
use crate::schema::{Schema, *}; use crate::schema::{Schema, *};
@@ -464,9 +463,9 @@ mod tests_indexsorting {
let my_number = index.schema().get_field("my_number").unwrap(); let my_number = index.schema().get_field("my_number").unwrap();
let fast_field = fast_fields.u64(my_number).unwrap(); let fast_field = fast_fields.u64(my_number).unwrap();
assert_eq!(fast_field.get(0u32), 10u64); assert_eq!(fast_field.get_val(0), 10u64);
assert_eq!(fast_field.get(1u32), 20u64); assert_eq!(fast_field.get_val(1), 20u64);
assert_eq!(fast_field.get(2u32), 30u64); assert_eq!(fast_field.get_val(2), 30u64);
let multi_numbers = index.schema().get_field("multi_numbers").unwrap(); let multi_numbers = index.schema().get_field("multi_numbers").unwrap();
let multifield = fast_fields.u64s(multi_numbers).unwrap(); let multifield = fast_fields.u64s(multi_numbers).unwrap();

View File

@@ -174,9 +174,7 @@ fn index_documents(
segment_updater: &mut SegmentUpdater, segment_updater: &mut SegmentUpdater,
mut delete_cursor: DeleteCursor, mut delete_cursor: DeleteCursor,
) -> crate::Result<()> { ) -> crate::Result<()> {
let schema = segment.schema(); let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone())?;
let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), schema)?;
for document_group in grouped_document_iterator { for document_group in grouped_document_iterator {
for doc in document_group { for doc in document_group {
segment_writer.add_document(doc)?; segment_writer.add_document(doc)?;
@@ -785,7 +783,6 @@ mod tests {
use crate::collector::TopDocs; use crate::collector::TopDocs;
use crate::directory::error::LockError; use crate::directory::error::LockError;
use crate::error::*; use crate::error::*;
use crate::fastfield::FastFieldReader;
use crate::indexer::NoMergePolicy; use crate::indexer::NoMergePolicy;
use crate::query::{QueryParser, TermQuery}; use crate::query::{QueryParser, TermQuery};
use crate::schema::{ use crate::schema::{
@@ -1327,7 +1324,7 @@ mod tests {
let fast_field_reader = segment_reader.fast_fields().u64(id_field)?; let fast_field_reader = segment_reader.fast_fields().u64(id_field)?;
let in_order_alive_ids: Vec<u64> = segment_reader let in_order_alive_ids: Vec<u64> = segment_reader
.doc_ids_alive() .doc_ids_alive()
.map(|doc| fast_field_reader.get(doc)) .map(|doc| fast_field_reader.get_val(doc as u64))
.collect(); .collect();
assert_eq!(&in_order_alive_ids[..], &[9, 8, 7, 6, 5, 4, 1, 0]); assert_eq!(&in_order_alive_ids[..], &[9, 8, 7, 6, 5, 4, 1, 0]);
Ok(()) Ok(())
@@ -1493,7 +1490,7 @@ mod tests {
let ff_reader = segment_reader.fast_fields().u64(id_field).unwrap(); let ff_reader = segment_reader.fast_fields().u64(id_field).unwrap();
segment_reader segment_reader
.doc_ids_alive() .doc_ids_alive()
.map(move |doc| ff_reader.get(doc)) .map(move |doc| ff_reader.get_val(doc as u64))
}) })
.collect(); .collect();
@@ -1504,7 +1501,7 @@ mod tests {
let ff_reader = segment_reader.fast_fields().u64(id_field).unwrap(); let ff_reader = segment_reader.fast_fields().u64(id_field).unwrap();
segment_reader segment_reader
.doc_ids_alive() .doc_ids_alive()
.map(move |doc| ff_reader.get(doc)) .map(move |doc| ff_reader.get_val(doc as u64))
}) })
.collect(); .collect();
@@ -1622,7 +1619,7 @@ mod tests {
facet_reader facet_reader
.facet_from_ord(facet_ords[0], &mut facet) .facet_from_ord(facet_ords[0], &mut facet)
.unwrap(); .unwrap();
let id = ff_reader.get(doc_id); let id = ff_reader.get_val(doc_id as u64);
let facet_expected = Facet::from(&("/cola/".to_string() + &id.to_string())); let facet_expected = Facet::from(&("/cola/".to_string() + &id.to_string()));
assert_eq!(facet, facet_expected); assert_eq!(facet, facet_expected);

View File

@@ -4,14 +4,13 @@ use std::sync::Arc;
use itertools::Itertools; use itertools::Itertools;
use measure_time::debug_time; use measure_time::debug_time;
use tantivy_bitpacker::minmax;
use crate::core::{Segment, SegmentReader}; use crate::core::{Segment, SegmentReader};
use crate::docset::{DocSet, TERMINATED}; use crate::docset::{DocSet, TERMINATED};
use crate::error::DataCorruption; use crate::error::DataCorruption;
use crate::fastfield::{ use crate::fastfield::{
AliveBitSet, CompositeFastFieldSerializer, DynamicFastFieldReader, FastFieldDataAccess, AliveBitSet, Column, CompositeFastFieldSerializer, FastFieldStats, MultiValueLength,
FastFieldReader, FastFieldStats, MultiValueLength, MultiValuedFastFieldReader, MultiValuedFastFieldReader,
}; };
use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter}; use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter};
use crate::indexer::doc_id_mapping::{expect_field_id_for_sort_field, SegmentDocIdMapping}; use crate::indexer::doc_id_mapping::{expect_field_id_for_sort_field, SegmentDocIdMapping};
@@ -88,7 +87,7 @@ pub struct IndexMerger {
} }
fn compute_min_max_val( fn compute_min_max_val(
u64_reader: &impl FastFieldReader<u64>, u64_reader: &dyn Column<u64>,
segment_reader: &SegmentReader, segment_reader: &SegmentReader,
) -> Option<(u64, u64)> { ) -> Option<(u64, u64)> {
if segment_reader.max_doc() == 0 { if segment_reader.max_doc() == 0 {
@@ -102,11 +101,11 @@ fn compute_min_max_val(
} }
// some deleted documents, // some deleted documents,
// we need to recompute the max / min // we need to recompute the max / min
minmax( segment_reader
segment_reader .doc_ids_alive()
.doc_ids_alive() .map(|doc_id| u64_reader.get_val(doc_id as u64))
.map(|doc_id| u64_reader.get(doc_id)), .minmax()
) .into_option()
} }
struct TermOrdinalMapping { struct TermOrdinalMapping {
@@ -134,7 +133,7 @@ impl TermOrdinalMapping {
fn max_term_ord(&self) -> TermOrdinal { fn max_term_ord(&self) -> TermOrdinal {
self.per_segment_new_term_ordinals self.per_segment_new_term_ordinals
.iter() .iter()
.flat_map(|term_ordinals| term_ordinals.iter().max()) .flat_map(|term_ordinals| term_ordinals.iter().max().cloned())
.max() .max()
.unwrap_or_default() .unwrap_or_default()
} }
@@ -342,12 +341,12 @@ impl IndexMerger {
.readers .readers
.iter() .iter()
.filter_map(|reader| { .filter_map(|reader| {
let u64_reader: DynamicFastFieldReader<u64> = let u64_reader: Arc<dyn Column<u64>> =
reader.fast_fields().typed_fast_field_reader(field).expect( reader.fast_fields().typed_fast_field_reader(field).expect(
"Failed to find a reader for single fast field. This is a tantivy bug and \ "Failed to find a reader for single fast field. This is a tantivy bug and \
it should never happen.", it should never happen.",
); );
compute_min_max_val(&u64_reader, reader) compute_min_max_val(&*u64_reader, reader)
}) })
.reduce(|a, b| (a.0.min(b.0), a.1.max(b.1))) .reduce(|a, b| (a.0.min(b.0), a.1.max(b.1)))
.expect("Unexpected error, empty readers in IndexMerger"); .expect("Unexpected error, empty readers in IndexMerger");
@@ -356,7 +355,7 @@ impl IndexMerger {
.readers .readers
.iter() .iter()
.map(|reader| { .map(|reader| {
let u64_reader: DynamicFastFieldReader<u64> = let u64_reader: Arc<dyn Column<u64>> =
reader.fast_fields().typed_fast_field_reader(field).expect( reader.fast_fields().typed_fast_field_reader(field).expect(
"Failed to find a reader for single fast field. This is a tantivy bug and \ "Failed to find a reader for single fast field. This is a tantivy bug and \
it should never happen.", it should never happen.",
@@ -373,16 +372,16 @@ impl IndexMerger {
#[derive(Clone)] #[derive(Clone)]
struct SortedDocIdFieldAccessProvider<'a> { struct SortedDocIdFieldAccessProvider<'a> {
doc_id_mapping: &'a SegmentDocIdMapping, doc_id_mapping: &'a SegmentDocIdMapping,
fast_field_readers: &'a Vec<DynamicFastFieldReader<u64>>, fast_field_readers: &'a Vec<Arc<dyn Column<u64>>>,
stats: FastFieldStats, stats: FastFieldStats,
} }
impl<'a> FastFieldDataAccess for SortedDocIdFieldAccessProvider<'a> { impl<'a> Column for SortedDocIdFieldAccessProvider<'a> {
fn get_val(&self, doc: u64) -> u64 { fn get_val(&self, doc: u64) -> u64 {
let DocAddress { let DocAddress {
doc_id, doc_id,
segment_ord, segment_ord,
} = self.doc_id_mapping.get_old_doc_addr(doc as u32); } = self.doc_id_mapping.get_old_doc_addr(doc as u32);
self.fast_field_readers[segment_ord as usize].get(doc_id) self.fast_field_readers[segment_ord as usize].get_val(doc_id as u64)
} }
fn iter(&self) -> Box<dyn Iterator<Item = u64> + '_> { fn iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
@@ -392,7 +391,7 @@ impl IndexMerger {
.map(|old_doc_addr| { .map(|old_doc_addr| {
let fast_field_reader = let fast_field_reader =
&self.fast_field_readers[old_doc_addr.segment_ord as usize]; &self.fast_field_readers[old_doc_addr.segment_ord as usize];
fast_field_reader.get(old_doc_addr.doc_id) fast_field_reader.get_val(old_doc_addr.doc_id as u64)
}), }),
) )
} }
@@ -429,7 +428,7 @@ impl IndexMerger {
let everything_is_in_order = reader_ordinal_and_field_accessors let everything_is_in_order = reader_ordinal_and_field_accessors
.into_iter() .into_iter()
.map(|reader| reader.1) .map(|(_, col)| Arc::new(col))
.tuple_windows() .tuple_windows()
.all(|(field_accessor1, field_accessor2)| { .all(|(field_accessor1, field_accessor2)| {
if sort_by_field.order.is_asc() { if sort_by_field.order.is_asc() {
@@ -444,7 +443,7 @@ impl IndexMerger {
pub(crate) fn get_sort_field_accessor( pub(crate) fn get_sort_field_accessor(
reader: &SegmentReader, reader: &SegmentReader,
sort_by_field: &IndexSortByField, sort_by_field: &IndexSortByField,
) -> crate::Result<impl FastFieldReader<u64>> { ) -> crate::Result<Arc<dyn Column>> {
let field_id = expect_field_id_for_sort_field(reader.schema(), sort_by_field)?; // for now expect fastfield, but not strictly required let field_id = expect_field_id_for_sort_field(reader.schema(), sort_by_field)?; // for now expect fastfield, but not strictly required
let value_accessor = reader.fast_fields().u64_lenient(field_id)?; let value_accessor = reader.fast_fields().u64_lenient(field_id)?;
Ok(value_accessor) Ok(value_accessor)
@@ -453,7 +452,7 @@ impl IndexMerger {
pub(crate) fn get_reader_with_sort_field_accessor( pub(crate) fn get_reader_with_sort_field_accessor(
&self, &self,
sort_by_field: &IndexSortByField, sort_by_field: &IndexSortByField,
) -> crate::Result<Vec<(SegmentOrdinal, impl FastFieldReader<u64> + Clone)>> { ) -> crate::Result<Vec<(SegmentOrdinal, Arc<dyn Column>)>> {
let reader_ordinal_and_field_accessors = self let reader_ordinal_and_field_accessors = self
.readers .readers
.iter() .iter()
@@ -506,8 +505,8 @@ impl IndexMerger {
doc_id_reader_pair doc_id_reader_pair
.into_iter() .into_iter()
.kmerge_by(|a, b| { .kmerge_by(|a, b| {
let val1 = a.2.get(a.0); let val1 = a.2.get_val(a.0 as u64);
let val2 = b.2.get(b.0); let val2 = b.2.get_val(b.0 as u64);
if sort_by_field.order == Order::Asc { if sort_by_field.order == Order::Asc {
val1 < val2 val1 < val2
} else { } else {
@@ -578,7 +577,7 @@ impl IndexMerger {
offsets: &'a [u64], offsets: &'a [u64],
stats: FastFieldStats, stats: FastFieldStats,
} }
impl<'a> FastFieldDataAccess for FieldIndexAccessProvider<'a> { impl<'a> Column for FieldIndexAccessProvider<'a> {
fn get_val(&self, doc: u64) -> u64 { fn get_val(&self, doc: u64) -> u64 {
self.offsets[doc as usize] self.offsets[doc as usize]
} }
@@ -619,7 +618,7 @@ impl IndexMerger {
.map(|reader| { .map(|reader| {
let u64s_reader: MultiValuedFastFieldReader<u64> = reader let u64s_reader: MultiValuedFastFieldReader<u64> = reader
.fast_fields() .fast_fields()
.typed_fast_field_multi_reader(field) .typed_fast_field_multi_reader::<u64>(field)
.expect( .expect(
"Failed to find index for multivalued field. This is a bug in tantivy, \ "Failed to find index for multivalued field. This is a bug in tantivy, \
please report.", please report.",
@@ -669,7 +668,7 @@ impl IndexMerger {
{ {
let mut serialize_vals = let mut serialize_vals =
fast_field_serializer.new_u64_fast_field_with_idx(field, 0u64, max_term_ord, 1)?; fast_field_serializer.new_u64_fast_field_with_idx(field, 0u64, max_term_ord, 1)?;
let mut vals = Vec::with_capacity(100); let mut vals: Vec<u64> = Vec::with_capacity(100);
for old_doc_addr in doc_id_mapping.iter_old_doc_addrs() { for old_doc_addr in doc_id_mapping.iter_old_doc_addrs() {
let term_ordinal_mapping: &[TermOrdinal] = let term_ordinal_mapping: &[TermOrdinal] =
@@ -743,7 +742,7 @@ impl IndexMerger {
for reader in &self.readers { for reader in &self.readers {
let ff_reader: MultiValuedFastFieldReader<u64> = reader let ff_reader: MultiValuedFastFieldReader<u64> = reader
.fast_fields() .fast_fields()
.typed_fast_field_multi_reader(field) .typed_fast_field_multi_reader::<u64>(field)
.expect( .expect(
"Failed to find multivalued fast field reader. This is a bug in tantivy. \ "Failed to find multivalued fast field reader. This is a bug in tantivy. \
Please report.", Please report.",
@@ -778,14 +777,14 @@ impl IndexMerger {
offsets: Vec<u64>, offsets: Vec<u64>,
stats: FastFieldStats, stats: FastFieldStats,
} }
impl<'a> FastFieldDataAccess for SortedDocIdMultiValueAccessProvider<'a> { impl<'a> Column for SortedDocIdMultiValueAccessProvider<'a> {
fn get_val(&self, pos: u64) -> u64 { fn get_val(&self, pos: u64) -> u64 {
// use the offsets index to find the doc_id which will contain the position. // use the offsets index to find the doc_id which will contain the position.
// the offsets are strictly increasing so we can do a simple search on it. // the offsets are strictly increasing so we can do a simple search on it.
let new_doc_id: DocId = let new_doc_id: DocId =
self.offsets self.offsets
.iter() .iter()
.position(|offset| offset > pos) .position(|&offset| offset > pos)
.expect("pos is out of bounds") as DocId .expect("pos is out of bounds") as DocId
- 1u32; - 1u32;
@@ -1207,7 +1206,6 @@ mod tests {
}; };
use crate::collector::{Count, FacetCollector}; use crate::collector::{Count, FacetCollector};
use crate::core::Index; use crate::core::Index;
use crate::fastfield::FastFieldReader;
use crate::query::{AllQuery, BooleanQuery, Scorer, TermQuery}; use crate::query::{AllQuery, BooleanQuery, Scorer, TermQuery};
use crate::schema::{ use crate::schema::{
Cardinality, Document, Facet, FacetOptions, IndexRecordOption, NumericOptions, Term, Cardinality, Document, Facet, FacetOptions, IndexRecordOption, NumericOptions, Term,

View File

@@ -2,7 +2,7 @@
mod tests { mod tests {
use crate::collector::TopDocs; use crate::collector::TopDocs;
use crate::core::Index; use crate::core::Index;
use crate::fastfield::{AliveBitSet, FastFieldReader, MultiValuedFastFieldReader}; use crate::fastfield::{AliveBitSet, MultiValuedFastFieldReader};
use crate::query::QueryParser; use crate::query::QueryParser;
use crate::schema::{ use crate::schema::{
self, BytesOptions, Cardinality, Facet, FacetOptions, IndexRecordOption, NumericOptions, self, BytesOptions, Cardinality, Facet, FacetOptions, IndexRecordOption, NumericOptions,
@@ -186,17 +186,17 @@ mod tests {
let fast_fields = segment_reader.fast_fields(); let fast_fields = segment_reader.fast_fields();
let fast_field = fast_fields.u64(int_field).unwrap(); let fast_field = fast_fields.u64(int_field).unwrap();
assert_eq!(fast_field.get(5u32), 1u64); assert_eq!(fast_field.get_val(5), 1u64);
assert_eq!(fast_field.get(4u32), 2u64); assert_eq!(fast_field.get_val(4), 2u64);
assert_eq!(fast_field.get(3u32), 3u64); assert_eq!(fast_field.get_val(3), 3u64);
if force_disjunct_segment_sort_values { if force_disjunct_segment_sort_values {
assert_eq!(fast_field.get(2u32), 20u64); assert_eq!(fast_field.get_val(2u64), 20u64);
assert_eq!(fast_field.get(1u32), 100u64); assert_eq!(fast_field.get_val(1u64), 100u64);
} else { } else {
assert_eq!(fast_field.get(2u32), 10u64); assert_eq!(fast_field.get_val(2u64), 10u64);
assert_eq!(fast_field.get(1u32), 20u64); assert_eq!(fast_field.get_val(1u64), 20u64);
} }
assert_eq!(fast_field.get(0u32), 1_000u64); assert_eq!(fast_field.get_val(0u64), 1_000u64);
// test new field norm mapping // test new field norm mapping
{ {
@@ -373,12 +373,12 @@ mod tests {
let fast_fields = segment_reader.fast_fields(); let fast_fields = segment_reader.fast_fields();
let fast_field = fast_fields.u64(int_field).unwrap(); let fast_field = fast_fields.u64(int_field).unwrap();
assert_eq!(fast_field.get(0u32), 1u64); assert_eq!(fast_field.get_val(0), 1u64);
assert_eq!(fast_field.get(1u32), 2u64); assert_eq!(fast_field.get_val(1), 2u64);
assert_eq!(fast_field.get(2u32), 3u64); assert_eq!(fast_field.get_val(2), 3u64);
assert_eq!(fast_field.get(3u32), 10u64); assert_eq!(fast_field.get_val(3), 10u64);
assert_eq!(fast_field.get(4u32), 20u64); assert_eq!(fast_field.get_val(4), 20u64);
assert_eq!(fast_field.get(5u32), 1_000u64); assert_eq!(fast_field.get_val(5), 1_000u64);
let get_vals = |fast_field: &MultiValuedFastFieldReader<u64>, doc_id: u32| -> Vec<u64> { let get_vals = |fast_field: &MultiValuedFastFieldReader<u64>, doc_id: u32| -> Vec<u64> {
let mut vals = vec![]; let mut vals = vec![];
@@ -478,11 +478,12 @@ mod tests {
#[cfg(all(test, feature = "unstable"))] #[cfg(all(test, feature = "unstable"))]
mod bench_sorted_index_merge { mod bench_sorted_index_merge {
use std::sync::Arc;
use fastfield_codecs::Column;
use test::{self, Bencher}; use test::{self, Bencher};
use crate::core::Index; use crate::core::Index;
// use cratedoc_id, readerdoc_id_mappinglet vals = reader.fate::schema;
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader};
use crate::indexer::merger::IndexMerger; use crate::indexer::merger::IndexMerger;
use crate::schema::{Cardinality, NumericOptions, Schema}; use crate::schema::{Cardinality, NumericOptions, Schema};
use crate::{IndexSettings, IndexSortByField, IndexWriter, Order}; use crate::{IndexSettings, IndexSortByField, IndexWriter, Order};
@@ -534,7 +535,7 @@ mod bench_sorted_index_merge {
b.iter(|| { b.iter(|| {
let sorted_doc_ids = doc_id_mapping.iter_old_doc_addrs().map(|doc_addr| { let sorted_doc_ids = doc_id_mapping.iter_old_doc_addrs().map(|doc_addr| {
let reader = &merger.readers[doc_addr.segment_ord as usize]; let reader = &merger.readers[doc_addr.segment_ord as usize];
let u64_reader: DynamicFastFieldReader<u64> = let u64_reader: Arc<dyn Column<u64>> =
reader.fast_fields().typed_fast_field_reader(field).expect( reader.fast_fields().typed_fast_field_reader(field).expect(
"Failed to find a reader for single fast field. This is a tantivy bug and \ "Failed to find a reader for single fast field. This is a tantivy bug and \
it should never happen.", it should never happen.",
@@ -544,7 +545,7 @@ mod bench_sorted_index_merge {
// add values in order of the new doc_ids // add values in order of the new doc_ids
let mut val = 0; let mut val = 0;
for (doc_id, _reader, field_reader) in sorted_doc_ids { for (doc_id, _reader, field_reader) in sorted_doc_ids {
val = field_reader.get(doc_id); val = field_reader.get_val(doc_id as u64);
} }
val val

View File

@@ -25,39 +25,10 @@ use crate::indexer::{
DefaultMergePolicy, MergeCandidate, MergeOperation, MergePolicy, SegmentEntry, DefaultMergePolicy, MergeCandidate, MergeOperation, MergePolicy, SegmentEntry,
SegmentSerializer, SegmentSerializer,
}; };
use crate::schema::Schema;
use crate::{FutureResult, Opstamp}; use crate::{FutureResult, Opstamp};
const NUM_MERGE_THREADS: usize = 4; const NUM_MERGE_THREADS: usize = 4;
/// Save the index meta file.
/// This operation is atomic :
/// Either
/// - it fails, in which case an error is returned,
/// and the `meta.json` remains untouched,
/// - it succeeds, and `meta.json` is written
/// and flushed.
///
/// This method is not part of tantivy's public API
pub fn save_new_metas(
schema: Schema,
index_settings: IndexSettings,
directory: &dyn Directory,
) -> crate::Result<()> {
save_metas(
&IndexMeta {
index_settings,
segments: Vec::new(),
schema,
opstamp: 0u64,
payload: None,
},
directory,
)?;
directory.sync_directory()?;
Ok(())
}
/// Save the index meta file. /// Save the index meta file.
/// This operation is atomic: /// This operation is atomic:
/// Either /// Either
@@ -67,7 +38,7 @@ pub fn save_new_metas(
/// and flushed. /// and flushed.
/// ///
/// This method is not part of tantivy's public API /// This method is not part of tantivy's public API
fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate::Result<()> { pub(crate) fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate::Result<()> {
info!("save metas"); info!("save metas");
let mut buffer = serde_json::to_vec_pretty(metas)?; let mut buffer = serde_json::to_vec_pretty(metas)?;
// Just adding a new line at the end of the buffer. // Just adding a new line at the end of the buffer.

View File

@@ -80,8 +80,8 @@ impl SegmentWriter {
pub fn for_segment( pub fn for_segment(
memory_budget_in_bytes: usize, memory_budget_in_bytes: usize,
segment: Segment, segment: Segment,
schema: Schema,
) -> crate::Result<SegmentWriter> { ) -> crate::Result<SegmentWriter> {
let schema = segment.schema();
let tokenizer_manager = segment.index().tokenizers().clone(); let tokenizer_manager = segment.index().tokenizers().clone();
let table_size = compute_initial_table_size(memory_budget_in_bytes)?; let table_size = compute_initial_table_size(memory_budget_in_bytes)?;
let segment_serializer = SegmentSerializer::for_segment(segment, false)?; let segment_serializer = SegmentSerializer::for_segment(segment, false)?;

View File

@@ -429,7 +429,6 @@ pub mod tests {
use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE; use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE;
use crate::core::SegmentReader; use crate::core::SegmentReader;
use crate::docset::{DocSet, TERMINATED}; use crate::docset::{DocSet, TERMINATED};
use crate::fastfield::FastFieldReader;
use crate::merge_policy::NoMergePolicy; use crate::merge_policy::NoMergePolicy;
use crate::query::BooleanQuery; use crate::query::BooleanQuery;
use crate::schema::*; use crate::schema::*;
@@ -1036,21 +1035,21 @@ pub mod tests {
let fast_field_reader_opt = segment_reader.fast_fields().u64(fast_field_unsigned); let fast_field_reader_opt = segment_reader.fast_fields().u64(fast_field_unsigned);
assert!(fast_field_reader_opt.is_ok()); assert!(fast_field_reader_opt.is_ok());
let fast_field_reader = fast_field_reader_opt.unwrap(); let fast_field_reader = fast_field_reader_opt.unwrap();
assert_eq!(fast_field_reader.get(0), 4u64) assert_eq!(fast_field_reader.get_val(0), 4u64)
} }
{ {
let fast_field_reader_res = segment_reader.fast_fields().i64(fast_field_signed); let fast_field_reader_res = segment_reader.fast_fields().i64(fast_field_signed);
assert!(fast_field_reader_res.is_ok()); assert!(fast_field_reader_res.is_ok());
let fast_field_reader = fast_field_reader_res.unwrap(); let fast_field_reader = fast_field_reader_res.unwrap();
assert_eq!(fast_field_reader.get(0), 4i64) assert_eq!(fast_field_reader.get_val(0), 4i64)
} }
{ {
let fast_field_reader_res = segment_reader.fast_fields().f64(fast_field_float); let fast_field_reader_res = segment_reader.fast_fields().f64(fast_field_float);
assert!(fast_field_reader_res.is_ok()); assert!(fast_field_reader_res.is_ok());
let fast_field_reader = fast_field_reader_res.unwrap(); let fast_field_reader = fast_field_reader_res.unwrap();
assert_eq!(fast_field_reader.get(0), 4f64) assert_eq!(fast_field_reader.get_val(0), 4f64)
} }
Ok(()) Ok(())
} }

View File

@@ -227,7 +227,7 @@ pub mod tests {
{ {
let mut segment_writer = let mut segment_writer =
SegmentWriter::for_segment(3_000_000, segment.clone(), schema).unwrap(); SegmentWriter::for_segment(3_000_000, segment.clone()).unwrap();
{ {
// checking that position works if the field has two values // checking that position works if the field has two values
let op = AddOperation { let op = AddOperation {

View File

@@ -339,7 +339,7 @@ impl StoreReader {
async fn read_block_async(&self, checkpoint: &Checkpoint) -> crate::AsyncIoResult<Block> { async fn read_block_async(&self, checkpoint: &Checkpoint) -> crate::AsyncIoResult<Block> {
let cache_key = checkpoint.byte_range.start; let cache_key = checkpoint.byte_range.start;
if let Some(block) = self.cache.get_from_cache(checkpoint.byte_range.start) { if let Some(block) = self.cache.get_from_cache(checkpoint.byte_range.start) {
return Ok(block.clone()); return Ok(block);
} }
let compressed_block = self let compressed_block = self

View File

@@ -172,8 +172,7 @@ where TValueReader: value::ValueReader
} }
pub fn suffix(&self) -> &[u8] { pub fn suffix(&self) -> &[u8] {
&self self.block_reader
.block_reader
.buffer_from_to(self.suffix_start, self.suffix_end) .buffer_from_to(self.suffix_start, self.suffix_end)
} }

View File

@@ -50,7 +50,7 @@ pub struct SSTableIndexBuilder {
/// matches `left <= left' < right`. /// matches `left <= left' < right`.
fn find_shorter_str_in_between(left: &mut Vec<u8>, right: &[u8]) { fn find_shorter_str_in_between(left: &mut Vec<u8>, right: &[u8]) {
assert!(&left[..] < right); assert!(&left[..] < right);
let common_len = common_prefix_len(&left, right); let common_len = common_prefix_len(left, right);
if left.len() == common_len { if left.len() == common_len {
return; return;
} }